Commit 18d95a28 authored by Peter Zijlstra's avatar Peter Zijlstra Committed by Ingo Molnar

sched: fair-group: SMP-nice for group scheduling

Implement SMP nice support for the full group hierarchy.

On each load-balance action, compile a sched_domain wide view of the full
task_group tree. We compute the domain wide view when walking down the
hierarchy, and readjust the weights when walking back up.

After collecting and readjusting the domain wide view, we try to balance the
tasks within the task_groups. The current approach is a naively balance each
task group until we've moved the targeted amount of load.

Inspired by Srivatsa Vaddsgiri's previous code and Abhishek Chandra's H-SMP
paper.

XXX: there will be some numerical issues due to the limited nature of
     SCHED_LOAD_SCALE wrt to representing a task_groups influence on the
     total weight. When the tree is deep enough, or the task weight small
     enough, we'll run out of bits.
Signed-off-by: default avatarPeter Zijlstra <a.p.zijlstra@chello.nl>
CC: Abhishek Chandra <chandra@cs.umn.edu>
CC: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: default avatarIngo Molnar <mingo@elte.hu>
parent 1d3504fc
...@@ -758,6 +758,7 @@ struct sched_domain { ...@@ -758,6 +758,7 @@ struct sched_domain {
struct sched_domain *child; /* bottom domain must be null terminated */ struct sched_domain *child; /* bottom domain must be null terminated */
struct sched_group *groups; /* the balancing groups of the domain */ struct sched_group *groups; /* the balancing groups of the domain */
cpumask_t span; /* span of all CPUs in this domain */ cpumask_t span; /* span of all CPUs in this domain */
int first_cpu; /* cache of the first cpu in this domain */
unsigned long min_interval; /* Minimum balance interval ms */ unsigned long min_interval; /* Minimum balance interval ms */
unsigned long max_interval; /* Maximum balance interval ms */ unsigned long max_interval; /* Maximum balance interval ms */
unsigned int busy_factor; /* less balancing by factor if busy */ unsigned int busy_factor; /* less balancing by factor if busy */
......
...@@ -316,6 +316,8 @@ static DEFINE_MUTEX(doms_cur_mutex); ...@@ -316,6 +316,8 @@ static DEFINE_MUTEX(doms_cur_mutex);
# define INIT_TASK_GROUP_LOAD NICE_0_LOAD # define INIT_TASK_GROUP_LOAD NICE_0_LOAD
#endif #endif
#define MIN_SHARES 2
static int init_task_group_load = INIT_TASK_GROUP_LOAD; static int init_task_group_load = INIT_TASK_GROUP_LOAD;
#endif #endif
...@@ -403,6 +405,43 @@ struct cfs_rq { ...@@ -403,6 +405,43 @@ struct cfs_rq {
*/ */
struct list_head leaf_cfs_rq_list; struct list_head leaf_cfs_rq_list;
struct task_group *tg; /* group that "owns" this runqueue */ struct task_group *tg; /* group that "owns" this runqueue */
#ifdef CONFIG_SMP
unsigned long task_weight;
unsigned long shares;
/*
* We need space to build a sched_domain wide view of the full task
* group tree, in order to avoid depending on dynamic memory allocation
* during the load balancing we place this in the per cpu task group
* hierarchy. This limits the load balancing to one instance per cpu,
* but more should not be needed anyway.
*/
struct aggregate_struct {
/*
* load = weight(cpus) * f(tg)
*
* Where f(tg) is the recursive weight fraction assigned to
* this group.
*/
unsigned long load;
/*
* part of the group weight distributed to this span.
*/
unsigned long shares;
/*
* The sum of all runqueue weights within this span.
*/
unsigned long rq_weight;
/*
* Weight contributed by tasks; this is the part we can
* influence by moving tasks around.
*/
unsigned long task_weight;
} aggregate;
#endif
#endif #endif
}; };
...@@ -1402,11 +1441,390 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime); ...@@ -1402,11 +1441,390 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
#endif #endif
static inline void inc_cpu_load(struct rq *rq, unsigned long load)
{
update_load_add(&rq->load, load);
}
static inline void dec_cpu_load(struct rq *rq, unsigned long load)
{
update_load_sub(&rq->load, load);
}
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
static unsigned long source_load(int cpu, int type); static unsigned long source_load(int cpu, int type);
static unsigned long target_load(int cpu, int type); static unsigned long target_load(int cpu, int type);
static unsigned long cpu_avg_load_per_task(int cpu); static unsigned long cpu_avg_load_per_task(int cpu);
static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* Group load balancing.
*
* We calculate a few balance domain wide aggregate numbers; load and weight.
* Given the pictures below, and assuming each item has equal weight:
*
* root 1 - thread
* / | \ A - group
* A 1 B
* /|\ / \
* C 2 D 3 4
* | |
* 5 6
*
* load:
* A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd,
* which equals 1/9-th of the total load.
*
* shares:
* The weight of this group on the selected cpus.
*
* rq_weight:
* Direct sum of all the cpu's their rq weight, e.g. A would get 3 while
* B would get 2.
*
* task_weight:
* Part of the rq_weight contributed by tasks; all groups except B would
* get 1, B gets 2.
*/
static inline struct aggregate_struct *
aggregate(struct task_group *tg, struct sched_domain *sd)
{
return &tg->cfs_rq[sd->first_cpu]->aggregate;
}
typedef void (*aggregate_func)(struct task_group *, struct sched_domain *);
/*
* Iterate the full tree, calling @down when first entering a node and @up when
* leaving it for the final time.
*/
static
void aggregate_walk_tree(aggregate_func down, aggregate_func up,
struct sched_domain *sd)
{
struct task_group *parent, *child;
rcu_read_lock();
parent = &root_task_group;
down:
(*down)(parent, sd);
list_for_each_entry_rcu(child, &parent->children, siblings) {
parent = child;
goto down;
up:
continue;
}
(*up)(parent, sd);
child = parent;
parent = parent->parent;
if (parent)
goto up;
rcu_read_unlock();
}
/*
* Calculate the aggregate runqueue weight.
*/
static
void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd)
{
unsigned long rq_weight = 0;
unsigned long task_weight = 0;
int i;
for_each_cpu_mask(i, sd->span) {
rq_weight += tg->cfs_rq[i]->load.weight;
task_weight += tg->cfs_rq[i]->task_weight;
}
aggregate(tg, sd)->rq_weight = rq_weight;
aggregate(tg, sd)->task_weight = task_weight;
}
/*
* Redistribute tg->shares amongst all tg->cfs_rq[]s.
*/
static void __aggregate_redistribute_shares(struct task_group *tg)
{
int i, max_cpu = smp_processor_id();
unsigned long rq_weight = 0;
unsigned long shares, max_shares = 0, shares_rem = tg->shares;
for_each_possible_cpu(i)
rq_weight += tg->cfs_rq[i]->load.weight;
for_each_possible_cpu(i) {
/*
* divide shares proportional to the rq_weights.
*/
shares = tg->shares * tg->cfs_rq[i]->load.weight;
shares /= rq_weight + 1;
tg->cfs_rq[i]->shares = shares;
if (shares > max_shares) {
max_shares = shares;
max_cpu = i;
}
shares_rem -= shares;
}
/*
* Ensure it all adds up to tg->shares; we can loose a few
* due to rounding down when computing the per-cpu shares.
*/
if (shares_rem)
tg->cfs_rq[max_cpu]->shares += shares_rem;
}
/*
* Compute the weight of this group on the given cpus.
*/
static
void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd)
{
unsigned long shares = 0;
int i;
again:
for_each_cpu_mask(i, sd->span)
shares += tg->cfs_rq[i]->shares;
/*
* When the span doesn't have any shares assigned, but does have
* tasks to run do a machine wide rebalance (should be rare).
*/
if (unlikely(!shares && aggregate(tg, sd)->rq_weight)) {
__aggregate_redistribute_shares(tg);
goto again;
}
aggregate(tg, sd)->shares = shares;
}
/*
* Compute the load fraction assigned to this group, relies on the aggregate
* weight and this group's parent's load, i.e. top-down.
*/
static
void aggregate_group_load(struct task_group *tg, struct sched_domain *sd)
{
unsigned long load;
if (!tg->parent) {
int i;
load = 0;
for_each_cpu_mask(i, sd->span)
load += cpu_rq(i)->load.weight;
} else {
load = aggregate(tg->parent, sd)->load;
/*
* shares is our weight in the parent's rq so
* shares/parent->rq_weight gives our fraction of the load
*/
load *= aggregate(tg, sd)->shares;
load /= aggregate(tg->parent, sd)->rq_weight + 1;
}
aggregate(tg, sd)->load = load;
}
static void __set_se_shares(struct sched_entity *se, unsigned long shares);
/*
* Calculate and set the cpu's group shares.
*/
static void
__update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
int tcpu)
{
int boost = 0;
unsigned long shares;
unsigned long rq_weight;
if (!tg->se[tcpu])
return;
rq_weight = tg->cfs_rq[tcpu]->load.weight;
/*
* If there are currently no tasks on the cpu pretend there is one of
* average load so that when a new task gets to run here it will not
* get delayed by group starvation.
*/
if (!rq_weight) {
boost = 1;
rq_weight = NICE_0_LOAD;
}
/*
* \Sum shares * rq_weight
* shares = -----------------------
* \Sum rq_weight
*
*/
shares = aggregate(tg, sd)->shares * rq_weight;
shares /= aggregate(tg, sd)->rq_weight + 1;
/*
* record the actual number of shares, not the boosted amount.
*/
tg->cfs_rq[tcpu]->shares = boost ? 0 : shares;
if (shares < MIN_SHARES)
shares = MIN_SHARES;
__set_se_shares(tg->se[tcpu], shares);
}
/*
* Re-adjust the weights on the cpu the task came from and on the cpu the
* task went to.
*/
static void
__move_group_shares(struct task_group *tg, struct sched_domain *sd,
int scpu, int dcpu)
{
unsigned long shares;
shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
__update_group_shares_cpu(tg, sd, scpu);
__update_group_shares_cpu(tg, sd, dcpu);
/*
* ensure we never loose shares due to rounding errors in the
* above redistribution.
*/
shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
if (shares)
tg->cfs_rq[dcpu]->shares += shares;
}
/*
* Because changing a group's shares changes the weight of the super-group
* we need to walk up the tree and change all shares until we hit the root.
*/
static void
move_group_shares(struct task_group *tg, struct sched_domain *sd,
int scpu, int dcpu)
{
while (tg) {
__move_group_shares(tg, sd, scpu, dcpu);
tg = tg->parent;
}
}
static
void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd)
{
unsigned long shares = aggregate(tg, sd)->shares;
int i;
for_each_cpu_mask(i, sd->span) {
struct rq *rq = cpu_rq(i);
unsigned long flags;
spin_lock_irqsave(&rq->lock, flags);
__update_group_shares_cpu(tg, sd, i);
spin_unlock_irqrestore(&rq->lock, flags);
}
aggregate_group_shares(tg, sd);
/*
* ensure we never loose shares due to rounding errors in the
* above redistribution.
*/
shares -= aggregate(tg, sd)->shares;
if (shares) {
tg->cfs_rq[sd->first_cpu]->shares += shares;
aggregate(tg, sd)->shares += shares;
}
}
/*
* Calculate the accumulative weight and recursive load of each task group
* while walking down the tree.
*/
static
void aggregate_get_down(struct task_group *tg, struct sched_domain *sd)
{
aggregate_group_weight(tg, sd);
aggregate_group_shares(tg, sd);
aggregate_group_load(tg, sd);
}
/*
* Rebalance the cpu shares while walking back up the tree.
*/
static
void aggregate_get_up(struct task_group *tg, struct sched_domain *sd)
{
aggregate_group_set_shares(tg, sd);
}
static DEFINE_PER_CPU(spinlock_t, aggregate_lock);
static void __init init_aggregate(void)
{
int i;
for_each_possible_cpu(i)
spin_lock_init(&per_cpu(aggregate_lock, i));
}
static int get_aggregate(struct sched_domain *sd)
{
if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu)))
return 0;
aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd);
return 1;
}
static void put_aggregate(struct sched_domain *sd)
{
spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu));
}
static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
{
cfs_rq->shares = shares;
}
#else
static inline void init_aggregate(void)
{
}
static inline int get_aggregate(struct sched_domain *sd)
{
return 0;
}
static inline void put_aggregate(struct sched_domain *sd)
{
}
#endif
#else /* CONFIG_SMP */
#ifdef CONFIG_FAIR_GROUP_SCHED
static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
{
}
#endif
#endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */
#include "sched_stats.h" #include "sched_stats.h"
...@@ -1419,26 +1837,14 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); ...@@ -1419,26 +1837,14 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
#define sched_class_highest (&rt_sched_class) #define sched_class_highest (&rt_sched_class)
static inline void inc_load(struct rq *rq, const struct task_struct *p) static void inc_nr_running(struct rq *rq)
{
update_load_add(&rq->load, p->se.load.weight);
}
static inline void dec_load(struct rq *rq, const struct task_struct *p)
{
update_load_sub(&rq->load, p->se.load.weight);
}
static void inc_nr_running(struct task_struct *p, struct rq *rq)
{ {
rq->nr_running++; rq->nr_running++;
inc_load(rq, p);
} }
static void dec_nr_running(struct task_struct *p, struct rq *rq) static void dec_nr_running(struct rq *rq)
{ {
rq->nr_running--; rq->nr_running--;
dec_load(rq, p);
} }
static void set_load_weight(struct task_struct *p) static void set_load_weight(struct task_struct *p)
...@@ -1530,7 +1936,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) ...@@ -1530,7 +1936,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
rq->nr_uninterruptible--; rq->nr_uninterruptible--;
enqueue_task(rq, p, wakeup); enqueue_task(rq, p, wakeup);
inc_nr_running(p, rq); inc_nr_running(rq);
} }
/* /*
...@@ -1542,7 +1948,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) ...@@ -1542,7 +1948,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
rq->nr_uninterruptible++; rq->nr_uninterruptible++;
dequeue_task(rq, p, sleep); dequeue_task(rq, p, sleep);
dec_nr_running(p, rq); dec_nr_running(rq);
} }
/** /**
...@@ -2194,7 +2600,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) ...@@ -2194,7 +2600,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
* management (if any): * management (if any):
*/ */
p->sched_class->task_new(rq, p); p->sched_class->task_new(rq, p);
inc_nr_running(p, rq); inc_nr_running(rq);
} }
check_preempt_curr(rq, p); check_preempt_curr(rq, p);
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
...@@ -3185,9 +3591,12 @@ static int load_balance(int this_cpu, struct rq *this_rq, ...@@ -3185,9 +3591,12 @@ static int load_balance(int this_cpu, struct rq *this_rq,
unsigned long imbalance; unsigned long imbalance;
struct rq *busiest; struct rq *busiest;
unsigned long flags; unsigned long flags;
int unlock_aggregate;
cpus_setall(*cpus); cpus_setall(*cpus);
unlock_aggregate = get_aggregate(sd);
/* /*
* When power savings policy is enabled for the parent domain, idle * When power savings policy is enabled for the parent domain, idle
* sibling can pick up load irrespective of busy siblings. In this case, * sibling can pick up load irrespective of busy siblings. In this case,
...@@ -3303,8 +3712,9 @@ redo: ...@@ -3303,8 +3712,9 @@ redo:
if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
return -1; ld_moved = -1;
return ld_moved;
goto out;
out_balanced: out_balanced:
schedstat_inc(sd, lb_balanced[idle]); schedstat_inc(sd, lb_balanced[idle]);
...@@ -3319,8 +3729,13 @@ out_one_pinned: ...@@ -3319,8 +3729,13 @@ out_one_pinned:
if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
return -1; ld_moved = -1;
return 0; else
ld_moved = 0;
out:
if (unlock_aggregate)
put_aggregate(sd);
return ld_moved;
} }
/* /*
...@@ -4535,10 +4950,8 @@ void set_user_nice(struct task_struct *p, long nice) ...@@ -4535,10 +4950,8 @@ void set_user_nice(struct task_struct *p, long nice)
goto out_unlock; goto out_unlock;
} }
on_rq = p->se.on_rq; on_rq = p->se.on_rq;
if (on_rq) { if (on_rq)
dequeue_task(rq, p, 0); dequeue_task(rq, p, 0);
dec_load(rq, p);
}
p->static_prio = NICE_TO_PRIO(nice); p->static_prio = NICE_TO_PRIO(nice);
set_load_weight(p); set_load_weight(p);
...@@ -4548,7 +4961,6 @@ void set_user_nice(struct task_struct *p, long nice) ...@@ -4548,7 +4961,6 @@ void set_user_nice(struct task_struct *p, long nice)
if (on_rq) { if (on_rq) {
enqueue_task(rq, p, 0); enqueue_task(rq, p, 0);
inc_load(rq, p);
/* /*
* If the task increased its priority or is running and * If the task increased its priority or is running and
* lowered its priority, then reschedule its CPU: * lowered its priority, then reschedule its CPU:
...@@ -6921,6 +7333,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, ...@@ -6921,6 +7333,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
SD_INIT(sd, ALLNODES); SD_INIT(sd, ALLNODES);
set_domain_attribute(sd, attr); set_domain_attribute(sd, attr);
sd->span = *cpu_map; sd->span = *cpu_map;
sd->first_cpu = first_cpu(sd->span);
cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
p = sd; p = sd;
sd_allnodes = 1; sd_allnodes = 1;
...@@ -6931,6 +7344,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, ...@@ -6931,6 +7344,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
SD_INIT(sd, NODE); SD_INIT(sd, NODE);
set_domain_attribute(sd, attr); set_domain_attribute(sd, attr);
sched_domain_node_span(cpu_to_node(i), &sd->span); sched_domain_node_span(cpu_to_node(i), &sd->span);
sd->first_cpu = first_cpu(sd->span);
sd->parent = p; sd->parent = p;
if (p) if (p)
p->child = sd; p->child = sd;
...@@ -6942,6 +7356,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, ...@@ -6942,6 +7356,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
SD_INIT(sd, CPU); SD_INIT(sd, CPU);
set_domain_attribute(sd, attr); set_domain_attribute(sd, attr);
sd->span = *nodemask; sd->span = *nodemask;
sd->first_cpu = first_cpu(sd->span);
sd->parent = p; sd->parent = p;
if (p) if (p)
p->child = sd; p->child = sd;
...@@ -6953,6 +7368,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, ...@@ -6953,6 +7368,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
SD_INIT(sd, MC); SD_INIT(sd, MC);
set_domain_attribute(sd, attr); set_domain_attribute(sd, attr);
sd->span = cpu_coregroup_map(i); sd->span = cpu_coregroup_map(i);
sd->first_cpu = first_cpu(sd->span);
cpus_and(sd->span, sd->span, *cpu_map); cpus_and(sd->span, sd->span, *cpu_map);
sd->parent = p; sd->parent = p;
p->child = sd; p->child = sd;
...@@ -6965,6 +7381,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, ...@@ -6965,6 +7381,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
SD_INIT(sd, SIBLING); SD_INIT(sd, SIBLING);
set_domain_attribute(sd, attr); set_domain_attribute(sd, attr);
sd->span = per_cpu(cpu_sibling_map, i); sd->span = per_cpu(cpu_sibling_map, i);
sd->first_cpu = first_cpu(sd->span);
cpus_and(sd->span, sd->span, *cpu_map); cpus_and(sd->span, sd->span, *cpu_map);
sd->parent = p; sd->parent = p;
p->child = sd; p->child = sd;
...@@ -7633,6 +8050,7 @@ void __init sched_init(void) ...@@ -7633,6 +8050,7 @@ void __init sched_init(void)
} }
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
init_aggregate();
init_defrootdomain(); init_defrootdomain();
#endif #endif
...@@ -8199,14 +8617,11 @@ void sched_move_task(struct task_struct *tsk) ...@@ -8199,14 +8617,11 @@ void sched_move_task(struct task_struct *tsk)
#endif #endif
#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_FAIR_GROUP_SCHED
static void set_se_shares(struct sched_entity *se, unsigned long shares) static void __set_se_shares(struct sched_entity *se, unsigned long shares)
{ {
struct cfs_rq *cfs_rq = se->cfs_rq; struct cfs_rq *cfs_rq = se->cfs_rq;
struct rq *rq = cfs_rq->rq;
int on_rq; int on_rq;
spin_lock_irq(&rq->lock);
on_rq = se->on_rq; on_rq = se->on_rq;
if (on_rq) if (on_rq)
dequeue_entity(cfs_rq, se, 0); dequeue_entity(cfs_rq, se, 0);
...@@ -8216,8 +8631,17 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares) ...@@ -8216,8 +8631,17 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares)
if (on_rq) if (on_rq)
enqueue_entity(cfs_rq, se, 0); enqueue_entity(cfs_rq, se, 0);
}
spin_unlock_irq(&rq->lock); static void set_se_shares(struct sched_entity *se, unsigned long shares)
{
struct cfs_rq *cfs_rq = se->cfs_rq;
struct rq *rq = cfs_rq->rq;
unsigned long flags;
spin_lock_irqsave(&rq->lock, flags);
__set_se_shares(se, shares);
spin_unlock_irqrestore(&rq->lock, flags);
} }
static DEFINE_MUTEX(shares_mutex); static DEFINE_MUTEX(shares_mutex);
...@@ -8238,8 +8662,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) ...@@ -8238,8 +8662,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
* (The default weight is 1024 - so there's no practical * (The default weight is 1024 - so there's no practical
* limitation from this.) * limitation from this.)
*/ */
if (shares < 2) if (shares < MIN_SHARES)
shares = 2; shares = MIN_SHARES;
mutex_lock(&shares_mutex); mutex_lock(&shares_mutex);
if (tg->shares == shares) if (tg->shares == shares)
...@@ -8259,8 +8683,13 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) ...@@ -8259,8 +8683,13 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
* w/o tripping rebalance_share or load_balance_fair. * w/o tripping rebalance_share or load_balance_fair.
*/ */
tg->shares = shares; tg->shares = shares;
for_each_possible_cpu(i) for_each_possible_cpu(i) {
set_se_shares(tg->se[i], shares); /*
* force a rebalance
*/
cfs_rq_set_shares(tg->cfs_rq[i], 0);
set_se_shares(tg->se[i], shares/nr_cpu_ids);
}
/* /*
* Enable load balance activity on this group, by inserting it back on * Enable load balance activity on this group, by inserting it back on
......
...@@ -492,10 +492,27 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) ...@@ -492,10 +492,27 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
* Scheduling class queueing methods: * Scheduling class queueing methods:
*/ */
#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
static void
add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
{
cfs_rq->task_weight += weight;
}
#else
static inline void
add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
{
}
#endif
static void static void
account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{ {
update_load_add(&cfs_rq->load, se->load.weight); update_load_add(&cfs_rq->load, se->load.weight);
if (!parent_entity(se))
inc_cpu_load(rq_of(cfs_rq), se->load.weight);
if (entity_is_task(se))
add_cfs_task_weight(cfs_rq, se->load.weight);
cfs_rq->nr_running++; cfs_rq->nr_running++;
se->on_rq = 1; se->on_rq = 1;
} }
...@@ -504,6 +521,10 @@ static void ...@@ -504,6 +521,10 @@ static void
account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{ {
update_load_sub(&cfs_rq->load, se->load.weight); update_load_sub(&cfs_rq->load, se->load.weight);
if (!parent_entity(se))
dec_cpu_load(rq_of(cfs_rq), se->load.weight);
if (entity_is_task(se))
add_cfs_task_weight(cfs_rq, -se->load.weight);
cfs_rq->nr_running--; cfs_rq->nr_running--;
se->on_rq = 0; se->on_rq = 0;
} }
...@@ -1286,75 +1307,90 @@ static struct task_struct *load_balance_next_fair(void *arg) ...@@ -1286,75 +1307,90 @@ static struct task_struct *load_balance_next_fair(void *arg)
return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
} }
#ifdef CONFIG_FAIR_GROUP_SCHED static unsigned long
static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) __load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move, struct sched_domain *sd,
enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
struct cfs_rq *cfs_rq)
{ {
struct sched_entity *curr; struct rq_iterator cfs_rq_iterator;
struct task_struct *p;
if (!cfs_rq->nr_running || !first_fair(cfs_rq))
return MAX_PRIO;
curr = cfs_rq->curr;
if (!curr)
curr = __pick_next_entity(cfs_rq);
p = task_of(curr); cfs_rq_iterator.start = load_balance_start_fair;
cfs_rq_iterator.next = load_balance_next_fair;
cfs_rq_iterator.arg = cfs_rq;
return p->prio; return balance_tasks(this_rq, this_cpu, busiest,
max_load_move, sd, idle, all_pinned,
this_best_prio, &cfs_rq_iterator);
} }
#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
static unsigned long static unsigned long
load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move, unsigned long max_load_move,
struct sched_domain *sd, enum cpu_idle_type idle, struct sched_domain *sd, enum cpu_idle_type idle,
int *all_pinned, int *this_best_prio) int *all_pinned, int *this_best_prio)
{ {
struct cfs_rq *busy_cfs_rq;
long rem_load_move = max_load_move; long rem_load_move = max_load_move;
struct rq_iterator cfs_rq_iterator; int busiest_cpu = cpu_of(busiest);
struct task_group *tg;
cfs_rq_iterator.start = load_balance_start_fair; rcu_read_lock();
cfs_rq_iterator.next = load_balance_next_fair; list_for_each_entry(tg, &task_groups, list) {
for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
#ifdef CONFIG_FAIR_GROUP_SCHED
struct cfs_rq *this_cfs_rq;
long imbalance; long imbalance;
unsigned long maxload; unsigned long this_weight, busiest_weight;
long rem_load, max_load, moved_load;
/*
* empty group
*/
if (!aggregate(tg, sd)->task_weight)
continue;
rem_load = rem_load_move * aggregate(tg, sd)->rq_weight;
rem_load /= aggregate(tg, sd)->load + 1;
this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); this_weight = tg->cfs_rq[this_cpu]->task_weight;
busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight;
imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; imbalance = (busiest_weight - this_weight) / 2;
/* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
if (imbalance <= 0) if (imbalance < 0)
imbalance = busiest_weight;
max_load = max(rem_load, imbalance);
moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
max_load, sd, idle, all_pinned, this_best_prio,
tg->cfs_rq[busiest_cpu]);
if (!moved_load)
continue; continue;
/* Don't pull more than imbalance/2 */ move_group_shares(tg, sd, busiest_cpu, this_cpu);
imbalance /= 2;
maxload = min(rem_load_move, imbalance);
*this_best_prio = cfs_rq_best_prio(this_cfs_rq); moved_load *= aggregate(tg, sd)->load;
#else moved_load /= aggregate(tg, sd)->rq_weight + 1;
# define maxload rem_load_move
#endif
/*
* pass busy_cfs_rq argument into
* load_balance_[start|next]_fair iterators
*/
cfs_rq_iterator.arg = busy_cfs_rq;
rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
maxload, sd, idle, all_pinned,
this_best_prio,
&cfs_rq_iterator);
if (rem_load_move <= 0) rem_load_move -= moved_load;
if (rem_load_move < 0)
break; break;
} }
rcu_read_unlock();
return max_load_move - rem_load_move; return max_load_move - rem_load_move;
} }
#else
static unsigned long
load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move,
struct sched_domain *sd, enum cpu_idle_type idle,
int *all_pinned, int *this_best_prio)
{
return __load_balance_fair(this_rq, this_cpu, busiest,
max_load_move, sd, idle, all_pinned,
this_best_prio, &busiest->cfs);
}
#endif
static int static int
move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
......
...@@ -518,6 +518,8 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) ...@@ -518,6 +518,8 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
*/ */
for_each_sched_rt_entity(rt_se) for_each_sched_rt_entity(rt_se)
enqueue_rt_entity(rt_se); enqueue_rt_entity(rt_se);
inc_cpu_load(rq, p->se.load.weight);
} }
static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
...@@ -537,6 +539,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) ...@@ -537,6 +539,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
if (rt_rq && rt_rq->rt_nr_running) if (rt_rq && rt_rq->rt_nr_running)
enqueue_rt_entity(rt_se); enqueue_rt_entity(rt_se);
} }
dec_cpu_load(rq, p->se.load.weight);
} }
/* /*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment