Commit ec4e0e2f authored by Ken Chen's avatar Ken Chen Committed by Ingo Molnar

sched: fix inconsistency when redistribute per-cpu tg->cfs_rq shares

Impact: make load-balancing more consistent

In the update_shares() path leading to tg_shares_up(), the calculation of
per-cpu cfs_rq shares is rather erratic even under moderate task wake up
rate.  The problem is that the per-cpu tg->cfs_rq load weight used in the
sd_rq_weight aggregation and actual redistribution of the cfs_rq->shares
are collected at different time.  Under moderate system load, we've seen
quite a bit of variation on the cfs_rq->shares and ultimately wildly
affects sched_entity's load weight.

This patch caches the result of initial per-cpu load weight when doing the
sum calculation, and then pass it down to update_group_shares_cpu() for
redistributing per-cpu cfs_rq shares.  This allows consistent total cfs_rq
shares across all CPUs. It also simplifies the rounding and zero load
weight check.
Signed-off-by: default avatarKen Chen <kenchen@google.com>
Acked-by: default avatarPeter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: default avatarIngo Molnar <mingo@elte.hu>
parent 3ac3ba0b
...@@ -1453,27 +1453,13 @@ static void ...@@ -1453,27 +1453,13 @@ static void
update_group_shares_cpu(struct task_group *tg, int cpu, update_group_shares_cpu(struct task_group *tg, int cpu,
unsigned long sd_shares, unsigned long sd_rq_weight) unsigned long sd_shares, unsigned long sd_rq_weight)
{ {
int boost = 0;
unsigned long shares; unsigned long shares;
unsigned long rq_weight; unsigned long rq_weight;
if (!tg->se[cpu]) if (!tg->se[cpu])
return; return;
rq_weight = tg->cfs_rq[cpu]->load.weight; rq_weight = tg->cfs_rq[cpu]->rq_weight;
/*
* If there are currently no tasks on the cpu pretend there is one of
* average load so that when a new task gets to run here it will not
* get delayed by group starvation.
*/
if (!rq_weight) {
boost = 1;
rq_weight = NICE_0_LOAD;
}
if (unlikely(rq_weight > sd_rq_weight))
rq_weight = sd_rq_weight;
/* /*
* \Sum shares * rq_weight * \Sum shares * rq_weight
...@@ -1481,7 +1467,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu, ...@@ -1481,7 +1467,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
* \Sum rq_weight * \Sum rq_weight
* *
*/ */
shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); shares = (sd_shares * rq_weight) / sd_rq_weight;
shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
if (abs(shares - tg->se[cpu]->load.weight) > if (abs(shares - tg->se[cpu]->load.weight) >
...@@ -1490,11 +1476,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu, ...@@ -1490,11 +1476,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
unsigned long flags; unsigned long flags;
spin_lock_irqsave(&rq->lock, flags); spin_lock_irqsave(&rq->lock, flags);
/* tg->cfs_rq[cpu]->shares = shares;
* record the actual number of shares, not the boosted amount.
*/
tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
tg->cfs_rq[cpu]->rq_weight = rq_weight;
__set_se_shares(tg->se[cpu], shares); __set_se_shares(tg->se[cpu], shares);
spin_unlock_irqrestore(&rq->lock, flags); spin_unlock_irqrestore(&rq->lock, flags);
...@@ -1508,13 +1490,23 @@ update_group_shares_cpu(struct task_group *tg, int cpu, ...@@ -1508,13 +1490,23 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
*/ */
static int tg_shares_up(struct task_group *tg, void *data) static int tg_shares_up(struct task_group *tg, void *data)
{ {
unsigned long rq_weight = 0; unsigned long weight, rq_weight = 0;
unsigned long shares = 0; unsigned long shares = 0;
struct sched_domain *sd = data; struct sched_domain *sd = data;
int i; int i;
for_each_cpu_mask(i, sd->span) { for_each_cpu_mask(i, sd->span) {
rq_weight += tg->cfs_rq[i]->load.weight; /*
* If there are currently no tasks on the cpu pretend there
* is one of average load so that when a new task gets to
* run here it will not get delayed by group starvation.
*/
weight = tg->cfs_rq[i]->load.weight;
if (!weight)
weight = NICE_0_LOAD;
tg->cfs_rq[i]->rq_weight = weight;
rq_weight += weight;
shares += tg->cfs_rq[i]->shares; shares += tg->cfs_rq[i]->shares;
} }
...@@ -1524,9 +1516,6 @@ static int tg_shares_up(struct task_group *tg, void *data) ...@@ -1524,9 +1516,6 @@ static int tg_shares_up(struct task_group *tg, void *data)
if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
shares = tg->shares; shares = tg->shares;
if (!rq_weight)
rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
for_each_cpu_mask(i, sd->span) for_each_cpu_mask(i, sd->span)
update_group_shares_cpu(tg, i, shares, rq_weight); update_group_shares_cpu(tg, i, shares, rq_weight);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment