Commit 7897986b authored by Nick Piggin's avatar Nick Piggin Committed by Linus Torvalds

[PATCH] sched: balance timers

Do CPU load averaging over a number of different intervals.  Allow each
interval to be chosen by sending a parameter to source_load and target_load.
0 is instantaneous, idx > 0 returns a decaying average with the most recent
sample weighted at 2^(idx-1).  To a maximum of 3 (could be easily increased).

So generally a higher number will result in more conservative balancing.
Signed-off-by: default avatarNick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 99b61ccf
...@@ -74,6 +74,10 @@ static inline int node_to_first_cpu(int node) ...@@ -74,6 +74,10 @@ static inline int node_to_first_cpu(int node)
.imbalance_pct = 125, \ .imbalance_pct = 125, \
.cache_hot_time = (10*1000000), \ .cache_hot_time = (10*1000000), \
.cache_nice_tries = 1, \ .cache_nice_tries = 1, \
.busy_idx = 3, \
.idle_idx = 1, \
.newidle_idx = 2, \
.wake_idx = 1, \
.per_cpu_gain = 100, \ .per_cpu_gain = 100, \
.flags = SD_LOAD_BALANCE \ .flags = SD_LOAD_BALANCE \
| SD_BALANCE_EXEC \ | SD_BALANCE_EXEC \
......
...@@ -39,7 +39,11 @@ extern int __node_distance(int, int); ...@@ -39,7 +39,11 @@ extern int __node_distance(int, int);
.busy_factor = 32, \ .busy_factor = 32, \
.imbalance_pct = 125, \ .imbalance_pct = 125, \
.cache_hot_time = (10*1000000), \ .cache_hot_time = (10*1000000), \
.cache_nice_tries = 1, \ .cache_nice_tries = 2, \
.busy_idx = 3, \
.idle_idx = 2, \
.newidle_idx = 1, \
.wake_idx = 1, \
.per_cpu_gain = 100, \ .per_cpu_gain = 100, \
.flags = SD_LOAD_BALANCE \ .flags = SD_LOAD_BALANCE \
| SD_BALANCE_NEWIDLE \ | SD_BALANCE_NEWIDLE \
......
...@@ -488,6 +488,10 @@ struct sched_domain { ...@@ -488,6 +488,10 @@ struct sched_domain {
unsigned long long cache_hot_time; /* Task considered cache hot (ns) */ unsigned long long cache_hot_time; /* Task considered cache hot (ns) */
unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */ unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */
unsigned int per_cpu_gain; /* CPU % gained by adding domain cpus */ unsigned int per_cpu_gain; /* CPU % gained by adding domain cpus */
unsigned int busy_idx;
unsigned int idle_idx;
unsigned int newidle_idx;
unsigned int wake_idx;
int flags; /* See SD_* */ int flags; /* See SD_* */
/* Runtime fields. */ /* Runtime fields. */
......
...@@ -89,6 +89,10 @@ ...@@ -89,6 +89,10 @@
.cache_hot_time = 0, \ .cache_hot_time = 0, \
.cache_nice_tries = 0, \ .cache_nice_tries = 0, \
.per_cpu_gain = 25, \ .per_cpu_gain = 25, \
.busy_idx = 0, \
.idle_idx = 0, \
.newidle_idx = 0, \
.wake_idx = 0, \
.flags = SD_LOAD_BALANCE \ .flags = SD_LOAD_BALANCE \
| SD_BALANCE_NEWIDLE \ | SD_BALANCE_NEWIDLE \
| SD_BALANCE_EXEC \ | SD_BALANCE_EXEC \
...@@ -115,6 +119,10 @@ ...@@ -115,6 +119,10 @@
.cache_hot_time = (5*1000000/2), \ .cache_hot_time = (5*1000000/2), \
.cache_nice_tries = 1, \ .cache_nice_tries = 1, \
.per_cpu_gain = 100, \ .per_cpu_gain = 100, \
.busy_idx = 2, \
.idle_idx = 0, \
.newidle_idx = 1, \
.wake_idx = 1, \
.flags = SD_LOAD_BALANCE \ .flags = SD_LOAD_BALANCE \
| SD_BALANCE_NEWIDLE \ | SD_BALANCE_NEWIDLE \
| SD_BALANCE_EXEC \ | SD_BALANCE_EXEC \
......
...@@ -206,7 +206,7 @@ struct runqueue { ...@@ -206,7 +206,7 @@ struct runqueue {
*/ */
unsigned long nr_running; unsigned long nr_running;
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
unsigned long cpu_load; unsigned long cpu_load[3];
#endif #endif
unsigned long long nr_switches; unsigned long long nr_switches;
...@@ -886,23 +886,27 @@ void kick_process(task_t *p) ...@@ -886,23 +886,27 @@ void kick_process(task_t *p)
* We want to under-estimate the load of migration sources, to * We want to under-estimate the load of migration sources, to
* balance conservatively. * balance conservatively.
*/ */
static inline unsigned long source_load(int cpu) static inline unsigned long source_load(int cpu, int type)
{ {
runqueue_t *rq = cpu_rq(cpu); runqueue_t *rq = cpu_rq(cpu);
unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
if (type == 0)
return load_now;
return min(rq->cpu_load, load_now); return min(rq->cpu_load[type-1], load_now);
} }
/* /*
* Return a high guess at the load of a migration-target cpu * Return a high guess at the load of a migration-target cpu
*/ */
static inline unsigned long target_load(int cpu) static inline unsigned long target_load(int cpu, int type)
{ {
runqueue_t *rq = cpu_rq(cpu); runqueue_t *rq = cpu_rq(cpu);
unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
if (type == 0)
return load_now;
return max(rq->cpu_load, load_now); return max(rq->cpu_load[type-1], load_now);
} }
#endif #endif
...@@ -967,7 +971,7 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync) ...@@ -967,7 +971,7 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
runqueue_t *rq; runqueue_t *rq;
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
unsigned long load, this_load; unsigned long load, this_load;
struct sched_domain *sd; struct sched_domain *sd, *this_sd = NULL;
int new_cpu; int new_cpu;
#endif #endif
...@@ -986,72 +990,64 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync) ...@@ -986,72 +990,64 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
if (unlikely(task_running(rq, p))) if (unlikely(task_running(rq, p)))
goto out_activate; goto out_activate;
#ifdef CONFIG_SCHEDSTATS new_cpu = cpu;
schedstat_inc(rq, ttwu_cnt); schedstat_inc(rq, ttwu_cnt);
if (cpu == this_cpu) { if (cpu == this_cpu) {
schedstat_inc(rq, ttwu_local); schedstat_inc(rq, ttwu_local);
} else { goto out_set_cpu;
for_each_domain(this_cpu, sd) { }
if (cpu_isset(cpu, sd->span)) {
schedstat_inc(sd, ttwu_wake_remote); for_each_domain(this_cpu, sd) {
break; if (cpu_isset(cpu, sd->span)) {
} schedstat_inc(sd, ttwu_wake_remote);
this_sd = sd;
break;
} }
} }
#endif
new_cpu = cpu; if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
goto out_set_cpu; goto out_set_cpu;
load = source_load(cpu);
this_load = target_load(this_cpu);
/* /*
* If sync wakeup then subtract the (maximum possible) effect of * Check for affine wakeup and passive balancing possibilities.
* the currently running task from the load of the current CPU:
*/ */
if (sync) if (this_sd) {
this_load -= SCHED_LOAD_SCALE; int idx = this_sd->wake_idx;
unsigned int imbalance;
/* Don't pull the task off an idle CPU to a busy one */
if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
goto out_set_cpu;
new_cpu = this_cpu; /* Wake to this CPU if we can */ load = source_load(cpu, idx);
this_load = target_load(this_cpu, idx);
/*
* Scan domains for affine wakeup and passive balancing
* possibilities.
*/
for_each_domain(this_cpu, sd) {
unsigned int imbalance;
/* /*
* Start passive balancing when half the imbalance_pct * If sync wakeup then subtract the (maximum possible) effect of
* limit is reached. * the currently running task from the load of the current CPU:
*/ */
imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2; if (sync)
this_load -= SCHED_LOAD_SCALE;
/* Don't pull the task off an idle CPU to a busy one */
if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
goto out_set_cpu;
if ((sd->flags & SD_WAKE_AFFINE) && new_cpu = this_cpu; /* Wake to this CPU if we can */
!task_hot(p, rq->timestamp_last_tick, sd)) {
if ((this_sd->flags & SD_WAKE_AFFINE) &&
!task_hot(p, rq->timestamp_last_tick, this_sd)) {
/* /*
* This domain has SD_WAKE_AFFINE and p is cache cold * This domain has SD_WAKE_AFFINE and p is cache cold
* in this domain. * in this domain.
*/ */
if (cpu_isset(cpu, sd->span)) { schedstat_inc(this_sd, ttwu_move_affine);
schedstat_inc(sd, ttwu_move_affine); goto out_set_cpu;
goto out_set_cpu; } else if ((this_sd->flags & SD_WAKE_BALANCE) &&
}
} else if ((sd->flags & SD_WAKE_BALANCE) &&
imbalance*this_load <= 100*load) { imbalance*this_load <= 100*load) {
/* /*
* This domain has SD_WAKE_BALANCE and there is * This domain has SD_WAKE_BALANCE and there is
* an imbalance. * an imbalance.
*/ */
if (cpu_isset(cpu, sd->span)) { schedstat_inc(this_sd, ttwu_move_balance);
schedstat_inc(sd, ttwu_move_balance); goto out_set_cpu;
goto out_set_cpu;
}
} }
} }
...@@ -1509,7 +1505,7 @@ static int find_idlest_cpu(struct task_struct *p, int this_cpu, ...@@ -1509,7 +1505,7 @@ static int find_idlest_cpu(struct task_struct *p, int this_cpu,
cpus_and(mask, sd->span, p->cpus_allowed); cpus_and(mask, sd->span, p->cpus_allowed);
for_each_cpu_mask(i, mask) { for_each_cpu_mask(i, mask) {
load = target_load(i); load = target_load(i, sd->wake_idx);
if (load < min_load) { if (load < min_load) {
min_cpu = i; min_cpu = i;
...@@ -1522,7 +1518,7 @@ static int find_idlest_cpu(struct task_struct *p, int this_cpu, ...@@ -1522,7 +1518,7 @@ static int find_idlest_cpu(struct task_struct *p, int this_cpu,
} }
/* add +1 to account for the new task */ /* add +1 to account for the new task */
this_load = source_load(this_cpu) + SCHED_LOAD_SCALE; this_load = source_load(this_cpu, sd->wake_idx) + SCHED_LOAD_SCALE;
/* /*
* Would with the addition of the new task to the * Would with the addition of the new task to the
...@@ -1767,8 +1763,15 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, ...@@ -1767,8 +1763,15 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
{ {
struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
unsigned long max_load, avg_load, total_load, this_load, total_pwr; unsigned long max_load, avg_load, total_load, this_load, total_pwr;
int load_idx;
max_load = this_load = total_load = total_pwr = 0; max_load = this_load = total_load = total_pwr = 0;
if (idle == NOT_IDLE)
load_idx = sd->busy_idx;
else if (idle == NEWLY_IDLE)
load_idx = sd->newidle_idx;
else
load_idx = sd->idle_idx;
do { do {
unsigned long load; unsigned long load;
...@@ -1783,9 +1786,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, ...@@ -1783,9 +1786,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
for_each_cpu_mask(i, group->cpumask) { for_each_cpu_mask(i, group->cpumask) {
/* Bias balancing toward cpus of our domain */ /* Bias balancing toward cpus of our domain */
if (local_group) if (local_group)
load = target_load(i); load = target_load(i, load_idx);
else else
load = source_load(i); load = source_load(i, load_idx);
avg_load += load; avg_load += load;
} }
...@@ -1895,7 +1898,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group) ...@@ -1895,7 +1898,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group)
int i; int i;
for_each_cpu_mask(i, group->cpumask) { for_each_cpu_mask(i, group->cpumask) {
load = source_load(i); load = source_load(i, 0);
if (load > max_load) { if (load > max_load) {
max_load = load; max_load = load;
...@@ -2150,18 +2153,23 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, ...@@ -2150,18 +2153,23 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
unsigned long old_load, this_load; unsigned long old_load, this_load;
unsigned long j = jiffies + CPU_OFFSET(this_cpu); unsigned long j = jiffies + CPU_OFFSET(this_cpu);
struct sched_domain *sd; struct sched_domain *sd;
int i;
/* Update our load */
old_load = this_rq->cpu_load;
this_load = this_rq->nr_running * SCHED_LOAD_SCALE; this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
/* /* Update our load */
* Round up the averaging division if load is increasing. This for (i = 0; i < 3; i++) {
* prevents us from getting stuck on 9 if the load is 10, for unsigned long new_load = this_load;
* example. int scale = 1 << i;
*/ old_load = this_rq->cpu_load[i];
if (this_load > old_load) /*
old_load++; * Round up the averaging division if load is increasing. This
this_rq->cpu_load = (old_load + this_load) / 2; * prevents us from getting stuck on 9 if the load is 10, for
* example.
*/
if (new_load > old_load)
new_load += scale-1;
this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale;
}
for_each_domain(this_cpu, sd) { for_each_domain(this_cpu, sd) {
unsigned long interval; unsigned long interval;
...@@ -4921,13 +4929,15 @@ void __init sched_init(void) ...@@ -4921,13 +4929,15 @@ void __init sched_init(void)
rq = cpu_rq(i); rq = cpu_rq(i);
spin_lock_init(&rq->lock); spin_lock_init(&rq->lock);
rq->nr_running = 0;
rq->active = rq->arrays; rq->active = rq->arrays;
rq->expired = rq->arrays + 1; rq->expired = rq->arrays + 1;
rq->best_expired_prio = MAX_PRIO; rq->best_expired_prio = MAX_PRIO;
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
rq->sd = &sched_domain_dummy; rq->sd = &sched_domain_dummy;
rq->cpu_load = 0; for (j = 1; j < 3; j++)
rq->cpu_load[j] = 0;
rq->active_balance = 0; rq->active_balance = 0;
rq->push_cpu = 0; rq->push_cpu = 0;
rq->migration_thread = NULL; rq->migration_thread = NULL;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment