Commit c96d145e authored by Chen, Kenneth W's avatar Chen, Kenneth W Committed by Linus Torvalds

[PATCH] sched: fix smt nice lock contention and optimization

Initial report and lock contention fix from Chris Mason:

Recent benchmarks showed some performance regressions between 2.6.16 and
2.6.5.  We tracked down one of the regressions to lock contention in
schedule heavy workloads (~70,000 context switches per second)

kernel/sched.c:dependent_sleeper() was responsible for most of the lock
contention, hammering on the run queue locks.  The patch below is more of a
discussion point than a suggested fix (although it does reduce lock
contention significantly).  The dependent_sleeper code looks very expensive
to me, especially for using a spinlock to bounce control between two
different siblings in the same cpu.

It is further optimized:

* perform dependent_sleeper check after next task is determined
* convert wake_sleeping_dependent to use trylock
* skip smt runqueue check if trylock fails
* optimize double_rq_lock now that smt nice is converted to trylock
* early exit in searching first SD_SHARE_CPUPOWER domain
* speedup fast path of dependent_sleeper

[akpm@osdl.org: cleanup]
Signed-off-by: default avatarKen Chen <kenneth.w.chen@intel.com>
Acked-by: default avatarIngo Molnar <mingo@elte.hu>
Acked-by: default avatarCon Kolivas <kernel@kolivas.org>
Signed-off-by: default avatarNick Piggin <npiggin@suse.de>
Acked-by: default avatarChris Mason <mason@suse.com>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 7a8e2a5e
...@@ -239,7 +239,6 @@ struct runqueue { ...@@ -239,7 +239,6 @@ struct runqueue {
task_t *migration_thread; task_t *migration_thread;
struct list_head migration_queue; struct list_head migration_queue;
int cpu;
#endif #endif
#ifdef CONFIG_SCHEDSTATS #ifdef CONFIG_SCHEDSTATS
...@@ -1074,9 +1073,10 @@ static int sched_balance_self(int cpu, int flag) ...@@ -1074,9 +1073,10 @@ static int sched_balance_self(int cpu, int flag)
struct task_struct *t = current; struct task_struct *t = current;
struct sched_domain *tmp, *sd = NULL; struct sched_domain *tmp, *sd = NULL;
for_each_domain(cpu, tmp) for_each_domain(cpu, tmp) {
if (tmp->flags & flag) if (tmp->flags & flag)
sd = tmp; sd = tmp;
}
while (sd) { while (sd) {
cpumask_t span; cpumask_t span;
...@@ -1691,9 +1691,6 @@ unsigned long nr_active(void) ...@@ -1691,9 +1691,6 @@ unsigned long nr_active(void)
/* /*
* double_rq_lock - safely lock two runqueues * double_rq_lock - safely lock two runqueues
* *
* We must take them in cpu order to match code in
* dependent_sleeper and wake_dependent_sleeper.
*
* Note this does not disable interrupts like task_rq_lock, * Note this does not disable interrupts like task_rq_lock,
* you need to do so manually before calling. * you need to do so manually before calling.
*/ */
...@@ -1705,7 +1702,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) ...@@ -1705,7 +1702,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
spin_lock(&rq1->lock); spin_lock(&rq1->lock);
__acquire(rq2->lock); /* Fake it out ;) */ __acquire(rq2->lock); /* Fake it out ;) */
} else { } else {
if (rq1->cpu < rq2->cpu) { if (rq1 < rq2) {
spin_lock(&rq1->lock); spin_lock(&rq1->lock);
spin_lock(&rq2->lock); spin_lock(&rq2->lock);
} else { } else {
...@@ -1741,7 +1738,7 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) ...@@ -1741,7 +1738,7 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
__acquires(this_rq->lock) __acquires(this_rq->lock)
{ {
if (unlikely(!spin_trylock(&busiest->lock))) { if (unlikely(!spin_trylock(&busiest->lock))) {
if (busiest->cpu < this_rq->cpu) { if (busiest < this_rq) {
spin_unlock(&this_rq->lock); spin_unlock(&this_rq->lock);
spin_lock(&busiest->lock); spin_lock(&busiest->lock);
spin_lock(&this_rq->lock); spin_lock(&this_rq->lock);
...@@ -2352,10 +2349,11 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) ...@@ -2352,10 +2349,11 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
double_lock_balance(busiest_rq, target_rq); double_lock_balance(busiest_rq, target_rq);
/* Search for an sd spanning us and the target CPU. */ /* Search for an sd spanning us and the target CPU. */
for_each_domain(target_cpu, sd) for_each_domain(target_cpu, sd) {
if ((sd->flags & SD_LOAD_BALANCE) && if ((sd->flags & SD_LOAD_BALANCE) &&
cpu_isset(busiest_cpu, sd->span)) cpu_isset(busiest_cpu, sd->span))
break; break;
}
if (unlikely(sd == NULL)) if (unlikely(sd == NULL))
goto out; goto out;
...@@ -2691,48 +2689,35 @@ static inline void wakeup_busy_runqueue(runqueue_t *rq) ...@@ -2691,48 +2689,35 @@ static inline void wakeup_busy_runqueue(runqueue_t *rq)
resched_task(rq->idle); resched_task(rq->idle);
} }
static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) /*
* Called with interrupt disabled and this_rq's runqueue locked.
*/
static void wake_sleeping_dependent(int this_cpu)
{ {
struct sched_domain *tmp, *sd = NULL; struct sched_domain *tmp, *sd = NULL;
cpumask_t sibling_map;
int i; int i;
for_each_domain(this_cpu, tmp) for_each_domain(this_cpu, tmp) {
if (tmp->flags & SD_SHARE_CPUPOWER) if (tmp->flags & SD_SHARE_CPUPOWER) {
sd = tmp; sd = tmp;
break;
}
}
if (!sd) if (!sd)
return; return;
/* for_each_cpu_mask(i, sd->span) {
* Unlock the current runqueue because we have to lock in
* CPU order to avoid deadlocks. Caller knows that we might
* unlock. We keep IRQs disabled.
*/
spin_unlock(&this_rq->lock);
sibling_map = sd->span;
for_each_cpu_mask(i, sibling_map)
spin_lock(&cpu_rq(i)->lock);
/*
* We clear this CPU from the mask. This both simplifies the
* inner loop and keps this_rq locked when we exit:
*/
cpu_clear(this_cpu, sibling_map);
for_each_cpu_mask(i, sibling_map) {
runqueue_t *smt_rq = cpu_rq(i); runqueue_t *smt_rq = cpu_rq(i);
if (i == this_cpu)
continue;
if (unlikely(!spin_trylock(&smt_rq->lock)))
continue;
wakeup_busy_runqueue(smt_rq); wakeup_busy_runqueue(smt_rq);
spin_unlock(&smt_rq->lock);
} }
for_each_cpu_mask(i, sibling_map)
spin_unlock(&cpu_rq(i)->lock);
/*
* We exit with this_cpu's rq still held and IRQs
* still disabled:
*/
} }
/* /*
...@@ -2745,52 +2730,46 @@ static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd) ...@@ -2745,52 +2730,46 @@ static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd)
return p->time_slice * (100 - sd->per_cpu_gain) / 100; return p->time_slice * (100 - sd->per_cpu_gain) / 100;
} }
static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) /*
* To minimise lock contention and not have to drop this_rq's runlock we only
* trylock the sibling runqueues and bypass those runqueues if we fail to
* acquire their lock. As we only trylock the normal locking order does not
* need to be obeyed.
*/
static int dependent_sleeper(int this_cpu, runqueue_t *this_rq, task_t *p)
{ {
struct sched_domain *tmp, *sd = NULL; struct sched_domain *tmp, *sd = NULL;
cpumask_t sibling_map;
prio_array_t *array;
int ret = 0, i; int ret = 0, i;
task_t *p;
for_each_domain(this_cpu, tmp) /* kernel/rt threads do not participate in dependent sleeping */
if (tmp->flags & SD_SHARE_CPUPOWER) if (!p->mm || rt_task(p))
return 0;
for_each_domain(this_cpu, tmp) {
if (tmp->flags & SD_SHARE_CPUPOWER) {
sd = tmp; sd = tmp;
break;
}
}
if (!sd) if (!sd)
return 0; return 0;
/* for_each_cpu_mask(i, sd->span) {
* The same locking rules and details apply as for runqueue_t *smt_rq;
* wake_sleeping_dependent(): task_t *smt_curr;
*/
spin_unlock(&this_rq->lock);
sibling_map = sd->span;
for_each_cpu_mask(i, sibling_map)
spin_lock(&cpu_rq(i)->lock);
cpu_clear(this_cpu, sibling_map);
/* if (i == this_cpu)
* Establish next task to be run - it might have gone away because continue;
* we released the runqueue lock above:
*/
if (!this_rq->nr_running)
goto out_unlock;
array = this_rq->active;
if (!array->nr_active)
array = this_rq->expired;
BUG_ON(!array->nr_active);
p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next, smt_rq = cpu_rq(i);
task_t, run_list); if (unlikely(!spin_trylock(&smt_rq->lock)))
continue;
for_each_cpu_mask(i, sibling_map) { smt_curr = smt_rq->curr;
runqueue_t *smt_rq = cpu_rq(i);
task_t *smt_curr = smt_rq->curr;
/* Kernel threads do not participate in dependent sleeping */ if (!smt_curr->mm)
if (!p->mm || !smt_curr->mm || rt_task(p)) goto unlock;
goto check_smt_task;
/* /*
* If a user task with lower static priority than the * If a user task with lower static priority than the
...@@ -2808,49 +2787,24 @@ static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) ...@@ -2808,49 +2787,24 @@ static int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
if ((jiffies % DEF_TIMESLICE) > if ((jiffies % DEF_TIMESLICE) >
(sd->per_cpu_gain * DEF_TIMESLICE / 100)) (sd->per_cpu_gain * DEF_TIMESLICE / 100))
ret = 1; ret = 1;
} else } else {
if (smt_curr->static_prio < p->static_prio && if (smt_curr->static_prio < p->static_prio &&
!TASK_PREEMPTS_CURR(p, smt_rq) && !TASK_PREEMPTS_CURR(p, smt_rq) &&
smt_slice(smt_curr, sd) > task_timeslice(p)) smt_slice(smt_curr, sd) > task_timeslice(p))
ret = 1; ret = 1;
check_smt_task:
if ((!smt_curr->mm && smt_curr != smt_rq->idle) ||
rt_task(smt_curr))
continue;
if (!p->mm) {
wakeup_busy_runqueue(smt_rq);
continue;
}
/*
* Reschedule a lower priority task on the SMT sibling for
* it to be put to sleep, or wake it up if it has been put to
* sleep for priority reasons to see if it should run now.
*/
if (rt_task(p)) {
if ((jiffies % DEF_TIMESLICE) >
(sd->per_cpu_gain * DEF_TIMESLICE / 100))
resched_task(smt_curr);
} else {
if (TASK_PREEMPTS_CURR(p, smt_rq) &&
smt_slice(p, sd) > task_timeslice(smt_curr))
resched_task(smt_curr);
else
wakeup_busy_runqueue(smt_rq);
} }
unlock:
spin_unlock(&smt_rq->lock);
} }
out_unlock:
for_each_cpu_mask(i, sibling_map)
spin_unlock(&cpu_rq(i)->lock);
return ret; return ret;
} }
#else #else
static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) static inline void wake_sleeping_dependent(int this_cpu)
{ {
} }
static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq,
task_t *p)
{ {
return 0; return 0;
} }
...@@ -2972,32 +2926,13 @@ need_resched_nonpreemptible: ...@@ -2972,32 +2926,13 @@ need_resched_nonpreemptible:
cpu = smp_processor_id(); cpu = smp_processor_id();
if (unlikely(!rq->nr_running)) { if (unlikely(!rq->nr_running)) {
go_idle:
idle_balance(cpu, rq); idle_balance(cpu, rq);
if (!rq->nr_running) { if (!rq->nr_running) {
next = rq->idle; next = rq->idle;
rq->expired_timestamp = 0; rq->expired_timestamp = 0;
wake_sleeping_dependent(cpu, rq); wake_sleeping_dependent(cpu);
/*
* wake_sleeping_dependent() might have released
* the runqueue, so break out if we got new
* tasks meanwhile:
*/
if (!rq->nr_running)
goto switch_tasks;
}
} else {
if (dependent_sleeper(cpu, rq)) {
next = rq->idle;
goto switch_tasks; goto switch_tasks;
} }
/*
* dependent_sleeper() releases and reacquires the runqueue
* lock, hence go into the idle loop if the rq went
* empty meanwhile:
*/
if (unlikely(!rq->nr_running))
goto go_idle;
} }
array = rq->active; array = rq->active;
...@@ -3035,6 +2970,8 @@ go_idle: ...@@ -3035,6 +2970,8 @@ go_idle:
} }
} }
next->sleep_type = SLEEP_NORMAL; next->sleep_type = SLEEP_NORMAL;
if (dependent_sleeper(cpu, rq, next))
next = rq->idle;
switch_tasks: switch_tasks:
if (next == rq->idle) if (next == rq->idle)
schedstat_inc(rq, sched_goidle); schedstat_inc(rq, sched_goidle);
...@@ -6144,7 +6081,6 @@ void __init sched_init(void) ...@@ -6144,7 +6081,6 @@ void __init sched_init(void)
rq->push_cpu = 0; rq->push_cpu = 0;
rq->migration_thread = NULL; rq->migration_thread = NULL;
INIT_LIST_HEAD(&rq->migration_queue); INIT_LIST_HEAD(&rq->migration_queue);
rq->cpu = i;
#endif #endif
atomic_set(&rq->nr_iowait, 0); atomic_set(&rq->nr_iowait, 0);
...@@ -6205,7 +6141,7 @@ void normalize_rt_tasks(void) ...@@ -6205,7 +6141,7 @@ void normalize_rt_tasks(void)
runqueue_t *rq; runqueue_t *rq;
read_lock_irq(&tasklist_lock); read_lock_irq(&tasklist_lock);
for_each_process (p) { for_each_process(p) {
if (!rt_task(p)) if (!rt_task(p))
continue; continue;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment