rtmutex: prevent missed wakeups

The sleeping locks implementation based on rtmutexes can miss wakeups for two reasons: 1) The unconditional usage TASK_UNINTERRUPTIBLE for the blocking state Results in missed wakeups from wake_up_interruptible*() state = TASK_INTERRUPTIBLE; blocks_on_lock() state = TASK_UNINTERRUPTIBLE; schedule(); .... acquires_lock(); restore_state(); Until the waiter has restored its state wake_up_interruptible*() will fail. 2) The rtmutex wakeup intermediate state TASK_RUNNING_MUTEX Results in missed wakeups from wake_up*() waiter is woken by mutex wakeup waiter->state = TASK_RUNNING_MUTEX; .... acquires_lock(); restore_state(); Until the waiter has restored its state wake_up*() will fail. Solution: Instead of setting the state to TASK_RUNNING_MUTEX in the mutex wakeup case we logically OR TASK_RUNNING_MUTEX to the current waiter state. This keeps the original bits (TASK_INTERRUPTIBLE / TASK_UNINTERRUPTIBLE) intact and lets wakeups succeed. When a task blocks on a lock in state TASK_INTERRUPTIBLE and is woken up by a real wakeup, then we store the state = TASK_RUNNING for the restore and can safely use TASK_UNINTERRUPTIBLE from that point to avoid further wakeups which just let us loop in the lock code. This also removes the extra TASK_RUNNING_MUTEX flags from the wakeup_process*() functions as they are not longer necessary. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu>

rtmutex: prevent missed wakeups
The sleeping locks implementation based on rtmutexes can miss wakeups for two reasons: 1) The unconditional usage TASK_UNINTERRUPTIBLE for the blocking state Results in missed wakeups from wake_up_interruptible*() state = TASK_INTERRUPTIBLE; blocks_on_lock() state = TASK_UNINTERRUPTIBLE; schedule(); .... acquires_lock(); restore_state(); Until the waiter has restored its state wake_up_interruptible*() will fail. 2) The rtmutex wakeup intermediate state TASK_RUNNING_MUTEX Results in missed wakeups from wake_up*() waiter is woken by mutex wakeup waiter->state = TASK_RUNNING_MUTEX; .... acquires_lock(); restore_state(); Until the waiter has restored its state wake_up*() will fail. Solution: Instead of setting the state to TASK_RUNNING_MUTEX in the mutex wakeup case we logically OR TASK_RUNNING_MUTEX to the current waiter state. This keeps the original bits (TASK_INTERRUPTIBLE / TASK_UNINTERRUPTIBLE) intact and lets wakeups succeed. When a task blocks on a lock in state TASK_INTERRUPTIBLE and is woken up by a real wakeup, then we store the state = TASK_RUNNING for the restore and can safely use TASK_UNINTERRUPTIBLE from that point to avoid further wakeups which just let us loop in the lock code. This also removes the extra TASK_RUNNING_MUTEX flags from the wakeup_process*() functions as they are not longer necessary. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu>
487ac708 · Thomas Gleixner · 640f0c05 · 487ac708 · 487ac708 · 487ac708
Commit 487ac708 authored Jul 03, 2009 by Thomas Gleixner
Show whitespace changes
Inline Side-by-side

Showing with 32 additions and 10 deletions

include/linux/sched.h include/linux/sched.h +1 -2

kernel/rtmutex.c kernel/rtmutex.c +19 -3

kernel/sched.c kernel/sched.c +12 -5

No files found.
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -204,8 +204,7 @@ extern struct semaphore kernel_sem;
 /* Convenience macros for the sake of wake_up */
 #define TASK_NORMAL		(TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
-#define TASK_ALL		(TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED | \
+#define TASK_ALL		(TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED)
-				 TASK_RUNNING_MUTEX)
 /* get_task_state() */
 #define TASK_REPORT		(TASK_RUNNING | TASK_RUNNING_MUTEX | \

--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -729,16 +729,32 @@ static int adaptive_wait(struct rt_mutex_waiter *waiter,
 /*
 * The state setting needs to preserve the original state and needs to
 * take care of non rtmutex wakeups.
+ *
+ * Called with rtmutex->wait_lock held to serialize against rtmutex
+ * wakeups().
 */
 static inline unsigned long
 rt_set_current_blocked_state(unsigned long saved_state)
 {
-	unsigned long state;
+	unsigned long state, block_state;
+	/*
+	 * If state is TASK_INTERRUPTIBLE, then we set the state for
+	 * blocking to TASK_INTERRUPTIBLE as well, otherwise we would
+	 * miss real wakeups via wake_up_interruptible(). If such a
+	 * wakeup happens we see the running state and preserve it in
+	 * saved_state. Now we can ignore further wakeups as we will
+	 * return in state running from our "spin" sleep.
+	 */
+	if (saved_state == TASK_INTERRUPTIBLE)
+		block_state = TASK_INTERRUPTIBLE;
+	else
+		block_state = TASK_UNINTERRUPTIBLE;
-	state = xchg(&current->state, TASK_UNINTERRUPTIBLE);
+	state = xchg(&current->state, block_state);
 	/*
 	 * Take care of non rtmutex wakeups. rtmutex wakeups
-	 * set the state to TASK_RUNNING_MUTEX.
+	 * or TASK_RUNNING_MUTEX to (UN)INTERRUPTIBLE.
 	 */
 	if (state == TASK_RUNNING)
 		saved_state = TASK_RUNNING;

--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2530,8 +2530,16 @@ out_running:
 	trace_sched_wakeup(rq, p, success);
 	check_preempt_curr(rq, p, sync);
+	/*
+	 * For a mutex wakeup we or TASK_RUNNING_MUTEX to the task
+	 * state to preserve the original state, so a real wakeup
+	 * still can see the (UN)INTERRUPTIBLE bits in the state check
+	 * above. We dont have to worry about the | TASK_RUNNING_MUTEX
+	 * here. The waiter is serialized by the mutex lock and nobody
+	 * else can fiddle with p->state as we hold rq lock.
+	 */
 	if (mutex)
-		p->state = TASK_RUNNING_MUTEX;
+		p->state |= TASK_RUNNING_MUTEX;
 	else
 		p->state = TASK_RUNNING;
 #ifdef CONFIG_SMP
@@ -2581,7 +2589,7 @@ EXPORT_SYMBOL(wake_up_process_mutex_sync);
 int wake_up_state(struct task_struct *p, unsigned int state)
 {
-	return try_to_wake_up(p, state | TASK_RUNNING_MUTEX, 0, 0);
+	return try_to_wake_up(p, state, 0, 0);
 }
 /*
@@ -5385,7 +5393,7 @@ need_resched_nonpreemptible:
 	update_rq_clock(rq);
 	clear_tsk_need_resched(prev);
-	if ((prev->state & ~TASK_RUNNING_MUTEX) &&
+	if (!(prev->state & TASK_RUNNING_MUTEX) && prev->state &&
 	    !(preempt_count() & PREEMPT_ACTIVE)) {
 		if (unlikely(signal_pending_state(prev->state, prev)))
 			prev->state = TASK_RUNNING;
@@ -5585,8 +5593,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
 			  void *key)
 {
-	return try_to_wake_up(curr->private, mode | TASK_RUNNING_MUTEX,
+	return try_to_wake_up(curr->private, mode, sync, 0);
-			      sync, 0);
 }
 EXPORT_SYMBOL(default_wake_function);