sched: mmdrop needs to be delayed on -rt

Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

sched: mmdrop needs to be delayed on -rt
Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
5b6e135f · Ingo Molnar · Thomas Gleixner · 0a930ce9 · 5b6e135f · 5b6e135f
Commit 5b6e135f authored Jul 03, 2009 by Ingo Molnar Committed by Thomas Gleixner Jul 29, 2009
Showing with 174 additions and 1 deletion

include/linux/mm_types.h include/linux/mm_types.h +3 -0

include/linux/sched.h include/linux/sched.h +8 -0

kernel/fork.c kernel/fork.c +154 -0

kernel/sched.c kernel/sched.c +9 -1

No files found.
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -247,6 +247,9 @@ struct mm_struct {
 	/* Architecture-specific MM context */
 	mm_context_t context;
+	/* realtime bits */
+	struct list_head	delayed_drop;
 	/* Swap token stuff */
 	/*
 	 * Last value of global fault stamp as seen by this process.

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2111,12 +2111,20 @@ extern struct mm_struct * mm_alloc(void);
 /* mmdrop drops the mm and the page tables */
 extern void __mmdrop(struct mm_struct *);
+extern void __mmdrop_delayed(struct mm_struct *);
 static inline void mmdrop(struct mm_struct * mm)
 {
 	if (unlikely(atomic_dec_and_test(&mm->mm_count)))
 		__mmdrop(mm);
 }
+static inline void mmdrop_delayed(struct mm_struct * mm)
+{
+	if (atomic_dec_and_test(&mm->mm_count))
+		__mmdrop_delayed(mm);
+}
 /* mmput gets rid of the mappings and all user-space */
 extern void mmput(struct mm_struct *);
 /* Grab a reference to a task's mm, if it is not already going away */

--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -38,6 +38,7 @@
 #include <linux/syscalls.h>
 #include <linux/jiffies.h>
 #include <linux/tracehook.h>
+#include <linux/interrupt.h>
 #include <linux/futex.h>
 #include <linux/compat.h>
 #include <linux/task_io_accounting_ops.h>
@@ -48,6 +49,8 @@
 #include <linux/memcontrol.h>
 #include <linux/ftrace.h>
 #include <linux/profile.h>
+#include <linux/kthread.h>
+#include <linux/notifier.h>
 #include <linux/rmap.h>
 #include <linux/acct.h>
 #include <linux/tsacct_kern.h>
@@ -88,6 +91,14 @@ DEFINE_RWLOCK(tasklist_lock);  /* outer */
 __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
 #endif
+/*
+ * Delayed mmdrop. In the PREEMPT_RT case we
+ * dont want to do this from the scheduling
+ * context.
+ */
+static DEFINE_PER_CPU(struct task_struct *, desched_task);
+static DEFINE_PER_CPU(struct list_head, delayed_drop_list);
 int nr_processes(void)
 {
 	int cpu;
@@ -174,6 +185,8 @@ void __put_task_struct(struct task_struct *tsk)
 void __init fork_init(unsigned long mempages)
 {
+	int i;
 #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
 #ifndef ARCH_MIN_TASKALIGN
 #define ARCH_MIN_TASKALIGN	L1_CACHE_BYTES
@@ -204,6 +217,9 @@ void __init fork_init(unsigned long mempages)
 	init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
 	init_task.signal->rlim[RLIMIT_SIGPENDING] =
 		init_task.signal->rlim[RLIMIT_NPROC];
+	for (i = 0; i < NR_CPUS; i++)
+		INIT_LIST_HEAD(&per_cpu(delayed_drop_list, i));
 }
 int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst,
@@ -285,6 +301,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 	mm->locked_vm = 0;
 	mm->mmap = NULL;
 	mm->mmap_cache = NULL;
+	INIT_LIST_HEAD(&mm->delayed_drop);
 	mm->free_area_cache = oldmm->mmap_base;
 	mm->cached_hole_size = ~0UL;
 	mm->map_count = 0;
@@ -1270,7 +1287,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 			attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
 			attach_pid(p, PIDTYPE_SID, task_session(current));
 			list_add_tail_rcu(&p->tasks, &init_task.tasks);
+			preempt_disable();
 			__get_cpu_var(process_counts)++;
+			preempt_enable();
 		}
 		attach_pid(p, PIDTYPE_PID, pid);
 		nr_threads++;
@@ -1748,3 +1767,138 @@ int unshare_files(struct files_struct **displaced)
 	task_unlock(task);
 	return 0;
 }
+static int mmdrop_complete(void)
+{
+	struct list_head *head;
+	int ret = 0;
+	head = &get_cpu_var(delayed_drop_list);
+	while (!list_empty(head)) {
+		struct mm_struct *mm = list_entry(head->next,
+					struct mm_struct, delayed_drop);
+		list_del(&mm->delayed_drop);
+		put_cpu_var(delayed_drop_list);
+		__mmdrop(mm);
+		ret = 1;
+		head = &get_cpu_var(delayed_drop_list);
+	}
+	put_cpu_var(delayed_drop_list);
+	return ret;
+}
+/*
+ * We dont want to do complex work from the scheduler, thus
+ * we delay the work to a per-CPU worker thread:
+ */
+void  __mmdrop_delayed(struct mm_struct *mm)
+{
+	struct task_struct *desched_task;
+	struct list_head *head;
+	head = &get_cpu_var(delayed_drop_list);
+	list_add_tail(&mm->delayed_drop, head);
+	desched_task = __get_cpu_var(desched_task);
+	if (desched_task)
+		wake_up_process(desched_task);
+	put_cpu_var(delayed_drop_list);
+}
+static void takeover_delayed_drop(int hotcpu)
+{
+	struct list_head *head = &per_cpu(delayed_drop_list, hotcpu);
+	while (!list_empty(head)) {
+		struct mm_struct *mm = list_entry(head->next,
+				struct mm_struct, delayed_drop);
+		list_del(&mm->delayed_drop);
+		__mmdrop_delayed(mm);
+	}
+}
+static int desched_thread(void * __bind_cpu)
+{
+	set_user_nice(current, -10);
+	current->flags |= PF_NOFREEZE | PF_SOFTIRQ;
+	set_current_state(TASK_INTERRUPTIBLE);
+	while (!kthread_should_stop()) {
+		if (mmdrop_complete())
+			continue;
+		schedule();
+		/*
+ 		 * This must be called from time to time on ia64, and is a
+ 		 * no-op on other archs. Used to be in cpu_idle(), but with
+ 		 * the new -rt semantics it can't stay there.
+		 */
+		check_pgt_cache();
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+static int __devinit cpu_callback(struct notifier_block *nfb,
+				  unsigned long action,
+				  void *hcpu)
+{
+	int hotcpu = (unsigned long)hcpu;
+	struct task_struct *p;
+	switch (action) {
+	case CPU_UP_PREPARE:
+		BUG_ON(per_cpu(desched_task, hotcpu));
+		INIT_LIST_HEAD(&per_cpu(delayed_drop_list, hotcpu));
+		p = kthread_create(desched_thread, hcpu, "desched/%d", hotcpu);
+		if (IS_ERR(p)) {
+			printk("desched_thread for %i failed\n", hotcpu);
+			return NOTIFY_BAD;
+		}
+		per_cpu(desched_task, hotcpu) = p;
+		kthread_bind(p, hotcpu);
+		break;
+	case CPU_ONLINE:
+		wake_up_process(per_cpu(desched_task, hotcpu));
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_UP_CANCELED:
+		/* Unbind so it can run.  Fall thru. */
+		kthread_bind(per_cpu(desched_task, hotcpu), smp_processor_id());
+	case CPU_DEAD:
+		p = per_cpu(desched_task, hotcpu);
+		per_cpu(desched_task, hotcpu) = NULL;
+		kthread_stop(p);
+		takeover_delayed_drop(hotcpu);
+		takeover_tasklets(hotcpu);
+		break;
+#endif /* CONFIG_HOTPLUG_CPU */
+	}
+	return NOTIFY_OK;
+}
+static struct notifier_block __devinitdata cpu_nfb = {
+	.notifier_call = cpu_callback
+};
+__init int spawn_desched_task(void)
+{
+	void *cpu = (void *)(long)smp_processor_id();
+	cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
+	register_cpu_notifier(&cpu_nfb);
+	return 0;
+}
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2938,8 +2938,12 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 #endif
 	fire_sched_in_preempt_notifiers(current);
+	/*
+	 * Delay the final freeing of the mm or task, so that we dont have
+	 * to do complex work from within the scheduler:
+	 */
 	if (mm)
-		mmdrop(mm);
+ 		mmdrop_delayed(mm);
 	if (unlikely(prev_state == TASK_DEAD)) {
 		/*
 		 * Remove function-return probe instances associated with this
@@ -7573,7 +7577,11 @@ void idle_task_exit(void)
 	if (mm != &init_mm)
 		switch_mm(mm, &init_mm, current);
+#ifdef CONFIG_PREEMPT_RT
+	mmdrop_delayed(mm);
+#else
 	mmdrop(mm);
+#endif
 }
 /* called under rq->lock with disabled interrupts */