mm: page_alloc: rt-friendly per-cpu pages

rt-friendly per-cpu pages: convert the irqs-off per-cpu locking method into a preemptible, explicit-per-cpu-locks method. Contains fixes from: Peter Zijlstra <a.p.zijlstra@chello.nl> Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

mm: page_alloc: rt-friendly per-cpu pages
rt-friendly per-cpu pages: convert the irqs-off per-cpu locking method into a preemptible, explicit-per-cpu-locks method. Contains fixes from: Peter Zijlstra <a.p.zijlstra@chello.nl> Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
ff3fd6af · Ingo Molnar · Thomas Gleixner · 4672c440 · ff3fd6af
Commit ff3fd6af authored Jul 03, 2009 by Ingo Molnar Committed by Thomas Gleixner Jul 24, 2009
Hide whitespace changes
Inline Side-by-side

Showing with 133 additions and 28 deletions

mm/page_alloc.c mm/page_alloc.c +133 -28

No files found.
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -161,6 +161,53 @@ static unsigned long __meminitdata dma_reserve;
  EXPORT_SYMBOL(movable_zone);
 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */

+#ifdef CONFIG_PREEMPT_RT
+static DEFINE_PER_CPU_LOCKED(int, pcp_locks);
+#endif
+
+static inline void __lock_cpu_pcp(unsigned long *flags, int cpu)
+{
+#ifdef CONFIG_PREEMPT_RT
+	spin_lock(&__get_cpu_lock(pcp_locks, cpu));
+	flags = 0;
+#else
+	local_irq_save(*flags);
+#endif
+}
+
+static inline void lock_cpu_pcp(unsigned long *flags, int *this_cpu)
+{
+#ifdef CONFIG_PREEMPT_RT
+	(void)get_cpu_var_locked(pcp_locks, this_cpu);
+	flags = 0;
+#else
+	local_irq_save(*flags);
+	*this_cpu = smp_processor_id();
+#endif
+}
+
+static inline void unlock_cpu_pcp(unsigned long flags, int this_cpu)
+{
+#ifdef CONFIG_PREEMPT_RT
+	put_cpu_var_locked(pcp_locks, this_cpu);
+#else
+	local_irq_restore(flags);
+#endif
+}
+
+static struct per_cpu_pageset *
+get_zone_pcp(struct zone *zone, unsigned long *flags, int *this_cpu)
+{
+	lock_cpu_pcp(flags, this_cpu);
+	return zone_pcp(zone, *this_cpu);
+}
+
+static void
+put_zone_pcp(struct zone *zone, unsigned long flags, int this_cpu)
+{
+	unlock_cpu_pcp(flags, this_cpu);
+}
+
 #if MAX_NUMNODES > 1
 int nr_node_ids __read_mostly = MAX_NUMNODES;
 int nr_online_nodes __read_mostly = 1;
@@ -555,8 +602,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
 static void __free_pages_ok(struct page *page, unsigned int order)
 {
 	unsigned long flags;
-	int i;
-	int bad = 0;
+	int i, this_cpu, bad = 0;
 	int wasMlocked = TestClearPageMlocked(page);

 	kmemcheck_free_shadow(page, order);
@@ -574,13 +620,13 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 	arch_free_page(page, order);
 	kernel_map_pages(page, 1 << order, 0);

-	local_irq_save(flags);
+	lock_cpu_pcp(&flags, &this_cpu);
 	if (unlikely(wasMlocked))
 		free_page_mlock(page);
-	__count_vm_events(PGFREE, 1 << order);
+	count_vm_events(PGFREE, 1 << order);
+	unlock_cpu_pcp(flags, this_cpu);
 	free_one_page(page_zone(page), page, order,
-					get_pageblock_migratetype(page));
-	local_irq_restore(flags);
+		      get_pageblock_migratetype(page));
 }

 /*
@@ -923,15 +969,16 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 {
 	unsigned long flags;
 	int to_drain;
+	int this_cpu;

-	local_irq_save(flags);
+	lock_cpu_pcp(&flags, &this_cpu);
 	if (pcp->count >= pcp->batch)
 		to_drain = pcp->batch;
 	else
 		to_drain = pcp->count;
 	free_pages_bulk(zone, to_drain, &pcp->list, 0);
 	pcp->count -= to_drain;
-	local_irq_restore(flags);
+	unlock_cpu_pcp(flags, this_cpu);
 }
 #endif

@@ -951,13 +998,17 @@ static void drain_pages(unsigned int cpu)
 		struct per_cpu_pageset *pset;
 		struct per_cpu_pages *pcp;

+		__lock_cpu_pcp(&flags, cpu);
 		pset = zone_pcp(zone, cpu);
-
+		if (!pset) {
+			unlock_cpu_pcp(flags, cpu);
+			WARN_ON(1);
+			continue;
+		}
 		pcp = &pset->pcp;
-		local_irq_save(flags);
 		free_pages_bulk(zone, pcp->count, &pcp->list, 0);
 		pcp->count = 0;
-		local_irq_restore(flags);
+		unlock_cpu_pcp(flags, cpu);
 	}
 }

@@ -969,12 +1020,52 @@ void drain_local_pages(void *arg)
 	drain_pages(smp_processor_id());
 }

+#ifdef CONFIG_PREEMPT_RT
+static void drain_local_pages_work(struct work_struct *wrk)
+{
+	drain_pages(smp_processor_id());
+}
+#endif
+
 /*
 * Spill all the per-cpu pages from all CPUs back into the buddy allocator
 */
 void drain_all_pages(void)
 {
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * HACK!!!!!
+	 *  For RT we can't use IPIs to run drain_local_pages, since
+	 *  that code will call spin_locks that will now sleep.
+	 *  But, schedule_on_each_cpu will call kzalloc, which will
+	 *  call page_alloc which was what calls this.
+	 *
+	 *  Luckily, there's a condition to get here, and that is if
+	 *  the order passed in to alloc_pages is greater than 0
+	 *  (alloced more than a page size).  The slabs only allocate
+	 *  what is needed, and the allocation made by schedule_on_each_cpu
+	 *  does an alloc of "sizeof(void *)*nr_cpu_ids".
+	 *
+	 *  So we can safely call schedule_on_each_cpu if that number
+	 *  is less than a page. Otherwise don't bother. At least warn of
+	 *  this issue.
+	 *
+	 * And yes, this is one big hack.  Please fix ;-)
+	 */
+	if (sizeof(void *)*nr_cpu_ids < PAGE_SIZE)
+		schedule_on_each_cpu(drain_local_pages_work);
+	else {
+		static int once;
+		if (!once) {
+			printk(KERN_ERR "Can't drain all CPUS due to possible recursion\n");
+			once = 1;
+		}
+		drain_local_pages(NULL);
+	}
+
+#else
 	on_each_cpu(drain_local_pages, NULL, 1);
+#endif
 }

 #ifdef CONFIG_HIBERNATION
@@ -1019,9 +1110,10 @@ void mark_free_pages(struct zone *zone)
 static void free_hot_cold_page(struct page *page, int cold)
 {
 	struct zone *zone = page_zone(page);
+	struct per_cpu_pageset *pset;
 	struct per_cpu_pages *pcp;
 	unsigned long flags;
-	int wasMlocked = TestClearPageMlocked(page);
+	int this_cpu, wasMlocked = TestClearPageMlocked(page);

 	kmemcheck_free_shadow(page, 0);

@@ -1037,12 +1129,12 @@ static void free_hot_cold_page(struct page *page, int cold)
 	arch_free_page(page, 0);
 	kernel_map_pages(page, 1, 0);

-	pcp = &zone_pcp(zone, get_cpu())->pcp;
+	pset = get_zone_pcp(zone, &flags, &this_cpu);
+	pcp = &pset->pcp;
 	set_page_private(page, get_pageblock_migratetype(page));
-	local_irq_save(flags);
 	if (unlikely(wasMlocked))
 		free_page_mlock(page);
-	__count_vm_event(PGFREE);
+	count_vm_event(PGFREE);

 	if (cold)
 		list_add_tail(&page->lru, &pcp->list);
@@ -1053,8 +1145,7 @@ static void free_hot_cold_page(struct page *page, int cold)
 		free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
 		pcp->count -= pcp->batch;
 	}
-	local_irq_restore(flags);
-	put_cpu();
+	put_zone_pcp(zone, flags, this_cpu);
 }

 void free_hot_page(struct page *page)
@@ -1108,15 +1199,15 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
 	unsigned long flags;
 	struct page *page;
 	int cold = !!(gfp_flags & __GFP_COLD);
-	int cpu;
+	struct per_cpu_pageset *pset;
+	int this_cpu;

 again:
-	cpu  = get_cpu();
+	pset = get_zone_pcp(zone, &flags, &this_cpu);
+
 	if (likely(order == 0)) {
-		struct per_cpu_pages *pcp;
+		struct per_cpu_pages *pcp = &pset->pcp;

-		pcp = &zone_pcp(zone, cpu)->pcp;
-		local_irq_save(flags);
 		if (!pcp->count) {
 			pcp->count = rmqueue_bulk(zone, 0,
 					pcp->batch, &pcp->list, migratetype);
@@ -1158,7 +1249,7 @@ again:
 			 */
 			WARN_ON_ONCE(order > 1);
 		}
-		spin_lock_irqsave(&zone->lock, flags);
+		spin_lock(&zone->lock);
 		page = __rmqueue(zone, order, migratetype);
 		__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
 		spin_unlock(&zone->lock);
@@ -1168,8 +1259,7 @@ again:

 	__count_zone_vm_events(PGALLOC, zone, 1 << order);
 	zone_statistics(preferred_zone, zone);
-	local_irq_restore(flags);
-	put_cpu();
+	put_zone_pcp(zone, flags, this_cpu);

 	VM_BUG_ON(bad_range(zone, page));
 	if (prep_new_page(page, order, gfp_flags))
@@ -1177,8 +1267,7 @@ again:
 	return page;

 failed:
-	local_irq_restore(flags);
-	put_cpu();
+	put_zone_pcp(zone, flags, this_cpu);
 	return NULL;
 }

@@ -3036,7 +3125,23 @@ static inline void free_zone_pagesets(int cpu)
 	struct zone *zone;

 	for_each_zone(zone) {
-		struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
+		unsigned long flags;
+		struct per_cpu_pageset *pset;
+
+		/*
+		 * On PREEMPT_RT the allocator is preemptible, therefore
+		 * kstopmachine can preempt a process in the middle of an
+		 * allocation, freeing the pset underneath such a process
+		 * isn't a good idea.
+		 *
+		 * Take the per-cpu pcp lock to allow the task to complete
+		 * before we free it. New tasks will be held off by the
+		 * cpu_online() check in get_cpu_var_locked().
+		 */
+		__lock_cpu_pcp(&flags, cpu);
+		pset = zone_pcp(zone, cpu);
+		zone_pcp(zone, cpu) = NULL;
+		unlock_cpu_pcp(flags, cpu);

 		/* Free per_cpu_pageset if it is slab allocated */
 		if (pset != &boot_pageset[cpu])