Merge branch 'rt/mm' into rt/base

Conflicts: include/linux/percpu.h Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

Merge branch 'rt/mm' into rt/base
Conflicts: include/linux/percpu.h Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
ba36d1d9 · Thomas Gleixner · 55f9e9a3 · 104f75cb · ba36d1d9 · ba36d1d9
Commit ba36d1d9 authored Jul 29, 2009 by Thomas Gleixner
11 changed files
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -345,6 +345,9 @@ static inline void list_splice_tail_init(struct list_head *list,
 #define list_first_entry(ptr, type, member) \
 	list_entry((ptr)->next, type, member)

+#define list_last_entry(ptr, type, member) \
+	list_entry((ptr)->prev, type, member)
+
 /**
 * list_for_each	-	iterate over a list
 * @pos:	the &struct list_head to use as a loop cursor.

--- a/include/linux/quicklist.h
+++ b/include/linux/quicklist.h
@@ -18,7 +18,7 @@ struct quicklist {
 	int nr_pages;
 };

-DECLARE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK];
+DECLARE_PER_CPU_LOCKED(struct quicklist, quicklist)[CONFIG_NR_QUICK];

 /*
 * The two key functions quicklist_alloc and quicklist_free are inline so
@@ -30,19 +30,27 @@ DECLARE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK];
 * The fast patch in quicklist_alloc touched only a per cpu cacheline and
 * the first cacheline of the page itself. There is minmal overhead involved.
 */
-static inline void *quicklist_alloc(int nr, gfp_t flags, void (*ctor)(void *))
+static inline void *__quicklist_alloc(struct quicklist *q)
 {
-	struct quicklist *q;
-	void **p = NULL;
+	void **p = q->page;

-	q =&get_cpu_var(quicklist)[nr];
-	p = q->page;
 	if (likely(p)) {
 		q->page = p[0];
 		p[0] = NULL;
 		q->nr_pages--;
 	}
-	put_cpu_var(quicklist);
+	return p;
+}
+
+static inline void *quicklist_alloc(int nr, gfp_t flags, void (*ctor)(void *))
+{
+	struct quicklist *q;
+	void **p;
+	int cpu;
+
+	q = &get_cpu_var_locked(quicklist, &cpu)[nr];
+	p = __quicklist_alloc(q);
+	put_cpu_var_locked(quicklist, cpu);
 	if (likely(p))
 		return p;

@@ -56,12 +64,13 @@ static inline void __quicklist_free(int nr, void (*dtor)(void *), void *p,
 	struct page *page)
 {
 	struct quicklist *q;
+	int cpu;

-	q = &get_cpu_var(quicklist)[nr];
+	q = &get_cpu_var_locked(quicklist, &cpu)[nr];
 	*(void **)p = q->page;
 	q->page = p;
 	q->nr_pages++;
-	put_cpu_var(quicklist);
+	put_cpu_var_locked(quicklist, cpu);
 }

 static inline void quicklist_free(int nr, void (*dtor)(void *), void *pp)

--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -76,7 +76,12 @@ DECLARE_PER_CPU(struct vm_event_state, vm_event_states);

 static inline void __count_vm_event(enum vm_event_item item)
 {
+#ifdef CONFIG_PREEMPT_RT
+	get_cpu_var(vm_event_states).event[item]++;
+	put_cpu();
+#else
 	__get_cpu_var(vm_event_states).event[item]++;
+#endif
 }

 static inline void count_vm_event(enum vm_event_item item)
@@ -87,7 +92,12 @@ static inline void count_vm_event(enum vm_event_item item)

 static inline void __count_vm_events(enum vm_event_item item, long delta)
 {
+#ifdef CONFIG_PREEMPT_RT
+	get_cpu_var(vm_event_states).event[item] += delta;
+	put_cpu();
+#else
 	__get_cpu_var(vm_event_states).event[item] += delta;
+#endif
 }

 static inline void count_vm_events(enum vm_event_item item, long delta)

--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -948,13 +948,14 @@ void mem_cgroup_update_mapped_file_stat(struct page *page, int val)
 		goto done;

 	/*
-	 * Preemption is already disabled, we don't need get_cpu()
+	 * Preemption is already disabled, we don't need get_cpu(), but
+	 * that's not true for RT :)
 	 */
-	cpu = smp_processor_id();
+	cpu = get_cpu();
 	stat = &mem->stat;
 	cpustat = &stat->cpustat[cpu];
-
 	__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, val);
+	put_cpu();
 done:
 	unlock_page_cgroup(pc);
 }

--- a/mm/memory.c
+++ b/mm/memory.c
@@ -922,10 +922,13 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
 	return addr;
 }

-#ifdef CONFIG_PREEMPT
+#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_RT)
 # define ZAP_BLOCK_SIZE	(8 * PAGE_SIZE)
 #else
-/* No preempt: go for improved straight-line efficiency */
+/*
+ * No preempt: go for improved straight-line efficiency
+ * on PREEMPT_RT this is not a critical latency-path.
+ */
 # define ZAP_BLOCK_SIZE	(1024 * PAGE_SIZE)
 #endif


--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -161,6 +161,53 @@ static unsigned long __meminitdata dma_reserve;
  EXPORT_SYMBOL(movable_zone);
 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */

+#ifdef CONFIG_PREEMPT_RT
+static DEFINE_PER_CPU_LOCKED(int, pcp_locks);
+#endif
+
+static inline void __lock_cpu_pcp(unsigned long *flags, int cpu)
+{
+#ifdef CONFIG_PREEMPT_RT
+	spin_lock(&__get_cpu_lock(pcp_locks, cpu));
+	flags = 0;
+#else
+	local_irq_save(*flags);
+#endif
+}
+
+static inline void lock_cpu_pcp(unsigned long *flags, int *this_cpu)
+{
+#ifdef CONFIG_PREEMPT_RT
+	(void)get_cpu_var_locked(pcp_locks, this_cpu);
+	flags = 0;
+#else
+	local_irq_save(*flags);
+	*this_cpu = smp_processor_id();
+#endif
+}
+
+static inline void unlock_cpu_pcp(unsigned long flags, int this_cpu)
+{
+#ifdef CONFIG_PREEMPT_RT
+	put_cpu_var_locked(pcp_locks, this_cpu);
+#else
+	local_irq_restore(flags);
+#endif
+}
+
+static struct per_cpu_pageset *
+get_zone_pcp(struct zone *zone, unsigned long *flags, int *this_cpu)
+{
+	lock_cpu_pcp(flags, this_cpu);
+	return zone_pcp(zone, *this_cpu);
+}
+
+static void
+put_zone_pcp(struct zone *zone, unsigned long flags, int this_cpu)
+{
+	unlock_cpu_pcp(flags, this_cpu);
+}
+
 #if MAX_NUMNODES > 1
 int nr_node_ids __read_mostly = MAX_NUMNODES;
 int nr_online_nodes __read_mostly = 1;
@@ -523,7 +570,9 @@ static inline int free_pages_check(struct page *page)
 static void free_pages_bulk(struct zone *zone, int count,
 					struct list_head *list, int order)
 {
-	spin_lock(&zone->lock);
+	unsigned long flags;
+
+	spin_lock_irqsave(&zone->lock, flags);
 	zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
 	zone->pages_scanned = 0;

@@ -536,27 +585,31 @@ static void free_pages_bulk(struct zone *zone, int count,
 		/* have to delete it as __free_one_page list manipulates */
 		list_del(&page->lru);
 		__free_one_page(page, zone, order, page_private(page));
+#ifdef CONFIG_PREEMPT_RT
+		cond_resched_lock(&zone->lock);
+#endif
 	}
-	spin_unlock(&zone->lock);
+	spin_unlock_irqrestore(&zone->lock, flags);
 }

 static void free_one_page(struct zone *zone, struct page *page, int order,
 				int migratetype)
 {
-	spin_lock(&zone->lock);
+	unsigned long flags;
+
+	spin_lock_irqsave(&zone->lock, flags);
 	zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
 	zone->pages_scanned = 0;

 	__mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
 	__free_one_page(page, zone, order, migratetype);
-	spin_unlock(&zone->lock);
+	spin_unlock_irqrestore(&zone->lock, flags);
 }

 static void __free_pages_ok(struct page *page, unsigned int order)
 {
 	unsigned long flags;
-	int i;
-	int bad = 0;
+	int i, this_cpu, bad = 0;
 	int wasMlocked = TestClearPageMlocked(page);

 	kmemcheck_free_shadow(page, order);
@@ -574,13 +627,13 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 	arch_free_page(page, order);
 	kernel_map_pages(page, 1 << order, 0);

-	local_irq_save(flags);
+	lock_cpu_pcp(&flags, &this_cpu);
 	if (unlikely(wasMlocked))
 		free_page_mlock(page);
-	__count_vm_events(PGFREE, 1 << order);
+	count_vm_events(PGFREE, 1 << order);
+	unlock_cpu_pcp(flags, this_cpu);
 	free_one_page(page_zone(page), page, order,
-					get_pageblock_migratetype(page));
-	local_irq_restore(flags);
+		      get_pageblock_migratetype(page));
 }

 /*
@@ -910,6 +963,16 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 	return i;
 }

+static void
+isolate_pcp_pages(int count, struct list_head *src, struct list_head *dst)
+{
+	while (count--) {
+		struct page *page = list_last_entry(src, struct page, lru);
+		list_move(&page->lru, dst);
+	}
+}
+
+
 #ifdef CONFIG_NUMA
 /*
 * Called from the vmstat counter updater to drain pagesets of this
@@ -921,17 +984,20 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 */
 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 {
+	LIST_HEAD(free_list);
 	unsigned long flags;
 	int to_drain;
+	int this_cpu;

-	local_irq_save(flags);
+	lock_cpu_pcp(&flags, &this_cpu);
 	if (pcp->count >= pcp->batch)
 		to_drain = pcp->batch;
 	else
 		to_drain = pcp->count;
-	free_pages_bulk(zone, to_drain, &pcp->list, 0);
+	isolate_pcp_pages(to_drain, &pcp->list, &free_list);
 	pcp->count -= to_drain;
-	local_irq_restore(flags);
+	unlock_cpu_pcp(flags, this_cpu);
+	free_pages_bulk(zone, to_drain, &free_list, 0);
 }
 #endif

@@ -950,14 +1016,22 @@ static void drain_pages(unsigned int cpu)
 	for_each_populated_zone(zone) {
 		struct per_cpu_pageset *pset;
 		struct per_cpu_pages *pcp;
+		LIST_HEAD(free_list);
+		int count;

+		__lock_cpu_pcp(&flags, cpu);
 		pset = zone_pcp(zone, cpu);
-
+		if (!pset) {
+			unlock_cpu_pcp(flags, cpu);
+			WARN_ON(1);
+			continue;
+		}
 		pcp = &pset->pcp;
-		local_irq_save(flags);
-		free_pages_bulk(zone, pcp->count, &pcp->list, 0);
+		isolate_pcp_pages(pcp->count, &pcp->list, &free_list);
+		count = pcp->count;
 		pcp->count = 0;
-		local_irq_restore(flags);
+		unlock_cpu_pcp(flags, cpu);
+		free_pages_bulk(zone, count, &free_list, 0);
 	}
 }

@@ -969,12 +1043,52 @@ void drain_local_pages(void *arg)
 	drain_pages(smp_processor_id());
 }

+#ifdef CONFIG_PREEMPT_RT
+static void drain_local_pages_work(struct work_struct *wrk)
+{
+	drain_pages(smp_processor_id());
+}
+#endif
+
 /*
 * Spill all the per-cpu pages from all CPUs back into the buddy allocator
 */
 void drain_all_pages(void)
 {
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * HACK!!!!!
+	 *  For RT we can't use IPIs to run drain_local_pages, since
+	 *  that code will call spin_locks that will now sleep.
+	 *  But, schedule_on_each_cpu will call kzalloc, which will
+	 *  call page_alloc which was what calls this.
+	 *
+	 *  Luckily, there's a condition to get here, and that is if
+	 *  the order passed in to alloc_pages is greater than 0
+	 *  (alloced more than a page size).  The slabs only allocate
+	 *  what is needed, and the allocation made by schedule_on_each_cpu
+	 *  does an alloc of "sizeof(void *)*nr_cpu_ids".
+	 *
+	 *  So we can safely call schedule_on_each_cpu if that number
+	 *  is less than a page. Otherwise don't bother. At least warn of
+	 *  this issue.
+	 *
+	 * And yes, this is one big hack.  Please fix ;-)
+	 */
+	if (sizeof(void *)*nr_cpu_ids < PAGE_SIZE)
+		schedule_on_each_cpu(drain_local_pages_work);
+	else {
+		static int once;
+		if (!once) {
+			printk(KERN_ERR "Can't drain all CPUS due to possible recursion\n");
+			once = 1;
+		}
+		drain_local_pages(NULL);
+	}
+
+#else
 	on_each_cpu(drain_local_pages, NULL, 1);
+#endif
 }

 #ifdef CONFIG_HIBERNATION
@@ -1019,9 +1133,10 @@ void mark_free_pages(struct zone *zone)
 static void free_hot_cold_page(struct page *page, int cold)
 {
 	struct zone *zone = page_zone(page);
+	struct per_cpu_pageset *pset;
 	struct per_cpu_pages *pcp;
 	unsigned long flags;
-	int wasMlocked = TestClearPageMlocked(page);
+	int count, this_cpu, wasMlocked = TestClearPageMlocked(page);

 	kmemcheck_free_shadow(page, 0);

@@ -1037,12 +1152,12 @@ static void free_hot_cold_page(struct page *page, int cold)
 	arch_free_page(page, 0);
 	kernel_map_pages(page, 1, 0);

-	pcp = &zone_pcp(zone, get_cpu())->pcp;
+	pset = get_zone_pcp(zone, &flags, &this_cpu);
+	pcp = &pset->pcp;
 	set_page_private(page, get_pageblock_migratetype(page));
-	local_irq_save(flags);
 	if (unlikely(wasMlocked))
 		free_page_mlock(page);
-	__count_vm_event(PGFREE);
+	count_vm_event(PGFREE);

 	if (cold)
 		list_add_tail(&page->lru, &pcp->list);
@@ -1050,11 +1165,15 @@ static void free_hot_cold_page(struct page *page, int cold)
 		list_add(&page->lru, &pcp->list);
 	pcp->count++;
 	if (pcp->count >= pcp->high) {
-		free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
+		LIST_HEAD(free_list);
+
+		isolate_pcp_pages(pcp->batch, &pcp->list, &free_list);
 		pcp->count -= pcp->batch;
-	}
-	local_irq_restore(flags);
-	put_cpu();
+		count = pcp->batch;
+		put_zone_pcp(zone, flags, this_cpu);
+		free_pages_bulk(zone, count, &free_list, 0);
+	} else
+		put_zone_pcp(zone, flags, this_cpu);
 }

 void free_hot_page(struct page *page)
@@ -1108,15 +1227,15 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
 	unsigned long flags;
 	struct page *page;
 	int cold = !!(gfp_flags & __GFP_COLD);
-	int cpu;
+	struct per_cpu_pageset *pset;
+	int this_cpu;

 again:
-	cpu  = get_cpu();
+	pset = get_zone_pcp(zone, &flags, &this_cpu);
+
 	if (likely(order == 0)) {
-		struct per_cpu_pages *pcp;
+		struct per_cpu_pages *pcp = &pset->pcp;

-		pcp = &zone_pcp(zone, cpu)->pcp;
-		local_irq_save(flags);
 		if (!pcp->count) {
 			pcp->count = rmqueue_bulk(zone, 0,
 					pcp->batch, &pcp->list, migratetype);
@@ -1158,7 +1277,7 @@ again:
 			 */
 			WARN_ON_ONCE(order > 1);
 		}
-		spin_lock_irqsave(&zone->lock, flags);
+		spin_lock(&zone->lock);
 		page = __rmqueue(zone, order, migratetype);
 		__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
 		spin_unlock(&zone->lock);
@@ -1168,8 +1287,7 @@ again:

 	__count_zone_vm_events(PGALLOC, zone, 1 << order);
 	zone_statistics(preferred_zone, zone);
-	local_irq_restore(flags);
-	put_cpu();
+	put_zone_pcp(zone, flags, this_cpu);

 	VM_BUG_ON(bad_range(zone, page));
 	if (prep_new_page(page, order, gfp_flags))
@@ -1177,8 +1295,7 @@ again:
 	return page;

 failed:
-	local_irq_restore(flags);
-	put_cpu();
+	put_zone_pcp(zone, flags, this_cpu);
 	return NULL;
 }

@@ -3036,7 +3153,23 @@ static inline void free_zone_pagesets(int cpu)
 	struct zone *zone;

 	for_each_zone(zone) {
-		struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
+		unsigned long flags;
+		struct per_cpu_pageset *pset;
+
+		/*
+		 * On PREEMPT_RT the allocator is preemptible, therefore
+		 * kstopmachine can preempt a process in the middle of an
+		 * allocation, freeing the pset underneath such a process
+		 * isn't a good idea.
+		 *
+		 * Take the per-cpu pcp lock to allow the task to complete
+		 * before we free it. New tasks will be held off by the
+		 * cpu_online() check in get_cpu_var_locked().
+		 */
+		__lock_cpu_pcp(&flags, cpu);
+		pset = zone_pcp(zone, cpu);
+		zone_pcp(zone, cpu) = NULL;
+		unlock_cpu_pcp(flags, cpu);

 		/* Free per_cpu_pageset if it is slab allocated */
 		if (pset != &boot_pageset[cpu])

--- a/mm/quicklist.c
+++ b/mm/quicklist.c
@@ -19,7 +19,7 @@
 #include <linux/module.h>
 #include <linux/quicklist.h>

-DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK];
+DEFINE_PER_CPU_LOCKED(struct quicklist, quicklist)[CONFIG_NR_QUICK];

 #define FRACTION_OF_NODE_MEM	16

@@ -66,17 +66,14 @@ void quicklist_trim(int nr, void (*dtor)(void *),
 {
 	long pages_to_free;
 	struct quicklist *q;
+	int cpu;

-	q = &get_cpu_var(quicklist)[nr];
+	q = &get_cpu_var_locked(quicklist, &cpu)[nr];
 	if (q->nr_pages > min_pages) {
 		pages_to_free = min_pages_to_free(q, min_pages, max_free);

 		while (pages_to_free > 0) {
-			/*
-			 * We pass a gfp_t of 0 to quicklist_alloc here
-			 * because we will never call into the page allocator.
-			 */
-			void *p = quicklist_alloc(nr, 0, NULL);
+			void *p = __quicklist_alloc(q);

 			if (dtor)
 				dtor(p);
@@ -84,7 +81,7 @@ void quicklist_trim(int nr, void (*dtor)(void *),
 			pages_to_free--;
 		}
 	}
-	put_cpu_var(quicklist);
+	put_cpu_var_locked(quicklist, cpu);
 }

 unsigned long quicklist_total_size(void)
@@ -94,7 +91,7 @@ unsigned long quicklist_total_size(void)
 	struct quicklist *ql, *q;

 	for_each_online_cpu(cpu) {
-		ql = per_cpu(quicklist, cpu);
+		ql = per_cpu_var_locked(quicklist, cpu);
 		for (q = ql; q < ql + CONFIG_NR_QUICK; q++)
 			count += q->nr_pages;
 	}

--- a/mm/slab.c
+++ b/mm/slab.c
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -30,15 +30,92 @@
 #include <linux/notifier.h>
 #include <linux/backing-dev.h>
 #include <linux/memcontrol.h>
+#include <linux/interrupt.h>

 #include "internal.h"

 /* How many pages do we try to swap or page in/out together? */
 int page_cluster;

+#ifdef CONFIG_PREEMPT_RT
+/*
+ * On PREEMPT_RT we don't want to disable preemption for cpu variables.
+ * We grab a cpu and then use that cpu to lock the variables accordingly.
+ *
+ * (On !PREEMPT_RT this turns into normal preempt-off sections, as before.)
+ */
+static DEFINE_PER_CPU_LOCKED(struct pagevec[NR_LRU_LISTS], lru_add_pvecs);
+static DEFINE_PER_CPU_LOCKED(struct pagevec, lru_rotate_pvecs);
+
+#define swap_get_cpu_var_irq_save(var, flags, cpu)	\
+	({						\
+		(void)flags;				\
+		&get_cpu_var_locked(var, &cpu);		\
+	})
+
+#define swap_put_cpu_var_irq_restore(var, flags, cpu)	\
+	put_cpu_var_locked(var, cpu)
+
+#define swap_get_cpu_var(var, cpu)			\
+	&get_cpu_var_locked(var, &cpu)
+
+#define swap_put_cpu_var(var, cpu)			\
+	put_cpu_var_locked(var, cpu)
+
+#define swap_per_cpu_lock(var, cpu)			\
+	({						\
+		spin_lock(&__get_cpu_lock(var, cpu));	\
+		&__get_cpu_var_locked(var, cpu);	\
+	})
+
+#define swap_per_cpu_unlock(var, cpu)			\
+	spin_unlock(&__get_cpu_lock(var, cpu));
+
+#define swap_get_cpu() raw_smp_processor_id()
+
+#define swap_put_cpu() do { } while (0)
+
+#define swap_irq_save(flags) do { (void)flags; } while (0)
+
+#define swap_irq_restore(flags) do { (void)flags; } while (0)
+
+#else
+
 static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs);
 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);

+#define swap_get_cpu_var_irq_save(var, flags, cpu)	\
+	({						\
+		(void)cpu;				\
+		local_irq_save(flags);			\
+		&__get_cpu_var(var);			\
+	})
+
+#define swap_put_cpu_var_irq_restore(var, flags, cpu)	\
+	local_irq_restore(flags)
+
+#define swap_get_cpu_var(var, cpu)			\
+	({						\
+		(void)cpu;				\
+		&get_cpu_var(var);			\
+	 })
+
+#define swap_put_cpu_var(var, cpu)	put_cpu_var(var)
+
+#define swap_per_cpu_lock(var, cpu)	&per_cpu(var, cpu)
+
+#define swap_per_cpu_unlock(var, cpu)	do { } while (0)
+
+#define swap_get_cpu() get_cpu()
+
+#define swap_put_cpu() put_cpu()
+
+#define swap_irq_save(flags) local_irq_save(flags)
+
+#define swap_irq_restore(flags) local_irq_restore(flags)
+
+#endif
+
 /*
 * This path almost never happens for VM activity - pages are normally
 * freed via pagevecs.  But it gets used by networking.
@@ -141,13 +218,13 @@ void  rotate_reclaimable_page(struct page *page)
 	    !PageUnevictable(page) && PageLRU(page)) {
 		struct pagevec *pvec;
 		unsigned long flags;
+		int cpu;

 		page_cache_get(page);
-		local_irq_save(flags);
-		pvec = &__get_cpu_var(lru_rotate_pvecs);
+		pvec = swap_get_cpu_var_irq_save(lru_rotate_pvecs, flags, cpu);
 		if (!pagevec_add(pvec, page))
 			pagevec_move_tail(pvec);
-		local_irq_restore(flags);
+		swap_put_cpu_var_irq_restore(lru_rotate_pvecs, flags, cpu);
 	}
 }

@@ -216,12 +293,14 @@ EXPORT_SYMBOL(mark_page_accessed);

 void __lru_cache_add(struct page *page, enum lru_list lru)
 {
-	struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru];
+	struct pagevec *pvec;
+	int cpu;

+	pvec = swap_get_cpu_var(lru_add_pvecs, cpu)[lru];
 	page_cache_get(page);
 	if (!pagevec_add(pvec, page))
 		____pagevec_lru_add(pvec, lru);
-	put_cpu_var(lru_add_pvecs);
+	swap_put_cpu_var(lru_add_pvecs, cpu);
 }

 /**
@@ -271,31 +350,33 @@ void add_page_to_unevictable_list(struct page *page)
 */
 static void drain_cpu_pagevecs(int cpu)
 {
-	struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu);
-	struct pagevec *pvec;
+	struct pagevec *pvecs, *pvec;
 	int lru;

+	pvecs = swap_per_cpu_lock(lru_add_pvecs, cpu)[0];
 	for_each_lru(lru) {
 		pvec = &pvecs[lru - LRU_BASE];
 		if (pagevec_count(pvec))
 			____pagevec_lru_add(pvec, lru);
 	}
+	swap_per_cpu_unlock(lru_add_pvecs, cpu);

-	pvec = &per_cpu(lru_rotate_pvecs, cpu);
+	pvec = swap_per_cpu_lock(lru_rotate_pvecs, cpu);
 	if (pagevec_count(pvec)) {
 		unsigned long flags;

 		/* No harm done if a racing interrupt already did this */
-		local_irq_save(flags);
+		swap_irq_save(flags);
 		pagevec_move_tail(pvec);
-		local_irq_restore(flags);
+		swap_irq_restore(flags);
 	}
+	swap_per_cpu_unlock(lru_rotate_pvecs, cpu);
 }

 void lru_add_drain(void)
 {
-	drain_cpu_pagevecs(get_cpu());
-	put_cpu();
+	drain_cpu_pagevecs(swap_get_cpu());
+	swap_put_cpu();
 }

 static void lru_add_drain_per_cpu(struct work_struct *dummy)
@@ -369,7 +450,7 @@ void release_pages(struct page **pages, int nr, int cold)
 			}
 			__pagevec_free(&pages_to_free);
 			pagevec_reinit(&pages_to_free);
-  		}
+		}
 	}
 	if (zone)
 		spin_unlock_irqrestore(&zone->lru_lock, flags);

--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -23,6 +23,7 @@
 #include <linux/file.h>
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
+#include <linux/interrupt.h>
 #include <linux/buffer_head.h>	/* for try_to_release_page(),
 					buffer_heads_over_limit */
 #include <linux/mm_inline.h>
@@ -1118,7 +1119,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
 		}

 		nr_reclaimed += nr_freed;
-		local_irq_disable();
+		local_irq_disable_nort();
 		if (current_is_kswapd()) {
 			__count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
 			__count_vm_events(KSWAPD_STEAL, nr_freed);
@@ -1159,9 +1160,14 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
 			}
 		}
  	} while (nr_scanned < max_scan);
+	/*
+	 * Non-PREEMPT_RT relies on IRQs-off protecting the page_states
+	 * per-CPU data. PREEMPT_RT has that data protected even in
+	 * __mod_page_state(), so no need to keep IRQs disabled.
+	 */
 	spin_unlock(&zone->lru_lock);
 done:
-	local_irq_enable();
+	local_irq_enable_nort();
 	pagevec_release(&pvec);
 	return nr_reclaimed;
 }

--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -149,17 +149,16 @@ static void refresh_zone_stat_thresholds(void)
 void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
 				int delta)
 {
-	struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
+	struct per_cpu_pageset *pcp = zone_pcp(zone, get_cpu());
 	s8 *p = pcp->vm_stat_diff + item;
-	long x;
-
-	x = delta + *p;
+	long x = delta + *p;

 	if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) {
 		zone_page_state_add(x, zone, item);
 		x = 0;
 	}
 	*p = x;
+	put_cpu();
 }
 EXPORT_SYMBOL(__mod_zone_page_state);

@@ -202,7 +201,7 @@ EXPORT_SYMBOL(mod_zone_page_state);
 */
 void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
 {
-	struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
+	struct per_cpu_pageset *pcp = zone_pcp(zone, get_cpu());
 	s8 *p = pcp->vm_stat_diff + item;

 	(*p)++;
@@ -213,17 +212,28 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
 		zone_page_state_add(*p + overstep, zone, item);
 		*p = -overstep;
 	}
+	put_cpu();
 }

 void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
 {
+#ifdef CONFIG_PREEMPT_RT
+	unsigned long flags;
+	struct zone *zone;
+
+	zone = page_zone(page);
+	local_irq_save(flags);
+	__inc_zone_state(zone, item);
+	local_irq_restore(flags);
+#else
 	__inc_zone_state(page_zone(page), item);
+#endif
 }
 EXPORT_SYMBOL(__inc_zone_page_state);

 void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
 {
-	struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
+	struct per_cpu_pageset *pcp = zone_pcp(zone, get_cpu());
 	s8 *p = pcp->vm_stat_diff + item;

 	(*p)--;
@@ -234,6 +244,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
 		zone_page_state_add(*p - overstep, zone, item);
 		*p = overstep;
 	}
+	put_cpu();
 }

 void __dec_zone_page_state(struct page *page, enum zone_stat_item item)