Commit ff3fd6af authored by Ingo Molnar's avatar Ingo Molnar Committed by Thomas Gleixner

mm: page_alloc: rt-friendly per-cpu pages

rt-friendly per-cpu pages: convert the irqs-off per-cpu locking
method into a preemptible, explicit-per-cpu-locks method.

Contains fixes from:
	 Peter Zijlstra <a.p.zijlstra@chello.nl>
	 Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: default avatarIngo Molnar <mingo@elte.hu>
Signed-off-by: default avatarThomas Gleixner <tglx@linutronix.de>
parent 4672c440
......@@ -161,6 +161,53 @@ static unsigned long __meminitdata dma_reserve;
EXPORT_SYMBOL(movable_zone);
#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
#ifdef CONFIG_PREEMPT_RT
static DEFINE_PER_CPU_LOCKED(int, pcp_locks);
#endif
static inline void __lock_cpu_pcp(unsigned long *flags, int cpu)
{
#ifdef CONFIG_PREEMPT_RT
spin_lock(&__get_cpu_lock(pcp_locks, cpu));
flags = 0;
#else
local_irq_save(*flags);
#endif
}
static inline void lock_cpu_pcp(unsigned long *flags, int *this_cpu)
{
#ifdef CONFIG_PREEMPT_RT
(void)get_cpu_var_locked(pcp_locks, this_cpu);
flags = 0;
#else
local_irq_save(*flags);
*this_cpu = smp_processor_id();
#endif
}
static inline void unlock_cpu_pcp(unsigned long flags, int this_cpu)
{
#ifdef CONFIG_PREEMPT_RT
put_cpu_var_locked(pcp_locks, this_cpu);
#else
local_irq_restore(flags);
#endif
}
static struct per_cpu_pageset *
get_zone_pcp(struct zone *zone, unsigned long *flags, int *this_cpu)
{
lock_cpu_pcp(flags, this_cpu);
return zone_pcp(zone, *this_cpu);
}
static void
put_zone_pcp(struct zone *zone, unsigned long flags, int this_cpu)
{
unlock_cpu_pcp(flags, this_cpu);
}
#if MAX_NUMNODES > 1
int nr_node_ids __read_mostly = MAX_NUMNODES;
int nr_online_nodes __read_mostly = 1;
......@@ -555,8 +602,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
static void __free_pages_ok(struct page *page, unsigned int order)
{
unsigned long flags;
int i;
int bad = 0;
int i, this_cpu, bad = 0;
int wasMlocked = TestClearPageMlocked(page);
kmemcheck_free_shadow(page, order);
......@@ -574,13 +620,13 @@ static void __free_pages_ok(struct page *page, unsigned int order)
arch_free_page(page, order);
kernel_map_pages(page, 1 << order, 0);
local_irq_save(flags);
lock_cpu_pcp(&flags, &this_cpu);
if (unlikely(wasMlocked))
free_page_mlock(page);
__count_vm_events(PGFREE, 1 << order);
count_vm_events(PGFREE, 1 << order);
unlock_cpu_pcp(flags, this_cpu);
free_one_page(page_zone(page), page, order,
get_pageblock_migratetype(page));
local_irq_restore(flags);
get_pageblock_migratetype(page));
}
/*
......@@ -923,15 +969,16 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
{
unsigned long flags;
int to_drain;
int this_cpu;
local_irq_save(flags);
lock_cpu_pcp(&flags, &this_cpu);
if (pcp->count >= pcp->batch)
to_drain = pcp->batch;
else
to_drain = pcp->count;
free_pages_bulk(zone, to_drain, &pcp->list, 0);
pcp->count -= to_drain;
local_irq_restore(flags);
unlock_cpu_pcp(flags, this_cpu);
}
#endif
......@@ -951,13 +998,17 @@ static void drain_pages(unsigned int cpu)
struct per_cpu_pageset *pset;
struct per_cpu_pages *pcp;
__lock_cpu_pcp(&flags, cpu);
pset = zone_pcp(zone, cpu);
if (!pset) {
unlock_cpu_pcp(flags, cpu);
WARN_ON(1);
continue;
}
pcp = &pset->pcp;
local_irq_save(flags);
free_pages_bulk(zone, pcp->count, &pcp->list, 0);
pcp->count = 0;
local_irq_restore(flags);
unlock_cpu_pcp(flags, cpu);
}
}
......@@ -969,12 +1020,52 @@ void drain_local_pages(void *arg)
drain_pages(smp_processor_id());
}
#ifdef CONFIG_PREEMPT_RT
static void drain_local_pages_work(struct work_struct *wrk)
{
drain_pages(smp_processor_id());
}
#endif
/*
* Spill all the per-cpu pages from all CPUs back into the buddy allocator
*/
void drain_all_pages(void)
{
#ifdef CONFIG_PREEMPT_RT
/*
* HACK!!!!!
* For RT we can't use IPIs to run drain_local_pages, since
* that code will call spin_locks that will now sleep.
* But, schedule_on_each_cpu will call kzalloc, which will
* call page_alloc which was what calls this.
*
* Luckily, there's a condition to get here, and that is if
* the order passed in to alloc_pages is greater than 0
* (alloced more than a page size). The slabs only allocate
* what is needed, and the allocation made by schedule_on_each_cpu
* does an alloc of "sizeof(void *)*nr_cpu_ids".
*
* So we can safely call schedule_on_each_cpu if that number
* is less than a page. Otherwise don't bother. At least warn of
* this issue.
*
* And yes, this is one big hack. Please fix ;-)
*/
if (sizeof(void *)*nr_cpu_ids < PAGE_SIZE)
schedule_on_each_cpu(drain_local_pages_work);
else {
static int once;
if (!once) {
printk(KERN_ERR "Can't drain all CPUS due to possible recursion\n");
once = 1;
}
drain_local_pages(NULL);
}
#else
on_each_cpu(drain_local_pages, NULL, 1);
#endif
}
#ifdef CONFIG_HIBERNATION
......@@ -1019,9 +1110,10 @@ void mark_free_pages(struct zone *zone)
static void free_hot_cold_page(struct page *page, int cold)
{
struct zone *zone = page_zone(page);
struct per_cpu_pageset *pset;
struct per_cpu_pages *pcp;
unsigned long flags;
int wasMlocked = TestClearPageMlocked(page);
int this_cpu, wasMlocked = TestClearPageMlocked(page);
kmemcheck_free_shadow(page, 0);
......@@ -1037,12 +1129,12 @@ static void free_hot_cold_page(struct page *page, int cold)
arch_free_page(page, 0);
kernel_map_pages(page, 1, 0);
pcp = &zone_pcp(zone, get_cpu())->pcp;
pset = get_zone_pcp(zone, &flags, &this_cpu);
pcp = &pset->pcp;
set_page_private(page, get_pageblock_migratetype(page));
local_irq_save(flags);
if (unlikely(wasMlocked))
free_page_mlock(page);
__count_vm_event(PGFREE);
count_vm_event(PGFREE);
if (cold)
list_add_tail(&page->lru, &pcp->list);
......@@ -1053,8 +1145,7 @@ static void free_hot_cold_page(struct page *page, int cold)
free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
pcp->count -= pcp->batch;
}
local_irq_restore(flags);
put_cpu();
put_zone_pcp(zone, flags, this_cpu);
}
void free_hot_page(struct page *page)
......@@ -1108,15 +1199,15 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
unsigned long flags;
struct page *page;
int cold = !!(gfp_flags & __GFP_COLD);
int cpu;
struct per_cpu_pageset *pset;
int this_cpu;
again:
cpu = get_cpu();
pset = get_zone_pcp(zone, &flags, &this_cpu);
if (likely(order == 0)) {
struct per_cpu_pages *pcp;
struct per_cpu_pages *pcp = &pset->pcp;
pcp = &zone_pcp(zone, cpu)->pcp;
local_irq_save(flags);
if (!pcp->count) {
pcp->count = rmqueue_bulk(zone, 0,
pcp->batch, &pcp->list, migratetype);
......@@ -1158,7 +1249,7 @@ again:
*/
WARN_ON_ONCE(order > 1);
}
spin_lock_irqsave(&zone->lock, flags);
spin_lock(&zone->lock);
page = __rmqueue(zone, order, migratetype);
__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
spin_unlock(&zone->lock);
......@@ -1168,8 +1259,7 @@ again:
__count_zone_vm_events(PGALLOC, zone, 1 << order);
zone_statistics(preferred_zone, zone);
local_irq_restore(flags);
put_cpu();
put_zone_pcp(zone, flags, this_cpu);
VM_BUG_ON(bad_range(zone, page));
if (prep_new_page(page, order, gfp_flags))
......@@ -1177,8 +1267,7 @@ again:
return page;
failed:
local_irq_restore(flags);
put_cpu();
put_zone_pcp(zone, flags, this_cpu);
return NULL;
}
......@@ -3036,7 +3125,23 @@ static inline void free_zone_pagesets(int cpu)
struct zone *zone;
for_each_zone(zone) {
struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
unsigned long flags;
struct per_cpu_pageset *pset;
/*
* On PREEMPT_RT the allocator is preemptible, therefore
* kstopmachine can preempt a process in the middle of an
* allocation, freeing the pset underneath such a process
* isn't a good idea.
*
* Take the per-cpu pcp lock to allow the task to complete
* before we free it. New tasks will be held off by the
* cpu_online() check in get_cpu_var_locked().
*/
__lock_cpu_pcp(&flags, cpu);
pset = zone_pcp(zone, cpu);
zone_pcp(zone, cpu) = NULL;
unlock_cpu_pcp(flags, cpu);
/* Free per_cpu_pageset if it is slab allocated */
if (pset != &boot_pageset[cpu])
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment