Commit 8ad4b1fb authored by Rohit Seth's avatar Rohit Seth Committed by Linus Torvalds

[PATCH] Make high and batch sizes of per_cpu_pagelists configurable

As recently there has been lot of traffic on the right values for batch and
high water marks for per_cpu_pagelists.  This patch makes these two
variables configurable through /proc interface.

A new tunable /proc/sys/vm/percpu_pagelist_fraction is added.  This entry
controls the fraction of pages at most in each zone that are allocated for
each per cpu page list.  The min value for this is 8.  It means that we
don't allow more than 1/8th of pages in each zone to be allocated in any
single per_cpu_pagelist.

The batch value of each per cpu pagelist is also updated as a result.  It
is set to pcp->high/4.  The upper limit of batch is (PAGE_SHIFT * 8)
Signed-off-by: default avatarRohit Seth <rohit.seth@intel.com>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 9d0243bc
...@@ -103,3 +103,20 @@ This is used to force the Linux VM to keep a minimum number ...@@ -103,3 +103,20 @@ This is used to force the Linux VM to keep a minimum number
of kilobytes free. The VM uses this number to compute a pages_min of kilobytes free. The VM uses this number to compute a pages_min
value for each lowmem zone in the system. Each lowmem zone gets value for each lowmem zone in the system. Each lowmem zone gets
a number of reserved free pages based proportionally on its size. a number of reserved free pages based proportionally on its size.
==============================================================
percpu_pagelist_fraction
This is the fraction of pages at most (high mark pcp->high) in each zone that
are allocated for each per cpu page list. The min value for this is 8. It
means that we don't allow more than 1/8th of pages in each zone to be
allocated in any single per_cpu_pagelist. This entry only changes the value
of hot per cpu pagelists. User can specify a number like 100 to allocate
1/100th of each zone to each per cpu page list.
The batch value of each per cpu pagelist is also updated as a result. It is
set to pcp->high/4. The upper limit of batch is (PAGE_SHIFT * 8)
The initial value is zero. Kernel does not use this value at boot time to set
the high water marks for each per cpu page list.
...@@ -437,6 +437,8 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *, ...@@ -437,6 +437,8 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *,
extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1]; extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];
int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *, int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *,
void __user *, size_t *, loff_t *); void __user *, size_t *, loff_t *);
int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, struct file *,
void __user *, size_t *, loff_t *);
#include <linux/topology.h> #include <linux/topology.h>
/* Returns the number of the current Node. */ /* Returns the number of the current Node. */
......
...@@ -181,6 +181,7 @@ enum ...@@ -181,6 +181,7 @@ enum
VM_LEGACY_VA_LAYOUT=27, /* legacy/compatibility virtual address space layout */ VM_LEGACY_VA_LAYOUT=27, /* legacy/compatibility virtual address space layout */
VM_SWAP_TOKEN_TIMEOUT=28, /* default time for token time out */ VM_SWAP_TOKEN_TIMEOUT=28, /* default time for token time out */
VM_DROP_PAGECACHE=29, /* int: nuke lots of pagecache */ VM_DROP_PAGECACHE=29, /* int: nuke lots of pagecache */
VM_PERCPU_PAGELIST_FRACTION=30,/* int: fraction of pages in each percpu_pagelist */
}; };
......
...@@ -69,6 +69,7 @@ extern int printk_ratelimit_jiffies; ...@@ -69,6 +69,7 @@ extern int printk_ratelimit_jiffies;
extern int printk_ratelimit_burst; extern int printk_ratelimit_burst;
extern int pid_max_min, pid_max_max; extern int pid_max_min, pid_max_max;
extern int sysctl_drop_caches; extern int sysctl_drop_caches;
extern int percpu_pagelist_fraction;
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
int unknown_nmi_panic; int unknown_nmi_panic;
...@@ -79,6 +80,7 @@ extern int proc_unknown_nmi_panic(ctl_table *, int, struct file *, ...@@ -79,6 +80,7 @@ extern int proc_unknown_nmi_panic(ctl_table *, int, struct file *,
/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
static int maxolduid = 65535; static int maxolduid = 65535;
static int minolduid; static int minolduid;
static int min_percpu_pagelist_fract = 8;
static int ngroups_max = NGROUPS_MAX; static int ngroups_max = NGROUPS_MAX;
...@@ -794,6 +796,16 @@ static ctl_table vm_table[] = { ...@@ -794,6 +796,16 @@ static ctl_table vm_table[] = {
.strategy = &sysctl_intvec, .strategy = &sysctl_intvec,
.extra1 = &zero, .extra1 = &zero,
}, },
{
.ctl_name = VM_PERCPU_PAGELIST_FRACTION,
.procname = "percpu_pagelist_fraction",
.data = &percpu_pagelist_fraction,
.maxlen = sizeof(percpu_pagelist_fraction),
.mode = 0644,
.proc_handler = &percpu_pagelist_fraction_sysctl_handler,
.strategy = &sysctl_intvec,
.extra1 = &min_percpu_pagelist_fract,
},
#ifdef CONFIG_MMU #ifdef CONFIG_MMU
{ {
.ctl_name = VM_MAX_MAP_COUNT, .ctl_name = VM_MAX_MAP_COUNT,
......
...@@ -53,6 +53,7 @@ struct pglist_data *pgdat_list __read_mostly; ...@@ -53,6 +53,7 @@ struct pglist_data *pgdat_list __read_mostly;
unsigned long totalram_pages __read_mostly; unsigned long totalram_pages __read_mostly;
unsigned long totalhigh_pages __read_mostly; unsigned long totalhigh_pages __read_mostly;
long nr_swap_pages; long nr_swap_pages;
int percpu_pagelist_fraction;
static void fastcall free_hot_cold_page(struct page *page, int cold); static void fastcall free_hot_cold_page(struct page *page, int cold);
...@@ -1831,6 +1832,24 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) ...@@ -1831,6 +1832,24 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
INIT_LIST_HEAD(&pcp->list); INIT_LIST_HEAD(&pcp->list);
} }
/*
* setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
* to the value high for the pageset p.
*/
static void setup_pagelist_highmark(struct per_cpu_pageset *p,
unsigned long high)
{
struct per_cpu_pages *pcp;
pcp = &p->pcp[0]; /* hot list */
pcp->high = high;
pcp->batch = max(1UL, high/4);
if ((high/4) > (PAGE_SHIFT * 8))
pcp->batch = PAGE_SHIFT * 8;
}
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
/* /*
* Boot pageset table. One per cpu which is going to be used for all * Boot pageset table. One per cpu which is going to be used for all
...@@ -1868,6 +1887,10 @@ static int __devinit process_zones(int cpu) ...@@ -1868,6 +1887,10 @@ static int __devinit process_zones(int cpu)
goto bad; goto bad;
setup_pageset(zone->pageset[cpu], zone_batchsize(zone)); setup_pageset(zone->pageset[cpu], zone_batchsize(zone));
if (percpu_pagelist_fraction)
setup_pagelist_highmark(zone_pcp(zone, cpu),
(zone->present_pages / percpu_pagelist_fraction));
} }
return 0; return 0;
...@@ -2567,6 +2590,32 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, ...@@ -2567,6 +2590,32 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
return 0; return 0;
} }
/*
* percpu_pagelist_fraction - changes the pcp->high for each zone on each
* cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist
* can have before it gets flushed back to buddy allocator.
*/
int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
{
struct zone *zone;
unsigned int cpu;
int ret;
ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
if (!write || (ret == -EINVAL))
return ret;
for_each_zone(zone) {
for_each_online_cpu(cpu) {
unsigned long high;
high = zone->present_pages / percpu_pagelist_fraction;
setup_pagelist_highmark(zone_pcp(zone, cpu), high);
}
}
return 0;
}
__initdata int hashdist = HASHDIST_DEFAULT; __initdata int hashdist = HASHDIST_DEFAULT;
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment