Commit a530b795 authored by Tejun Heo's avatar Tejun Heo

percpu: teach large page allocator about NUMA

Large page first chunk allocator is primarily used for NUMA machines;
however, its NUMA handling is extremely simplistic.  Regardless of
their proximity, each cpu is put into separate large page just to
return most of the allocated space back wasting large amount of
vmalloc space and increasing cache footprint.

This patch teachs NUMA details to large page allocator.  Given
processor proximity information, pcpu_lpage_build_unit_map() will find
fitting cpu -> unit mapping in which cpus in LOCAL_DISTANCE share the
same large page and not too much virtual address space is wasted.

This greatly reduces the unit and thus chunk size and wastes much less
address space for the first chunk.  For example, on 4/4 NUMA machine,
the original code occupied 16MB of virtual space for the first chunk
while the new code only uses 4MB - one 2MB page for each node.

[ Impact: much better space efficiency on NUMA machines ]
Signed-off-by: default avatarTejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Jan Beulich <JBeulich@novell.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Miller <davem@davemloft.net>
parent 2f39e637
......@@ -149,36 +149,73 @@ static void __init pcpul_map(void *ptr, size_t size, void *addr)
set_pmd(pmd, pmd_v);
}
static int pcpu_lpage_cpu_distance(unsigned int from, unsigned int to)
{
if (early_cpu_to_node(from) == early_cpu_to_node(to))
return LOCAL_DISTANCE;
else
return REMOTE_DISTANCE;
}
static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
{
size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
size_t dyn_size = reserve - PERCPU_FIRST_CHUNK_RESERVE;
size_t unit_map_size, unit_size;
int *unit_map;
int nr_units;
ssize_t ret;
/* on non-NUMA, embedding is better */
if (!chosen && !pcpu_need_numa())
return -EINVAL;
/* need PSE */
if (!cpu_has_pse) {
pr_warning("PERCPU: lpage allocator requires PSE\n");
return -EINVAL;
}
/* allocate and build unit_map */
unit_map_size = num_possible_cpus() * sizeof(int);
unit_map = alloc_bootmem_nopanic(unit_map_size);
if (!unit_map) {
pr_warning("PERCPU: failed to allocate unit_map\n");
return -ENOMEM;
}
ret = pcpu_lpage_build_unit_map(static_size,
PERCPU_FIRST_CHUNK_RESERVE,
&dyn_size, &unit_size, PMD_SIZE,
unit_map, pcpu_lpage_cpu_distance);
if (ret < 0) {
pr_warning("PERCPU: failed to build unit_map\n");
goto out_free;
}
nr_units = ret;
/* do the parameters look okay? */
if (!chosen) {
size_t vm_size = VMALLOC_END - VMALLOC_START;
size_t tot_size = num_possible_cpus() * PMD_SIZE;
/* on non-NUMA, embedding is better */
if (!pcpu_need_numa())
return -EINVAL;
size_t tot_size = nr_units * unit_size;
/* don't consume more than 20% of vmalloc area */
if (tot_size > vm_size / 5) {
pr_info("PERCPU: too large chunk size %zuMB for "
"large page remap\n", tot_size >> 20);
return -EINVAL;
ret = -EINVAL;
goto out_free;
}
}
/* need PSE */
if (!cpu_has_pse) {
pr_warning("PERCPU: lpage allocator requires PSE\n");
return -EINVAL;
}
return pcpu_lpage_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
reserve - PERCPU_FIRST_CHUNK_RESERVE,
PMD_SIZE,
pcpu_fc_alloc, pcpu_fc_free, pcpul_map);
ret = pcpu_lpage_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
dyn_size, unit_size, PMD_SIZE,
unit_map, nr_units,
pcpu_fc_alloc, pcpu_fc_free, pcpul_map);
out_free:
if (ret < 0)
free_bootmem(__pa(unit_map), unit_map_size);
return ret;
}
#else
static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
......@@ -299,7 +336,8 @@ void __init setup_per_cpu_areas(void)
/* alrighty, percpu areas up and running */
delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
for_each_possible_cpu(cpu) {
per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size;
per_cpu_offset(cpu) =
delta + pcpu_unit_map[cpu] * pcpu_unit_size;
per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
per_cpu(cpu_number, cpu) = cpu;
setup_percpu_segment(cpu);
......
......@@ -62,6 +62,7 @@ extern const int *pcpu_unit_map;
typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size);
typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size);
typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr);
typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to);
typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr);
extern size_t __init pcpu_setup_first_chunk(
......@@ -80,18 +81,37 @@ extern ssize_t __init pcpu_4k_first_chunk(
pcpu_fc_populate_pte_fn_t populate_pte_fn);
#ifdef CONFIG_NEED_MULTIPLE_NODES
extern int __init pcpu_lpage_build_unit_map(
size_t static_size, size_t reserved_size,
ssize_t *dyn_sizep, size_t *unit_sizep,
size_t lpage_size, int *unit_map,
pcpu_fc_cpu_distance_fn_t cpu_distance_fn);
extern ssize_t __init pcpu_lpage_first_chunk(
size_t static_size, size_t reserved_size,
ssize_t dyn_size, size_t lpage_size,
size_t dyn_size, size_t unit_size,
size_t lpage_size, const int *unit_map,
int nr_units,
pcpu_fc_alloc_fn_t alloc_fn,
pcpu_fc_free_fn_t free_fn,
pcpu_fc_map_fn_t map_fn);
extern void *pcpu_lpage_remapped(void *kaddr);
#else
static inline int pcpu_lpage_build_unit_map(
size_t static_size, size_t reserved_size,
ssize_t *dyn_sizep, size_t *unit_sizep,
size_t lpage_size, int *unit_map,
pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
{
return -EINVAL;
}
static inline ssize_t __init pcpu_lpage_first_chunk(
size_t static_size, size_t reserved_size,
ssize_t dyn_size, size_t lpage_size,
size_t dyn_size, size_t unit_size,
size_t lpage_size, const int *unit_map,
int nr_units,
pcpu_fc_alloc_fn_t alloc_fn,
pcpu_fc_free_fn_t free_fn,
pcpu_fc_map_fn_t map_fn)
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment