Commit 8bccd85f authored by Christoph Lameter's avatar Christoph Lameter Committed by Linus Torvalds

[PATCH] Implement sys_* do_* layering in the memory policy layer.

- Do a separation between do_xxx and sys_xxx functions. sys_xxx functions
  take variable sized bitmaps from user space as arguments. do_xxx functions
  take fixed sized nodemask_t as arguments and may be used from inside the
  kernel. Doing so simplifies the initialization code. There is no
  fs = kernel_ds assumption anymore.

- Split up get_nodes into get_nodes (which gets the node list) and
  contextualize_policy which restricts the nodes to those accessible
  to the task and updates cpusets.

- Add comments explaining limitations of bind policy
Signed-off-by: default avatarChristoph Lameter <clameter@sgi.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent bb7e7e03
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
* Simple NUMA memory policy for the Linux kernel. * Simple NUMA memory policy for the Linux kernel.
* *
* Copyright 2003,2004 Andi Kleen, SuSE Labs. * Copyright 2003,2004 Andi Kleen, SuSE Labs.
* (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
* Subject to the GNU Public License, version 2. * Subject to the GNU Public License, version 2.
* *
* NUMA policy allows the user to give hints in which node(s) memory should * NUMA policy allows the user to give hints in which node(s) memory should
...@@ -17,13 +18,19 @@ ...@@ -17,13 +18,19 @@
* offset into the backing object or offset into the mapping * offset into the backing object or offset into the mapping
* for anonymous memory. For process policy an process counter * for anonymous memory. For process policy an process counter
* is used. * is used.
*
* bind Only allocate memory on a specific set of nodes, * bind Only allocate memory on a specific set of nodes,
* no fallback. * no fallback.
* FIXME: memory is allocated starting with the first node
* to the last. It would be better if bind would truly restrict
* the allocation to memory nodes instead
*
* preferred Try a specific node first before normal fallback. * preferred Try a specific node first before normal fallback.
* As a special case node -1 here means do the allocation * As a special case node -1 here means do the allocation
* on the local CPU. This is normally identical to default, * on the local CPU. This is normally identical to default,
* but useful to set in a VMA when you have a non default * but useful to set in a VMA when you have a non default
* process policy. * process policy.
*
* default Allocate on the local node first, or when on a VMA * default Allocate on the local node first, or when on a VMA
* use the process policy. This is what Linux always did * use the process policy. This is what Linux always did
* in a NUMA aware kernel and still does by, ahem, default. * in a NUMA aware kernel and still does by, ahem, default.
...@@ -113,56 +120,6 @@ static int mpol_check_policy(int mode, nodemask_t *nodes) ...@@ -113,56 +120,6 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
} }
return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL; return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
} }
/* Copy a node mask from user space. */
static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
unsigned long maxnode, int mode)
{
unsigned long k;
unsigned long nlongs;
unsigned long endmask;
--maxnode;
nodes_clear(*nodes);
if (maxnode == 0 || !nmask)
return 0;
nlongs = BITS_TO_LONGS(maxnode);
if ((maxnode % BITS_PER_LONG) == 0)
endmask = ~0UL;
else
endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
/* When the user specified more nodes than supported just check
if the non supported part is all zero. */
if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
if (nlongs > PAGE_SIZE/sizeof(long))
return -EINVAL;
for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
unsigned long t;
if (get_user(t, nmask + k))
return -EFAULT;
if (k == nlongs - 1) {
if (t & endmask)
return -EINVAL;
} else if (t)
return -EINVAL;
}
nlongs = BITS_TO_LONGS(MAX_NUMNODES);
endmask = ~0UL;
}
if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
return -EFAULT;
nodes_addr(*nodes)[nlongs-1] &= endmask;
/* Update current mems_allowed */
cpuset_update_current_mems_allowed();
/* Ignore nodes not set in current->mems_allowed */
/* AK: shouldn't this error out instead? */
cpuset_restrict_to_mems_allowed(nodes_addr(*nodes));
return mpol_check_policy(mode, nodes);
}
/* Generate a custom zonelist for the BIND policy. */ /* Generate a custom zonelist for the BIND policy. */
static struct zonelist *bind_zonelist(nodemask_t *nodes) static struct zonelist *bind_zonelist(nodemask_t *nodes)
{ {
...@@ -380,17 +337,25 @@ static int mbind_range(struct vm_area_struct *vma, unsigned long start, ...@@ -380,17 +337,25 @@ static int mbind_range(struct vm_area_struct *vma, unsigned long start,
return err; return err;
} }
/* Change policy for a memory range */ static int contextualize_policy(int mode, nodemask_t *nodes)
asmlinkage long sys_mbind(unsigned long start, unsigned long len, {
unsigned long mode, if (!nodes)
unsigned long __user *nmask, unsigned long maxnode, return 0;
unsigned flags)
/* Update current mems_allowed */
cpuset_update_current_mems_allowed();
/* Ignore nodes not set in current->mems_allowed */
cpuset_restrict_to_mems_allowed(nodes->bits);
return mpol_check_policy(mode, nodes);
}
long do_mbind(unsigned long start, unsigned long len,
unsigned long mode, nodemask_t *nmask, unsigned long flags)
{ {
struct vm_area_struct *vma; struct vm_area_struct *vma;
struct mm_struct *mm = current->mm; struct mm_struct *mm = current->mm;
struct mempolicy *new; struct mempolicy *new;
unsigned long end; unsigned long end;
nodemask_t nodes;
int err; int err;
if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX) if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
...@@ -405,12 +370,9 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len, ...@@ -405,12 +370,9 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
return -EINVAL; return -EINVAL;
if (end == start) if (end == start)
return 0; return 0;
if (contextualize_policy(mode, nmask))
err = get_nodes(&nodes, nmask, maxnode, mode); return -EINVAL;
if (err) new = mpol_new(mode, nmask);
return err;
new = mpol_new(mode, &nodes);
if (IS_ERR(new)) if (IS_ERR(new))
return PTR_ERR(new); return PTR_ERR(new);
...@@ -418,7 +380,7 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len, ...@@ -418,7 +380,7 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
mode,nodes_addr(nodes)[0]); mode,nodes_addr(nodes)[0]);
down_write(&mm->mmap_sem); down_write(&mm->mmap_sem);
vma = check_range(mm, start, end, &nodes, flags); vma = check_range(mm, start, end, nmask, flags);
err = PTR_ERR(vma); err = PTR_ERR(vma);
if (!IS_ERR(vma)) if (!IS_ERR(vma))
err = mbind_range(vma, start, end, new); err = mbind_range(vma, start, end, new);
...@@ -428,19 +390,13 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len, ...@@ -428,19 +390,13 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
} }
/* Set the process memory policy */ /* Set the process memory policy */
asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, long do_set_mempolicy(int mode, nodemask_t *nodes)
unsigned long maxnode)
{ {
int err;
struct mempolicy *new; struct mempolicy *new;
nodemask_t nodes;
if (mode < 0 || mode > MPOL_MAX) if (contextualize_policy(mode, nodes))
return -EINVAL; return -EINVAL;
err = get_nodes(&nodes, nmask, maxnode, mode); new = mpol_new(mode, nodes);
if (err)
return err;
new = mpol_new(mode, &nodes);
if (IS_ERR(new)) if (IS_ERR(new))
return PTR_ERR(new); return PTR_ERR(new);
mpol_free(current->mempolicy); mpol_free(current->mempolicy);
...@@ -459,7 +415,8 @@ static void get_zonemask(struct mempolicy *p, nodemask_t *nodes) ...@@ -459,7 +415,8 @@ static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
switch (p->policy) { switch (p->policy) {
case MPOL_BIND: case MPOL_BIND:
for (i = 0; p->v.zonelist->zones[i]; i++) for (i = 0; p->v.zonelist->zones[i]; i++)
node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, *nodes); node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
*nodes);
break; break;
case MPOL_DEFAULT: case MPOL_DEFAULT:
break; break;
...@@ -491,38 +448,17 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr) ...@@ -491,38 +448,17 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr)
return err; return err;
} }
/* Copy a kernel node mask to user space */
static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
nodemask_t *nodes)
{
unsigned long copy = ALIGN(maxnode-1, 64) / 8;
const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
if (copy > nbytes) {
if (copy > PAGE_SIZE)
return -EINVAL;
if (clear_user((char __user *)mask + nbytes, copy - nbytes))
return -EFAULT;
copy = nbytes;
}
return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
}
/* Retrieve NUMA policy */ /* Retrieve NUMA policy */
asmlinkage long sys_get_mempolicy(int __user *policy, long do_get_mempolicy(int *policy, nodemask_t *nmask,
unsigned long __user *nmask, unsigned long addr, unsigned long flags)
unsigned long maxnode,
unsigned long addr, unsigned long flags)
{ {
int err, pval; int err;
struct mm_struct *mm = current->mm; struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = NULL; struct vm_area_struct *vma = NULL;
struct mempolicy *pol = current->mempolicy; struct mempolicy *pol = current->mempolicy;
if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
return -EINVAL; return -EINVAL;
if (nmask != NULL && maxnode < MAX_NUMNODES)
return -EINVAL;
if (flags & MPOL_F_ADDR) { if (flags & MPOL_F_ADDR) {
down_read(&mm->mmap_sem); down_read(&mm->mmap_sem);
vma = find_vma_intersection(mm, addr, addr+1); vma = find_vma_intersection(mm, addr, addr+1);
...@@ -545,31 +481,25 @@ asmlinkage long sys_get_mempolicy(int __user *policy, ...@@ -545,31 +481,25 @@ asmlinkage long sys_get_mempolicy(int __user *policy,
err = lookup_node(mm, addr); err = lookup_node(mm, addr);
if (err < 0) if (err < 0)
goto out; goto out;
pval = err; *policy = err;
} else if (pol == current->mempolicy && } else if (pol == current->mempolicy &&
pol->policy == MPOL_INTERLEAVE) { pol->policy == MPOL_INTERLEAVE) {
pval = current->il_next; *policy = current->il_next;
} else { } else {
err = -EINVAL; err = -EINVAL;
goto out; goto out;
} }
} else } else
pval = pol->policy; *policy = pol->policy;
if (vma) { if (vma) {
up_read(&current->mm->mmap_sem); up_read(&current->mm->mmap_sem);
vma = NULL; vma = NULL;
} }
if (policy && put_user(pval, policy))
return -EFAULT;
err = 0; err = 0;
if (nmask) { if (nmask)
nodemask_t nodes; get_zonemask(pol, nmask);
get_zonemask(pol, &nodes);
err = copy_nodes_to_user(nmask, maxnode, &nodes);
}
out: out:
if (vma) if (vma)
...@@ -577,6 +507,126 @@ asmlinkage long sys_get_mempolicy(int __user *policy, ...@@ -577,6 +507,126 @@ asmlinkage long sys_get_mempolicy(int __user *policy,
return err; return err;
} }
/*
* User space interface with variable sized bitmaps for nodelists.
*/
/* Copy a node mask from user space. */
static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
unsigned long maxnode)
{
unsigned long k;
unsigned long nlongs;
unsigned long endmask;
--maxnode;
nodes_clear(*nodes);
if (maxnode == 0 || !nmask)
return 0;
nlongs = BITS_TO_LONGS(maxnode);
if ((maxnode % BITS_PER_LONG) == 0)
endmask = ~0UL;
else
endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
/* When the user specified more nodes than supported just check
if the non supported part is all zero. */
if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
if (nlongs > PAGE_SIZE/sizeof(long))
return -EINVAL;
for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
unsigned long t;
if (get_user(t, nmask + k))
return -EFAULT;
if (k == nlongs - 1) {
if (t & endmask)
return -EINVAL;
} else if (t)
return -EINVAL;
}
nlongs = BITS_TO_LONGS(MAX_NUMNODES);
endmask = ~0UL;
}
if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
return -EFAULT;
nodes_addr(*nodes)[nlongs-1] &= endmask;
return 0;
}
/* Copy a kernel node mask to user space */
static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
nodemask_t *nodes)
{
unsigned long copy = ALIGN(maxnode-1, 64) / 8;
const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
if (copy > nbytes) {
if (copy > PAGE_SIZE)
return -EINVAL;
if (clear_user((char __user *)mask + nbytes, copy - nbytes))
return -EFAULT;
copy = nbytes;
}
return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
}
asmlinkage long sys_mbind(unsigned long start, unsigned long len,
unsigned long mode,
unsigned long __user *nmask, unsigned long maxnode,
unsigned flags)
{
nodemask_t nodes;
int err;
err = get_nodes(&nodes, nmask, maxnode);
if (err)
return err;
return do_mbind(start, len, mode, &nodes, flags);
}
/* Set the process memory policy */
asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
unsigned long maxnode)
{
int err;
nodemask_t nodes;
if (mode < 0 || mode > MPOL_MAX)
return -EINVAL;
err = get_nodes(&nodes, nmask, maxnode);
if (err)
return err;
return do_set_mempolicy(mode, &nodes);
}
/* Retrieve NUMA policy */
asmlinkage long sys_get_mempolicy(int __user *policy,
unsigned long __user *nmask,
unsigned long maxnode,
unsigned long addr, unsigned long flags)
{
int err, pval;
nodemask_t nodes;
if (nmask != NULL && maxnode < MAX_NUMNODES)
return -EINVAL;
err = do_get_mempolicy(&pval, &nodes, addr, flags);
if (err)
return err;
if (policy && put_user(pval, policy))
return -EFAULT;
if (nmask)
err = copy_nodes_to_user(nmask, maxnode, &nodes);
return err;
}
#ifdef CONFIG_COMPAT #ifdef CONFIG_COMPAT
asmlinkage long compat_sys_get_mempolicy(int __user *policy, asmlinkage long compat_sys_get_mempolicy(int __user *policy,
...@@ -664,7 +714,7 @@ get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned lo ...@@ -664,7 +714,7 @@ get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned lo
if (vma) { if (vma) {
if (vma->vm_ops && vma->vm_ops->get_policy) if (vma->vm_ops && vma->vm_ops->get_policy)
pol = vma->vm_ops->get_policy(vma, addr); pol = vma->vm_ops->get_policy(vma, addr);
else if (vma->vm_policy && else if (vma->vm_policy &&
vma->vm_policy->policy != MPOL_DEFAULT) vma->vm_policy->policy != MPOL_DEFAULT)
pol = vma->vm_policy; pol = vma->vm_policy;
...@@ -1147,14 +1197,12 @@ void __init numa_policy_init(void) ...@@ -1147,14 +1197,12 @@ void __init numa_policy_init(void)
/* Set interleaving policy for system init. This way not all /* Set interleaving policy for system init. This way not all
the data structures allocated at system boot end up in node zero. */ the data structures allocated at system boot end up in node zero. */
if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map), if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
MAX_NUMNODES) < 0)
printk("numa_policy_init: interleaving failed\n"); printk("numa_policy_init: interleaving failed\n");
} }
/* Reset policy of current process to default. /* Reset policy of current process to default */
* Assumes fs == KERNEL_DS */
void numa_default_policy(void) void numa_default_policy(void)
{ {
sys_set_mempolicy(MPOL_DEFAULT, NULL, 0); do_set_mempolicy(MPOL_DEFAULT, NULL);
} }
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment