Commit dc9aa5b9 authored by Christoph Lameter's avatar Christoph Lameter Committed by Linus Torvalds

[PATCH] Swap Migration V5: MPOL_MF_MOVE interface

Add page migration support via swap to the NUMA policy layer

This patch adds page migration support to the NUMA policy layer.  An
additional flag MPOL_MF_MOVE is introduced for mbind.  If MPOL_MF_MOVE is
specified then pages that do not conform to the memory policy will be evicted
from memory.  When they get pages back in new pages will be allocated
following the numa policy.
Signed-off-by: default avatarChristoph Lameter <clameter@sgi.com>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 7cbe34cf
...@@ -22,6 +22,9 @@ ...@@ -22,6 +22,9 @@
/* Flags for mbind */ /* Flags for mbind */
#define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */ #define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */
#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform to mapping */
#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to mapping */
#define MPOL_MF_INTERNAL (1<<3) /* Internal flags start here */
#ifdef __KERNEL__ #ifdef __KERNEL__
......
...@@ -83,9 +83,14 @@ ...@@ -83,9 +83,14 @@
#include <linux/init.h> #include <linux/init.h>
#include <linux/compat.h> #include <linux/compat.h>
#include <linux/mempolicy.h> #include <linux/mempolicy.h>
#include <linux/swap.h>
#include <asm/tlbflush.h> #include <asm/tlbflush.h>
#include <asm/uaccess.h> #include <asm/uaccess.h>
/* Internal MPOL_MF_xxx flags */
#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
static kmem_cache_t *policy_cache; static kmem_cache_t *policy_cache;
static kmem_cache_t *sn_cache; static kmem_cache_t *sn_cache;
...@@ -174,9 +179,59 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) ...@@ -174,9 +179,59 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
return policy; return policy;
} }
/* Check if we are the only process mapping the page in question */
static inline int single_mm_mapping(struct mm_struct *mm,
struct address_space *mapping)
{
struct vm_area_struct *vma;
struct prio_tree_iter iter;
int rc = 1;
spin_lock(&mapping->i_mmap_lock);
vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
if (mm != vma->vm_mm) {
rc = 0;
goto out;
}
list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
if (mm != vma->vm_mm) {
rc = 0;
goto out;
}
out:
spin_unlock(&mapping->i_mmap_lock);
return rc;
}
/*
* Add a page to be migrated to the pagelist
*/
static void migrate_page_add(struct vm_area_struct *vma,
struct page *page, struct list_head *pagelist, unsigned long flags)
{
/*
* Avoid migrating a page that is shared by others and not writable.
*/
if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
mapping_writably_mapped(page->mapping) ||
single_mm_mapping(vma->vm_mm, page->mapping)) {
int rc = isolate_lru_page(page);
if (rc == 1)
list_add(&page->lru, pagelist);
/*
* If the isolate attempt was not successful then we just
* encountered an unswappable page. Something must be wrong.
*/
WARN_ON(rc == 0);
}
}
/* Ensure all existing pages follow the policy. */ /* Ensure all existing pages follow the policy. */
static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end, nodemask_t *nodes) unsigned long addr, unsigned long end,
const nodemask_t *nodes, unsigned long flags,
struct list_head *pagelist)
{ {
pte_t *orig_pte; pte_t *orig_pte;
pte_t *pte; pte_t *pte;
...@@ -193,15 +248,21 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, ...@@ -193,15 +248,21 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
if (!page) if (!page)
continue; continue;
nid = page_to_nid(page); nid = page_to_nid(page);
if (!node_isset(nid, *nodes)) if (!node_isset(nid, *nodes)) {
break; if (pagelist)
migrate_page_add(vma, page, pagelist, flags);
else
break;
}
} while (pte++, addr += PAGE_SIZE, addr != end); } while (pte++, addr += PAGE_SIZE, addr != end);
pte_unmap_unlock(orig_pte, ptl); pte_unmap_unlock(orig_pte, ptl);
return addr != end; return addr != end;
} }
static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
unsigned long addr, unsigned long end, nodemask_t *nodes) unsigned long addr, unsigned long end,
const nodemask_t *nodes, unsigned long flags,
struct list_head *pagelist)
{ {
pmd_t *pmd; pmd_t *pmd;
unsigned long next; unsigned long next;
...@@ -211,14 +272,17 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, ...@@ -211,14 +272,17 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
next = pmd_addr_end(addr, end); next = pmd_addr_end(addr, end);
if (pmd_none_or_clear_bad(pmd)) if (pmd_none_or_clear_bad(pmd))
continue; continue;
if (check_pte_range(vma, pmd, addr, next, nodes)) if (check_pte_range(vma, pmd, addr, next, nodes,
flags, pagelist))
return -EIO; return -EIO;
} while (pmd++, addr = next, addr != end); } while (pmd++, addr = next, addr != end);
return 0; return 0;
} }
static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
unsigned long addr, unsigned long end, nodemask_t *nodes) unsigned long addr, unsigned long end,
const nodemask_t *nodes, unsigned long flags,
struct list_head *pagelist)
{ {
pud_t *pud; pud_t *pud;
unsigned long next; unsigned long next;
...@@ -228,14 +292,17 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, ...@@ -228,14 +292,17 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
next = pud_addr_end(addr, end); next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud)) if (pud_none_or_clear_bad(pud))
continue; continue;
if (check_pmd_range(vma, pud, addr, next, nodes)) if (check_pmd_range(vma, pud, addr, next, nodes,
flags, pagelist))
return -EIO; return -EIO;
} while (pud++, addr = next, addr != end); } while (pud++, addr = next, addr != end);
return 0; return 0;
} }
static inline int check_pgd_range(struct vm_area_struct *vma, static inline int check_pgd_range(struct vm_area_struct *vma,
unsigned long addr, unsigned long end, nodemask_t *nodes) unsigned long addr, unsigned long end,
const nodemask_t *nodes, unsigned long flags,
struct list_head *pagelist)
{ {
pgd_t *pgd; pgd_t *pgd;
unsigned long next; unsigned long next;
...@@ -245,16 +312,31 @@ static inline int check_pgd_range(struct vm_area_struct *vma, ...@@ -245,16 +312,31 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
next = pgd_addr_end(addr, end); next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd)) if (pgd_none_or_clear_bad(pgd))
continue; continue;
if (check_pud_range(vma, pgd, addr, next, nodes)) if (check_pud_range(vma, pgd, addr, next, nodes,
flags, pagelist))
return -EIO; return -EIO;
} while (pgd++, addr = next, addr != end); } while (pgd++, addr = next, addr != end);
return 0; return 0;
} }
/* Step 1: check the range */ /* Check if a vma is migratable */
static inline int vma_migratable(struct vm_area_struct *vma)
{
if (vma->vm_flags & (
VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP))
return 0;
return 1;
}
/*
* Check if all pages in a range are on a set of nodes.
* If pagelist != NULL then isolate pages from the LRU and
* put them on the pagelist.
*/
static struct vm_area_struct * static struct vm_area_struct *
check_range(struct mm_struct *mm, unsigned long start, unsigned long end, check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
nodemask_t *nodes, unsigned long flags) const nodemask_t *nodes, unsigned long flags,
struct list_head *pagelist)
{ {
int err; int err;
struct vm_area_struct *first, *vma, *prev; struct vm_area_struct *first, *vma, *prev;
...@@ -264,17 +346,24 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, ...@@ -264,17 +346,24 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
return ERR_PTR(-EFAULT); return ERR_PTR(-EFAULT);
prev = NULL; prev = NULL;
for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
if (!vma->vm_next && vma->vm_end < end) if (!(flags & MPOL_MF_DISCONTIG_OK)) {
return ERR_PTR(-EFAULT); if (!vma->vm_next && vma->vm_end < end)
if (prev && prev->vm_end < vma->vm_start) return ERR_PTR(-EFAULT);
return ERR_PTR(-EFAULT); if (prev && prev->vm_end < vma->vm_start)
if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) { return ERR_PTR(-EFAULT);
}
if (!is_vm_hugetlb_page(vma) &&
((flags & MPOL_MF_STRICT) ||
((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
vma_migratable(vma)))) {
unsigned long endvma = vma->vm_end; unsigned long endvma = vma->vm_end;
if (endvma > end) if (endvma > end)
endvma = end; endvma = end;
if (vma->vm_start > start) if (vma->vm_start > start)
start = vma->vm_start; start = vma->vm_start;
err = check_pgd_range(vma, start, endvma, nodes); err = check_pgd_range(vma, start, endvma, nodes,
flags, pagelist);
if (err) { if (err) {
first = ERR_PTR(err); first = ERR_PTR(err);
break; break;
...@@ -348,33 +437,59 @@ long do_mbind(unsigned long start, unsigned long len, ...@@ -348,33 +437,59 @@ long do_mbind(unsigned long start, unsigned long len,
struct mempolicy *new; struct mempolicy *new;
unsigned long end; unsigned long end;
int err; int err;
LIST_HEAD(pagelist);
if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX) if ((flags & ~(unsigned long)(MPOL_MF_STRICT|MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
|| mode > MPOL_MAX)
return -EINVAL; return -EINVAL;
if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
return -EPERM;
if (start & ~PAGE_MASK) if (start & ~PAGE_MASK)
return -EINVAL; return -EINVAL;
if (mode == MPOL_DEFAULT) if (mode == MPOL_DEFAULT)
flags &= ~MPOL_MF_STRICT; flags &= ~MPOL_MF_STRICT;
len = (len + PAGE_SIZE - 1) & PAGE_MASK; len = (len + PAGE_SIZE - 1) & PAGE_MASK;
end = start + len; end = start + len;
if (end < start) if (end < start)
return -EINVAL; return -EINVAL;
if (end == start) if (end == start)
return 0; return 0;
if (mpol_check_policy(mode, nmask)) if (mpol_check_policy(mode, nmask))
return -EINVAL; return -EINVAL;
new = mpol_new(mode, nmask); new = mpol_new(mode, nmask);
if (IS_ERR(new)) if (IS_ERR(new))
return PTR_ERR(new); return PTR_ERR(new);
/*
* If we are using the default policy then operation
* on discontinuous address spaces is okay after all
*/
if (!new)
flags |= MPOL_MF_DISCONTIG_OK;
PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
mode,nodes_addr(nodes)[0]); mode,nodes_addr(nodes)[0]);
down_write(&mm->mmap_sem); down_write(&mm->mmap_sem);
vma = check_range(mm, start, end, nmask, flags); vma = check_range(mm, start, end, nmask, flags,
(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ? &pagelist : NULL);
err = PTR_ERR(vma); err = PTR_ERR(vma);
if (!IS_ERR(vma)) if (!IS_ERR(vma)) {
err = mbind_range(vma, start, end, new); err = mbind_range(vma, start, end, new);
if (!list_empty(&pagelist))
migrate_pages(&pagelist, NULL);
if (!err && !list_empty(&pagelist) && (flags & MPOL_MF_STRICT))
err = -EIO;
}
if (!list_empty(&pagelist))
putback_lru_pages(&pagelist);
up_write(&mm->mmap_sem); up_write(&mm->mmap_sem);
mpol_free(new); mpol_free(new);
return err; return err;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment