Commit db114b83 authored by Hugh Dickins's avatar Hugh Dickins Committed by Linus Torvalds

ksm: hold anon_vma in rmap_item

For full functionality, page_referenced_one() and try_to_unmap_one() need
to know the vma: to pass vma down to arch-dependent flushes, or to observe
VM_LOCKED or VM_EXEC.  But KSM keeps no record of vma: nor can it, since
vmas get split and merged without its knowledge.

Instead, note page's anon_vma in its rmap_item when adding to stable tree:
all the vmas which might map that page are listed by its anon_vma.

page_referenced_ksm() and try_to_unmap_ksm() then traverse the anon_vma,
first to find the probable vma, that which matches rmap_item's mm; but if
that is not enough to locate all instances, traverse again to try the
others.  This catches those occasions when fork has duplicated a pte of a
ksm page, but ksmd has not yet come around to assign it an rmap_item.

But each rmap_item in the stable tree which refers to an anon_vma needs to
take a reference to it.  Andrea's anon_vma design cleverly avoided a
reference count (an anon_vma was free when its list of vmas was empty),
but KSM now needs to add that.  Is a 32-bit count sufficient?  I believe
so - the anon_vma is only free when both count is 0 and list is empty.
Signed-off-by: default avatarHugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: Izik Eidus <ieidus@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Chris Wright <chrisw@redhat.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 5ad64688
...@@ -26,6 +26,9 @@ ...@@ -26,6 +26,9 @@
*/ */
struct anon_vma { struct anon_vma {
spinlock_t lock; /* Serialize access to vma list */ spinlock_t lock; /* Serialize access to vma list */
#ifdef CONFIG_KSM
atomic_t ksm_refcount;
#endif
/* /*
* NOTE: the LSB of the head.next is set by * NOTE: the LSB of the head.next is set by
* mm_take_all_locks() _after_ taking the above lock. So the * mm_take_all_locks() _after_ taking the above lock. So the
...@@ -38,6 +41,26 @@ struct anon_vma { ...@@ -38,6 +41,26 @@ struct anon_vma {
}; };
#ifdef CONFIG_MMU #ifdef CONFIG_MMU
#ifdef CONFIG_KSM
static inline void ksm_refcount_init(struct anon_vma *anon_vma)
{
atomic_set(&anon_vma->ksm_refcount, 0);
}
static inline int ksm_refcount(struct anon_vma *anon_vma)
{
return atomic_read(&anon_vma->ksm_refcount);
}
#else
static inline void ksm_refcount_init(struct anon_vma *anon_vma)
{
}
static inline int ksm_refcount(struct anon_vma *anon_vma)
{
return 0;
}
#endif /* CONFIG_KSM */
static inline struct anon_vma *page_anon_vma(struct page *page) static inline struct anon_vma *page_anon_vma(struct page *page)
{ {
...@@ -70,6 +93,7 @@ void __anon_vma_merge(struct vm_area_struct *, struct vm_area_struct *); ...@@ -70,6 +93,7 @@ void __anon_vma_merge(struct vm_area_struct *, struct vm_area_struct *);
void anon_vma_unlink(struct vm_area_struct *); void anon_vma_unlink(struct vm_area_struct *);
void anon_vma_link(struct vm_area_struct *); void anon_vma_link(struct vm_area_struct *);
void __anon_vma_link(struct vm_area_struct *); void __anon_vma_link(struct vm_area_struct *);
void anon_vma_free(struct anon_vma *);
/* /*
* rmap interfaces called when adding or removing pte of page * rmap interfaces called when adding or removing pte of page
......
...@@ -121,7 +121,7 @@ struct stable_node { ...@@ -121,7 +121,7 @@ struct stable_node {
/** /**
* struct rmap_item - reverse mapping item for virtual addresses * struct rmap_item - reverse mapping item for virtual addresses
* @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
* @filler: unused space we're making available in this patch * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
* @mm: the memory structure this rmap_item is pointing into * @mm: the memory structure this rmap_item is pointing into
* @address: the virtual address this rmap_item tracks (+ flags in low bits) * @address: the virtual address this rmap_item tracks (+ flags in low bits)
* @oldchecksum: previous checksum of the page at that virtual address * @oldchecksum: previous checksum of the page at that virtual address
...@@ -131,7 +131,7 @@ struct stable_node { ...@@ -131,7 +131,7 @@ struct stable_node {
*/ */
struct rmap_item { struct rmap_item {
struct rmap_item *rmap_list; struct rmap_item *rmap_list;
unsigned long filler; struct anon_vma *anon_vma; /* when stable */
struct mm_struct *mm; struct mm_struct *mm;
unsigned long address; /* + low bits used for flags below */ unsigned long address; /* + low bits used for flags below */
unsigned int oldchecksum; /* when unstable */ unsigned int oldchecksum; /* when unstable */
...@@ -196,13 +196,6 @@ static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); ...@@ -196,13 +196,6 @@ static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
static DEFINE_MUTEX(ksm_thread_mutex); static DEFINE_MUTEX(ksm_thread_mutex);
static DEFINE_SPINLOCK(ksm_mmlist_lock); static DEFINE_SPINLOCK(ksm_mmlist_lock);
/*
* Temporary hack for page_referenced_ksm() and try_to_unmap_ksm(),
* later we rework things a little to get the right vma to them.
*/
static DEFINE_SPINLOCK(ksm_fallback_vma_lock);
static struct vm_area_struct ksm_fallback_vma;
#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\ #define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
sizeof(struct __struct), __alignof__(struct __struct),\ sizeof(struct __struct), __alignof__(struct __struct),\
(__flags), NULL) (__flags), NULL)
...@@ -323,6 +316,25 @@ static inline int in_stable_tree(struct rmap_item *rmap_item) ...@@ -323,6 +316,25 @@ static inline int in_stable_tree(struct rmap_item *rmap_item)
return rmap_item->address & STABLE_FLAG; return rmap_item->address & STABLE_FLAG;
} }
static void hold_anon_vma(struct rmap_item *rmap_item,
struct anon_vma *anon_vma)
{
rmap_item->anon_vma = anon_vma;
atomic_inc(&anon_vma->ksm_refcount);
}
static void drop_anon_vma(struct rmap_item *rmap_item)
{
struct anon_vma *anon_vma = rmap_item->anon_vma;
if (atomic_dec_and_lock(&anon_vma->ksm_refcount, &anon_vma->lock)) {
int empty = list_empty(&anon_vma->head);
spin_unlock(&anon_vma->lock);
if (empty)
anon_vma_free(anon_vma);
}
}
/* /*
* ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
* page tables after it has passed through ksm_exit() - which, if necessary, * page tables after it has passed through ksm_exit() - which, if necessary,
...@@ -472,6 +484,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) ...@@ -472,6 +484,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
ksm_pages_shared--; ksm_pages_shared--;
} }
drop_anon_vma(rmap_item);
rmap_item->address &= PAGE_MASK; rmap_item->address &= PAGE_MASK;
} else if (rmap_item->address & UNSTABLE_FLAG) { } else if (rmap_item->address & UNSTABLE_FLAG) {
...@@ -752,6 +765,9 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, ...@@ -752,6 +765,9 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
pte_t orig_pte = __pte(0); pte_t orig_pte = __pte(0);
int err = -EFAULT; int err = -EFAULT;
if (page == kpage) /* ksm page forked */
return 0;
if (!(vma->vm_flags & VM_MERGEABLE)) if (!(vma->vm_flags & VM_MERGEABLE))
goto out; goto out;
if (!PageAnon(page)) if (!PageAnon(page))
...@@ -805,9 +821,6 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item, ...@@ -805,9 +821,6 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
struct vm_area_struct *vma; struct vm_area_struct *vma;
int err = -EFAULT; int err = -EFAULT;
if (page == kpage) /* ksm page forked */
return 0;
down_read(&mm->mmap_sem); down_read(&mm->mmap_sem);
if (ksm_test_exit(mm)) if (ksm_test_exit(mm))
goto out; goto out;
...@@ -816,6 +829,11 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item, ...@@ -816,6 +829,11 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
goto out; goto out;
err = try_to_merge_one_page(vma, page, kpage); err = try_to_merge_one_page(vma, page, kpage);
if (err)
goto out;
/* Must get reference to anon_vma while still holding mmap_sem */
hold_anon_vma(rmap_item, vma->anon_vma);
out: out:
up_read(&mm->mmap_sem); up_read(&mm->mmap_sem);
return err; return err;
...@@ -869,6 +887,11 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, ...@@ -869,6 +887,11 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
lru_cache_add_lru(kpage, LRU_ACTIVE_ANON); lru_cache_add_lru(kpage, LRU_ACTIVE_ANON);
err = try_to_merge_one_page(vma, page, kpage); err = try_to_merge_one_page(vma, page, kpage);
if (err)
goto up;
/* Must get reference to anon_vma while still holding mmap_sem */
hold_anon_vma(rmap_item, vma->anon_vma);
up: up:
up_read(&mm->mmap_sem); up_read(&mm->mmap_sem);
...@@ -879,9 +902,11 @@ up: ...@@ -879,9 +902,11 @@ up:
* If that fails, we have a ksm page with only one pte * If that fails, we have a ksm page with only one pte
* pointing to it: so break it. * pointing to it: so break it.
*/ */
if (err) if (err) {
drop_anon_vma(rmap_item);
break_cow(rmap_item); break_cow(rmap_item);
} }
}
if (err) { if (err) {
put_page(kpage); put_page(kpage);
kpage = NULL; kpage = NULL;
...@@ -1155,7 +1180,9 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) ...@@ -1155,7 +1180,9 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
* in which case we need to break_cow on both. * in which case we need to break_cow on both.
*/ */
if (!stable_node) { if (!stable_node) {
drop_anon_vma(tree_rmap_item);
break_cow(tree_rmap_item); break_cow(tree_rmap_item);
drop_anon_vma(rmap_item);
break_cow(rmap_item); break_cow(rmap_item);
} }
} }
...@@ -1490,7 +1517,7 @@ int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, ...@@ -1490,7 +1517,7 @@ int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg,
struct hlist_node *hlist; struct hlist_node *hlist;
unsigned int mapcount = page_mapcount(page); unsigned int mapcount = page_mapcount(page);
int referenced = 0; int referenced = 0;
struct vm_area_struct *vma; int search_new_forks = 0;
VM_BUG_ON(!PageKsm(page)); VM_BUG_ON(!PageKsm(page));
VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(!PageLocked(page));
...@@ -1498,36 +1525,40 @@ int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, ...@@ -1498,36 +1525,40 @@ int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg,
stable_node = page_stable_node(page); stable_node = page_stable_node(page);
if (!stable_node) if (!stable_node)
return 0; return 0;
again:
hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
struct anon_vma *anon_vma = rmap_item->anon_vma;
struct vm_area_struct *vma;
spin_lock(&anon_vma->lock);
list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
if (rmap_item->address < vma->vm_start ||
rmap_item->address >= vma->vm_end)
continue;
/* /*
* Temporary hack: really we need anon_vma in rmap_item, to * Initially we examine only the vma which covers this
* provide the correct vma, and to find recently forked instances. * rmap_item; but later, if there is still work to do,
* Use zalloc to avoid weirdness if any other fields are involved. * we examine covering vmas in other mms: in case they
* were forked from the original since ksmd passed.
*/ */
vma = kmem_cache_zalloc(vm_area_cachep, GFP_ATOMIC); if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
if (!vma) {
spin_lock(&ksm_fallback_vma_lock);
vma = &ksm_fallback_vma;
}
hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
if (memcg && !mm_match_cgroup(rmap_item->mm, memcg))
continue; continue;
vma->vm_mm = rmap_item->mm; if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
vma->vm_start = rmap_item->address; continue;
vma->vm_end = vma->vm_start + PAGE_SIZE;
referenced += page_referenced_one(page, vma, referenced += page_referenced_one(page, vma,
rmap_item->address, &mapcount, vm_flags); rmap_item->address, &mapcount, vm_flags);
if (!search_new_forks || !mapcount)
break;
}
spin_unlock(&anon_vma->lock);
if (!mapcount) if (!mapcount)
goto out; goto out;
} }
if (!search_new_forks++)
goto again;
out: out:
if (vma == &ksm_fallback_vma)
spin_unlock(&ksm_fallback_vma_lock);
else
kmem_cache_free(vm_area_cachep, vma);
return referenced; return referenced;
} }
...@@ -1537,7 +1568,7 @@ int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) ...@@ -1537,7 +1568,7 @@ int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
struct hlist_node *hlist; struct hlist_node *hlist;
struct rmap_item *rmap_item; struct rmap_item *rmap_item;
int ret = SWAP_AGAIN; int ret = SWAP_AGAIN;
struct vm_area_struct *vma; int search_new_forks = 0;
VM_BUG_ON(!PageKsm(page)); VM_BUG_ON(!PageKsm(page));
VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(!PageLocked(page));
...@@ -1545,35 +1576,37 @@ int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) ...@@ -1545,35 +1576,37 @@ int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
stable_node = page_stable_node(page); stable_node = page_stable_node(page);
if (!stable_node) if (!stable_node)
return SWAP_FAIL; return SWAP_FAIL;
again:
hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
struct anon_vma *anon_vma = rmap_item->anon_vma;
struct vm_area_struct *vma;
spin_lock(&anon_vma->lock);
list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
if (rmap_item->address < vma->vm_start ||
rmap_item->address >= vma->vm_end)
continue;
/* /*
* Temporary hack: really we need anon_vma in rmap_item, to * Initially we examine only the vma which covers this
* provide the correct vma, and to find recently forked instances. * rmap_item; but later, if there is still work to do,
* Use zalloc to avoid weirdness if any other fields are involved. * we examine covering vmas in other mms: in case they
* were forked from the original since ksmd passed.
*/ */
if (TTU_ACTION(flags) != TTU_UNMAP) if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
return SWAP_FAIL; continue;
vma = kmem_cache_zalloc(vm_area_cachep, GFP_ATOMIC);
if (!vma) {
spin_lock(&ksm_fallback_vma_lock);
vma = &ksm_fallback_vma;
}
hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
vma->vm_mm = rmap_item->mm;
vma->vm_start = rmap_item->address;
vma->vm_end = vma->vm_start + PAGE_SIZE;
ret = try_to_unmap_one(page, vma, rmap_item->address, flags); ret = try_to_unmap_one(page, vma,
if (ret != SWAP_AGAIN || !page_mapped(page)) rmap_item->address, flags);
if (ret != SWAP_AGAIN || !page_mapped(page)) {
spin_unlock(&anon_vma->lock);
goto out; goto out;
} }
}
spin_unlock(&anon_vma->lock);
}
if (!search_new_forks++)
goto again;
out: out:
if (vma == &ksm_fallback_vma)
spin_unlock(&ksm_fallback_vma_lock);
else
kmem_cache_free(vm_area_cachep, vma);
return ret; return ret;
} }
......
...@@ -68,7 +68,7 @@ static inline struct anon_vma *anon_vma_alloc(void) ...@@ -68,7 +68,7 @@ static inline struct anon_vma *anon_vma_alloc(void)
return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
} }
static inline void anon_vma_free(struct anon_vma *anon_vma) void anon_vma_free(struct anon_vma *anon_vma)
{ {
kmem_cache_free(anon_vma_cachep, anon_vma); kmem_cache_free(anon_vma_cachep, anon_vma);
} }
...@@ -172,7 +172,7 @@ void anon_vma_unlink(struct vm_area_struct *vma) ...@@ -172,7 +172,7 @@ void anon_vma_unlink(struct vm_area_struct *vma)
list_del(&vma->anon_vma_node); list_del(&vma->anon_vma_node);
/* We must garbage collect the anon_vma if it's empty */ /* We must garbage collect the anon_vma if it's empty */
empty = list_empty(&anon_vma->head); empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma);
spin_unlock(&anon_vma->lock); spin_unlock(&anon_vma->lock);
if (empty) if (empty)
...@@ -184,6 +184,7 @@ static void anon_vma_ctor(void *data) ...@@ -184,6 +184,7 @@ static void anon_vma_ctor(void *data)
struct anon_vma *anon_vma = data; struct anon_vma *anon_vma = data;
spin_lock_init(&anon_vma->lock); spin_lock_init(&anon_vma->lock);
ksm_refcount_init(anon_vma);
INIT_LIST_HEAD(&anon_vma->head); INIT_LIST_HEAD(&anon_vma->head);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment