Commit 570a335b authored by Hugh Dickins's avatar Hugh Dickins Committed by Linus Torvalds

swap_info: swap count continuations

Swap is duplicated (reference count incremented by one) whenever the same
swap page is inserted into another mm (when forking finds a swap entry in
place of a pte, or when reclaim unmaps a pte to insert the swap entry).

swap_info_struct's vmalloc'ed swap_map is the array of these reference
counts: but what happens when the unsigned short (or unsigned char since
the preceding patch) is full? (and its high bit is kept for a cache flag)

We then lose track of it, never freeing, leaving it in use until swapoff:
at which point we _hope_ that a single pass will have found all instances,
assume there are no more, and will lose user data if we're wrong.

Swapping of KSM pages has not yet been enabled; but it is implemented,
and makes it very easy for a user to overflow the maximum swap count:
possible with ordinary process pages, but unlikely, even when pid_max
has been raised from PID_MAX_DEFAULT.

This patch implements swap count continuations: when the count overflows,
a continuation page is allocated and linked to the original vmalloc'ed
map page, and this used to hold the continuation counts for that entry
and its neighbours.  These continuation pages are seldom referenced:
the common paths all work on the original swap_map, only referring to
a continuation page when the low "digit" of a count is incremented or
decremented through SWAP_MAP_MAX.
Signed-off-by: default avatarHugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 8d69aaee
...@@ -145,15 +145,18 @@ enum { ...@@ -145,15 +145,18 @@ enum {
SWP_DISCARDABLE = (1 << 2), /* blkdev supports discard */ SWP_DISCARDABLE = (1 << 2), /* blkdev supports discard */
SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */ SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */
SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */ SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */
SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */
/* add others here before... */ /* add others here before... */
SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */ SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */
}; };
#define SWAP_CLUSTER_MAX 32 #define SWAP_CLUSTER_MAX 32
#define SWAP_MAP_MAX 0x7e #define SWAP_MAP_MAX 0x3e /* Max duplication count, in first swap_map */
#define SWAP_MAP_BAD 0x7f #define SWAP_MAP_BAD 0x3f /* Note pageblock is bad, in first swap_map */
#define SWAP_HAS_CACHE 0x80 /* There is a swap cache of entry. */ #define SWAP_HAS_CACHE 0x40 /* Flag page is cached, in first swap_map */
#define SWAP_CONT_MAX 0x7f /* Max count, in each swap_map continuation */
#define COUNT_CONTINUED 0x80 /* See swap_map continuation for full count */
/* /*
* The in-memory structure used to track swap areas. * The in-memory structure used to track swap areas.
...@@ -311,9 +314,10 @@ extern long total_swap_pages; ...@@ -311,9 +314,10 @@ extern long total_swap_pages;
extern void si_swapinfo(struct sysinfo *); extern void si_swapinfo(struct sysinfo *);
extern swp_entry_t get_swap_page(void); extern swp_entry_t get_swap_page(void);
extern swp_entry_t get_swap_page_of_type(int); extern swp_entry_t get_swap_page_of_type(int);
extern void swap_duplicate(swp_entry_t);
extern int swapcache_prepare(swp_entry_t);
extern int valid_swaphandles(swp_entry_t, unsigned long *); extern int valid_swaphandles(swp_entry_t, unsigned long *);
extern int add_swap_count_continuation(swp_entry_t, gfp_t);
extern int swap_duplicate(swp_entry_t);
extern int swapcache_prepare(swp_entry_t);
extern void swap_free(swp_entry_t); extern void swap_free(swp_entry_t);
extern void swapcache_free(swp_entry_t, struct page *page); extern void swapcache_free(swp_entry_t, struct page *page);
extern int free_swap_and_cache(swp_entry_t); extern int free_swap_and_cache(swp_entry_t);
...@@ -385,8 +389,14 @@ static inline void show_swap_cache_info(void) ...@@ -385,8 +389,14 @@ static inline void show_swap_cache_info(void)
#define free_swap_and_cache(swp) is_migration_entry(swp) #define free_swap_and_cache(swp) is_migration_entry(swp)
#define swapcache_prepare(swp) is_migration_entry(swp) #define swapcache_prepare(swp) is_migration_entry(swp)
static inline void swap_duplicate(swp_entry_t swp) static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask)
{ {
return 0;
}
static inline int swap_duplicate(swp_entry_t swp)
{
return 0;
} }
static inline void swap_free(swp_entry_t swp) static inline void swap_free(swp_entry_t swp)
......
...@@ -572,7 +572,7 @@ out: ...@@ -572,7 +572,7 @@ out:
* covered by this vma. * covered by this vma.
*/ */
static inline void static inline unsigned long
copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
unsigned long addr, int *rss) unsigned long addr, int *rss)
...@@ -586,7 +586,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, ...@@ -586,7 +586,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
if (!pte_file(pte)) { if (!pte_file(pte)) {
swp_entry_t entry = pte_to_swp_entry(pte); swp_entry_t entry = pte_to_swp_entry(pte);
swap_duplicate(entry); if (swap_duplicate(entry) < 0)
return entry.val;
/* make sure dst_mm is on swapoff's mmlist. */ /* make sure dst_mm is on swapoff's mmlist. */
if (unlikely(list_empty(&dst_mm->mmlist))) { if (unlikely(list_empty(&dst_mm->mmlist))) {
spin_lock(&mmlist_lock); spin_lock(&mmlist_lock);
...@@ -635,6 +637,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, ...@@ -635,6 +637,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
out_set_pte: out_set_pte:
set_pte_at(dst_mm, addr, dst_pte, pte); set_pte_at(dst_mm, addr, dst_pte, pte);
return 0;
} }
static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
...@@ -646,6 +649,7 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, ...@@ -646,6 +649,7 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
spinlock_t *src_ptl, *dst_ptl; spinlock_t *src_ptl, *dst_ptl;
int progress = 0; int progress = 0;
int rss[2]; int rss[2];
swp_entry_t entry = (swp_entry_t){0};
again: again:
rss[1] = rss[0] = 0; rss[1] = rss[0] = 0;
...@@ -674,7 +678,10 @@ again: ...@@ -674,7 +678,10 @@ again:
progress++; progress++;
continue; continue;
} }
copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss); entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
vma, addr, rss);
if (entry.val)
break;
progress += 8; progress += 8;
} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
...@@ -684,6 +691,12 @@ again: ...@@ -684,6 +691,12 @@ again:
add_mm_rss(dst_mm, rss[0], rss[1]); add_mm_rss(dst_mm, rss[0], rss[1]);
pte_unmap_unlock(orig_dst_pte, dst_ptl); pte_unmap_unlock(orig_dst_pte, dst_ptl);
cond_resched(); cond_resched();
if (entry.val) {
if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
return -ENOMEM;
progress = 0;
}
if (addr != end) if (addr != end)
goto again; goto again;
return 0; return 0;
......
...@@ -822,7 +822,11 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, ...@@ -822,7 +822,11 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
* Store the swap location in the pte. * Store the swap location in the pte.
* See handle_pte_fault() ... * See handle_pte_fault() ...
*/ */
swap_duplicate(entry); if (swap_duplicate(entry) < 0) {
set_pte_at(mm, address, pte, pteval);
ret = SWAP_FAIL;
goto out_unmap;
}
if (list_empty(&mm->mmlist)) { if (list_empty(&mm->mmlist)) {
spin_lock(&mmlist_lock); spin_lock(&mmlist_lock);
if (list_empty(&mm->mmlist)) if (list_empty(&mm->mmlist))
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment