Commit 705e87c0 authored by Hugh Dickins's avatar Hugh Dickins Committed by Linus Torvalds

[PATCH] mm: pte_offset_map_lock loops

Convert those common loops using page_table_lock on the outside and
pte_offset_map within to use just pte_offset_map_lock within instead.

These all hold mmap_sem (some exclusively, some not), so at no level can a
page table be whipped away from beneath them.  But whereas pte_alloc loops
tested with the "atomic" pmd_present, these loops are testing with pmd_none,
which on i386 PAE tests both lower and upper halves.

That's now unsafe, so add a cast into pmd_none to test only the vital lower
half: we lose a little sensitivity to a corrupt middle directory, but not
enough to worry about.  It appears that i386 and UML were the only
architectures vulnerable in this way, and pgd and pud no problem.
Signed-off-by: default avatarHugh Dickins <hugh@veritas.com>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 8f4e2101
...@@ -203,13 +203,14 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd, ...@@ -203,13 +203,14 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
struct mem_size_stats *mss) struct mem_size_stats *mss)
{ {
pte_t *pte, ptent; pte_t *pte, ptent;
spinlock_t *ptl;
unsigned long pfn; unsigned long pfn;
struct page *page; struct page *page;
pte = pte_offset_map(pmd, addr); pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
do { do {
ptent = *pte; ptent = *pte;
if (pte_none(ptent) || !pte_present(ptent)) if (!pte_present(ptent))
continue; continue;
mss->resident += PAGE_SIZE; mss->resident += PAGE_SIZE;
...@@ -230,8 +231,8 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd, ...@@ -230,8 +231,8 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
mss->private_clean += PAGE_SIZE; mss->private_clean += PAGE_SIZE;
} }
} while (pte++, addr += PAGE_SIZE, addr != end); } while (pte++, addr += PAGE_SIZE, addr != end);
pte_unmap(pte - 1); pte_unmap_unlock(pte - 1, ptl);
cond_resched_lock(&vma->vm_mm->page_table_lock); cond_resched();
} }
static inline void smaps_pmd_range(struct vm_area_struct *vma, pud_t *pud, static inline void smaps_pmd_range(struct vm_area_struct *vma, pud_t *pud,
...@@ -285,17 +286,11 @@ static inline void smaps_pgd_range(struct vm_area_struct *vma, ...@@ -285,17 +286,11 @@ static inline void smaps_pgd_range(struct vm_area_struct *vma,
static int show_smap(struct seq_file *m, void *v) static int show_smap(struct seq_file *m, void *v)
{ {
struct vm_area_struct *vma = v; struct vm_area_struct *vma = v;
struct mm_struct *mm = vma->vm_mm;
struct mem_size_stats mss; struct mem_size_stats mss;
memset(&mss, 0, sizeof mss); memset(&mss, 0, sizeof mss);
if (vma->vm_mm)
if (mm) {
spin_lock(&mm->page_table_lock);
smaps_pgd_range(vma, vma->vm_start, vma->vm_end, &mss); smaps_pgd_range(vma, vma->vm_start, vma->vm_end, &mss);
spin_unlock(&mm->page_table_lock);
}
return show_map_internal(m, v, &mss); return show_map_internal(m, v, &mss);
} }
......
...@@ -203,7 +203,8 @@ extern unsigned long pg0[]; ...@@ -203,7 +203,8 @@ extern unsigned long pg0[];
#define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE)) #define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE))
#define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0) #define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
#define pmd_none(x) (!pmd_val(x)) /* To avoid harmful races, pmd_none(x) should check only the lower when PAE */
#define pmd_none(x) (!(unsigned long)pmd_val(x))
#define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) #define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT)
#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
#define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) #define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
......
...@@ -138,7 +138,7 @@ extern unsigned long pg0[1024]; ...@@ -138,7 +138,7 @@ extern unsigned long pg0[1024];
#define pte_clear(mm,addr,xp) pte_set_val(*(xp), (phys_t) 0, __pgprot(_PAGE_NEWPAGE)) #define pte_clear(mm,addr,xp) pte_set_val(*(xp), (phys_t) 0, __pgprot(_PAGE_NEWPAGE))
#define pmd_none(x) (!(pmd_val(x) & ~_PAGE_NEWPAGE)) #define pmd_none(x) (!((unsigned long)pmd_val(x) & ~_PAGE_NEWPAGE))
#define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) #define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
#define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) #define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT)
#define pmd_clear(xp) do { pmd_val(*(xp)) = _PAGE_NEWPAGE; } while (0) #define pmd_clear(xp) do { pmd_val(*(xp)) = _PAGE_NEWPAGE; } while (0)
......
...@@ -228,9 +228,9 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, ...@@ -228,9 +228,9 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
{ {
pte_t *orig_pte; pte_t *orig_pte;
pte_t *pte; pte_t *pte;
spinlock_t *ptl;
spin_lock(&vma->vm_mm->page_table_lock); orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
orig_pte = pte = pte_offset_map(pmd, addr);
do { do {
unsigned long pfn; unsigned long pfn;
unsigned int nid; unsigned int nid;
...@@ -246,8 +246,7 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, ...@@ -246,8 +246,7 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
if (!node_isset(nid, *nodes)) if (!node_isset(nid, *nodes))
break; break;
} while (pte++, addr += PAGE_SIZE, addr != end); } while (pte++, addr += PAGE_SIZE, addr != end);
pte_unmap(orig_pte); pte_unmap_unlock(orig_pte, ptl);
spin_unlock(&vma->vm_mm->page_table_lock);
return addr != end; return addr != end;
} }
......
...@@ -29,8 +29,9 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, ...@@ -29,8 +29,9 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
unsigned long addr, unsigned long end, pgprot_t newprot) unsigned long addr, unsigned long end, pgprot_t newprot)
{ {
pte_t *pte; pte_t *pte;
spinlock_t *ptl;
pte = pte_offset_map(pmd, addr); pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
do { do {
if (pte_present(*pte)) { if (pte_present(*pte)) {
pte_t ptent; pte_t ptent;
...@@ -44,7 +45,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, ...@@ -44,7 +45,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
lazy_mmu_prot_update(ptent); lazy_mmu_prot_update(ptent);
} }
} while (pte++, addr += PAGE_SIZE, addr != end); } while (pte++, addr += PAGE_SIZE, addr != end);
pte_unmap(pte - 1); pte_unmap_unlock(pte - 1, ptl);
} }
static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
...@@ -88,7 +89,6 @@ static void change_protection(struct vm_area_struct *vma, ...@@ -88,7 +89,6 @@ static void change_protection(struct vm_area_struct *vma,
BUG_ON(addr >= end); BUG_ON(addr >= end);
pgd = pgd_offset(mm, addr); pgd = pgd_offset(mm, addr);
flush_cache_range(vma, addr, end); flush_cache_range(vma, addr, end);
spin_lock(&mm->page_table_lock);
do { do {
next = pgd_addr_end(addr, end); next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd)) if (pgd_none_or_clear_bad(pgd))
...@@ -96,7 +96,6 @@ static void change_protection(struct vm_area_struct *vma, ...@@ -96,7 +96,6 @@ static void change_protection(struct vm_area_struct *vma,
change_pud_range(mm, pgd, addr, next, newprot); change_pud_range(mm, pgd, addr, next, newprot);
} while (pgd++, addr = next, addr != end); } while (pgd++, addr = next, addr != end);
flush_tlb_range(vma, start, end); flush_tlb_range(vma, start, end);
spin_unlock(&mm->page_table_lock);
} }
static int static int
......
...@@ -17,28 +17,22 @@ ...@@ -17,28 +17,22 @@
#include <asm/pgtable.h> #include <asm/pgtable.h>
#include <asm/tlbflush.h> #include <asm/tlbflush.h>
/*
* Called with mm->page_table_lock held to protect against other
* threads/the swapper from ripping pte's out from under us.
*/
static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end) unsigned long addr, unsigned long end)
{ {
struct mm_struct *mm = vma->vm_mm;
pte_t *pte; pte_t *pte;
spinlock_t *ptl;
int progress = 0; int progress = 0;
again: again:
pte = pte_offset_map(pmd, addr); pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
do { do {
unsigned long pfn; unsigned long pfn;
struct page *page; struct page *page;
if (progress >= 64) { if (progress >= 64) {
progress = 0; progress = 0;
if (need_resched() || if (need_resched() || need_lockbreak(ptl))
need_lockbreak(&mm->page_table_lock))
break; break;
} }
progress++; progress++;
...@@ -58,8 +52,8 @@ again: ...@@ -58,8 +52,8 @@ again:
set_page_dirty(page); set_page_dirty(page);
progress += 3; progress += 3;
} while (pte++, addr += PAGE_SIZE, addr != end); } while (pte++, addr += PAGE_SIZE, addr != end);
pte_unmap(pte - 1); pte_unmap_unlock(pte - 1, ptl);
cond_resched_lock(&mm->page_table_lock); cond_resched();
if (addr != end) if (addr != end)
goto again; goto again;
} }
...@@ -97,7 +91,6 @@ static inline void msync_pud_range(struct vm_area_struct *vma, pgd_t *pgd, ...@@ -97,7 +91,6 @@ static inline void msync_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
static void msync_page_range(struct vm_area_struct *vma, static void msync_page_range(struct vm_area_struct *vma,
unsigned long addr, unsigned long end) unsigned long addr, unsigned long end)
{ {
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd; pgd_t *pgd;
unsigned long next; unsigned long next;
...@@ -110,16 +103,14 @@ static void msync_page_range(struct vm_area_struct *vma, ...@@ -110,16 +103,14 @@ static void msync_page_range(struct vm_area_struct *vma,
return; return;
BUG_ON(addr >= end); BUG_ON(addr >= end);
pgd = pgd_offset(mm, addr); pgd = pgd_offset(vma->vm_mm, addr);
flush_cache_range(vma, addr, end); flush_cache_range(vma, addr, end);
spin_lock(&mm->page_table_lock);
do { do {
next = pgd_addr_end(addr, end); next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd)) if (pgd_none_or_clear_bad(pgd))
continue; continue;
msync_pud_range(vma, pgd, addr, next); msync_pud_range(vma, pgd, addr, next);
} while (pgd++, addr = next, addr != end); } while (pgd++, addr = next, addr != end);
spin_unlock(&mm->page_table_lock);
} }
/* /*
......
...@@ -401,8 +401,6 @@ void free_swap_and_cache(swp_entry_t entry) ...@@ -401,8 +401,6 @@ void free_swap_and_cache(swp_entry_t entry)
* No need to decide whether this PTE shares the swap entry with others, * No need to decide whether this PTE shares the swap entry with others,
* just let do_wp_page work it out if a write is requested later - to * just let do_wp_page work it out if a write is requested later - to
* force COW, vm_page_prot omits write permission from any private vma. * force COW, vm_page_prot omits write permission from any private vma.
*
* vma->vm_mm->page_table_lock is held.
*/ */
static void unuse_pte(struct vm_area_struct *vma, pte_t *pte, static void unuse_pte(struct vm_area_struct *vma, pte_t *pte,
unsigned long addr, swp_entry_t entry, struct page *page) unsigned long addr, swp_entry_t entry, struct page *page)
...@@ -424,23 +422,25 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, ...@@ -424,23 +422,25 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end, unsigned long addr, unsigned long end,
swp_entry_t entry, struct page *page) swp_entry_t entry, struct page *page)
{ {
pte_t *pte;
pte_t swp_pte = swp_entry_to_pte(entry); pte_t swp_pte = swp_entry_to_pte(entry);
pte_t *pte;
spinlock_t *ptl;
int found = 0;
pte = pte_offset_map(pmd, addr); pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
do { do {
/* /*
* swapoff spends a _lot_ of time in this loop! * swapoff spends a _lot_ of time in this loop!
* Test inline before going to call unuse_pte. * Test inline before going to call unuse_pte.
*/ */
if (unlikely(pte_same(*pte, swp_pte))) { if (unlikely(pte_same(*pte, swp_pte))) {
unuse_pte(vma, pte, addr, entry, page); unuse_pte(vma, pte++, addr, entry, page);
pte_unmap(pte); found = 1;
return 1; break;
} }
} while (pte++, addr += PAGE_SIZE, addr != end); } while (pte++, addr += PAGE_SIZE, addr != end);
pte_unmap(pte - 1); pte_unmap_unlock(pte - 1, ptl);
return 0; return found;
} }
static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
...@@ -522,12 +522,10 @@ static int unuse_mm(struct mm_struct *mm, ...@@ -522,12 +522,10 @@ static int unuse_mm(struct mm_struct *mm,
down_read(&mm->mmap_sem); down_read(&mm->mmap_sem);
lock_page(page); lock_page(page);
} }
spin_lock(&mm->page_table_lock);
for (vma = mm->mmap; vma; vma = vma->vm_next) { for (vma = mm->mmap; vma; vma = vma->vm_next) {
if (vma->anon_vma && unuse_vma(vma, entry, page)) if (vma->anon_vma && unuse_vma(vma, entry, page))
break; break;
} }
spin_unlock(&mm->page_table_lock);
up_read(&mm->mmap_sem); up_read(&mm->mmap_sem);
/* /*
* Currently unuse_mm cannot fail, but leave error handling * Currently unuse_mm cannot fail, but leave error handling
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment