Commit 6aab341e authored by Linus Torvalds's avatar Linus Torvalds

mm: re-architect the VM_UNPAGED logic

This replaces the (in my opinion horrible) VM_UNMAPPED logic with very
explicit support for a "remapped page range" aka VM_PFNMAP.  It allows a
VM area to contain an arbitrary range of page table entries that the VM
never touches, and never considers to be normal pages.

Any user of "remap_pfn_range()" automatically gets this new
functionality, and doesn't even have to mark the pages reserved or
indeed mark them any other way.  It just works.  As a side effect, doing
mmap() on /dev/mem works for arbitrary ranges.

Sparc update from David in the next commit.
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 458af543
...@@ -145,8 +145,7 @@ static void dump_vdso_pages(struct vm_area_struct * vma) ...@@ -145,8 +145,7 @@ static void dump_vdso_pages(struct vm_area_struct * vma)
struct page *pg = virt_to_page(vdso32_kbase + struct page *pg = virt_to_page(vdso32_kbase +
i*PAGE_SIZE); i*PAGE_SIZE);
struct page *upg = (vma && vma->vm_mm) ? struct page *upg = (vma && vma->vm_mm) ?
follow_page(vma->vm_mm, vma->vm_start + follow_page(vma, vma->vm_start + i*PAGE_SIZE, 0)
i*PAGE_SIZE, 0)
: NULL; : NULL;
dump_one_vdso_page(pg, upg); dump_one_vdso_page(pg, upg);
} }
...@@ -157,8 +156,7 @@ static void dump_vdso_pages(struct vm_area_struct * vma) ...@@ -157,8 +156,7 @@ static void dump_vdso_pages(struct vm_area_struct * vma)
struct page *pg = virt_to_page(vdso64_kbase + struct page *pg = virt_to_page(vdso64_kbase +
i*PAGE_SIZE); i*PAGE_SIZE);
struct page *upg = (vma && vma->vm_mm) ? struct page *upg = (vma && vma->vm_mm) ?
follow_page(vma->vm_mm, vma->vm_start + follow_page(vma, vma->vm_start + i*PAGE_SIZE, 0)
i*PAGE_SIZE, 0)
: NULL; : NULL;
dump_one_vdso_page(pg, upg); dump_one_vdso_page(pg, upg);
} }
......
...@@ -591,7 +591,7 @@ static inline size_t read_zero_pagealigned(char __user * buf, size_t size) ...@@ -591,7 +591,7 @@ static inline size_t read_zero_pagealigned(char __user * buf, size_t size)
if (vma->vm_start > addr || (vma->vm_flags & VM_WRITE) == 0) if (vma->vm_start > addr || (vma->vm_flags & VM_WRITE) == 0)
goto out_up; goto out_up;
if (vma->vm_flags & (VM_SHARED | VM_HUGETLB | VM_UNPAGED)) if (vma->vm_flags & (VM_SHARED | VM_HUGETLB))
break; break;
count = vma->vm_end - addr; count = vma->vm_end - addr;
if (count > size) if (count > size)
......
...@@ -402,12 +402,11 @@ struct numa_maps { ...@@ -402,12 +402,11 @@ struct numa_maps {
/* /*
* Calculate numa node maps for a vma * Calculate numa node maps for a vma
*/ */
static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma) static struct numa_maps *get_numa_maps(struct vm_area_struct *vma)
{ {
int i;
struct page *page; struct page *page;
unsigned long vaddr; unsigned long vaddr;
struct mm_struct *mm = vma->vm_mm;
int i;
struct numa_maps *md = kmalloc(sizeof(struct numa_maps), GFP_KERNEL); struct numa_maps *md = kmalloc(sizeof(struct numa_maps), GFP_KERNEL);
if (!md) if (!md)
...@@ -420,7 +419,7 @@ static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma) ...@@ -420,7 +419,7 @@ static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma)
md->node[i] =0; md->node[i] =0;
for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) { for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) {
page = follow_page(mm, vaddr, 0); page = follow_page(vma, vaddr, 0);
if (page) { if (page) {
int count = page_mapcount(page); int count = page_mapcount(page);
......
...@@ -145,7 +145,7 @@ extern unsigned int kobjsize(const void *objp); ...@@ -145,7 +145,7 @@ extern unsigned int kobjsize(const void *objp);
#define VM_GROWSDOWN 0x00000100 /* general info on the segment */ #define VM_GROWSDOWN 0x00000100 /* general info on the segment */
#define VM_GROWSUP 0x00000200 #define VM_GROWSUP 0x00000200
#define VM_SHM 0x00000000 /* Means nothing: delete it later */ #define VM_SHM 0x00000000 /* Means nothing: delete it later */
#define VM_UNPAGED 0x00000400 /* Pages managed without map count */ #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */
#define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ #define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */
#define VM_EXECUTABLE 0x00001000 #define VM_EXECUTABLE 0x00001000
...@@ -664,6 +664,7 @@ struct zap_details { ...@@ -664,6 +664,7 @@ struct zap_details {
unsigned long truncate_count; /* Compare vm_truncate_count */ unsigned long truncate_count; /* Compare vm_truncate_count */
}; };
struct page *vm_normal_page(struct vm_area_struct *, unsigned long, pte_t);
unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
unsigned long size, struct zap_details *); unsigned long size, struct zap_details *);
unsigned long unmap_vmas(struct mmu_gather **tlb, unsigned long unmap_vmas(struct mmu_gather **tlb,
...@@ -953,7 +954,7 @@ unsigned long vmalloc_to_pfn(void *addr); ...@@ -953,7 +954,7 @@ unsigned long vmalloc_to_pfn(void *addr);
int remap_pfn_range(struct vm_area_struct *, unsigned long addr, int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
unsigned long pfn, unsigned long size, pgprot_t); unsigned long pfn, unsigned long size, pgprot_t);
struct page *follow_page(struct mm_struct *, unsigned long address, struct page *follow_page(struct vm_area_struct *, unsigned long address,
unsigned int foll_flags); unsigned int foll_flags);
#define FOLL_WRITE 0x01 /* check pte is writable */ #define FOLL_WRITE 0x01 /* check pte is writable */
#define FOLL_TOUCH 0x02 /* mark page accessed */ #define FOLL_TOUCH 0x02 /* mark page accessed */
......
...@@ -27,24 +27,20 @@ static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -27,24 +27,20 @@ static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
struct page *page = NULL; struct page *page = NULL;
if (pte_present(pte)) { if (pte_present(pte)) {
unsigned long pfn = pte_pfn(pte); flush_cache_page(vma, addr, pte_pfn(pte));
flush_cache_page(vma, addr, pfn);
pte = ptep_clear_flush(vma, addr, ptep); pte = ptep_clear_flush(vma, addr, ptep);
if (unlikely(!pfn_valid(pfn))) { page = vm_normal_page(vma, addr, pte);
print_bad_pte(vma, pte, addr); if (page) {
goto out; if (pte_dirty(pte))
set_page_dirty(page);
page_remove_rmap(page);
page_cache_release(page);
} }
page = pfn_to_page(pfn);
if (pte_dirty(pte))
set_page_dirty(page);
page_remove_rmap(page);
page_cache_release(page);
} else { } else {
if (!pte_file(pte)) if (!pte_file(pte))
free_swap_and_cache(pte_to_swp_entry(pte)); free_swap_and_cache(pte_to_swp_entry(pte));
pte_clear(mm, addr, ptep); pte_clear(mm, addr, ptep);
} }
out:
return !!page; return !!page;
} }
...@@ -65,8 +61,6 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -65,8 +61,6 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
pte_t pte_val; pte_t pte_val;
spinlock_t *ptl; spinlock_t *ptl;
BUG_ON(vma->vm_flags & VM_UNPAGED);
pgd = pgd_offset(mm, addr); pgd = pgd_offset(mm, addr);
pud = pud_alloc(mm, pgd, addr); pud = pud_alloc(mm, pgd, addr);
if (!pud) if (!pud)
...@@ -122,8 +116,6 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -122,8 +116,6 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
pte_t pte_val; pte_t pte_val;
spinlock_t *ptl; spinlock_t *ptl;
BUG_ON(vma->vm_flags & VM_UNPAGED);
pgd = pgd_offset(mm, addr); pgd = pgd_offset(mm, addr);
pud = pud_alloc(mm, pgd, addr); pud = pud_alloc(mm, pgd, addr);
if (!pud) if (!pud)
......
...@@ -126,7 +126,7 @@ static long madvise_dontneed(struct vm_area_struct * vma, ...@@ -126,7 +126,7 @@ static long madvise_dontneed(struct vm_area_struct * vma,
unsigned long start, unsigned long end) unsigned long start, unsigned long end)
{ {
*prev = vma; *prev = vma;
if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_UNPAGED)) if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
return -EINVAL; return -EINVAL;
if (unlikely(vma->vm_flags & VM_NONLINEAR)) { if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
......
This diff is collapsed.
...@@ -189,17 +189,15 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, ...@@ -189,17 +189,15 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
do { do {
unsigned long pfn; struct page *page;
unsigned int nid; unsigned int nid;
if (!pte_present(*pte)) if (!pte_present(*pte))
continue; continue;
pfn = pte_pfn(*pte); page = vm_normal_page(vma, addr, *pte);
if (!pfn_valid(pfn)) { if (!page)
print_bad_pte(vma, *pte, addr);
continue; continue;
} nid = page_to_nid(page);
nid = pfn_to_nid(pfn);
if (!node_isset(nid, *nodes)) if (!node_isset(nid, *nodes))
break; break;
} while (pte++, addr += PAGE_SIZE, addr != end); } while (pte++, addr += PAGE_SIZE, addr != end);
...@@ -269,8 +267,6 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, ...@@ -269,8 +267,6 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
first = find_vma(mm, start); first = find_vma(mm, start);
if (!first) if (!first)
return ERR_PTR(-EFAULT); return ERR_PTR(-EFAULT);
if (first->vm_flags & VM_UNPAGED)
return ERR_PTR(-EACCES);
prev = NULL; prev = NULL;
for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
if (!vma->vm_next && vma->vm_end < end) if (!vma->vm_next && vma->vm_end < end)
......
...@@ -27,7 +27,6 @@ static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, ...@@ -27,7 +27,6 @@ static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
again: again:
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
do { do {
unsigned long pfn;
struct page *page; struct page *page;
if (progress >= 64) { if (progress >= 64) {
...@@ -40,13 +39,9 @@ again: ...@@ -40,13 +39,9 @@ again:
continue; continue;
if (!pte_maybe_dirty(*pte)) if (!pte_maybe_dirty(*pte))
continue; continue;
pfn = pte_pfn(*pte); page = vm_normal_page(vma, addr, *pte);
if (unlikely(!pfn_valid(pfn))) { if (!page)
print_bad_pte(vma, *pte, addr);
continue; continue;
}
page = pfn_to_page(pfn);
if (ptep_clear_flush_dirty(vma, addr, pte) || if (ptep_clear_flush_dirty(vma, addr, pte) ||
page_test_and_clear_dirty(page)) page_test_and_clear_dirty(page))
set_page_dirty(page); set_page_dirty(page);
...@@ -97,9 +92,8 @@ static void msync_page_range(struct vm_area_struct *vma, ...@@ -97,9 +92,8 @@ static void msync_page_range(struct vm_area_struct *vma,
/* For hugepages we can't go walking the page table normally, /* For hugepages we can't go walking the page table normally,
* but that's ok, hugetlbfs is memory based, so we don't need * but that's ok, hugetlbfs is memory based, so we don't need
* to do anything more on an msync(). * to do anything more on an msync().
* Can't do anything with VM_UNPAGED regions either.
*/ */
if (vma->vm_flags & (VM_HUGETLB|VM_UNPAGED)) if (vma->vm_flags & VM_HUGETLB)
return; return;
BUG_ON(addr >= end); BUG_ON(addr >= end);
......
...@@ -1045,7 +1045,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) ...@@ -1045,7 +1045,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
EXPORT_SYMBOL(find_vma); EXPORT_SYMBOL(find_vma);
struct page *follow_page(struct mm_struct *mm, unsigned long address, struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
unsigned int foll_flags) unsigned int foll_flags)
{ {
return NULL; return NULL;
......
...@@ -226,8 +226,6 @@ vma_address(struct page *page, struct vm_area_struct *vma) ...@@ -226,8 +226,6 @@ vma_address(struct page *page, struct vm_area_struct *vma)
/* /*
* At what user virtual address is page expected in vma? checking that the * At what user virtual address is page expected in vma? checking that the
* page matches the vma: currently only used on anon pages, by unuse_vma; * page matches the vma: currently only used on anon pages, by unuse_vma;
* and by extraordinary checks on anon pages in VM_UNPAGED vmas, taking
* care that an mmap of /dev/mem might window free and foreign pages.
*/ */
unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
{ {
...@@ -614,7 +612,6 @@ static void try_to_unmap_cluster(unsigned long cursor, ...@@ -614,7 +612,6 @@ static void try_to_unmap_cluster(unsigned long cursor,
struct page *page; struct page *page;
unsigned long address; unsigned long address;
unsigned long end; unsigned long end;
unsigned long pfn;
address = (vma->vm_start + cursor) & CLUSTER_MASK; address = (vma->vm_start + cursor) & CLUSTER_MASK;
end = address + CLUSTER_SIZE; end = address + CLUSTER_SIZE;
...@@ -643,15 +640,8 @@ static void try_to_unmap_cluster(unsigned long cursor, ...@@ -643,15 +640,8 @@ static void try_to_unmap_cluster(unsigned long cursor,
for (; address < end; pte++, address += PAGE_SIZE) { for (; address < end; pte++, address += PAGE_SIZE) {
if (!pte_present(*pte)) if (!pte_present(*pte))
continue; continue;
page = vm_normal_page(vma, address, *pte);
pfn = pte_pfn(*pte); BUG_ON(!page || PageAnon(page));
if (unlikely(!pfn_valid(pfn))) {
print_bad_pte(vma, *pte, address);
continue;
}
page = pfn_to_page(pfn);
BUG_ON(PageAnon(page));
if (ptep_clear_flush_young(vma, address, pte)) if (ptep_clear_flush_young(vma, address, pte))
continue; continue;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment