[PATCH] powerpc: Fix pagetable bloat for hugepages

At present, ARCH=powerpc kernels can waste considerable space in pagetables when making large hugepage mappings. Hugepage PTEs go in PMD pages, but each PMD page maps 256M and so contains only 16 hugepage PTEs (128 bytes of data), but takes up a 1024 byte allocation. With CONFIG_PPC_64K_PAGES enabled (64k base page size), the situation is worse. Now hugepage PTEs are at the PTE page level (also mapping 256M), so we store 16 hugepage PTEs in a 64k allocation. The PowerPC MMU already means that any 256M region is either all hugepage, or all normal pages. Thus, with some care, we can use a different allocation for the hugepage PTE tables and only allocate the 128 bytes necessary. Signed-off-by: Paul Mackerras <paulus@samba.org>

[PATCH] powerpc: Fix pagetable bloat for hugepages
At present, ARCH=powerpc kernels can waste considerable space in pagetables when making large hugepage mappings. Hugepage PTEs go in PMD pages, but each PMD page maps 256M and so contains only 16 hugepage PTEs (128 bytes of data), but takes up a 1024 byte allocation. With CONFIG_PPC_64K_PAGES enabled (64k base page size), the situation is worse. Now hugepage PTEs are at the PTE page level (also mapping 256M), so we store 16 hugepage PTEs in a 64k allocation. The PowerPC MMU already means that any 256M region is either all hugepage, or all normal pages. Thus, with some care, we can use a different allocation for the hugepage PTE tables and only allocate the 128 bytes necessary. Signed-off-by: Paul Mackerras <paulus@samba.org>
f10a04c0 · David Gibson · Paul Mackerras · 37e53db8 · f10a04c0 · f10a04c0
Commit f10a04c0 authored Apr 28, 2006 by David Gibson Committed by Paul Mackerras Apr 28, 2006
4 changed files
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -30,13 +30,66 @@
 #define NUM_LOW_AREAS	(0x100000000UL >> SID_SHIFT)
 #define NUM_HIGH_AREAS	(PGTABLE_RANGE >> HTLB_AREA_SHIFT)
+#ifdef CONFIG_PPC_64K_PAGES
+#define HUGEPTE_INDEX_SIZE	(PMD_SHIFT-HPAGE_SHIFT)
+#else
+#define HUGEPTE_INDEX_SIZE	(PUD_SHIFT-HPAGE_SHIFT)
+#endif
+#define PTRS_PER_HUGEPTE	(1 << HUGEPTE_INDEX_SIZE)
+#define HUGEPTE_TABLE_SIZE	(sizeof(pte_t) << HUGEPTE_INDEX_SIZE)
+#define HUGEPD_SHIFT		(HPAGE_SHIFT + HUGEPTE_INDEX_SIZE)
+#define HUGEPD_SIZE		(1UL << HUGEPD_SHIFT)
+#define HUGEPD_MASK		(~(HUGEPD_SIZE-1))
+#define huge_pgtable_cache	(pgtable_cache[HUGEPTE_CACHE_NUM])
+/* Flag to mark huge PD pointers.  This means pmd_bad() and pud_bad()
+ * will choke on pointers to hugepte tables, which is handy for
+ * catching screwups early. */
+#define HUGEPD_OK	0x1
+typedef struct { unsigned long pd; } hugepd_t;
+#define hugepd_none(hpd)	((hpd).pd == 0)
+static inline pte_t *hugepd_page(hugepd_t hpd)
+{
+	BUG_ON(!(hpd.pd & HUGEPD_OK));
+	return (pte_t *)(hpd.pd & ~HUGEPD_OK);
+}
+static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr)
+{
+	unsigned long idx = ((addr >> HPAGE_SHIFT) & (PTRS_PER_HUGEPTE-1));
+	pte_t *dir = hugepd_page(*hpdp);
+	return dir + idx;
+}
+static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
+			   unsigned long address)
+{
+	pte_t *new = kmem_cache_alloc(huge_pgtable_cache,
+				      GFP_KERNEL|__GFP_REPEAT);
+	if (! new)
+		return -ENOMEM;
+	spin_lock(&mm->page_table_lock);
+	if (!hugepd_none(*hpdp))
+		kmem_cache_free(huge_pgtable_cache, new);
+	else
+		hpdp->pd = (unsigned long)new | HUGEPD_OK;
+	spin_unlock(&mm->page_table_lock);
+	return 0;
+}
 /* Modelled after find_linux_pte() */
 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pg;
 	pud_t *pu;
-	pmd_t *pm;
-	pte_t *pt;
 	BUG_ON(! in_hugepage_area(mm->context, addr));
@@ -46,26 +99,14 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 	if (!pgd_none(*pg)) {
 		pu = pud_offset(pg, addr);
 		if (!pud_none(*pu)) {
-			pm = pmd_offset(pu, addr);
 #ifdef CONFIG_PPC_64K_PAGES
-			/* Currently, we use the normal PTE offset within full
+			pmd_t *pm;
-			 * size PTE pages, thus our huge PTEs are scattered in
+			pm = pmd_offset(pu, addr);
-			 * the PTE page and we do waste some. We may change
+			if (!pmd_none(*pm))
-			 * that in the future, but the current mecanism keeps
+				return hugepte_offset((hugepd_t *)pm, addr);
-			 * things much simpler
+#else
-			 */
+			return hugepte_offset((hugepd_t *)pu, addr);
-			if (!pmd_none(*pm)) {
+#endif
-				/* Note: pte_offset_* are all equivalent on
-				 * ppc64 as we don't have HIGHMEM
-				 */
-				pt = pte_offset_kernel(pm, addr);
-				return pt;
-			}
-#else /* CONFIG_PPC_64K_PAGES */
-			/* On 4k pages, we put huge PTEs in the PMD page */
-			pt = (pte_t *)pm;
-			return pt;
-#endif /* CONFIG_PPC_64K_PAGES */
 		}
 	}
@@ -76,8 +117,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pg;
 	pud_t *pu;
-	pmd_t *pm;
+	hugepd_t *hpdp = NULL;
-	pte_t *pt;
 	BUG_ON(! in_hugepage_area(mm->context, addr));
@@ -87,23 +127,182 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 	pu = pud_alloc(mm, pg, addr);
 	if (pu) {
+#ifdef CONFIG_PPC_64K_PAGES
+		pmd_t *pm;
 		pm = pmd_alloc(mm, pu, addr);
-		if (pm) {
+		if (pm)
+			hpdp = (hugepd_t *)pm;
+#else
+		hpdp = (hugepd_t *)pu;
+#endif
+	}
+	if (! hpdp)
+		return NULL;
+	if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr))
+		return NULL;
+	return hugepte_offset(hpdp, addr);
+}
+static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp)
+{
+	pte_t *hugepte = hugepd_page(*hpdp);
+	hpdp->pd = 0;
+	tlb->need_flush = 1;
+	pgtable_free_tlb(tlb, pgtable_free_cache(hugepte, HUGEPTE_CACHE_NUM,
+						 HUGEPTE_TABLE_SIZE-1));
+}
 #ifdef CONFIG_PPC_64K_PAGES
-			/* See comment in huge_pte_offset. Note that if we ever
+static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
-			 * want to put the page size in the PMD, we would have
+				   unsigned long addr, unsigned long end,
-			 * to open code our own pte_alloc* function in order
+				   unsigned long floor, unsigned long ceiling)
-			 * to populate and set the size atomically
+{
-			 */
+	pmd_t *pmd;
-			pt = pte_alloc_map(mm, pm, addr);
+	unsigned long next;
-#else /* CONFIG_PPC_64K_PAGES */
+	unsigned long start;
-			pt = (pte_t *)pm;
-#endif /* CONFIG_PPC_64K_PAGES */
+	start = addr;
-			return pt;
+	pmd = pmd_offset(pud, addr);
-		}
+	do {
+		next = pmd_addr_end(addr, end);
+		if (pmd_none(*pmd))
+			continue;
+		free_hugepte_range(tlb, (hugepd_t *)pmd);
+	} while (pmd++, addr = next, addr != end);
+	start &= PUD_MASK;
+	if (start < floor)
+		return;
+	if (ceiling) {
+		ceiling &= PUD_MASK;
+		if (!ceiling)
+			return;
 	}
+	if (end - 1 > ceiling - 1)
+		return;
-	return NULL;
+	pmd = pmd_offset(pud, start);
+	pud_clear(pud);
+	pmd_free_tlb(tlb, pmd);
+}
+#endif
+static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
+				   unsigned long addr, unsigned long end,
+				   unsigned long floor, unsigned long ceiling)
+{
+	pud_t *pud;
+	unsigned long next;
+	unsigned long start;
+	start = addr;
+	pud = pud_offset(pgd, addr);
+	do {
+		next = pud_addr_end(addr, end);
+#ifdef CONFIG_PPC_64K_PAGES
+		if (pud_none_or_clear_bad(pud))
+			continue;
+		hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling);
+#else
+		if (pud_none(*pud))
+			continue;
+		free_hugepte_range(tlb, (hugepd_t *)pud);
+#endif
+	} while (pud++, addr = next, addr != end);
+	start &= PGDIR_MASK;
+	if (start < floor)
+		return;
+	if (ceiling) {
+		ceiling &= PGDIR_MASK;
+		if (!ceiling)
+			return;
+	}
+	if (end - 1 > ceiling - 1)
+		return;
+	pud = pud_offset(pgd, start);
+	pgd_clear(pgd);
+	pud_free_tlb(tlb, pud);
+}
+/*
+ * This function frees user-level page tables of a process.
+ *
+ * Must be called with pagetable lock held.
+ */
+void hugetlb_free_pgd_range(struct mmu_gather **tlb,
+			    unsigned long addr, unsigned long end,
+			    unsigned long floor, unsigned long ceiling)
+{
+	pgd_t *pgd;
+	unsigned long next;
+	unsigned long start;
+	/*
+	 * Comments below take from the normal free_pgd_range().  They
+	 * apply here too.  The tests against HUGEPD_MASK below are
+	 * essential, because we *don't* test for this at the bottom
+	 * level.  Without them we'll attempt to free a hugepte table
+	 * when we unmap just part of it, even if there are other
+	 * active mappings using it.
+	 *
+	 * The next few lines have given us lots of grief...
+	 *
+	 * Why are we testing HUGEPD* at this top level?  Because
+	 * often there will be no work to do at all, and we'd prefer
+	 * not to go all the way down to the bottom just to discover
+	 * that.
+	 *
+	 * Why all these "- 1"s?  Because 0 represents both the bottom
+	 * of the address space and the top of it (using -1 for the
+	 * top wouldn't help much: the masks would do the wrong thing).
+	 * The rule is that addr 0 and floor 0 refer to the bottom of
+	 * the address space, but end 0 and ceiling 0 refer to the top
+	 * Comparisons need to use "end - 1" and "ceiling - 1" (though
+	 * that end 0 case should be mythical).
+	 *
+	 * Wherever addr is brought up or ceiling brought down, we
+	 * must be careful to reject "the opposite 0" before it
+	 * confuses the subsequent tests.  But what about where end is
+	 * brought down by HUGEPD_SIZE below? no, end can't go down to
+	 * 0 there.
+	 *
+	 * Whereas we round start (addr) and ceiling down, by different
+	 * masks at different levels, in order to test whether a table
+	 * now has no other vmas using it, so can be freed, we don't
+	 * bother to round floor or end up - the tests don't need that.
+	 */
+	addr &= HUGEPD_MASK;
+	if (addr < floor) {
+		addr += HUGEPD_SIZE;
+		if (!addr)
+			return;
+	}
+	if (ceiling) {
+		ceiling &= HUGEPD_MASK;
+		if (!ceiling)
+			return;
+	}
+	if (end - 1 > ceiling - 1)
+		end -= HUGEPD_SIZE;
+	if (addr > end - 1)
+		return;
+	start = addr;
+	pgd = pgd_offset((*tlb)->mm, addr);
+	do {
+		BUG_ON(! in_hugepage_area((*tlb)->mm->context, addr));
+		next = pgd_addr_end(addr, end);
+		if (pgd_none_or_clear_bad(pgd))
+			continue;
+		hugetlb_free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
+	} while (pgd++, addr = next, addr != end);
 }
 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
@@ -841,3 +1040,27 @@ repeat:
 out:
 	return err;
 }
+static void zero_ctor(void *addr, kmem_cache_t *cache, unsigned long flags)
+{
+	memset(addr, 0, kmem_cache_size(cache));
+}
+static int __init hugetlbpage_init(void)
+{
+	if (!cpu_has_feature(CPU_FTR_16M_PAGE))
+		return -ENODEV;
+	huge_pgtable_cache = kmem_cache_create("hugepte_cache",
+					       HUGEPTE_TABLE_SIZE,
+					       HUGEPTE_TABLE_SIZE,
+					       SLAB_HWCACHE_ALIGN |
+					       SLAB_MUST_HWCACHE_ALIGN,
+					       zero_ctor, NULL);
+	if (! huge_pgtable_cache)
+		panic("hugetlbpage_init(): could not create hugepte cache\n");
+	return 0;
+}
+module_init(hugetlbpage_init);
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -162,7 +162,14 @@ static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = {
 };
 #endif /* CONFIG_PPC_64K_PAGES */
+#ifdef CONFIG_HUGETLB_PAGE
+/* Hugepages need one extra cache, initialized in hugetlbpage.c.  We
+ * can't put into the tables above, because HPAGE_SHIFT is not compile
+ * time constant. */
+kmem_cache_t *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)+1];
+#else
 kmem_cache_t *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)];
+#endif
 void pgtable_cache_init(void)
 {

--- a/include/asm-powerpc/page_64.h
+++ b/include/asm-powerpc/page_64.h
@@ -101,6 +101,7 @@ extern unsigned int HPAGE_SHIFT;
 		                      - (1U << GET_HTLB_AREA(addr))) & 0xffff)
 #define ARCH_HAS_HUGEPAGE_ONLY_RANGE
+#define ARCH_HAS_HUGETLB_FREE_PGD_RANGE
 #define ARCH_HAS_PREPARE_HUGEPAGE_RANGE
 #define ARCH_HAS_SETCLEAR_HUGE_PTE

--- a/include/asm-powerpc/pgalloc.h
+++ b/include/asm-powerpc/pgalloc.h
@@ -17,11 +17,13 @@ extern kmem_cache_t *pgtable_cache[];
 #define PTE_CACHE_NUM	0
 #define PMD_CACHE_NUM	1
 #define PGD_CACHE_NUM	2
+#define HUGEPTE_CACHE_NUM 3
 #else
 #define PTE_CACHE_NUM	0
 #define PMD_CACHE_NUM	1
 #define PUD_CACHE_NUM	1
 #define PGD_CACHE_NUM	0
+#define HUGEPTE_CACHE_NUM 2
 #endif
 /*