powerpc: Switch 16GB and 16MB explicit hugepages to a different page table format
authorAneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Sun, 28 Apr 2013 09:37:30 +0000 (09:37 +0000)
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>
Tue, 30 Apr 2013 05:59:56 +0000 (15:59 +1000)
We will be switching PMD_SHIFT to 24 bits to facilitate THP impmenetation.
With PMD_SHIFT set to 24, we now have 16MB huge pages allocated at PGD level.
That means with 32 bit process we cannot allocate normal pages at
all, because we cover the entire address space with one pgd entry. Fix this
by switching to a new page table format for hugepages. With the new page table
format for 16GB and 16MB hugepages we won't allocate hugepage directory. Instead
we encode the PTE information directly at the directory level. This forces 16MB
hugepage at PMD level. This will also make the page take walk much simpler later
when we add the THP support.

With the new table format we have 4 cases for pgds and pmds:
(1) invalid (all zeroes)
(2) pointer to next table, as normal; bottom 6 bits == 0
(3) leaf pte for huge page, bottom two bits != 00
(4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of table

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
arch/powerpc/include/asm/page.h
arch/powerpc/include/asm/pgtable.h
arch/powerpc/mm/gup.c
arch/powerpc/mm/hugetlbpage.c

index 652719ccd2e9a34b74f9bc8134f430ccf8b87bad..711e83a79e3db49c857cabebd23662736a3c65a0 100644 (file)
@@ -373,8 +373,10 @@ static inline int hugepd_ok(hugepd_t hpd)
 #endif
 
 #define is_hugepd(pdep)               (hugepd_ok(*((hugepd_t *)(pdep))))
+int pgd_huge(pgd_t pgd);
 #else /* CONFIG_HUGETLB_PAGE */
 #define is_hugepd(pdep)                        0
+#define pgd_huge(pgd)                  0
 #endif /* CONFIG_HUGETLB_PAGE */
 
 struct page;
index 4b52726e01ca5cea007cb745a9508a83f13f33b1..7aeb9555f6eac40a69cfb11bddf924b221add6ee 100644 (file)
@@ -218,6 +218,8 @@ extern void update_mmu_cache(struct vm_area_struct *, unsigned long, pte_t *);
 extern int gup_hugepd(hugepd_t *hugepd, unsigned pdshift, unsigned long addr,
                      unsigned long end, int write, struct page **pages, int *nr);
 
+extern int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
+                      unsigned long end, int write, struct page **pages, int *nr);
 #endif /* __ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
index d7efdbf640c7d5e39cd8c3c55f79aa0b8ca64f10..4b921affa495d2e315e0598a06c34c0d2231be38 100644 (file)
@@ -68,7 +68,11 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
                next = pmd_addr_end(addr, end);
                if (pmd_none(pmd))
                        return 0;
-               if (is_hugepd(pmdp)) {
+               if (pmd_huge(pmd)) {
+                       if (!gup_hugepte((pte_t *)pmdp, PMD_SIZE, addr, next,
+                                        write, pages, nr))
+                               return 0;
+               } else if (is_hugepd(pmdp)) {
                        if (!gup_hugepd((hugepd_t *)pmdp, PMD_SHIFT,
                                        addr, next, write, pages, nr))
                                return 0;
@@ -92,7 +96,11 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
                next = pud_addr_end(addr, end);
                if (pud_none(pud))
                        return 0;
-               if (is_hugepd(pudp)) {
+               if (pud_huge(pud)) {
+                       if (!gup_hugepte((pte_t *)pudp, PUD_SIZE, addr, next,
+                                        write, pages, nr))
+                               return 0;
+               } else if (is_hugepd(pudp)) {
                        if (!gup_hugepd((hugepd_t *)pudp, PUD_SHIFT,
                                        addr, next, write, pages, nr))
                                return 0;
@@ -153,7 +161,11 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
                next = pgd_addr_end(addr, end);
                if (pgd_none(pgd))
                        goto slow;
-               if (is_hugepd(pgdp)) {
+               if (pgd_huge(pgd)) {
+                       if (!gup_hugepte((pte_t *)pgdp, PGDIR_SIZE, addr, next,
+                                        write, pages, &nr))
+                               goto slow;
+               } else if (is_hugepd(pgdp)) {
                        if (!gup_hugepd((hugepd_t *)pgdp, PGDIR_SHIFT,
                                        addr, next, write, pages, &nr))
                                goto slow;
index b4e2f24a9b8f5c84e5ec24ad696fe27c21d9dc56..237c8e5f2640b79dce83c4bce7f7f66ef09fe02f 100644 (file)
@@ -50,11 +50,69 @@ static unsigned nr_gpages;
 
 #define hugepd_none(hpd)       ((hpd).pd == 0)
 
+#ifdef CONFIG_PPC_BOOK3S_64
+/*
+ * At this point we do the placement change only for BOOK3S 64. This would
+ * possibly work on other subarchs.
+ */
+
+/*
+ * We have PGD_INDEX_SIZ = 12 and PTE_INDEX_SIZE = 8, so that we can have
+ * 16GB hugepage pte in PGD and 16MB hugepage pte at PMD;
+ */
+int pmd_huge(pmd_t pmd)
+{
+       /*
+        * leaf pte for huge page, bottom two bits != 00
+        */
+       return ((pmd_val(pmd) & 0x3) != 0x0);
+}
+
+int pud_huge(pud_t pud)
+{
+       /*
+        * leaf pte for huge page, bottom two bits != 00
+        */
+       return ((pud_val(pud) & 0x3) != 0x0);
+}
+
+int pgd_huge(pgd_t pgd)
+{
+       /*
+        * leaf pte for huge page, bottom two bits != 00
+        */
+       return ((pgd_val(pgd) & 0x3) != 0x0);
+}
+#else
+int pmd_huge(pmd_t pmd)
+{
+       return 0;
+}
+
+int pud_huge(pud_t pud)
+{
+       return 0;
+}
+
+int pgd_huge(pgd_t pgd)
+{
+       return 0;
+}
+#endif
+
+/*
+ * We have 4 cases for pgds and pmds:
+ * (1) invalid (all zeroes)
+ * (2) pointer to next table, as normal; bottom 6 bits == 0
+ * (3) leaf pte for huge page, bottom two bits != 00
+ * (4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of table
+ */
 pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
 {
        pgd_t *pg;
        pud_t *pu;
        pmd_t *pm;
+       pte_t *ret_pte;
        hugepd_t *hpdp = NULL;
        unsigned pdshift = PGDIR_SHIFT;
 
@@ -62,30 +120,43 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift
                *shift = 0;
 
        pg = pgdir + pgd_index(ea);
-       if (is_hugepd(pg)) {
+
+       if (pgd_huge(*pg)) {
+               ret_pte = (pte_t *) pg;
+               goto out;
+       } else if (is_hugepd(pg))
                hpdp = (hugepd_t *)pg;
-       else if (!pgd_none(*pg)) {
+       else if (!pgd_none(*pg)) {
                pdshift = PUD_SHIFT;
                pu = pud_offset(pg, ea);
-               if (is_hugepd(pu))
+
+               if (pud_huge(*pu)) {
+                       ret_pte = (pte_t *) pu;
+                       goto out;
+               } else if (is_hugepd(pu))
                        hpdp = (hugepd_t *)pu;
                else if (!pud_none(*pu)) {
                        pdshift = PMD_SHIFT;
                        pm = pmd_offset(pu, ea);
-                       if (is_hugepd(pm))
+
+                       if (pmd_huge(*pm)) {
+                               ret_pte = (pte_t *) pm;
+                               goto out;
+                       } else if (is_hugepd(pm))
                                hpdp = (hugepd_t *)pm;
-                       else if (!pmd_none(*pm)) {
+                       else if (!pmd_none(*pm))
                                return pte_offset_kernel(pm, ea);
-                       }
                }
        }
-
        if (!hpdp)
                return NULL;
 
+       ret_pte = hugepte_offset(hpdp, ea, pdshift);
+       pdshift = hugepd_shift(*hpdp);
+out:
        if (shift)
-               *shift = hugepd_shift(*hpdp);
-       return hugepte_offset(hpdp, ea, pdshift);
+               *shift = pdshift;
+       return ret_pte;
 }
 EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte);
 
@@ -165,6 +236,61 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
 #define HUGEPD_PUD_SHIFT PMD_SHIFT
 #endif
 
+#ifdef CONFIG_PPC_BOOK3S_64
+/*
+ * At this point we do the placement change only for BOOK3S 64. This would
+ * possibly work on other subarchs.
+ */
+pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
+{
+       pgd_t *pg;
+       pud_t *pu;
+       pmd_t *pm;
+       hugepd_t *hpdp = NULL;
+       unsigned pshift = __ffs(sz);
+       unsigned pdshift = PGDIR_SHIFT;
+
+       addr &= ~(sz-1);
+       pg = pgd_offset(mm, addr);
+
+       if (pshift == PGDIR_SHIFT)
+               /* 16GB huge page */
+               return (pte_t *) pg;
+       else if (pshift > PUD_SHIFT)
+               /*
+                * We need to use hugepd table
+                */
+               hpdp = (hugepd_t *)pg;
+       else {
+               pdshift = PUD_SHIFT;
+               pu = pud_alloc(mm, pg, addr);
+               if (pshift == PUD_SHIFT)
+                       return (pte_t *)pu;
+               else if (pshift > PMD_SHIFT)
+                       hpdp = (hugepd_t *)pu;
+               else {
+                       pdshift = PMD_SHIFT;
+                       pm = pmd_alloc(mm, pu, addr);
+                       if (pshift == PMD_SHIFT)
+                               /* 16MB hugepage */
+                               return (pte_t *)pm;
+                       else
+                               hpdp = (hugepd_t *)pm;
+               }
+       }
+       if (!hpdp)
+               return NULL;
+
+       BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
+
+       if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
+               return NULL;
+
+       return hugepte_offset(hpdp, addr, pdshift);
+}
+
+#else
+
 pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
 {
        pgd_t *pg;
@@ -202,6 +328,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz
 
        return hugepte_offset(hpdp, addr, pdshift);
 }
+#endif
 
 #ifdef CONFIG_PPC_FSL_BOOK3E
 /* Build list of addresses of gigantic pages.  This function is used in early
@@ -465,7 +592,7 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
        do {
                pmd = pmd_offset(pud, addr);
                next = pmd_addr_end(addr, end);
-               if (pmd_none(*pmd))
+               if (pmd_none_or_clear_bad(pmd))
                        continue;
 #ifdef CONFIG_PPC_FSL_BOOK3E
                /*
@@ -618,16 +745,6 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
        return page;
 }
 
-int pmd_huge(pmd_t pmd)
-{
-       return 0;
-}
-
-int pud_huge(pud_t pud)
-{
-       return 0;
-}
-
 struct page *
 follow_huge_pmd(struct mm_struct *mm, unsigned long address,
                pmd_t *pmd, int write)
@@ -636,8 +753,8 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
        return NULL;
 }
 
-static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
-                      unsigned long end, int write, struct page **pages, int *nr)
+int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
+               unsigned long end, int write, struct page **pages, int *nr)
 {
        unsigned long mask;
        unsigned long pte_end;
@@ -873,11 +990,16 @@ static int __init hugetlbpage_init(void)
                        pdshift = PUD_SHIFT;
                else
                        pdshift = PGDIR_SHIFT;
-
-               pgtable_cache_add(pdshift - shift, NULL);
-               if (!PGT_CACHE(pdshift - shift))
-                       panic("hugetlbpage_init(): could not create "
-                             "pgtable cache for %d bit pagesize\n", shift);
+               /*
+                * if we have pdshift and shift value same, we don't
+                * use pgt cache for hugepd.
+                */
+               if (pdshift != shift) {
+                       pgtable_cache_add(pdshift - shift, NULL);
+                       if (!PGT_CACHE(pdshift - shift))
+                               panic("hugetlbpage_init(): could not create "
+                                     "pgtable cache for %d bit pagesize\n", shift);
+               }
        }
 
        /* Set default large page size. Currently, we pick 16M or 1M