powerpc/mm/hugetlbpage.c

0001 /*
0002  * PPC Huge TLB Page Support for Kernel.
0003  *
0004  * Copyright (C) 2003 David Gibson, IBM Corporation.
0005  * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
0006  *
0007  * Based on the IA-32 version:
0008  * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
0009  */
0010
0011 #include <linux/mm.h>
0012 #include <linux/io.h>
0013 #include <linux/slab.h>
0014 #include <linux/hugetlb.h>
0015 #include <linux/export.h>
0016 #include <linux/of_fdt.h>
0017 #include <linux/memblock.h>
0018 #include <linux/moduleparam.h>
0019 #include <linux/swap.h>
0020 #include <linux/swapops.h>
0021 #include <linux/kmemleak.h>
0022 #include <asm/pgalloc.h>
0023 #include <asm/tlb.h>
0024 #include <asm/setup.h>
0025 #include <asm/hugetlb.h>
0026 #include <asm/pte-walk.h>
0027 #include <asm/firmware.h>
0028
0029 bool hugetlb_disabled = false;
0030
0031 #define hugepd_none(hpd)    (hpd_val(hpd) == 0)
0032
0033 #define PTE_T_ORDER (__builtin_ffs(sizeof(pte_basic_t)) - \
0034              __builtin_ffs(sizeof(void *)))
0035
0036 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz)
0037 {
0038     /*
0039      * Only called for hugetlbfs pages, hence can ignore THP and the
0040      * irq disabled walk.
0041      */
0042     return __find_linux_pte(mm->pgd, addr, NULL, NULL);
0043 }
0044
0045 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
0046                unsigned long address, unsigned int pdshift,
0047                unsigned int pshift, spinlock_t *ptl)
0048 {
0049     struct kmem_cache *cachep;
0050     pte_t *new;
0051     int i;
0052     int num_hugepd;
0053
0054     if (pshift >= pdshift) {
0055         cachep = PGT_CACHE(PTE_T_ORDER);
0056         num_hugepd = 1 << (pshift - pdshift);
0057     } else {
0058         cachep = PGT_CACHE(pdshift - pshift);
0059         num_hugepd = 1;
0060     }
0061
0062     if (!cachep) {
0063         WARN_ONCE(1, "No page table cache created for hugetlb tables");
0064         return -ENOMEM;
0065     }
0066
0067     new = kmem_cache_alloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL));
0068
0069     BUG_ON(pshift > HUGEPD_SHIFT_MASK);
0070     BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
0071
0072     if (!new)
0073         return -ENOMEM;
0074
0075     /*
0076      * Make sure other cpus find the hugepd set only after a
0077      * properly initialized page table is visible to them.
0078      * For more details look for comment in __pte_alloc().
0079      */
0080     smp_wmb();
0081
0082     spin_lock(ptl);
0083     /*
0084      * We have multiple higher-level entries that point to the same
0085      * actual pte location.  Fill in each as we go and backtrack on error.
0086      * We need all of these so the DTLB pgtable walk code can find the
0087      * right higher-level entry without knowing if it's a hugepage or not.
0088      */
0089     for (i = 0; i < num_hugepd; i++, hpdp++) {
0090         if (unlikely(!hugepd_none(*hpdp)))
0091             break;
0092         hugepd_populate(hpdp, new, pshift);
0093     }
0094     /* If we bailed from the for loop early, an error occurred, clean up */
0095     if (i < num_hugepd) {
0096         for (i = i - 1 ; i >= 0; i--, hpdp--)
0097             *hpdp = __hugepd(0);
0098         kmem_cache_free(cachep, new);
0099     } else {
0100         kmemleak_ignore(new);
0101     }
0102     spin_unlock(ptl);
0103     return 0;
0104 }
0105
0106 /*
0107  * At this point we do the placement change only for BOOK3S 64. This would
0108  * possibly work on other subarchs.
0109  */
0110 pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
0111               unsigned long addr, unsigned long sz)
0112 {
0113     pgd_t *pg;
0114     p4d_t *p4;
0115     pud_t *pu;
0116     pmd_t *pm;
0117     hugepd_t *hpdp = NULL;
0118     unsigned pshift = __ffs(sz);
0119     unsigned pdshift = PGDIR_SHIFT;
0120     spinlock_t *ptl;
0121
0122     addr &= ~(sz-1);
0123     pg = pgd_offset(mm, addr);
0124     p4 = p4d_offset(pg, addr);
0125
0126 #ifdef CONFIG_PPC_BOOK3S_64
0127     if (pshift == PGDIR_SHIFT)
0128         /* 16GB huge page */
0129         return (pte_t *) p4;
0130     else if (pshift > PUD_SHIFT) {
0131         /*
0132          * We need to use hugepd table
0133          */
0134         ptl = &mm->page_table_lock;
0135         hpdp = (hugepd_t *)p4;
0136     } else {
0137         pdshift = PUD_SHIFT;
0138         pu = pud_alloc(mm, p4, addr);
0139         if (!pu)
0140             return NULL;
0141         if (pshift == PUD_SHIFT)
0142             return (pte_t *)pu;
0143         else if (pshift > PMD_SHIFT) {
0144             ptl = pud_lockptr(mm, pu);
0145             hpdp = (hugepd_t *)pu;
0146         } else {
0147             pdshift = PMD_SHIFT;
0148             pm = pmd_alloc(mm, pu, addr);
0149             if (!pm)
0150                 return NULL;
0151             if (pshift == PMD_SHIFT)
0152                 /* 16MB hugepage */
0153                 return (pte_t *)pm;
0154             else {
0155                 ptl = pmd_lockptr(mm, pm);
0156                 hpdp = (hugepd_t *)pm;
0157             }
0158         }
0159     }
0160 #else
0161     if (pshift >= PGDIR_SHIFT) {
0162         ptl = &mm->page_table_lock;
0163         hpdp = (hugepd_t *)p4;
0164     } else {
0165         pdshift = PUD_SHIFT;
0166         pu = pud_alloc(mm, p4, addr);
0167         if (!pu)
0168             return NULL;
0169         if (pshift >= PUD_SHIFT) {
0170             ptl = pud_lockptr(mm, pu);
0171             hpdp = (hugepd_t *)pu;
0172         } else {
0173             pdshift = PMD_SHIFT;
0174             pm = pmd_alloc(mm, pu, addr);
0175             if (!pm)
0176                 return NULL;
0177             ptl = pmd_lockptr(mm, pm);
0178             hpdp = (hugepd_t *)pm;
0179         }
0180     }
0181 #endif
0182     if (!hpdp)
0183         return NULL;
0184
0185     if (IS_ENABLED(CONFIG_PPC_8xx) && pshift < PMD_SHIFT)
0186         return pte_alloc_map(mm, (pmd_t *)hpdp, addr);
0187
0188     BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
0189
0190     if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr,
0191                           pdshift, pshift, ptl))
0192         return NULL;
0193
0194     return hugepte_offset(*hpdp, addr, pdshift);
0195 }
0196
0197 #ifdef CONFIG_PPC_BOOK3S_64
0198 /*
0199  * Tracks gpages after the device tree is scanned and before the
0200  * huge_boot_pages list is ready on pseries.
0201  */
0202 #define MAX_NUMBER_GPAGES   1024
0203 __initdata static u64 gpage_freearray[MAX_NUMBER_GPAGES];
0204 __initdata static unsigned nr_gpages;
0205
0206 /*
0207  * Build list of addresses of gigantic pages.  This function is used in early
0208  * boot before the buddy allocator is setup.
0209  */
0210 void __init pseries_add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
0211 {
0212     if (!addr)
0213         return;
0214     while (number_of_pages > 0) {
0215         gpage_freearray[nr_gpages] = addr;
0216         nr_gpages++;
0217         number_of_pages--;
0218         addr += page_size;
0219     }
0220 }
0221
0222 static int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate)
0223 {
0224     struct huge_bootmem_page *m;
0225     if (nr_gpages == 0)
0226         return 0;
0227     m = phys_to_virt(gpage_freearray[--nr_gpages]);
0228     gpage_freearray[nr_gpages] = 0;
0229     list_add(&m->list, &huge_boot_pages);
0230     m->hstate = hstate;
0231     return 1;
0232 }
0233
0234 bool __init hugetlb_node_alloc_supported(void)
0235 {
0236     return false;
0237 }
0238 #endif
0239
0240
0241 int __init alloc_bootmem_huge_page(struct hstate *h, int nid)
0242 {
0243
0244 #ifdef CONFIG_PPC_BOOK3S_64
0245     if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled())
0246         return pseries_alloc_bootmem_huge_page(h);
0247 #endif
0248     return __alloc_bootmem_huge_page(h, nid);
0249 }
0250
0251 #ifndef CONFIG_PPC_BOOK3S_64
0252 #define HUGEPD_FREELIST_SIZE \
0253     ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))
0254
0255 struct hugepd_freelist {
0256     struct rcu_head rcu;
0257     unsigned int index;
0258     void *ptes[];
0259 };
0260
0261 static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur);
0262
0263 static void hugepd_free_rcu_callback(struct rcu_head *head)
0264 {
0265     struct hugepd_freelist *batch =
0266         container_of(head, struct hugepd_freelist, rcu);
0267     unsigned int i;
0268
0269     for (i = 0; i < batch->index; i++)
0270         kmem_cache_free(PGT_CACHE(PTE_T_ORDER), batch->ptes[i]);
0271
0272     free_page((unsigned long)batch);
0273 }
0274
0275 static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
0276 {
0277     struct hugepd_freelist **batchp;
0278
0279     batchp = &get_cpu_var(hugepd_freelist_cur);
0280
0281     if (atomic_read(&tlb->mm->mm_users) < 2 ||
0282         mm_is_thread_local(tlb->mm)) {
0283         kmem_cache_free(PGT_CACHE(PTE_T_ORDER), hugepte);
0284         put_cpu_var(hugepd_freelist_cur);
0285         return;
0286     }
0287
0288     if (*batchp == NULL) {
0289         *batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC);
0290         (*batchp)->index = 0;
0291     }
0292
0293     (*batchp)->ptes[(*batchp)->index++] = hugepte;
0294     if ((*batchp)->index == HUGEPD_FREELIST_SIZE) {
0295         call_rcu(&(*batchp)->rcu, hugepd_free_rcu_callback);
0296         *batchp = NULL;
0297     }
0298     put_cpu_var(hugepd_freelist_cur);
0299 }
0300 #else
0301 static inline void hugepd_free(struct mmu_gather *tlb, void *hugepte) {}
0302 #endif
0303
0304 /* Return true when the entry to be freed maps more than the area being freed */
0305 static bool range_is_outside_limits(unsigned long start, unsigned long end,
0306                     unsigned long floor, unsigned long ceiling,
0307                     unsigned long mask)
0308 {
0309     if ((start & mask) < floor)
0310         return true;
0311     if (ceiling) {
0312         ceiling &= mask;
0313         if (!ceiling)
0314             return true;
0315     }
0316     return end - 1 > ceiling - 1;
0317 }
0318
0319 static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
0320                   unsigned long start, unsigned long end,
0321                   unsigned long floor, unsigned long ceiling)
0322 {
0323     pte_t *hugepte = hugepd_page(*hpdp);
0324     int i;
0325
0326     unsigned long pdmask = ~((1UL << pdshift) - 1);
0327     unsigned int num_hugepd = 1;
0328     unsigned int shift = hugepd_shift(*hpdp);
0329
0330     /* Note: On fsl the hpdp may be the first of several */
0331     if (shift > pdshift)
0332         num_hugepd = 1 << (shift - pdshift);
0333
0334     if (range_is_outside_limits(start, end, floor, ceiling, pdmask))
0335         return;
0336
0337     for (i = 0; i < num_hugepd; i++, hpdp++)
0338         *hpdp = __hugepd(0);
0339
0340     if (shift >= pdshift)
0341         hugepd_free(tlb, hugepte);
0342     else
0343         pgtable_free_tlb(tlb, hugepte,
0344                  get_hugepd_cache_index(pdshift - shift));
0345 }
0346
0347 static void hugetlb_free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
0348                    unsigned long addr, unsigned long end,
0349                    unsigned long floor, unsigned long ceiling)
0350 {
0351     pgtable_t token = pmd_pgtable(*pmd);
0352
0353     if (range_is_outside_limits(addr, end, floor, ceiling, PMD_MASK))
0354         return;
0355
0356     pmd_clear(pmd);
0357     pte_free_tlb(tlb, token, addr);
0358     mm_dec_nr_ptes(tlb->mm);
0359 }
0360
0361 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
0362                    unsigned long addr, unsigned long end,
0363                    unsigned long floor, unsigned long ceiling)
0364 {
0365     pmd_t *pmd;
0366     unsigned long next;
0367     unsigned long start;
0368
0369     start = addr;
0370     do {
0371         unsigned long more;
0372
0373         pmd = pmd_offset(pud, addr);
0374         next = pmd_addr_end(addr, end);
0375         if (!is_hugepd(__hugepd(pmd_val(*pmd)))) {
0376             if (pmd_none_or_clear_bad(pmd))
0377                 continue;
0378
0379             /*
0380              * if it is not hugepd pointer, we should already find
0381              * it cleared.
0382              */
0383             WARN_ON(!IS_ENABLED(CONFIG_PPC_8xx));
0384
0385             hugetlb_free_pte_range(tlb, pmd, addr, end, floor, ceiling);
0386
0387             continue;
0388         }
0389         /*
0390          * Increment next by the size of the huge mapping since
0391          * there may be more than one entry at this level for a
0392          * single hugepage, but all of them point to
0393          * the same kmem cache that holds the hugepte.
0394          */
0395         more = addr + (1 << hugepd_shift(*(hugepd_t *)pmd));
0396         if (more > next)
0397             next = more;
0398
0399         free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
0400                   addr, next, floor, ceiling);
0401     } while (addr = next, addr != end);
0402
0403     if (range_is_outside_limits(start, end, floor, ceiling, PUD_MASK))
0404         return;
0405
0406     pmd = pmd_offset(pud, start & PUD_MASK);
0407     pud_clear(pud);
0408     pmd_free_tlb(tlb, pmd, start & PUD_MASK);
0409     mm_dec_nr_pmds(tlb->mm);
0410 }
0411
0412 static void hugetlb_free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
0413                    unsigned long addr, unsigned long end,
0414                    unsigned long floor, unsigned long ceiling)
0415 {
0416     pud_t *pud;
0417     unsigned long next;
0418     unsigned long start;
0419
0420     start = addr;
0421     do {
0422         pud = pud_offset(p4d, addr);
0423         next = pud_addr_end(addr, end);
0424         if (!is_hugepd(__hugepd(pud_val(*pud)))) {
0425             if (pud_none_or_clear_bad(pud))
0426                 continue;
0427             hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
0428                            ceiling);
0429         } else {
0430             unsigned long more;
0431             /*
0432              * Increment next by the size of the huge mapping since
0433              * there may be more than one entry at this level for a
0434              * single hugepage, but all of them point to
0435              * the same kmem cache that holds the hugepte.
0436              */
0437             more = addr + (1 << hugepd_shift(*(hugepd_t *)pud));
0438             if (more > next)
0439                 next = more;
0440
0441             free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
0442                       addr, next, floor, ceiling);
0443         }
0444     } while (addr = next, addr != end);
0445
0446     if (range_is_outside_limits(start, end, floor, ceiling, PGDIR_MASK))
0447         return;
0448
0449     pud = pud_offset(p4d, start & PGDIR_MASK);
0450     p4d_clear(p4d);
0451     pud_free_tlb(tlb, pud, start & PGDIR_MASK);
0452     mm_dec_nr_puds(tlb->mm);
0453 }
0454
0455 /*
0456  * This function frees user-level page tables of a process.
0457  */
0458 void hugetlb_free_pgd_range(struct mmu_gather *tlb,
0459                 unsigned long addr, unsigned long end,
0460                 unsigned long floor, unsigned long ceiling)
0461 {
0462     pgd_t *pgd;
0463     p4d_t *p4d;
0464     unsigned long next;
0465
0466     /*
0467      * Because there are a number of different possible pagetable
0468      * layouts for hugepage ranges, we limit knowledge of how
0469      * things should be laid out to the allocation path
0470      * (huge_pte_alloc(), above).  Everything else works out the
0471      * structure as it goes from information in the hugepd
0472      * pointers.  That means that we can't here use the
0473      * optimization used in the normal page free_pgd_range(), of
0474      * checking whether we're actually covering a large enough
0475      * range to have to do anything at the top level of the walk
0476      * instead of at the bottom.
0477      *
0478      * To make sense of this, you should probably go read the big
0479      * block comment at the top of the normal free_pgd_range(),
0480      * too.
0481      */
0482
0483     do {
0484         next = pgd_addr_end(addr, end);
0485         pgd = pgd_offset(tlb->mm, addr);
0486         p4d = p4d_offset(pgd, addr);
0487         if (!is_hugepd(__hugepd(pgd_val(*pgd)))) {
0488             if (p4d_none_or_clear_bad(p4d))
0489                 continue;
0490             hugetlb_free_pud_range(tlb, p4d, addr, next, floor, ceiling);
0491         } else {
0492             unsigned long more;
0493             /*
0494              * Increment next by the size of the huge mapping since
0495              * there may be more than one entry at the pgd level
0496              * for a single hugepage, but all of them point to the
0497              * same kmem cache that holds the hugepte.
0498              */
0499             more = addr + (1 << hugepd_shift(*(hugepd_t *)pgd));
0500             if (more > next)
0501                 next = more;
0502
0503             free_hugepd_range(tlb, (hugepd_t *)p4d, PGDIR_SHIFT,
0504                       addr, next, floor, ceiling);
0505         }
0506     } while (addr = next, addr != end);
0507 }
0508
0509 struct page *follow_huge_pd(struct vm_area_struct *vma,
0510                 unsigned long address, hugepd_t hpd,
0511                 int flags, int pdshift)
0512 {
0513     pte_t *ptep;
0514     spinlock_t *ptl;
0515     struct page *page = NULL;
0516     unsigned long mask;
0517     int shift = hugepd_shift(hpd);
0518     struct mm_struct *mm = vma->vm_mm;
0519
0520 retry:
0521     /*
0522      * hugepage directory entries are protected by mm->page_table_lock
0523      * Use this instead of huge_pte_lockptr
0524      */
0525     ptl = &mm->page_table_lock;
0526     spin_lock(ptl);
0527
0528     ptep = hugepte_offset(hpd, address, pdshift);
0529     if (pte_present(*ptep)) {
0530         mask = (1UL << shift) - 1;
0531         page = pte_page(*ptep);
0532         page += ((address & mask) >> PAGE_SHIFT);
0533         if (flags & FOLL_GET)
0534             get_page(page);
0535     } else {
0536         if (is_hugetlb_entry_migration(*ptep)) {
0537             spin_unlock(ptl);
0538             __migration_entry_wait(mm, ptep, ptl);
0539             goto retry;
0540         }
0541     }
0542     spin_unlock(ptl);
0543     return page;
0544 }
0545
0546 bool __init arch_hugetlb_valid_size(unsigned long size)
0547 {
0548     int shift = __ffs(size);
0549     int mmu_psize;
0550
0551     /* Check that it is a page size supported by the hardware and
0552      * that it fits within pagetable and slice limits. */
0553     if (size <= PAGE_SIZE || !is_power_of_2(size))
0554         return false;
0555
0556     mmu_psize = check_and_get_huge_psize(shift);
0557     if (mmu_psize < 0)
0558         return false;
0559
0560     BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
0561
0562     return true;
0563 }
0564
0565 static int __init add_huge_page_size(unsigned long long size)
0566 {
0567     int shift = __ffs(size);
0568
0569     if (!arch_hugetlb_valid_size((unsigned long)size))
0570         return -EINVAL;
0571
0572     hugetlb_add_hstate(shift - PAGE_SHIFT);
0573     return 0;
0574 }
0575
0576 static int __init hugetlbpage_init(void)
0577 {
0578     bool configured = false;
0579     int psize;
0580
0581     if (hugetlb_disabled) {
0582         pr_info("HugeTLB support is disabled!\n");
0583         return 0;
0584     }
0585
0586     if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !radix_enabled() &&
0587         !mmu_has_feature(MMU_FTR_16M_PAGE))
0588         return -ENODEV;
0589
0590     for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
0591         unsigned shift;
0592         unsigned pdshift;
0593
0594         if (!mmu_psize_defs[psize].shift)
0595             continue;
0596
0597         shift = mmu_psize_to_shift(psize);
0598
0599 #ifdef CONFIG_PPC_BOOK3S_64
0600         if (shift > PGDIR_SHIFT)
0601             continue;
0602         else if (shift > PUD_SHIFT)
0603             pdshift = PGDIR_SHIFT;
0604         else if (shift > PMD_SHIFT)
0605             pdshift = PUD_SHIFT;
0606         else
0607             pdshift = PMD_SHIFT;
0608 #else
0609         if (shift < PUD_SHIFT)
0610             pdshift = PMD_SHIFT;
0611         else if (shift < PGDIR_SHIFT)
0612             pdshift = PUD_SHIFT;
0613         else
0614             pdshift = PGDIR_SHIFT;
0615 #endif
0616
0617         if (add_huge_page_size(1ULL << shift) < 0)
0618             continue;
0619         /*
0620          * if we have pdshift and shift value same, we don't
0621          * use pgt cache for hugepd.
0622          */
0623         if (pdshift > shift) {
0624             if (!IS_ENABLED(CONFIG_PPC_8xx))
0625                 pgtable_cache_add(pdshift - shift);
0626         } else if (IS_ENABLED(CONFIG_PPC_FSL_BOOK3E) ||
0627                IS_ENABLED(CONFIG_PPC_8xx)) {
0628             pgtable_cache_add(PTE_T_ORDER);
0629         }
0630
0631         configured = true;
0632     }
0633
0634     if (!configured)
0635         pr_info("Failed to initialize. Disabling HugeTLB");
0636
0637     return 0;
0638 }
0639
0640 arch_initcall(hugetlbpage_init);
0641
0642 void __init gigantic_hugetlb_cma_reserve(void)
0643 {
0644     unsigned long order = 0;
0645
0646     if (radix_enabled())
0647         order = PUD_SHIFT - PAGE_SHIFT;
0648     else if (!firmware_has_feature(FW_FEATURE_LPAR) && mmu_psize_defs[MMU_PAGE_16G].shift)
0649         /*
0650          * For pseries we do use ibm,expected#pages for reserving 16G pages.
0651          */
0652         order = mmu_psize_to_shift(MMU_PAGE_16G) - PAGE_SHIFT;
0653
0654     if (order) {
0655         VM_WARN_ON(order < MAX_ORDER);
0656         hugetlb_cma_reserve(order);
0657     }
0658 }