0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011 #include <linux/mm.h>
0012 #include <linux/io.h>
0013 #include <linux/slab.h>
0014 #include <linux/hugetlb.h>
0015 #include <linux/export.h>
0016 #include <linux/of_fdt.h>
0017 #include <linux/memblock.h>
0018 #include <linux/moduleparam.h>
0019 #include <linux/swap.h>
0020 #include <linux/swapops.h>
0021 #include <linux/kmemleak.h>
0022 #include <asm/pgalloc.h>
0023 #include <asm/tlb.h>
0024 #include <asm/setup.h>
0025 #include <asm/hugetlb.h>
0026 #include <asm/pte-walk.h>
0027 #include <asm/firmware.h>
0028
0029 bool hugetlb_disabled = false;
0030
0031 #define hugepd_none(hpd) (hpd_val(hpd) == 0)
0032
0033 #define PTE_T_ORDER (__builtin_ffs(sizeof(pte_basic_t)) - \
0034 __builtin_ffs(sizeof(void *)))
0035
0036 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz)
0037 {
0038
0039
0040
0041
0042 return __find_linux_pte(mm->pgd, addr, NULL, NULL);
0043 }
0044
0045 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
0046 unsigned long address, unsigned int pdshift,
0047 unsigned int pshift, spinlock_t *ptl)
0048 {
0049 struct kmem_cache *cachep;
0050 pte_t *new;
0051 int i;
0052 int num_hugepd;
0053
0054 if (pshift >= pdshift) {
0055 cachep = PGT_CACHE(PTE_T_ORDER);
0056 num_hugepd = 1 << (pshift - pdshift);
0057 } else {
0058 cachep = PGT_CACHE(pdshift - pshift);
0059 num_hugepd = 1;
0060 }
0061
0062 if (!cachep) {
0063 WARN_ONCE(1, "No page table cache created for hugetlb tables");
0064 return -ENOMEM;
0065 }
0066
0067 new = kmem_cache_alloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL));
0068
0069 BUG_ON(pshift > HUGEPD_SHIFT_MASK);
0070 BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
0071
0072 if (!new)
0073 return -ENOMEM;
0074
0075
0076
0077
0078
0079
0080 smp_wmb();
0081
0082 spin_lock(ptl);
0083
0084
0085
0086
0087
0088
0089 for (i = 0; i < num_hugepd; i++, hpdp++) {
0090 if (unlikely(!hugepd_none(*hpdp)))
0091 break;
0092 hugepd_populate(hpdp, new, pshift);
0093 }
0094
0095 if (i < num_hugepd) {
0096 for (i = i - 1 ; i >= 0; i--, hpdp--)
0097 *hpdp = __hugepd(0);
0098 kmem_cache_free(cachep, new);
0099 } else {
0100 kmemleak_ignore(new);
0101 }
0102 spin_unlock(ptl);
0103 return 0;
0104 }
0105
0106
0107
0108
0109
0110 pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
0111 unsigned long addr, unsigned long sz)
0112 {
0113 pgd_t *pg;
0114 p4d_t *p4;
0115 pud_t *pu;
0116 pmd_t *pm;
0117 hugepd_t *hpdp = NULL;
0118 unsigned pshift = __ffs(sz);
0119 unsigned pdshift = PGDIR_SHIFT;
0120 spinlock_t *ptl;
0121
0122 addr &= ~(sz-1);
0123 pg = pgd_offset(mm, addr);
0124 p4 = p4d_offset(pg, addr);
0125
0126 #ifdef CONFIG_PPC_BOOK3S_64
0127 if (pshift == PGDIR_SHIFT)
0128
0129 return (pte_t *) p4;
0130 else if (pshift > PUD_SHIFT) {
0131
0132
0133
0134 ptl = &mm->page_table_lock;
0135 hpdp = (hugepd_t *)p4;
0136 } else {
0137 pdshift = PUD_SHIFT;
0138 pu = pud_alloc(mm, p4, addr);
0139 if (!pu)
0140 return NULL;
0141 if (pshift == PUD_SHIFT)
0142 return (pte_t *)pu;
0143 else if (pshift > PMD_SHIFT) {
0144 ptl = pud_lockptr(mm, pu);
0145 hpdp = (hugepd_t *)pu;
0146 } else {
0147 pdshift = PMD_SHIFT;
0148 pm = pmd_alloc(mm, pu, addr);
0149 if (!pm)
0150 return NULL;
0151 if (pshift == PMD_SHIFT)
0152
0153 return (pte_t *)pm;
0154 else {
0155 ptl = pmd_lockptr(mm, pm);
0156 hpdp = (hugepd_t *)pm;
0157 }
0158 }
0159 }
0160 #else
0161 if (pshift >= PGDIR_SHIFT) {
0162 ptl = &mm->page_table_lock;
0163 hpdp = (hugepd_t *)p4;
0164 } else {
0165 pdshift = PUD_SHIFT;
0166 pu = pud_alloc(mm, p4, addr);
0167 if (!pu)
0168 return NULL;
0169 if (pshift >= PUD_SHIFT) {
0170 ptl = pud_lockptr(mm, pu);
0171 hpdp = (hugepd_t *)pu;
0172 } else {
0173 pdshift = PMD_SHIFT;
0174 pm = pmd_alloc(mm, pu, addr);
0175 if (!pm)
0176 return NULL;
0177 ptl = pmd_lockptr(mm, pm);
0178 hpdp = (hugepd_t *)pm;
0179 }
0180 }
0181 #endif
0182 if (!hpdp)
0183 return NULL;
0184
0185 if (IS_ENABLED(CONFIG_PPC_8xx) && pshift < PMD_SHIFT)
0186 return pte_alloc_map(mm, (pmd_t *)hpdp, addr);
0187
0188 BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
0189
0190 if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr,
0191 pdshift, pshift, ptl))
0192 return NULL;
0193
0194 return hugepte_offset(*hpdp, addr, pdshift);
0195 }
0196
0197 #ifdef CONFIG_PPC_BOOK3S_64
0198
0199
0200
0201
0202 #define MAX_NUMBER_GPAGES 1024
0203 __initdata static u64 gpage_freearray[MAX_NUMBER_GPAGES];
0204 __initdata static unsigned nr_gpages;
0205
0206
0207
0208
0209
0210 void __init pseries_add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
0211 {
0212 if (!addr)
0213 return;
0214 while (number_of_pages > 0) {
0215 gpage_freearray[nr_gpages] = addr;
0216 nr_gpages++;
0217 number_of_pages--;
0218 addr += page_size;
0219 }
0220 }
0221
0222 static int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate)
0223 {
0224 struct huge_bootmem_page *m;
0225 if (nr_gpages == 0)
0226 return 0;
0227 m = phys_to_virt(gpage_freearray[--nr_gpages]);
0228 gpage_freearray[nr_gpages] = 0;
0229 list_add(&m->list, &huge_boot_pages);
0230 m->hstate = hstate;
0231 return 1;
0232 }
0233
0234 bool __init hugetlb_node_alloc_supported(void)
0235 {
0236 return false;
0237 }
0238 #endif
0239
0240
0241 int __init alloc_bootmem_huge_page(struct hstate *h, int nid)
0242 {
0243
0244 #ifdef CONFIG_PPC_BOOK3S_64
0245 if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled())
0246 return pseries_alloc_bootmem_huge_page(h);
0247 #endif
0248 return __alloc_bootmem_huge_page(h, nid);
0249 }
0250
0251 #ifndef CONFIG_PPC_BOOK3S_64
0252 #define HUGEPD_FREELIST_SIZE \
0253 ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))
0254
0255 struct hugepd_freelist {
0256 struct rcu_head rcu;
0257 unsigned int index;
0258 void *ptes[];
0259 };
0260
0261 static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur);
0262
0263 static void hugepd_free_rcu_callback(struct rcu_head *head)
0264 {
0265 struct hugepd_freelist *batch =
0266 container_of(head, struct hugepd_freelist, rcu);
0267 unsigned int i;
0268
0269 for (i = 0; i < batch->index; i++)
0270 kmem_cache_free(PGT_CACHE(PTE_T_ORDER), batch->ptes[i]);
0271
0272 free_page((unsigned long)batch);
0273 }
0274
0275 static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
0276 {
0277 struct hugepd_freelist **batchp;
0278
0279 batchp = &get_cpu_var(hugepd_freelist_cur);
0280
0281 if (atomic_read(&tlb->mm->mm_users) < 2 ||
0282 mm_is_thread_local(tlb->mm)) {
0283 kmem_cache_free(PGT_CACHE(PTE_T_ORDER), hugepte);
0284 put_cpu_var(hugepd_freelist_cur);
0285 return;
0286 }
0287
0288 if (*batchp == NULL) {
0289 *batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC);
0290 (*batchp)->index = 0;
0291 }
0292
0293 (*batchp)->ptes[(*batchp)->index++] = hugepte;
0294 if ((*batchp)->index == HUGEPD_FREELIST_SIZE) {
0295 call_rcu(&(*batchp)->rcu, hugepd_free_rcu_callback);
0296 *batchp = NULL;
0297 }
0298 put_cpu_var(hugepd_freelist_cur);
0299 }
0300 #else
0301 static inline void hugepd_free(struct mmu_gather *tlb, void *hugepte) {}
0302 #endif
0303
0304
0305 static bool range_is_outside_limits(unsigned long start, unsigned long end,
0306 unsigned long floor, unsigned long ceiling,
0307 unsigned long mask)
0308 {
0309 if ((start & mask) < floor)
0310 return true;
0311 if (ceiling) {
0312 ceiling &= mask;
0313 if (!ceiling)
0314 return true;
0315 }
0316 return end - 1 > ceiling - 1;
0317 }
0318
0319 static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
0320 unsigned long start, unsigned long end,
0321 unsigned long floor, unsigned long ceiling)
0322 {
0323 pte_t *hugepte = hugepd_page(*hpdp);
0324 int i;
0325
0326 unsigned long pdmask = ~((1UL << pdshift) - 1);
0327 unsigned int num_hugepd = 1;
0328 unsigned int shift = hugepd_shift(*hpdp);
0329
0330
0331 if (shift > pdshift)
0332 num_hugepd = 1 << (shift - pdshift);
0333
0334 if (range_is_outside_limits(start, end, floor, ceiling, pdmask))
0335 return;
0336
0337 for (i = 0; i < num_hugepd; i++, hpdp++)
0338 *hpdp = __hugepd(0);
0339
0340 if (shift >= pdshift)
0341 hugepd_free(tlb, hugepte);
0342 else
0343 pgtable_free_tlb(tlb, hugepte,
0344 get_hugepd_cache_index(pdshift - shift));
0345 }
0346
0347 static void hugetlb_free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
0348 unsigned long addr, unsigned long end,
0349 unsigned long floor, unsigned long ceiling)
0350 {
0351 pgtable_t token = pmd_pgtable(*pmd);
0352
0353 if (range_is_outside_limits(addr, end, floor, ceiling, PMD_MASK))
0354 return;
0355
0356 pmd_clear(pmd);
0357 pte_free_tlb(tlb, token, addr);
0358 mm_dec_nr_ptes(tlb->mm);
0359 }
0360
0361 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
0362 unsigned long addr, unsigned long end,
0363 unsigned long floor, unsigned long ceiling)
0364 {
0365 pmd_t *pmd;
0366 unsigned long next;
0367 unsigned long start;
0368
0369 start = addr;
0370 do {
0371 unsigned long more;
0372
0373 pmd = pmd_offset(pud, addr);
0374 next = pmd_addr_end(addr, end);
0375 if (!is_hugepd(__hugepd(pmd_val(*pmd)))) {
0376 if (pmd_none_or_clear_bad(pmd))
0377 continue;
0378
0379
0380
0381
0382
0383 WARN_ON(!IS_ENABLED(CONFIG_PPC_8xx));
0384
0385 hugetlb_free_pte_range(tlb, pmd, addr, end, floor, ceiling);
0386
0387 continue;
0388 }
0389
0390
0391
0392
0393
0394
0395 more = addr + (1 << hugepd_shift(*(hugepd_t *)pmd));
0396 if (more > next)
0397 next = more;
0398
0399 free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
0400 addr, next, floor, ceiling);
0401 } while (addr = next, addr != end);
0402
0403 if (range_is_outside_limits(start, end, floor, ceiling, PUD_MASK))
0404 return;
0405
0406 pmd = pmd_offset(pud, start & PUD_MASK);
0407 pud_clear(pud);
0408 pmd_free_tlb(tlb, pmd, start & PUD_MASK);
0409 mm_dec_nr_pmds(tlb->mm);
0410 }
0411
0412 static void hugetlb_free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
0413 unsigned long addr, unsigned long end,
0414 unsigned long floor, unsigned long ceiling)
0415 {
0416 pud_t *pud;
0417 unsigned long next;
0418 unsigned long start;
0419
0420 start = addr;
0421 do {
0422 pud = pud_offset(p4d, addr);
0423 next = pud_addr_end(addr, end);
0424 if (!is_hugepd(__hugepd(pud_val(*pud)))) {
0425 if (pud_none_or_clear_bad(pud))
0426 continue;
0427 hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
0428 ceiling);
0429 } else {
0430 unsigned long more;
0431
0432
0433
0434
0435
0436
0437 more = addr + (1 << hugepd_shift(*(hugepd_t *)pud));
0438 if (more > next)
0439 next = more;
0440
0441 free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
0442 addr, next, floor, ceiling);
0443 }
0444 } while (addr = next, addr != end);
0445
0446 if (range_is_outside_limits(start, end, floor, ceiling, PGDIR_MASK))
0447 return;
0448
0449 pud = pud_offset(p4d, start & PGDIR_MASK);
0450 p4d_clear(p4d);
0451 pud_free_tlb(tlb, pud, start & PGDIR_MASK);
0452 mm_dec_nr_puds(tlb->mm);
0453 }
0454
0455
0456
0457
0458 void hugetlb_free_pgd_range(struct mmu_gather *tlb,
0459 unsigned long addr, unsigned long end,
0460 unsigned long floor, unsigned long ceiling)
0461 {
0462 pgd_t *pgd;
0463 p4d_t *p4d;
0464 unsigned long next;
0465
0466
0467
0468
0469
0470
0471
0472
0473
0474
0475
0476
0477
0478
0479
0480
0481
0482
0483 do {
0484 next = pgd_addr_end(addr, end);
0485 pgd = pgd_offset(tlb->mm, addr);
0486 p4d = p4d_offset(pgd, addr);
0487 if (!is_hugepd(__hugepd(pgd_val(*pgd)))) {
0488 if (p4d_none_or_clear_bad(p4d))
0489 continue;
0490 hugetlb_free_pud_range(tlb, p4d, addr, next, floor, ceiling);
0491 } else {
0492 unsigned long more;
0493
0494
0495
0496
0497
0498
0499 more = addr + (1 << hugepd_shift(*(hugepd_t *)pgd));
0500 if (more > next)
0501 next = more;
0502
0503 free_hugepd_range(tlb, (hugepd_t *)p4d, PGDIR_SHIFT,
0504 addr, next, floor, ceiling);
0505 }
0506 } while (addr = next, addr != end);
0507 }
0508
0509 struct page *follow_huge_pd(struct vm_area_struct *vma,
0510 unsigned long address, hugepd_t hpd,
0511 int flags, int pdshift)
0512 {
0513 pte_t *ptep;
0514 spinlock_t *ptl;
0515 struct page *page = NULL;
0516 unsigned long mask;
0517 int shift = hugepd_shift(hpd);
0518 struct mm_struct *mm = vma->vm_mm;
0519
0520 retry:
0521
0522
0523
0524
0525 ptl = &mm->page_table_lock;
0526 spin_lock(ptl);
0527
0528 ptep = hugepte_offset(hpd, address, pdshift);
0529 if (pte_present(*ptep)) {
0530 mask = (1UL << shift) - 1;
0531 page = pte_page(*ptep);
0532 page += ((address & mask) >> PAGE_SHIFT);
0533 if (flags & FOLL_GET)
0534 get_page(page);
0535 } else {
0536 if (is_hugetlb_entry_migration(*ptep)) {
0537 spin_unlock(ptl);
0538 __migration_entry_wait(mm, ptep, ptl);
0539 goto retry;
0540 }
0541 }
0542 spin_unlock(ptl);
0543 return page;
0544 }
0545
0546 bool __init arch_hugetlb_valid_size(unsigned long size)
0547 {
0548 int shift = __ffs(size);
0549 int mmu_psize;
0550
0551
0552
0553 if (size <= PAGE_SIZE || !is_power_of_2(size))
0554 return false;
0555
0556 mmu_psize = check_and_get_huge_psize(shift);
0557 if (mmu_psize < 0)
0558 return false;
0559
0560 BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
0561
0562 return true;
0563 }
0564
0565 static int __init add_huge_page_size(unsigned long long size)
0566 {
0567 int shift = __ffs(size);
0568
0569 if (!arch_hugetlb_valid_size((unsigned long)size))
0570 return -EINVAL;
0571
0572 hugetlb_add_hstate(shift - PAGE_SHIFT);
0573 return 0;
0574 }
0575
0576 static int __init hugetlbpage_init(void)
0577 {
0578 bool configured = false;
0579 int psize;
0580
0581 if (hugetlb_disabled) {
0582 pr_info("HugeTLB support is disabled!\n");
0583 return 0;
0584 }
0585
0586 if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !radix_enabled() &&
0587 !mmu_has_feature(MMU_FTR_16M_PAGE))
0588 return -ENODEV;
0589
0590 for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
0591 unsigned shift;
0592 unsigned pdshift;
0593
0594 if (!mmu_psize_defs[psize].shift)
0595 continue;
0596
0597 shift = mmu_psize_to_shift(psize);
0598
0599 #ifdef CONFIG_PPC_BOOK3S_64
0600 if (shift > PGDIR_SHIFT)
0601 continue;
0602 else if (shift > PUD_SHIFT)
0603 pdshift = PGDIR_SHIFT;
0604 else if (shift > PMD_SHIFT)
0605 pdshift = PUD_SHIFT;
0606 else
0607 pdshift = PMD_SHIFT;
0608 #else
0609 if (shift < PUD_SHIFT)
0610 pdshift = PMD_SHIFT;
0611 else if (shift < PGDIR_SHIFT)
0612 pdshift = PUD_SHIFT;
0613 else
0614 pdshift = PGDIR_SHIFT;
0615 #endif
0616
0617 if (add_huge_page_size(1ULL << shift) < 0)
0618 continue;
0619
0620
0621
0622
0623 if (pdshift > shift) {
0624 if (!IS_ENABLED(CONFIG_PPC_8xx))
0625 pgtable_cache_add(pdshift - shift);
0626 } else if (IS_ENABLED(CONFIG_PPC_FSL_BOOK3E) ||
0627 IS_ENABLED(CONFIG_PPC_8xx)) {
0628 pgtable_cache_add(PTE_T_ORDER);
0629 }
0630
0631 configured = true;
0632 }
0633
0634 if (!configured)
0635 pr_info("Failed to initialize. Disabling HugeTLB");
0636
0637 return 0;
0638 }
0639
0640 arch_initcall(hugetlbpage_init);
0641
0642 void __init gigantic_hugetlb_cma_reserve(void)
0643 {
0644 unsigned long order = 0;
0645
0646 if (radix_enabled())
0647 order = PUD_SHIFT - PAGE_SHIFT;
0648 else if (!firmware_has_feature(FW_FEATURE_LPAR) && mmu_psize_defs[MMU_PAGE_16G].shift)
0649
0650
0651
0652 order = mmu_psize_to_shift(MMU_PAGE_16G) - PAGE_SHIFT;
0653
0654 if (order) {
0655 VM_WARN_ON(order < MAX_ORDER);
0656 hugetlb_cma_reserve(order);
0657 }
0658 }