mm/book3s64/hash_pgtable.c

0001 // SPDX-License-Identifier: GPL-2.0-or-later
0002 /*
0003  * Copyright 2005, Paul Mackerras, IBM Corporation.
0004  * Copyright 2009, Benjamin Herrenschmidt, IBM Corporation.
0005  * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
0006  */
0007
0008 #include <linux/sched.h>
0009 #include <linux/mm_types.h>
0010 #include <linux/mm.h>
0011 #include <linux/stop_machine.h>
0012
0013 #include <asm/sections.h>
0014 #include <asm/mmu.h>
0015 #include <asm/tlb.h>
0016 #include <asm/firmware.h>
0017
0018 #include <mm/mmu_decl.h>
0019
0020 #include <trace/events/thp.h>
0021
0022 #if H_PGTABLE_RANGE > (USER_VSID_RANGE * (TASK_SIZE_USER64 / TASK_CONTEXT_SIZE))
0023 #warning Limited user VSID range means pagetable space is wasted
0024 #endif
0025
0026 #ifdef CONFIG_SPARSEMEM_VMEMMAP
0027 /*
0028  * vmemmap is the starting address of the virtual address space where
0029  * struct pages are allocated for all possible PFNs present on the system
0030  * including holes and bad memory (hence sparse). These virtual struct
0031  * pages are stored in sequence in this virtual address space irrespective
0032  * of the fact whether the corresponding PFN is valid or not. This achieves
0033  * constant relationship between address of struct page and its PFN.
0034  *
0035  * During boot or memory hotplug operation when a new memory section is
0036  * added, physical memory allocation (including hash table bolting) will
0037  * be performed for the set of struct pages which are part of the memory
0038  * section. This saves memory by not allocating struct pages for PFNs
0039  * which are not valid.
0040  *
0041  *      ----------------------------------------------
0042  *      | PHYSICAL ALLOCATION OF VIRTUAL STRUCT PAGES|
0043  *      ----------------------------------------------
0044  *
0045  *     f000000000000000                  c000000000000000
0046  * vmemmap +--------------+                  +--------------+
0047  *  +      |  page struct | +--------------> |  page struct |
0048  *  |      +--------------+                  +--------------+
0049  *  |      |  page struct | +--------------> |  page struct |
0050  *  |      +--------------+ |                +--------------+
0051  *  |      |  page struct | +       +------> |  page struct |
0052  *  |      +--------------+         |        +--------------+
0053  *  |      |  page struct |         |   +--> |  page struct |
0054  *  |      +--------------+         |   |    +--------------+
0055  *  |      |  page struct |         |   |
0056  *  |      +--------------+         |   |
0057  *  |      |  page struct |         |   |
0058  *  |      +--------------+         |   |
0059  *  |      |  page struct |         |   |
0060  *  |      +--------------+         |   |
0061  *  |      |  page struct |         |   |
0062  *  |      +--------------+         |   |
0063  *  |      |  page struct | +-------+   |
0064  *  |      +--------------+             |
0065  *  |      |  page struct | +-----------+
0066  *  |      +--------------+
0067  *  |      |  page struct | No mapping
0068  *  |      +--------------+
0069  *  |      |  page struct | No mapping
0070  *  v      +--------------+
0071  *
0072  *      -----------------------------------------
0073  *      | RELATION BETWEEN STRUCT PAGES AND PFNS|
0074  *      -----------------------------------------
0075  *
0076  * vmemmap +--------------+                 +---------------+
0077  *  +      |  page struct | +-------------> |      PFN      |
0078  *  |      +--------------+                 +---------------+
0079  *  |      |  page struct | +-------------> |      PFN      |
0080  *  |      +--------------+                 +---------------+
0081  *  |      |  page struct | +-------------> |      PFN      |
0082  *  |      +--------------+                 +---------------+
0083  *  |      |  page struct | +-------------> |      PFN      |
0084  *  |      +--------------+                 +---------------+
0085  *  |      |              |
0086  *  |      +--------------+
0087  *  |      |              |
0088  *  |      +--------------+
0089  *  |      |              |
0090  *  |      +--------------+                 +---------------+
0091  *  |      |  page struct | +-------------> |      PFN      |
0092  *  |      +--------------+                 +---------------+
0093  *  |      |              |
0094  *  |      +--------------+
0095  *  |      |              |
0096  *  |      +--------------+                 +---------------+
0097  *  |      |  page struct | +-------------> |      PFN      |
0098  *  |      +--------------+                 +---------------+
0099  *  |      |  page struct | +-------------> |      PFN      |
0100  *  v      +--------------+                 +---------------+
0101  */
0102 /*
0103  * On hash-based CPUs, the vmemmap is bolted in the hash table.
0104  *
0105  */
0106 int __meminit hash__vmemmap_create_mapping(unsigned long start,
0107                        unsigned long page_size,
0108                        unsigned long phys)
0109 {
0110     int rc;
0111
0112     if ((start + page_size) >= H_VMEMMAP_END) {
0113         pr_warn("Outside the supported range\n");
0114         return -1;
0115     }
0116
0117     rc = htab_bolt_mapping(start, start + page_size, phys,
0118                    pgprot_val(PAGE_KERNEL),
0119                    mmu_vmemmap_psize, mmu_kernel_ssize);
0120     if (rc < 0) {
0121         int rc2 = htab_remove_mapping(start, start + page_size,
0122                           mmu_vmemmap_psize,
0123                           mmu_kernel_ssize);
0124         BUG_ON(rc2 && (rc2 != -ENOENT));
0125     }
0126     return rc;
0127 }
0128
0129 #ifdef CONFIG_MEMORY_HOTPLUG
0130 void hash__vmemmap_remove_mapping(unsigned long start,
0131                   unsigned long page_size)
0132 {
0133     int rc = htab_remove_mapping(start, start + page_size,
0134                      mmu_vmemmap_psize,
0135                      mmu_kernel_ssize);
0136     BUG_ON((rc < 0) && (rc != -ENOENT));
0137     WARN_ON(rc == -ENOENT);
0138 }
0139 #endif
0140 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
0141
0142 /*
0143  * map_kernel_page currently only called by __ioremap
0144  * map_kernel_page adds an entry to the ioremap page table
0145  * and adds an entry to the HPT, possibly bolting it
0146  */
0147 int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
0148 {
0149     pgd_t *pgdp;
0150     p4d_t *p4dp;
0151     pud_t *pudp;
0152     pmd_t *pmdp;
0153     pte_t *ptep;
0154
0155     BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE);
0156     if (slab_is_available()) {
0157         pgdp = pgd_offset_k(ea);
0158         p4dp = p4d_offset(pgdp, ea);
0159         pudp = pud_alloc(&init_mm, p4dp, ea);
0160         if (!pudp)
0161             return -ENOMEM;
0162         pmdp = pmd_alloc(&init_mm, pudp, ea);
0163         if (!pmdp)
0164             return -ENOMEM;
0165         ptep = pte_alloc_kernel(pmdp, ea);
0166         if (!ptep)
0167             return -ENOMEM;
0168         set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, prot));
0169     } else {
0170         /*
0171          * If the mm subsystem is not fully up, we cannot create a
0172          * linux page table entry for this mapping.  Simply bolt an
0173          * entry in the hardware page table.
0174          *
0175          */
0176         if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, pgprot_val(prot),
0177                       mmu_io_psize, mmu_kernel_ssize)) {
0178             printk(KERN_ERR "Failed to do bolted mapping IO "
0179                    "memory at %016lx !\n", pa);
0180             return -ENOMEM;
0181         }
0182     }
0183
0184     smp_wmb();
0185     return 0;
0186 }
0187
0188 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
0189
0190 unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
0191                     pmd_t *pmdp, unsigned long clr,
0192                     unsigned long set)
0193 {
0194     __be64 old_be, tmp;
0195     unsigned long old;
0196
0197 #ifdef CONFIG_DEBUG_VM
0198     WARN_ON(!hash__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
0199     assert_spin_locked(pmd_lockptr(mm, pmdp));
0200 #endif
0201
0202     __asm__ __volatile__(
0203     "1: ldarx   %0,0,%3\n\
0204         and.    %1,%0,%6\n\
0205         bne-    1b \n\
0206         andc    %1,%0,%4 \n\
0207         or  %1,%1,%7\n\
0208         stdcx.  %1,0,%3 \n\
0209         bne-    1b"
0210     : "=&r" (old_be), "=&r" (tmp), "=m" (*pmdp)
0211     : "r" (pmdp), "r" (cpu_to_be64(clr)), "m" (*pmdp),
0212       "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set))
0213     : "cc" );
0214
0215     old = be64_to_cpu(old_be);
0216
0217     trace_hugepage_update(addr, old, clr, set);
0218     if (old & H_PAGE_HASHPTE)
0219         hpte_do_hugepage_flush(mm, addr, pmdp, old);
0220     return old;
0221 }
0222
0223 pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
0224                 pmd_t *pmdp)
0225 {
0226     pmd_t pmd;
0227
0228     VM_BUG_ON(address & ~HPAGE_PMD_MASK);
0229     VM_BUG_ON(pmd_trans_huge(*pmdp));
0230     VM_BUG_ON(pmd_devmap(*pmdp));
0231
0232     pmd = *pmdp;
0233     pmd_clear(pmdp);
0234     /*
0235      * Wait for all pending hash_page to finish. This is needed
0236      * in case of subpage collapse. When we collapse normal pages
0237      * to hugepage, we first clear the pmd, then invalidate all
0238      * the PTE entries. The assumption here is that any low level
0239      * page fault will see a none pmd and take the slow path that
0240      * will wait on mmap_lock. But we could very well be in a
0241      * hash_page with local ptep pointer value. Such a hash page
0242      * can result in adding new HPTE entries for normal subpages.
0243      * That means we could be modifying the page content as we
0244      * copy them to a huge page. So wait for parallel hash_page
0245      * to finish before invalidating HPTE entries. We can do this
0246      * by sending an IPI to all the cpus and executing a dummy
0247      * function there.
0248      */
0249     serialize_against_pte_lookup(vma->vm_mm);
0250     /*
0251      * Now invalidate the hpte entries in the range
0252      * covered by pmd. This make sure we take a
0253      * fault and will find the pmd as none, which will
0254      * result in a major fault which takes mmap_lock and
0255      * hence wait for collapse to complete. Without this
0256      * the __collapse_huge_page_copy can result in copying
0257      * the old content.
0258      */
0259     flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
0260     return pmd;
0261 }
0262
0263 /*
0264  * We want to put the pgtable in pmd and use pgtable for tracking
0265  * the base page size hptes
0266  */
0267 void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
0268                   pgtable_t pgtable)
0269 {
0270     pgtable_t *pgtable_slot;
0271
0272     assert_spin_locked(pmd_lockptr(mm, pmdp));
0273     /*
0274      * we store the pgtable in the second half of PMD
0275      */
0276     pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
0277     *pgtable_slot = pgtable;
0278     /*
0279      * expose the deposited pgtable to other cpus.
0280      * before we set the hugepage PTE at pmd level
0281      * hash fault code looks at the deposted pgtable
0282      * to store hash index values.
0283      */
0284     smp_wmb();
0285 }
0286
0287 pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
0288 {
0289     pgtable_t pgtable;
0290     pgtable_t *pgtable_slot;
0291
0292     assert_spin_locked(pmd_lockptr(mm, pmdp));
0293
0294     pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
0295     pgtable = *pgtable_slot;
0296     /*
0297      * Once we withdraw, mark the entry NULL.
0298      */
0299     *pgtable_slot = NULL;
0300     /*
0301      * We store HPTE information in the deposited PTE fragment.
0302      * zero out the content on withdraw.
0303      */
0304     memset(pgtable, 0, PTE_FRAG_SIZE);
0305     return pgtable;
0306 }
0307
0308 /*
0309  * A linux hugepage PMD was changed and the corresponding hash table entries
0310  * neesd to be flushed.
0311  */
0312 void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
0313                 pmd_t *pmdp, unsigned long old_pmd)
0314 {
0315     int ssize;
0316     unsigned int psize;
0317     unsigned long vsid;
0318     unsigned long flags = 0;
0319
0320     /* get the base page size,vsid and segment size */
0321 #ifdef CONFIG_DEBUG_VM
0322     psize = get_slice_psize(mm, addr);
0323     BUG_ON(psize == MMU_PAGE_16M);
0324 #endif
0325     if (old_pmd & H_PAGE_COMBO)
0326         psize = MMU_PAGE_4K;
0327     else
0328         psize = MMU_PAGE_64K;
0329
0330     if (!is_kernel_addr(addr)) {
0331         ssize = user_segment_size(addr);
0332         vsid = get_user_vsid(&mm->context, addr, ssize);
0333         WARN_ON(vsid == 0);
0334     } else {
0335         vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
0336         ssize = mmu_kernel_ssize;
0337     }
0338
0339     if (mm_is_thread_local(mm))
0340         flags |= HPTE_LOCAL_UPDATE;
0341
0342     return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags);
0343 }
0344
0345 pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
0346                 unsigned long addr, pmd_t *pmdp)
0347 {
0348     pmd_t old_pmd;
0349     pgtable_t pgtable;
0350     unsigned long old;
0351     pgtable_t *pgtable_slot;
0352
0353     old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
0354     old_pmd = __pmd(old);
0355     /*
0356      * We have pmd == none and we are holding page_table_lock.
0357      * So we can safely go and clear the pgtable hash
0358      * index info.
0359      */
0360     pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
0361     pgtable = *pgtable_slot;
0362     /*
0363      * Let's zero out old valid and hash index details
0364      * hash fault look at them.
0365      */
0366     memset(pgtable, 0, PTE_FRAG_SIZE);
0367     return old_pmd;
0368 }
0369
0370 int hash__has_transparent_hugepage(void)
0371 {
0372
0373     if (!mmu_has_feature(MMU_FTR_16M_PAGE))
0374         return 0;
0375     /*
0376      * We support THP only if PMD_SIZE is 16MB.
0377      */
0378     if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT)
0379         return 0;
0380     /*
0381      * We need to make sure that we support 16MB hugepage in a segment
0382      * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
0383      * of 64K.
0384      */
0385     /*
0386      * If we have 64K HPTE, we will be using that by default
0387      */
0388     if (mmu_psize_defs[MMU_PAGE_64K].shift &&
0389         (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1))
0390         return 0;
0391     /*
0392      * Ok we only have 4K HPTE
0393      */
0394     if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1)
0395         return 0;
0396
0397     return 1;
0398 }
0399 EXPORT_SYMBOL_GPL(hash__has_transparent_hugepage);
0400
0401 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
0402
0403 #ifdef CONFIG_STRICT_KERNEL_RWX
0404
0405 struct change_memory_parms {
0406     unsigned long start, end, newpp;
0407     unsigned int step, nr_cpus, master_cpu;
0408     atomic_t cpu_counter;
0409 };
0410
0411 // We'd rather this was on the stack but it has to be in the RMO
0412 static struct change_memory_parms chmem_parms;
0413
0414 // And therefore we need a lock to protect it from concurrent use
0415 static DEFINE_MUTEX(chmem_lock);
0416
0417 static void change_memory_range(unsigned long start, unsigned long end,
0418                 unsigned int step, unsigned long newpp)
0419 {
0420     unsigned long idx;
0421
0422     pr_debug("Changing page protection on range 0x%lx-0x%lx, to 0x%lx, step 0x%x\n",
0423          start, end, newpp, step);
0424
0425     for (idx = start; idx < end; idx += step)
0426         /* Not sure if we can do much with the return value */
0427         mmu_hash_ops.hpte_updateboltedpp(newpp, idx, mmu_linear_psize,
0428                             mmu_kernel_ssize);
0429 }
0430
0431 static int notrace chmem_secondary_loop(struct change_memory_parms *parms)
0432 {
0433     unsigned long msr, tmp, flags;
0434     int *p;
0435
0436     p = &parms->cpu_counter.counter;
0437
0438     local_irq_save(flags);
0439     hard_irq_disable();
0440
0441     asm volatile (
0442     // Switch to real mode and leave interrupts off
0443     "mfmsr  %[msr]          ;"
0444     "li %[tmp], %[MSR_IR_DR]    ;"
0445     "andc   %[tmp], %[msr], %[tmp]  ;"
0446     "mtmsrd %[tmp]          ;"
0447
0448     // Tell the master we are in real mode
0449     "1:             "
0450     "lwarx  %[tmp], 0, %[p]     ;"
0451     "addic  %[tmp], %[tmp], -1  ;"
0452     "stwcx. %[tmp], 0, %[p]     ;"
0453     "bne-   1b          ;"
0454
0455     // Spin until the counter goes to zero
0456     "2:             ;"
0457     "lwz    %[tmp], 0(%[p])     ;"
0458     "cmpwi  %[tmp], 0       ;"
0459     "bne-   2b          ;"
0460
0461     // Switch back to virtual mode
0462     "mtmsrd %[msr]          ;"
0463
0464     : // outputs
0465       [msr] "=&r" (msr), [tmp] "=&b" (tmp), "+m" (*p)
0466     : // inputs
0467       [p] "b" (p), [MSR_IR_DR] "i" (MSR_IR | MSR_DR)
0468     : // clobbers
0469       "cc", "xer"
0470     );
0471
0472     local_irq_restore(flags);
0473
0474     return 0;
0475 }
0476
0477 static int change_memory_range_fn(void *data)
0478 {
0479     struct change_memory_parms *parms = data;
0480
0481     if (parms->master_cpu != smp_processor_id())
0482         return chmem_secondary_loop(parms);
0483
0484     // Wait for all but one CPU (this one) to call-in
0485     while (atomic_read(&parms->cpu_counter) > 1)
0486         barrier();
0487
0488     change_memory_range(parms->start, parms->end, parms->step, parms->newpp);
0489
0490     mb();
0491
0492     // Signal the other CPUs that we're done
0493     atomic_dec(&parms->cpu_counter);
0494
0495     return 0;
0496 }
0497
0498 static bool hash__change_memory_range(unsigned long start, unsigned long end,
0499                       unsigned long newpp)
0500 {
0501     unsigned int step, shift;
0502
0503     shift = mmu_psize_defs[mmu_linear_psize].shift;
0504     step = 1 << shift;
0505
0506     start = ALIGN_DOWN(start, step);
0507     end = ALIGN(end, step); // aligns up
0508
0509     if (start >= end)
0510         return false;
0511
0512     if (firmware_has_feature(FW_FEATURE_LPAR)) {
0513         mutex_lock(&chmem_lock);
0514
0515         chmem_parms.start = start;
0516         chmem_parms.end = end;
0517         chmem_parms.step = step;
0518         chmem_parms.newpp = newpp;
0519         chmem_parms.master_cpu = smp_processor_id();
0520
0521         cpus_read_lock();
0522
0523         atomic_set(&chmem_parms.cpu_counter, num_online_cpus());
0524
0525         // Ensure state is consistent before we call the other CPUs
0526         mb();
0527
0528         stop_machine_cpuslocked(change_memory_range_fn, &chmem_parms,
0529                     cpu_online_mask);
0530
0531         cpus_read_unlock();
0532         mutex_unlock(&chmem_lock);
0533     } else
0534         change_memory_range(start, end, step, newpp);
0535
0536     return true;
0537 }
0538
0539 void hash__mark_rodata_ro(void)
0540 {
0541     unsigned long start, end, pp;
0542
0543     start = (unsigned long)_stext;
0544     end = (unsigned long)__init_begin;
0545
0546     pp = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL_ROX), HPTE_USE_KERNEL_KEY);
0547
0548     WARN_ON(!hash__change_memory_range(start, end, pp));
0549 }
0550
0551 void hash__mark_initmem_nx(void)
0552 {
0553     unsigned long start, end, pp;
0554
0555     start = (unsigned long)__init_begin;
0556     end = (unsigned long)__init_end;
0557
0558     pp = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL), HPTE_USE_KERNEL_KEY);
0559
0560     WARN_ON(!hash__change_memory_range(start, end, pp));
0561 }
0562 #endif