Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-or-later
0002 /*
0003  * Page table handling routines for radix page table.
0004  *
0005  * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
0006  */
0007 
0008 #define pr_fmt(fmt) "radix-mmu: " fmt
0009 
0010 #include <linux/io.h>
0011 #include <linux/kernel.h>
0012 #include <linux/sched/mm.h>
0013 #include <linux/memblock.h>
0014 #include <linux/of.h>
0015 #include <linux/of_fdt.h>
0016 #include <linux/mm.h>
0017 #include <linux/hugetlb.h>
0018 #include <linux/string_helpers.h>
0019 #include <linux/memory.h>
0020 
0021 #include <asm/pgalloc.h>
0022 #include <asm/mmu_context.h>
0023 #include <asm/dma.h>
0024 #include <asm/machdep.h>
0025 #include <asm/mmu.h>
0026 #include <asm/firmware.h>
0027 #include <asm/powernv.h>
0028 #include <asm/sections.h>
0029 #include <asm/smp.h>
0030 #include <asm/trace.h>
0031 #include <asm/uaccess.h>
0032 #include <asm/ultravisor.h>
0033 
0034 #include <trace/events/thp.h>
0035 
0036 unsigned int mmu_base_pid;
0037 unsigned long radix_mem_block_size __ro_after_init;
0038 
0039 static __ref void *early_alloc_pgtable(unsigned long size, int nid,
0040             unsigned long region_start, unsigned long region_end)
0041 {
0042     phys_addr_t min_addr = MEMBLOCK_LOW_LIMIT;
0043     phys_addr_t max_addr = MEMBLOCK_ALLOC_ANYWHERE;
0044     void *ptr;
0045 
0046     if (region_start)
0047         min_addr = region_start;
0048     if (region_end)
0049         max_addr = region_end;
0050 
0051     ptr = memblock_alloc_try_nid(size, size, min_addr, max_addr, nid);
0052 
0053     if (!ptr)
0054         panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa max_addr=%pa\n",
0055               __func__, size, size, nid, &min_addr, &max_addr);
0056 
0057     return ptr;
0058 }
0059 
0060 /*
0061  * When allocating pud or pmd pointers, we allocate a complete page
0062  * of PAGE_SIZE rather than PUD_TABLE_SIZE or PMD_TABLE_SIZE. This
0063  * is to ensure that the page obtained from the memblock allocator
0064  * can be completely used as page table page and can be freed
0065  * correctly when the page table entries are removed.
0066  */
0067 static int early_map_kernel_page(unsigned long ea, unsigned long pa,
0068               pgprot_t flags,
0069               unsigned int map_page_size,
0070               int nid,
0071               unsigned long region_start, unsigned long region_end)
0072 {
0073     unsigned long pfn = pa >> PAGE_SHIFT;
0074     pgd_t *pgdp;
0075     p4d_t *p4dp;
0076     pud_t *pudp;
0077     pmd_t *pmdp;
0078     pte_t *ptep;
0079 
0080     pgdp = pgd_offset_k(ea);
0081     p4dp = p4d_offset(pgdp, ea);
0082     if (p4d_none(*p4dp)) {
0083         pudp = early_alloc_pgtable(PAGE_SIZE, nid,
0084                        region_start, region_end);
0085         p4d_populate(&init_mm, p4dp, pudp);
0086     }
0087     pudp = pud_offset(p4dp, ea);
0088     if (map_page_size == PUD_SIZE) {
0089         ptep = (pte_t *)pudp;
0090         goto set_the_pte;
0091     }
0092     if (pud_none(*pudp)) {
0093         pmdp = early_alloc_pgtable(PAGE_SIZE, nid, region_start,
0094                        region_end);
0095         pud_populate(&init_mm, pudp, pmdp);
0096     }
0097     pmdp = pmd_offset(pudp, ea);
0098     if (map_page_size == PMD_SIZE) {
0099         ptep = pmdp_ptep(pmdp);
0100         goto set_the_pte;
0101     }
0102     if (!pmd_present(*pmdp)) {
0103         ptep = early_alloc_pgtable(PAGE_SIZE, nid,
0104                         region_start, region_end);
0105         pmd_populate_kernel(&init_mm, pmdp, ptep);
0106     }
0107     ptep = pte_offset_kernel(pmdp, ea);
0108 
0109 set_the_pte:
0110     set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
0111     asm volatile("ptesync": : :"memory");
0112     return 0;
0113 }
0114 
0115 /*
0116  * nid, region_start, and region_end are hints to try to place the page
0117  * table memory in the same node or region.
0118  */
0119 static int __map_kernel_page(unsigned long ea, unsigned long pa,
0120               pgprot_t flags,
0121               unsigned int map_page_size,
0122               int nid,
0123               unsigned long region_start, unsigned long region_end)
0124 {
0125     unsigned long pfn = pa >> PAGE_SHIFT;
0126     pgd_t *pgdp;
0127     p4d_t *p4dp;
0128     pud_t *pudp;
0129     pmd_t *pmdp;
0130     pte_t *ptep;
0131     /*
0132      * Make sure task size is correct as per the max adddr
0133      */
0134     BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE);
0135 
0136 #ifdef CONFIG_PPC_64K_PAGES
0137     BUILD_BUG_ON(RADIX_KERN_MAP_SIZE != (1UL << MAX_EA_BITS_PER_CONTEXT));
0138 #endif
0139 
0140     if (unlikely(!slab_is_available()))
0141         return early_map_kernel_page(ea, pa, flags, map_page_size,
0142                         nid, region_start, region_end);
0143 
0144     /*
0145      * Should make page table allocation functions be able to take a
0146      * node, so we can place kernel page tables on the right nodes after
0147      * boot.
0148      */
0149     pgdp = pgd_offset_k(ea);
0150     p4dp = p4d_offset(pgdp, ea);
0151     pudp = pud_alloc(&init_mm, p4dp, ea);
0152     if (!pudp)
0153         return -ENOMEM;
0154     if (map_page_size == PUD_SIZE) {
0155         ptep = (pte_t *)pudp;
0156         goto set_the_pte;
0157     }
0158     pmdp = pmd_alloc(&init_mm, pudp, ea);
0159     if (!pmdp)
0160         return -ENOMEM;
0161     if (map_page_size == PMD_SIZE) {
0162         ptep = pmdp_ptep(pmdp);
0163         goto set_the_pte;
0164     }
0165     ptep = pte_alloc_kernel(pmdp, ea);
0166     if (!ptep)
0167         return -ENOMEM;
0168 
0169 set_the_pte:
0170     set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
0171     asm volatile("ptesync": : :"memory");
0172     return 0;
0173 }
0174 
0175 int radix__map_kernel_page(unsigned long ea, unsigned long pa,
0176               pgprot_t flags,
0177               unsigned int map_page_size)
0178 {
0179     return __map_kernel_page(ea, pa, flags, map_page_size, -1, 0, 0);
0180 }
0181 
0182 #ifdef CONFIG_STRICT_KERNEL_RWX
0183 static void radix__change_memory_range(unsigned long start, unsigned long end,
0184                        unsigned long clear)
0185 {
0186     unsigned long idx;
0187     pgd_t *pgdp;
0188     p4d_t *p4dp;
0189     pud_t *pudp;
0190     pmd_t *pmdp;
0191     pte_t *ptep;
0192 
0193     start = ALIGN_DOWN(start, PAGE_SIZE);
0194     end = PAGE_ALIGN(end); // aligns up
0195 
0196     pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n",
0197          start, end, clear);
0198 
0199     for (idx = start; idx < end; idx += PAGE_SIZE) {
0200         pgdp = pgd_offset_k(idx);
0201         p4dp = p4d_offset(pgdp, idx);
0202         pudp = pud_alloc(&init_mm, p4dp, idx);
0203         if (!pudp)
0204             continue;
0205         if (pud_is_leaf(*pudp)) {
0206             ptep = (pte_t *)pudp;
0207             goto update_the_pte;
0208         }
0209         pmdp = pmd_alloc(&init_mm, pudp, idx);
0210         if (!pmdp)
0211             continue;
0212         if (pmd_is_leaf(*pmdp)) {
0213             ptep = pmdp_ptep(pmdp);
0214             goto update_the_pte;
0215         }
0216         ptep = pte_alloc_kernel(pmdp, idx);
0217         if (!ptep)
0218             continue;
0219 update_the_pte:
0220         radix__pte_update(&init_mm, idx, ptep, clear, 0, 0);
0221     }
0222 
0223     radix__flush_tlb_kernel_range(start, end);
0224 }
0225 
0226 void radix__mark_rodata_ro(void)
0227 {
0228     unsigned long start, end;
0229 
0230     start = (unsigned long)_stext;
0231     end = (unsigned long)__init_begin;
0232 
0233     radix__change_memory_range(start, end, _PAGE_WRITE);
0234 }
0235 
0236 void radix__mark_initmem_nx(void)
0237 {
0238     unsigned long start = (unsigned long)__init_begin;
0239     unsigned long end = (unsigned long)__init_end;
0240 
0241     radix__change_memory_range(start, end, _PAGE_EXEC);
0242 }
0243 #endif /* CONFIG_STRICT_KERNEL_RWX */
0244 
0245 static inline void __meminit
0246 print_mapping(unsigned long start, unsigned long end, unsigned long size, bool exec)
0247 {
0248     char buf[10];
0249 
0250     if (end <= start)
0251         return;
0252 
0253     string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf));
0254 
0255     pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf,
0256         exec ? " (exec)" : "");
0257 }
0258 
0259 static unsigned long next_boundary(unsigned long addr, unsigned long end)
0260 {
0261 #ifdef CONFIG_STRICT_KERNEL_RWX
0262     if (addr < __pa_symbol(__init_begin))
0263         return __pa_symbol(__init_begin);
0264 #endif
0265     return end;
0266 }
0267 
0268 static int __meminit create_physical_mapping(unsigned long start,
0269                          unsigned long end,
0270                          unsigned long max_mapping_size,
0271                          int nid, pgprot_t _prot)
0272 {
0273     unsigned long vaddr, addr, mapping_size = 0;
0274     bool prev_exec, exec = false;
0275     pgprot_t prot;
0276     int psize;
0277 
0278     start = ALIGN(start, PAGE_SIZE);
0279     end   = ALIGN_DOWN(end, PAGE_SIZE);
0280     for (addr = start; addr < end; addr += mapping_size) {
0281         unsigned long gap, previous_size;
0282         int rc;
0283 
0284         gap = next_boundary(addr, end) - addr;
0285         if (gap > max_mapping_size)
0286             gap = max_mapping_size;
0287         previous_size = mapping_size;
0288         prev_exec = exec;
0289 
0290         if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE &&
0291             mmu_psize_defs[MMU_PAGE_1G].shift) {
0292             mapping_size = PUD_SIZE;
0293             psize = MMU_PAGE_1G;
0294         } else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE &&
0295                mmu_psize_defs[MMU_PAGE_2M].shift) {
0296             mapping_size = PMD_SIZE;
0297             psize = MMU_PAGE_2M;
0298         } else {
0299             mapping_size = PAGE_SIZE;
0300             psize = mmu_virtual_psize;
0301         }
0302 
0303         vaddr = (unsigned long)__va(addr);
0304 
0305         if (overlaps_kernel_text(vaddr, vaddr + mapping_size) ||
0306             overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) {
0307             prot = PAGE_KERNEL_X;
0308             exec = true;
0309         } else {
0310             prot = _prot;
0311             exec = false;
0312         }
0313 
0314         if (mapping_size != previous_size || exec != prev_exec) {
0315             print_mapping(start, addr, previous_size, prev_exec);
0316             start = addr;
0317         }
0318 
0319         rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end);
0320         if (rc)
0321             return rc;
0322 
0323         update_page_count(psize, 1);
0324     }
0325 
0326     print_mapping(start, addr, mapping_size, exec);
0327     return 0;
0328 }
0329 
0330 static void __init radix_init_pgtable(void)
0331 {
0332     unsigned long rts_field;
0333     phys_addr_t start, end;
0334     u64 i;
0335 
0336     /* We don't support slb for radix */
0337     slb_set_size(0);
0338 
0339     /*
0340      * Create the linear mapping
0341      */
0342     for_each_mem_range(i, &start, &end) {
0343         /*
0344          * The memblock allocator  is up at this point, so the
0345          * page tables will be allocated within the range. No
0346          * need or a node (which we don't have yet).
0347          */
0348 
0349         if (end >= RADIX_VMALLOC_START) {
0350             pr_warn("Outside the supported range\n");
0351             continue;
0352         }
0353 
0354         WARN_ON(create_physical_mapping(start, end,
0355                         radix_mem_block_size,
0356                         -1, PAGE_KERNEL));
0357     }
0358 
0359     if (!cpu_has_feature(CPU_FTR_HVMODE) &&
0360             cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) {
0361         /*
0362          * Older versions of KVM on these machines prefer if the
0363          * guest only uses the low 19 PID bits.
0364          */
0365         mmu_pid_bits = 19;
0366     }
0367     mmu_base_pid = 1;
0368 
0369     /*
0370      * Allocate Partition table and process table for the
0371      * host.
0372      */
0373     BUG_ON(PRTB_SIZE_SHIFT > 36);
0374     process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0);
0375     /*
0376      * Fill in the process table.
0377      */
0378     rts_field = radix__get_tree_size();
0379     process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE);
0380 
0381     /*
0382      * The init_mm context is given the first available (non-zero) PID,
0383      * which is the "guard PID" and contains no page table. PIDR should
0384      * never be set to zero because that duplicates the kernel address
0385      * space at the 0x0... offset (quadrant 0)!
0386      *
0387      * An arbitrary PID that may later be allocated by the PID allocator
0388      * for userspace processes must not be used either, because that
0389      * would cause stale user mappings for that PID on CPUs outside of
0390      * the TLB invalidation scheme (because it won't be in mm_cpumask).
0391      *
0392      * So permanently carve out one PID for the purpose of a guard PID.
0393      */
0394     init_mm.context.id = mmu_base_pid;
0395     mmu_base_pid++;
0396 }
0397 
0398 static void __init radix_init_partition_table(void)
0399 {
0400     unsigned long rts_field, dw0, dw1;
0401 
0402     mmu_partition_table_init();
0403     rts_field = radix__get_tree_size();
0404     dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR;
0405     dw1 = __pa(process_tb) | (PRTB_SIZE_SHIFT - 12) | PATB_GR;
0406     mmu_partition_table_set_entry(0, dw0, dw1, false);
0407 
0408     pr_info("Initializing Radix MMU\n");
0409 }
0410 
0411 static int __init get_idx_from_shift(unsigned int shift)
0412 {
0413     int idx = -1;
0414 
0415     switch (shift) {
0416     case 0xc:
0417         idx = MMU_PAGE_4K;
0418         break;
0419     case 0x10:
0420         idx = MMU_PAGE_64K;
0421         break;
0422     case 0x15:
0423         idx = MMU_PAGE_2M;
0424         break;
0425     case 0x1e:
0426         idx = MMU_PAGE_1G;
0427         break;
0428     }
0429     return idx;
0430 }
0431 
0432 static int __init radix_dt_scan_page_sizes(unsigned long node,
0433                        const char *uname, int depth,
0434                        void *data)
0435 {
0436     int size = 0;
0437     int shift, idx;
0438     unsigned int ap;
0439     const __be32 *prop;
0440     const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
0441 
0442     /* We are scanning "cpu" nodes only */
0443     if (type == NULL || strcmp(type, "cpu") != 0)
0444         return 0;
0445 
0446     /* Grab page size encodings */
0447     prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size);
0448     if (!prop)
0449         return 0;
0450 
0451     pr_info("Page sizes from device-tree:\n");
0452     for (; size >= 4; size -= 4, ++prop) {
0453 
0454         struct mmu_psize_def *def;
0455 
0456         /* top 3 bit is AP encoding */
0457         shift = be32_to_cpu(prop[0]) & ~(0xe << 28);
0458         ap = be32_to_cpu(prop[0]) >> 29;
0459         pr_info("Page size shift = %d AP=0x%x\n", shift, ap);
0460 
0461         idx = get_idx_from_shift(shift);
0462         if (idx < 0)
0463             continue;
0464 
0465         def = &mmu_psize_defs[idx];
0466         def->shift = shift;
0467         def->ap  = ap;
0468         def->h_rpt_pgsize = psize_to_rpti_pgsize(idx);
0469     }
0470 
0471     /* needed ? */
0472     cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B;
0473     return 1;
0474 }
0475 
0476 #ifdef CONFIG_MEMORY_HOTPLUG
0477 static int __init probe_memory_block_size(unsigned long node, const char *uname, int
0478                       depth, void *data)
0479 {
0480     unsigned long *mem_block_size = (unsigned long *)data;
0481     const __be32 *prop;
0482     int len;
0483 
0484     if (depth != 1)
0485         return 0;
0486 
0487     if (strcmp(uname, "ibm,dynamic-reconfiguration-memory"))
0488         return 0;
0489 
0490     prop = of_get_flat_dt_prop(node, "ibm,lmb-size", &len);
0491 
0492     if (!prop || len < dt_root_size_cells * sizeof(__be32))
0493         /*
0494          * Nothing in the device tree
0495          */
0496         *mem_block_size = MIN_MEMORY_BLOCK_SIZE;
0497     else
0498         *mem_block_size = of_read_number(prop, dt_root_size_cells);
0499     return 1;
0500 }
0501 
0502 static unsigned long __init radix_memory_block_size(void)
0503 {
0504     unsigned long mem_block_size = MIN_MEMORY_BLOCK_SIZE;
0505 
0506     /*
0507      * OPAL firmware feature is set by now. Hence we are ok
0508      * to test OPAL feature.
0509      */
0510     if (firmware_has_feature(FW_FEATURE_OPAL))
0511         mem_block_size = 1UL * 1024 * 1024 * 1024;
0512     else
0513         of_scan_flat_dt(probe_memory_block_size, &mem_block_size);
0514 
0515     return mem_block_size;
0516 }
0517 
0518 #else   /* CONFIG_MEMORY_HOTPLUG */
0519 
0520 static unsigned long __init radix_memory_block_size(void)
0521 {
0522     return 1UL * 1024 * 1024 * 1024;
0523 }
0524 
0525 #endif /* CONFIG_MEMORY_HOTPLUG */
0526 
0527 
0528 void __init radix__early_init_devtree(void)
0529 {
0530     int rc;
0531 
0532     /*
0533      * Try to find the available page sizes in the device-tree
0534      */
0535     rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL);
0536     if (!rc) {
0537         /*
0538          * No page size details found in device tree.
0539          * Let's assume we have page 4k and 64k support
0540          */
0541         mmu_psize_defs[MMU_PAGE_4K].shift = 12;
0542         mmu_psize_defs[MMU_PAGE_4K].ap = 0x0;
0543         mmu_psize_defs[MMU_PAGE_4K].h_rpt_pgsize =
0544             psize_to_rpti_pgsize(MMU_PAGE_4K);
0545 
0546         mmu_psize_defs[MMU_PAGE_64K].shift = 16;
0547         mmu_psize_defs[MMU_PAGE_64K].ap = 0x5;
0548         mmu_psize_defs[MMU_PAGE_64K].h_rpt_pgsize =
0549             psize_to_rpti_pgsize(MMU_PAGE_64K);
0550     }
0551 
0552     /*
0553      * Max mapping size used when mapping pages. We don't use
0554      * ppc_md.memory_block_size() here because this get called
0555      * early and we don't have machine probe called yet. Also
0556      * the pseries implementation only check for ibm,lmb-size.
0557      * All hypervisor supporting radix do expose that device
0558      * tree node.
0559      */
0560     radix_mem_block_size = radix_memory_block_size();
0561     return;
0562 }
0563 
0564 void __init radix__early_init_mmu(void)
0565 {
0566     unsigned long lpcr;
0567 
0568 #ifdef CONFIG_PPC_64S_HASH_MMU
0569 #ifdef CONFIG_PPC_64K_PAGES
0570     /* PAGE_SIZE mappings */
0571     mmu_virtual_psize = MMU_PAGE_64K;
0572 #else
0573     mmu_virtual_psize = MMU_PAGE_4K;
0574 #endif
0575 
0576 #ifdef CONFIG_SPARSEMEM_VMEMMAP
0577     /* vmemmap mapping */
0578     if (mmu_psize_defs[MMU_PAGE_2M].shift) {
0579         /*
0580          * map vmemmap using 2M if available
0581          */
0582         mmu_vmemmap_psize = MMU_PAGE_2M;
0583     } else
0584         mmu_vmemmap_psize = mmu_virtual_psize;
0585 #endif
0586 #endif
0587     /*
0588      * initialize page table size
0589      */
0590     __pte_index_size = RADIX_PTE_INDEX_SIZE;
0591     __pmd_index_size = RADIX_PMD_INDEX_SIZE;
0592     __pud_index_size = RADIX_PUD_INDEX_SIZE;
0593     __pgd_index_size = RADIX_PGD_INDEX_SIZE;
0594     __pud_cache_index = RADIX_PUD_INDEX_SIZE;
0595     __pte_table_size = RADIX_PTE_TABLE_SIZE;
0596     __pmd_table_size = RADIX_PMD_TABLE_SIZE;
0597     __pud_table_size = RADIX_PUD_TABLE_SIZE;
0598     __pgd_table_size = RADIX_PGD_TABLE_SIZE;
0599 
0600     __pmd_val_bits = RADIX_PMD_VAL_BITS;
0601     __pud_val_bits = RADIX_PUD_VAL_BITS;
0602     __pgd_val_bits = RADIX_PGD_VAL_BITS;
0603 
0604     __kernel_virt_start = RADIX_KERN_VIRT_START;
0605     __vmalloc_start = RADIX_VMALLOC_START;
0606     __vmalloc_end = RADIX_VMALLOC_END;
0607     __kernel_io_start = RADIX_KERN_IO_START;
0608     __kernel_io_end = RADIX_KERN_IO_END;
0609     vmemmap = (struct page *)RADIX_VMEMMAP_START;
0610     ioremap_bot = IOREMAP_BASE;
0611 
0612 #ifdef CONFIG_PCI
0613     pci_io_base = ISA_IO_BASE;
0614 #endif
0615     __pte_frag_nr = RADIX_PTE_FRAG_NR;
0616     __pte_frag_size_shift = RADIX_PTE_FRAG_SIZE_SHIFT;
0617     __pmd_frag_nr = RADIX_PMD_FRAG_NR;
0618     __pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT;
0619 
0620     radix_init_pgtable();
0621 
0622     if (!firmware_has_feature(FW_FEATURE_LPAR)) {
0623         lpcr = mfspr(SPRN_LPCR);
0624         mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
0625         radix_init_partition_table();
0626     } else {
0627         radix_init_pseries();
0628     }
0629 
0630     memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
0631 
0632     /* Switch to the guard PID before turning on MMU */
0633     radix__switch_mmu_context(NULL, &init_mm);
0634     tlbiel_all();
0635 }
0636 
0637 void radix__early_init_mmu_secondary(void)
0638 {
0639     unsigned long lpcr;
0640     /*
0641      * update partition table control register and UPRT
0642      */
0643     if (!firmware_has_feature(FW_FEATURE_LPAR)) {
0644         lpcr = mfspr(SPRN_LPCR);
0645         mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
0646 
0647         set_ptcr_when_no_uv(__pa(partition_tb) |
0648                     (PATB_SIZE_SHIFT - 12));
0649     }
0650 
0651     radix__switch_mmu_context(NULL, &init_mm);
0652     tlbiel_all();
0653 
0654     /* Make sure userspace can't change the AMR */
0655     mtspr(SPRN_UAMOR, 0);
0656 }
0657 
0658 /* Called during kexec sequence with MMU off */
0659 notrace void radix__mmu_cleanup_all(void)
0660 {
0661     unsigned long lpcr;
0662 
0663     if (!firmware_has_feature(FW_FEATURE_LPAR)) {
0664         lpcr = mfspr(SPRN_LPCR);
0665         mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT);
0666         set_ptcr_when_no_uv(0);
0667         powernv_set_nmmu_ptcr(0);
0668         radix__flush_tlb_all();
0669     }
0670 }
0671 
0672 #ifdef CONFIG_MEMORY_HOTPLUG
0673 static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
0674 {
0675     pte_t *pte;
0676     int i;
0677 
0678     for (i = 0; i < PTRS_PER_PTE; i++) {
0679         pte = pte_start + i;
0680         if (!pte_none(*pte))
0681             return;
0682     }
0683 
0684     pte_free_kernel(&init_mm, pte_start);
0685     pmd_clear(pmd);
0686 }
0687 
0688 static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
0689 {
0690     pmd_t *pmd;
0691     int i;
0692 
0693     for (i = 0; i < PTRS_PER_PMD; i++) {
0694         pmd = pmd_start + i;
0695         if (!pmd_none(*pmd))
0696             return;
0697     }
0698 
0699     pmd_free(&init_mm, pmd_start);
0700     pud_clear(pud);
0701 }
0702 
0703 static void free_pud_table(pud_t *pud_start, p4d_t *p4d)
0704 {
0705     pud_t *pud;
0706     int i;
0707 
0708     for (i = 0; i < PTRS_PER_PUD; i++) {
0709         pud = pud_start + i;
0710         if (!pud_none(*pud))
0711             return;
0712     }
0713 
0714     pud_free(&init_mm, pud_start);
0715     p4d_clear(p4d);
0716 }
0717 
0718 static void remove_pte_table(pte_t *pte_start, unsigned long addr,
0719                  unsigned long end)
0720 {
0721     unsigned long next;
0722     pte_t *pte;
0723 
0724     pte = pte_start + pte_index(addr);
0725     for (; addr < end; addr = next, pte++) {
0726         next = (addr + PAGE_SIZE) & PAGE_MASK;
0727         if (next > end)
0728             next = end;
0729 
0730         if (!pte_present(*pte))
0731             continue;
0732 
0733         if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) {
0734             /*
0735              * The vmemmap_free() and remove_section_mapping()
0736              * codepaths call us with aligned addresses.
0737              */
0738             WARN_ONCE(1, "%s: unaligned range\n", __func__);
0739             continue;
0740         }
0741 
0742         pte_clear(&init_mm, addr, pte);
0743     }
0744 }
0745 
0746 static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
0747                  unsigned long end)
0748 {
0749     unsigned long next;
0750     pte_t *pte_base;
0751     pmd_t *pmd;
0752 
0753     pmd = pmd_start + pmd_index(addr);
0754     for (; addr < end; addr = next, pmd++) {
0755         next = pmd_addr_end(addr, end);
0756 
0757         if (!pmd_present(*pmd))
0758             continue;
0759 
0760         if (pmd_is_leaf(*pmd)) {
0761             if (!IS_ALIGNED(addr, PMD_SIZE) ||
0762                 !IS_ALIGNED(next, PMD_SIZE)) {
0763                 WARN_ONCE(1, "%s: unaligned range\n", __func__);
0764                 continue;
0765             }
0766             pte_clear(&init_mm, addr, (pte_t *)pmd);
0767             continue;
0768         }
0769 
0770         pte_base = (pte_t *)pmd_page_vaddr(*pmd);
0771         remove_pte_table(pte_base, addr, next);
0772         free_pte_table(pte_base, pmd);
0773     }
0774 }
0775 
0776 static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr,
0777                  unsigned long end)
0778 {
0779     unsigned long next;
0780     pmd_t *pmd_base;
0781     pud_t *pud;
0782 
0783     pud = pud_start + pud_index(addr);
0784     for (; addr < end; addr = next, pud++) {
0785         next = pud_addr_end(addr, end);
0786 
0787         if (!pud_present(*pud))
0788             continue;
0789 
0790         if (pud_is_leaf(*pud)) {
0791             if (!IS_ALIGNED(addr, PUD_SIZE) ||
0792                 !IS_ALIGNED(next, PUD_SIZE)) {
0793                 WARN_ONCE(1, "%s: unaligned range\n", __func__);
0794                 continue;
0795             }
0796             pte_clear(&init_mm, addr, (pte_t *)pud);
0797             continue;
0798         }
0799 
0800         pmd_base = pud_pgtable(*pud);
0801         remove_pmd_table(pmd_base, addr, next);
0802         free_pmd_table(pmd_base, pud);
0803     }
0804 }
0805 
0806 static void __meminit remove_pagetable(unsigned long start, unsigned long end)
0807 {
0808     unsigned long addr, next;
0809     pud_t *pud_base;
0810     pgd_t *pgd;
0811     p4d_t *p4d;
0812 
0813     spin_lock(&init_mm.page_table_lock);
0814 
0815     for (addr = start; addr < end; addr = next) {
0816         next = pgd_addr_end(addr, end);
0817 
0818         pgd = pgd_offset_k(addr);
0819         p4d = p4d_offset(pgd, addr);
0820         if (!p4d_present(*p4d))
0821             continue;
0822 
0823         if (p4d_is_leaf(*p4d)) {
0824             if (!IS_ALIGNED(addr, P4D_SIZE) ||
0825                 !IS_ALIGNED(next, P4D_SIZE)) {
0826                 WARN_ONCE(1, "%s: unaligned range\n", __func__);
0827                 continue;
0828             }
0829 
0830             pte_clear(&init_mm, addr, (pte_t *)pgd);
0831             continue;
0832         }
0833 
0834         pud_base = p4d_pgtable(*p4d);
0835         remove_pud_table(pud_base, addr, next);
0836         free_pud_table(pud_base, p4d);
0837     }
0838 
0839     spin_unlock(&init_mm.page_table_lock);
0840     radix__flush_tlb_kernel_range(start, end);
0841 }
0842 
0843 int __meminit radix__create_section_mapping(unsigned long start,
0844                         unsigned long end, int nid,
0845                         pgprot_t prot)
0846 {
0847     if (end >= RADIX_VMALLOC_START) {
0848         pr_warn("Outside the supported range\n");
0849         return -1;
0850     }
0851 
0852     return create_physical_mapping(__pa(start), __pa(end),
0853                        radix_mem_block_size, nid, prot);
0854 }
0855 
0856 int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end)
0857 {
0858     remove_pagetable(start, end);
0859     return 0;
0860 }
0861 #endif /* CONFIG_MEMORY_HOTPLUG */
0862 
0863 #ifdef CONFIG_SPARSEMEM_VMEMMAP
0864 static int __map_kernel_page_nid(unsigned long ea, unsigned long pa,
0865                  pgprot_t flags, unsigned int map_page_size,
0866                  int nid)
0867 {
0868     return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0);
0869 }
0870 
0871 int __meminit radix__vmemmap_create_mapping(unsigned long start,
0872                       unsigned long page_size,
0873                       unsigned long phys)
0874 {
0875     /* Create a PTE encoding */
0876     unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW;
0877     int nid = early_pfn_to_nid(phys >> PAGE_SHIFT);
0878     int ret;
0879 
0880     if ((start + page_size) >= RADIX_VMEMMAP_END) {
0881         pr_warn("Outside the supported range\n");
0882         return -1;
0883     }
0884 
0885     ret = __map_kernel_page_nid(start, phys, __pgprot(flags), page_size, nid);
0886     BUG_ON(ret);
0887 
0888     return 0;
0889 }
0890 
0891 #ifdef CONFIG_MEMORY_HOTPLUG
0892 void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)
0893 {
0894     remove_pagetable(start, start + page_size);
0895 }
0896 #endif
0897 #endif
0898 
0899 #ifdef CONFIG_DEBUG_PAGEALLOC
0900 void radix__kernel_map_pages(struct page *page, int numpages, int enable)
0901 {
0902     pr_warn_once("DEBUG_PAGEALLOC not supported in radix mode\n");
0903 }
0904 #endif
0905 
0906 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
0907 
0908 unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
0909                   pmd_t *pmdp, unsigned long clr,
0910                   unsigned long set)
0911 {
0912     unsigned long old;
0913 
0914 #ifdef CONFIG_DEBUG_VM
0915     WARN_ON(!radix__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
0916     assert_spin_locked(pmd_lockptr(mm, pmdp));
0917 #endif
0918 
0919     old = radix__pte_update(mm, addr, (pte_t *)pmdp, clr, set, 1);
0920     trace_hugepage_update(addr, old, clr, set);
0921 
0922     return old;
0923 }
0924 
0925 pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
0926             pmd_t *pmdp)
0927 
0928 {
0929     pmd_t pmd;
0930 
0931     VM_BUG_ON(address & ~HPAGE_PMD_MASK);
0932     VM_BUG_ON(radix__pmd_trans_huge(*pmdp));
0933     VM_BUG_ON(pmd_devmap(*pmdp));
0934     /*
0935      * khugepaged calls this for normal pmd
0936      */
0937     pmd = *pmdp;
0938     pmd_clear(pmdp);
0939 
0940     radix__flush_tlb_collapsed_pmd(vma->vm_mm, address);
0941 
0942     return pmd;
0943 }
0944 
0945 /*
0946  * For us pgtable_t is pte_t *. Inorder to save the deposisted
0947  * page table, we consider the allocated page table as a list
0948  * head. On withdraw we need to make sure we zero out the used
0949  * list_head memory area.
0950  */
0951 void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
0952                  pgtable_t pgtable)
0953 {
0954     struct list_head *lh = (struct list_head *) pgtable;
0955 
0956     assert_spin_locked(pmd_lockptr(mm, pmdp));
0957 
0958     /* FIFO */
0959     if (!pmd_huge_pte(mm, pmdp))
0960         INIT_LIST_HEAD(lh);
0961     else
0962         list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
0963     pmd_huge_pte(mm, pmdp) = pgtable;
0964 }
0965 
0966 pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
0967 {
0968     pte_t *ptep;
0969     pgtable_t pgtable;
0970     struct list_head *lh;
0971 
0972     assert_spin_locked(pmd_lockptr(mm, pmdp));
0973 
0974     /* FIFO */
0975     pgtable = pmd_huge_pte(mm, pmdp);
0976     lh = (struct list_head *) pgtable;
0977     if (list_empty(lh))
0978         pmd_huge_pte(mm, pmdp) = NULL;
0979     else {
0980         pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
0981         list_del(lh);
0982     }
0983     ptep = (pte_t *) pgtable;
0984     *ptep = __pte(0);
0985     ptep++;
0986     *ptep = __pte(0);
0987     return pgtable;
0988 }
0989 
0990 pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
0991                      unsigned long addr, pmd_t *pmdp)
0992 {
0993     pmd_t old_pmd;
0994     unsigned long old;
0995 
0996     old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
0997     old_pmd = __pmd(old);
0998     return old_pmd;
0999 }
1000 
1001 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1002 
1003 void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
1004                   pte_t entry, unsigned long address, int psize)
1005 {
1006     struct mm_struct *mm = vma->vm_mm;
1007     unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED |
1008                           _PAGE_RW | _PAGE_EXEC);
1009 
1010     unsigned long change = pte_val(entry) ^ pte_val(*ptep);
1011     /*
1012      * On POWER9, the NMMU is not able to relax PTE access permissions
1013      * for a translation with a TLB. The PTE must be invalidated, TLB
1014      * flushed before the new PTE is installed.
1015      *
1016      * This only needs to be done for radix, because hash translation does
1017      * flush when updating the linux pte (and we don't support NMMU
1018      * accelerators on HPT on POWER9 anyway XXX: do we?).
1019      *
1020      * POWER10 (and P9P) NMMU does behave as per ISA.
1021      */
1022     if (!cpu_has_feature(CPU_FTR_ARCH_31) && (change & _PAGE_RW) &&
1023         atomic_read(&mm->context.copros) > 0) {
1024         unsigned long old_pte, new_pte;
1025 
1026         old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID);
1027         new_pte = old_pte | set;
1028         radix__flush_tlb_page_psize(mm, address, psize);
1029         __radix_pte_update(ptep, _PAGE_INVALID, new_pte);
1030     } else {
1031         __radix_pte_update(ptep, 0, set);
1032         /*
1033          * Book3S does not require a TLB flush when relaxing access
1034          * restrictions when the address space (modulo the POWER9 nest
1035          * MMU issue above) because the MMU will reload the PTE after
1036          * taking an access fault, as defined by the architecture. See
1037          * "Setting a Reference or Change Bit or Upgrading Access
1038          *  Authority (PTE Subject to Atomic Hardware Updates)" in
1039          *  Power ISA Version 3.1B.
1040          */
1041     }
1042     /* See ptesync comment in radix__set_pte_at */
1043 }
1044 
1045 void radix__ptep_modify_prot_commit(struct vm_area_struct *vma,
1046                     unsigned long addr, pte_t *ptep,
1047                     pte_t old_pte, pte_t pte)
1048 {
1049     struct mm_struct *mm = vma->vm_mm;
1050 
1051     /*
1052      * POWER9 NMMU must flush the TLB after clearing the PTE before
1053      * installing a PTE with more relaxed access permissions, see
1054      * radix__ptep_set_access_flags.
1055      */
1056     if (!cpu_has_feature(CPU_FTR_ARCH_31) &&
1057         is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
1058         (atomic_read(&mm->context.copros) > 0))
1059         radix__flush_tlb_page(vma, addr);
1060 
1061     set_pte_at(mm, addr, ptep, pte);
1062 }
1063 
1064 int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
1065 {
1066     pte_t *ptep = (pte_t *)pud;
1067     pte_t new_pud = pfn_pte(__phys_to_pfn(addr), prot);
1068 
1069     if (!radix_enabled())
1070         return 0;
1071 
1072     set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pud);
1073 
1074     return 1;
1075 }
1076 
1077 int pud_clear_huge(pud_t *pud)
1078 {
1079     if (pud_is_leaf(*pud)) {
1080         pud_clear(pud);
1081         return 1;
1082     }
1083 
1084     return 0;
1085 }
1086 
1087 int pud_free_pmd_page(pud_t *pud, unsigned long addr)
1088 {
1089     pmd_t *pmd;
1090     int i;
1091 
1092     pmd = pud_pgtable(*pud);
1093     pud_clear(pud);
1094 
1095     flush_tlb_kernel_range(addr, addr + PUD_SIZE);
1096 
1097     for (i = 0; i < PTRS_PER_PMD; i++) {
1098         if (!pmd_none(pmd[i])) {
1099             pte_t *pte;
1100             pte = (pte_t *)pmd_page_vaddr(pmd[i]);
1101 
1102             pte_free_kernel(&init_mm, pte);
1103         }
1104     }
1105 
1106     pmd_free(&init_mm, pmd);
1107 
1108     return 1;
1109 }
1110 
1111 int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
1112 {
1113     pte_t *ptep = (pte_t *)pmd;
1114     pte_t new_pmd = pfn_pte(__phys_to_pfn(addr), prot);
1115 
1116     if (!radix_enabled())
1117         return 0;
1118 
1119     set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pmd);
1120 
1121     return 1;
1122 }
1123 
1124 int pmd_clear_huge(pmd_t *pmd)
1125 {
1126     if (pmd_is_leaf(*pmd)) {
1127         pmd_clear(pmd);
1128         return 1;
1129     }
1130 
1131     return 0;
1132 }
1133 
1134 int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
1135 {
1136     pte_t *pte;
1137 
1138     pte = (pte_t *)pmd_page_vaddr(*pmd);
1139     pmd_clear(pmd);
1140 
1141     flush_tlb_kernel_range(addr, addr + PMD_SIZE);
1142 
1143     pte_free_kernel(&init_mm, pte);
1144 
1145     return 1;
1146 }