Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  *    Copyright IBM Corp. 2006
0004  */
0005 
0006 #include <linux/memory_hotplug.h>
0007 #include <linux/memblock.h>
0008 #include <linux/pfn.h>
0009 #include <linux/mm.h>
0010 #include <linux/init.h>
0011 #include <linux/list.h>
0012 #include <linux/hugetlb.h>
0013 #include <linux/slab.h>
0014 #include <asm/cacheflush.h>
0015 #include <asm/nospec-branch.h>
0016 #include <asm/pgalloc.h>
0017 #include <asm/setup.h>
0018 #include <asm/tlbflush.h>
0019 #include <asm/sections.h>
0020 #include <asm/set_memory.h>
0021 
0022 static DEFINE_MUTEX(vmem_mutex);
0023 
0024 static void __ref *vmem_alloc_pages(unsigned int order)
0025 {
0026     unsigned long size = PAGE_SIZE << order;
0027 
0028     if (slab_is_available())
0029         return (void *)__get_free_pages(GFP_KERNEL, order);
0030     return memblock_alloc(size, size);
0031 }
0032 
0033 static void vmem_free_pages(unsigned long addr, int order)
0034 {
0035     /* We don't expect boot memory to be removed ever. */
0036     if (!slab_is_available() ||
0037         WARN_ON_ONCE(PageReserved(virt_to_page(addr))))
0038         return;
0039     free_pages(addr, order);
0040 }
0041 
0042 void *vmem_crst_alloc(unsigned long val)
0043 {
0044     unsigned long *table;
0045 
0046     table = vmem_alloc_pages(CRST_ALLOC_ORDER);
0047     if (table)
0048         crst_table_init(table, val);
0049     return table;
0050 }
0051 
0052 pte_t __ref *vmem_pte_alloc(void)
0053 {
0054     unsigned long size = PTRS_PER_PTE * sizeof(pte_t);
0055     pte_t *pte;
0056 
0057     if (slab_is_available())
0058         pte = (pte_t *) page_table_alloc(&init_mm);
0059     else
0060         pte = (pte_t *) memblock_alloc(size, size);
0061     if (!pte)
0062         return NULL;
0063     memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE);
0064     return pte;
0065 }
0066 
0067 static void vmem_pte_free(unsigned long *table)
0068 {
0069     /* We don't expect boot memory to be removed ever. */
0070     if (!slab_is_available() ||
0071         WARN_ON_ONCE(PageReserved(virt_to_page(table))))
0072         return;
0073     page_table_free(&init_mm, table);
0074 }
0075 
0076 #define PAGE_UNUSED 0xFD
0077 
0078 /*
0079  * The unused vmemmap range, which was not yet memset(PAGE_UNUSED) ranges
0080  * from unused_sub_pmd_start to next PMD_SIZE boundary.
0081  */
0082 static unsigned long unused_sub_pmd_start;
0083 
0084 static void vmemmap_flush_unused_sub_pmd(void)
0085 {
0086     if (!unused_sub_pmd_start)
0087         return;
0088     memset((void *)unused_sub_pmd_start, PAGE_UNUSED,
0089            ALIGN(unused_sub_pmd_start, PMD_SIZE) - unused_sub_pmd_start);
0090     unused_sub_pmd_start = 0;
0091 }
0092 
0093 static void vmemmap_mark_sub_pmd_used(unsigned long start, unsigned long end)
0094 {
0095     /*
0096      * As we expect to add in the same granularity as we remove, it's
0097      * sufficient to mark only some piece used to block the memmap page from
0098      * getting removed (just in case the memmap never gets initialized,
0099      * e.g., because the memory block never gets onlined).
0100      */
0101     memset((void *)start, 0, sizeof(struct page));
0102 }
0103 
0104 static void vmemmap_use_sub_pmd(unsigned long start, unsigned long end)
0105 {
0106     /*
0107      * We only optimize if the new used range directly follows the
0108      * previously unused range (esp., when populating consecutive sections).
0109      */
0110     if (unused_sub_pmd_start == start) {
0111         unused_sub_pmd_start = end;
0112         if (likely(IS_ALIGNED(unused_sub_pmd_start, PMD_SIZE)))
0113             unused_sub_pmd_start = 0;
0114         return;
0115     }
0116     vmemmap_flush_unused_sub_pmd();
0117     vmemmap_mark_sub_pmd_used(start, end);
0118 }
0119 
0120 static void vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end)
0121 {
0122     unsigned long page = ALIGN_DOWN(start, PMD_SIZE);
0123 
0124     vmemmap_flush_unused_sub_pmd();
0125 
0126     /* Could be our memmap page is filled with PAGE_UNUSED already ... */
0127     vmemmap_mark_sub_pmd_used(start, end);
0128 
0129     /* Mark the unused parts of the new memmap page PAGE_UNUSED. */
0130     if (!IS_ALIGNED(start, PMD_SIZE))
0131         memset((void *)page, PAGE_UNUSED, start - page);
0132     /*
0133      * We want to avoid memset(PAGE_UNUSED) when populating the vmemmap of
0134      * consecutive sections. Remember for the last added PMD the last
0135      * unused range in the populated PMD.
0136      */
0137     if (!IS_ALIGNED(end, PMD_SIZE))
0138         unused_sub_pmd_start = end;
0139 }
0140 
0141 /* Returns true if the PMD is completely unused and can be freed. */
0142 static bool vmemmap_unuse_sub_pmd(unsigned long start, unsigned long end)
0143 {
0144     unsigned long page = ALIGN_DOWN(start, PMD_SIZE);
0145 
0146     vmemmap_flush_unused_sub_pmd();
0147     memset((void *)start, PAGE_UNUSED, end - start);
0148     return !memchr_inv((void *)page, PAGE_UNUSED, PMD_SIZE);
0149 }
0150 
0151 /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */
0152 static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr,
0153                   unsigned long end, bool add, bool direct)
0154 {
0155     unsigned long prot, pages = 0;
0156     int ret = -ENOMEM;
0157     pte_t *pte;
0158 
0159     prot = pgprot_val(PAGE_KERNEL);
0160     if (!MACHINE_HAS_NX)
0161         prot &= ~_PAGE_NOEXEC;
0162 
0163     pte = pte_offset_kernel(pmd, addr);
0164     for (; addr < end; addr += PAGE_SIZE, pte++) {
0165         if (!add) {
0166             if (pte_none(*pte))
0167                 continue;
0168             if (!direct)
0169                 vmem_free_pages((unsigned long) pfn_to_virt(pte_pfn(*pte)), 0);
0170             pte_clear(&init_mm, addr, pte);
0171         } else if (pte_none(*pte)) {
0172             if (!direct) {
0173                 void *new_page = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE);
0174 
0175                 if (!new_page)
0176                     goto out;
0177                 set_pte(pte, __pte(__pa(new_page) | prot));
0178             } else {
0179                 set_pte(pte, __pte(__pa(addr) | prot));
0180             }
0181         } else {
0182             continue;
0183         }
0184         pages++;
0185     }
0186     ret = 0;
0187 out:
0188     if (direct)
0189         update_page_count(PG_DIRECT_MAP_4K, add ? pages : -pages);
0190     return ret;
0191 }
0192 
0193 static void try_free_pte_table(pmd_t *pmd, unsigned long start)
0194 {
0195     pte_t *pte;
0196     int i;
0197 
0198     /* We can safely assume this is fully in 1:1 mapping & vmemmap area */
0199     pte = pte_offset_kernel(pmd, start);
0200     for (i = 0; i < PTRS_PER_PTE; i++, pte++) {
0201         if (!pte_none(*pte))
0202             return;
0203     }
0204     vmem_pte_free((unsigned long *) pmd_deref(*pmd));
0205     pmd_clear(pmd);
0206 }
0207 
0208 /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */
0209 static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
0210                   unsigned long end, bool add, bool direct)
0211 {
0212     unsigned long next, prot, pages = 0;
0213     int ret = -ENOMEM;
0214     pmd_t *pmd;
0215     pte_t *pte;
0216 
0217     prot = pgprot_val(SEGMENT_KERNEL);
0218     if (!MACHINE_HAS_NX)
0219         prot &= ~_SEGMENT_ENTRY_NOEXEC;
0220 
0221     pmd = pmd_offset(pud, addr);
0222     for (; addr < end; addr = next, pmd++) {
0223         next = pmd_addr_end(addr, end);
0224         if (!add) {
0225             if (pmd_none(*pmd))
0226                 continue;
0227             if (pmd_large(*pmd)) {
0228                 if (IS_ALIGNED(addr, PMD_SIZE) &&
0229                     IS_ALIGNED(next, PMD_SIZE)) {
0230                     if (!direct)
0231                         vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE));
0232                     pmd_clear(pmd);
0233                     pages++;
0234                 } else if (!direct && vmemmap_unuse_sub_pmd(addr, next)) {
0235                     vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE));
0236                     pmd_clear(pmd);
0237                 }
0238                 continue;
0239             }
0240         } else if (pmd_none(*pmd)) {
0241             if (IS_ALIGNED(addr, PMD_SIZE) &&
0242                 IS_ALIGNED(next, PMD_SIZE) &&
0243                 MACHINE_HAS_EDAT1 && addr && direct &&
0244                 !debug_pagealloc_enabled()) {
0245                 set_pmd(pmd, __pmd(__pa(addr) | prot));
0246                 pages++;
0247                 continue;
0248             } else if (!direct && MACHINE_HAS_EDAT1) {
0249                 void *new_page;
0250 
0251                 /*
0252                  * Use 1MB frames for vmemmap if available. We
0253                  * always use large frames even if they are only
0254                  * partially used. Otherwise we would have also
0255                  * page tables since vmemmap_populate gets
0256                  * called for each section separately.
0257                  */
0258                 new_page = vmemmap_alloc_block(PMD_SIZE, NUMA_NO_NODE);
0259                 if (new_page) {
0260                     set_pmd(pmd, __pmd(__pa(new_page) | prot));
0261                     if (!IS_ALIGNED(addr, PMD_SIZE) ||
0262                         !IS_ALIGNED(next, PMD_SIZE)) {
0263                         vmemmap_use_new_sub_pmd(addr, next);
0264                     }
0265                     continue;
0266                 }
0267             }
0268             pte = vmem_pte_alloc();
0269             if (!pte)
0270                 goto out;
0271             pmd_populate(&init_mm, pmd, pte);
0272         } else if (pmd_large(*pmd)) {
0273             if (!direct)
0274                 vmemmap_use_sub_pmd(addr, next);
0275             continue;
0276         }
0277         ret = modify_pte_table(pmd, addr, next, add, direct);
0278         if (ret)
0279             goto out;
0280         if (!add)
0281             try_free_pte_table(pmd, addr & PMD_MASK);
0282     }
0283     ret = 0;
0284 out:
0285     if (direct)
0286         update_page_count(PG_DIRECT_MAP_1M, add ? pages : -pages);
0287     return ret;
0288 }
0289 
0290 static void try_free_pmd_table(pud_t *pud, unsigned long start)
0291 {
0292     const unsigned long end = start + PUD_SIZE;
0293     pmd_t *pmd;
0294     int i;
0295 
0296     /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */
0297     if (end > VMALLOC_START)
0298         return;
0299 #ifdef CONFIG_KASAN
0300     if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end)
0301         return;
0302 #endif
0303     pmd = pmd_offset(pud, start);
0304     for (i = 0; i < PTRS_PER_PMD; i++, pmd++)
0305         if (!pmd_none(*pmd))
0306             return;
0307     vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER);
0308     pud_clear(pud);
0309 }
0310 
0311 static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end,
0312                 bool add, bool direct)
0313 {
0314     unsigned long next, prot, pages = 0;
0315     int ret = -ENOMEM;
0316     pud_t *pud;
0317     pmd_t *pmd;
0318 
0319     prot = pgprot_val(REGION3_KERNEL);
0320     if (!MACHINE_HAS_NX)
0321         prot &= ~_REGION_ENTRY_NOEXEC;
0322     pud = pud_offset(p4d, addr);
0323     for (; addr < end; addr = next, pud++) {
0324         next = pud_addr_end(addr, end);
0325         if (!add) {
0326             if (pud_none(*pud))
0327                 continue;
0328             if (pud_large(*pud)) {
0329                 if (IS_ALIGNED(addr, PUD_SIZE) &&
0330                     IS_ALIGNED(next, PUD_SIZE)) {
0331                     pud_clear(pud);
0332                     pages++;
0333                 }
0334                 continue;
0335             }
0336         } else if (pud_none(*pud)) {
0337             if (IS_ALIGNED(addr, PUD_SIZE) &&
0338                 IS_ALIGNED(next, PUD_SIZE) &&
0339                 MACHINE_HAS_EDAT2 && addr && direct &&
0340                 !debug_pagealloc_enabled()) {
0341                 set_pud(pud, __pud(__pa(addr) | prot));
0342                 pages++;
0343                 continue;
0344             }
0345             pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY);
0346             if (!pmd)
0347                 goto out;
0348             pud_populate(&init_mm, pud, pmd);
0349         } else if (pud_large(*pud)) {
0350             continue;
0351         }
0352         ret = modify_pmd_table(pud, addr, next, add, direct);
0353         if (ret)
0354             goto out;
0355         if (!add)
0356             try_free_pmd_table(pud, addr & PUD_MASK);
0357     }
0358     ret = 0;
0359 out:
0360     if (direct)
0361         update_page_count(PG_DIRECT_MAP_2G, add ? pages : -pages);
0362     return ret;
0363 }
0364 
0365 static void try_free_pud_table(p4d_t *p4d, unsigned long start)
0366 {
0367     const unsigned long end = start + P4D_SIZE;
0368     pud_t *pud;
0369     int i;
0370 
0371     /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */
0372     if (end > VMALLOC_START)
0373         return;
0374 #ifdef CONFIG_KASAN
0375     if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end)
0376         return;
0377 #endif
0378 
0379     pud = pud_offset(p4d, start);
0380     for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
0381         if (!pud_none(*pud))
0382             return;
0383     }
0384     vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER);
0385     p4d_clear(p4d);
0386 }
0387 
0388 static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end,
0389                 bool add, bool direct)
0390 {
0391     unsigned long next;
0392     int ret = -ENOMEM;
0393     p4d_t *p4d;
0394     pud_t *pud;
0395 
0396     p4d = p4d_offset(pgd, addr);
0397     for (; addr < end; addr = next, p4d++) {
0398         next = p4d_addr_end(addr, end);
0399         if (!add) {
0400             if (p4d_none(*p4d))
0401                 continue;
0402         } else if (p4d_none(*p4d)) {
0403             pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY);
0404             if (!pud)
0405                 goto out;
0406             p4d_populate(&init_mm, p4d, pud);
0407         }
0408         ret = modify_pud_table(p4d, addr, next, add, direct);
0409         if (ret)
0410             goto out;
0411         if (!add)
0412             try_free_pud_table(p4d, addr & P4D_MASK);
0413     }
0414     ret = 0;
0415 out:
0416     return ret;
0417 }
0418 
0419 static void try_free_p4d_table(pgd_t *pgd, unsigned long start)
0420 {
0421     const unsigned long end = start + PGDIR_SIZE;
0422     p4d_t *p4d;
0423     int i;
0424 
0425     /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */
0426     if (end > VMALLOC_START)
0427         return;
0428 #ifdef CONFIG_KASAN
0429     if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end)
0430         return;
0431 #endif
0432 
0433     p4d = p4d_offset(pgd, start);
0434     for (i = 0; i < PTRS_PER_P4D; i++, p4d++) {
0435         if (!p4d_none(*p4d))
0436             return;
0437     }
0438     vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER);
0439     pgd_clear(pgd);
0440 }
0441 
0442 static int modify_pagetable(unsigned long start, unsigned long end, bool add,
0443                 bool direct)
0444 {
0445     unsigned long addr, next;
0446     int ret = -ENOMEM;
0447     pgd_t *pgd;
0448     p4d_t *p4d;
0449 
0450     if (WARN_ON_ONCE(!PAGE_ALIGNED(start | end)))
0451         return -EINVAL;
0452     for (addr = start; addr < end; addr = next) {
0453         next = pgd_addr_end(addr, end);
0454         pgd = pgd_offset_k(addr);
0455 
0456         if (!add) {
0457             if (pgd_none(*pgd))
0458                 continue;
0459         } else if (pgd_none(*pgd)) {
0460             p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY);
0461             if (!p4d)
0462                 goto out;
0463             pgd_populate(&init_mm, pgd, p4d);
0464         }
0465         ret = modify_p4d_table(pgd, addr, next, add, direct);
0466         if (ret)
0467             goto out;
0468         if (!add)
0469             try_free_p4d_table(pgd, addr & PGDIR_MASK);
0470     }
0471     ret = 0;
0472 out:
0473     if (!add)
0474         flush_tlb_kernel_range(start, end);
0475     return ret;
0476 }
0477 
0478 static int add_pagetable(unsigned long start, unsigned long end, bool direct)
0479 {
0480     return modify_pagetable(start, end, true, direct);
0481 }
0482 
0483 static int remove_pagetable(unsigned long start, unsigned long end, bool direct)
0484 {
0485     return modify_pagetable(start, end, false, direct);
0486 }
0487 
0488 /*
0489  * Add a physical memory range to the 1:1 mapping.
0490  */
0491 static int vmem_add_range(unsigned long start, unsigned long size)
0492 {
0493     return add_pagetable(start, start + size, true);
0494 }
0495 
0496 /*
0497  * Remove a physical memory range from the 1:1 mapping.
0498  */
0499 static void vmem_remove_range(unsigned long start, unsigned long size)
0500 {
0501     remove_pagetable(start, start + size, true);
0502 }
0503 
0504 /*
0505  * Add a backed mem_map array to the virtual mem_map array.
0506  */
0507 int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
0508                    struct vmem_altmap *altmap)
0509 {
0510     int ret;
0511 
0512     mutex_lock(&vmem_mutex);
0513     /* We don't care about the node, just use NUMA_NO_NODE on allocations */
0514     ret = add_pagetable(start, end, false);
0515     if (ret)
0516         remove_pagetable(start, end, false);
0517     mutex_unlock(&vmem_mutex);
0518     return ret;
0519 }
0520 
0521 void vmemmap_free(unsigned long start, unsigned long end,
0522           struct vmem_altmap *altmap)
0523 {
0524     mutex_lock(&vmem_mutex);
0525     remove_pagetable(start, end, false);
0526     mutex_unlock(&vmem_mutex);
0527 }
0528 
0529 void vmem_remove_mapping(unsigned long start, unsigned long size)
0530 {
0531     mutex_lock(&vmem_mutex);
0532     vmem_remove_range(start, size);
0533     mutex_unlock(&vmem_mutex);
0534 }
0535 
0536 struct range arch_get_mappable_range(void)
0537 {
0538     struct range mhp_range;
0539 
0540     mhp_range.start = 0;
0541     mhp_range.end =  VMEM_MAX_PHYS - 1;
0542     return mhp_range;
0543 }
0544 
0545 int vmem_add_mapping(unsigned long start, unsigned long size)
0546 {
0547     struct range range = arch_get_mappable_range();
0548     int ret;
0549 
0550     if (start < range.start ||
0551         start + size > range.end + 1 ||
0552         start + size < start)
0553         return -ERANGE;
0554 
0555     mutex_lock(&vmem_mutex);
0556     ret = vmem_add_range(start, size);
0557     if (ret)
0558         vmem_remove_range(start, size);
0559     mutex_unlock(&vmem_mutex);
0560     return ret;
0561 }
0562 
0563 /*
0564  * map whole physical memory to virtual memory (identity mapping)
0565  * we reserve enough space in the vmalloc area for vmemmap to hotplug
0566  * additional memory segments.
0567  */
0568 void __init vmem_map_init(void)
0569 {
0570     phys_addr_t base, end;
0571     u64 i;
0572 
0573     for_each_mem_range(i, &base, &end)
0574         vmem_add_range(base, end - base);
0575     __set_memory((unsigned long)_stext,
0576              (unsigned long)(_etext - _stext) >> PAGE_SHIFT,
0577              SET_MEMORY_RO | SET_MEMORY_X);
0578     __set_memory((unsigned long)_etext,
0579              (unsigned long)(__end_rodata - _etext) >> PAGE_SHIFT,
0580              SET_MEMORY_RO);
0581     __set_memory((unsigned long)_sinittext,
0582              (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT,
0583              SET_MEMORY_RO | SET_MEMORY_X);
0584     __set_memory(__stext_amode31, (__etext_amode31 - __stext_amode31) >> PAGE_SHIFT,
0585              SET_MEMORY_RO | SET_MEMORY_X);
0586 
0587     /* lowcore must be executable for LPSWE */
0588     if (!static_key_enabled(&cpu_has_bear))
0589         set_memory_x(0, 1);
0590 
0591     pr_info("Write protected kernel read-only data: %luk\n",
0592         (unsigned long)(__end_rodata - _stext) >> 10);
0593 }