Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * HugeTLB Vmemmap Optimization (HVO)
0004  *
0005  * Copyright (c) 2020, ByteDance. All rights reserved.
0006  *
0007  *     Author: Muchun Song <songmuchun@bytedance.com>
0008  *
0009  * See Documentation/mm/vmemmap_dedup.rst
0010  */
0011 #define pr_fmt(fmt) "HugeTLB: " fmt
0012 
0013 #include <linux/pgtable.h>
0014 #include <linux/bootmem_info.h>
0015 #include <asm/pgalloc.h>
0016 #include <asm/tlbflush.h>
0017 #include "hugetlb_vmemmap.h"
0018 
0019 /**
0020  * struct vmemmap_remap_walk - walk vmemmap page table
0021  *
0022  * @remap_pte:      called for each lowest-level entry (PTE).
0023  * @nr_walked:      the number of walked pte.
0024  * @reuse_page:     the page which is reused for the tail vmemmap pages.
0025  * @reuse_addr:     the virtual address of the @reuse_page page.
0026  * @vmemmap_pages:  the list head of the vmemmap pages that can be freed
0027  *          or is mapped from.
0028  */
0029 struct vmemmap_remap_walk {
0030     void            (*remap_pte)(pte_t *pte, unsigned long addr,
0031                          struct vmemmap_remap_walk *walk);
0032     unsigned long       nr_walked;
0033     struct page     *reuse_page;
0034     unsigned long       reuse_addr;
0035     struct list_head    *vmemmap_pages;
0036 };
0037 
0038 static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
0039 {
0040     pmd_t __pmd;
0041     int i;
0042     unsigned long addr = start;
0043     struct page *page = pmd_page(*pmd);
0044     pte_t *pgtable = pte_alloc_one_kernel(&init_mm);
0045 
0046     if (!pgtable)
0047         return -ENOMEM;
0048 
0049     pmd_populate_kernel(&init_mm, &__pmd, pgtable);
0050 
0051     for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
0052         pte_t entry, *pte;
0053         pgprot_t pgprot = PAGE_KERNEL;
0054 
0055         entry = mk_pte(page + i, pgprot);
0056         pte = pte_offset_kernel(&__pmd, addr);
0057         set_pte_at(&init_mm, addr, pte, entry);
0058     }
0059 
0060     spin_lock(&init_mm.page_table_lock);
0061     if (likely(pmd_leaf(*pmd))) {
0062         /*
0063          * Higher order allocations from buddy allocator must be able to
0064          * be treated as indepdenent small pages (as they can be freed
0065          * individually).
0066          */
0067         if (!PageReserved(page))
0068             split_page(page, get_order(PMD_SIZE));
0069 
0070         /* Make pte visible before pmd. See comment in pmd_install(). */
0071         smp_wmb();
0072         pmd_populate_kernel(&init_mm, pmd, pgtable);
0073         flush_tlb_kernel_range(start, start + PMD_SIZE);
0074     } else {
0075         pte_free_kernel(&init_mm, pgtable);
0076     }
0077     spin_unlock(&init_mm.page_table_lock);
0078 
0079     return 0;
0080 }
0081 
0082 static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
0083 {
0084     int leaf;
0085 
0086     spin_lock(&init_mm.page_table_lock);
0087     leaf = pmd_leaf(*pmd);
0088     spin_unlock(&init_mm.page_table_lock);
0089 
0090     if (!leaf)
0091         return 0;
0092 
0093     return __split_vmemmap_huge_pmd(pmd, start);
0094 }
0095 
0096 static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
0097                   unsigned long end,
0098                   struct vmemmap_remap_walk *walk)
0099 {
0100     pte_t *pte = pte_offset_kernel(pmd, addr);
0101 
0102     /*
0103      * The reuse_page is found 'first' in table walk before we start
0104      * remapping (which is calling @walk->remap_pte).
0105      */
0106     if (!walk->reuse_page) {
0107         walk->reuse_page = pte_page(*pte);
0108         /*
0109          * Because the reuse address is part of the range that we are
0110          * walking, skip the reuse address range.
0111          */
0112         addr += PAGE_SIZE;
0113         pte++;
0114         walk->nr_walked++;
0115     }
0116 
0117     for (; addr != end; addr += PAGE_SIZE, pte++) {
0118         walk->remap_pte(pte, addr, walk);
0119         walk->nr_walked++;
0120     }
0121 }
0122 
0123 static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
0124                  unsigned long end,
0125                  struct vmemmap_remap_walk *walk)
0126 {
0127     pmd_t *pmd;
0128     unsigned long next;
0129 
0130     pmd = pmd_offset(pud, addr);
0131     do {
0132         int ret;
0133 
0134         ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK);
0135         if (ret)
0136             return ret;
0137 
0138         next = pmd_addr_end(addr, end);
0139         vmemmap_pte_range(pmd, addr, next, walk);
0140     } while (pmd++, addr = next, addr != end);
0141 
0142     return 0;
0143 }
0144 
0145 static int vmemmap_pud_range(p4d_t *p4d, unsigned long addr,
0146                  unsigned long end,
0147                  struct vmemmap_remap_walk *walk)
0148 {
0149     pud_t *pud;
0150     unsigned long next;
0151 
0152     pud = pud_offset(p4d, addr);
0153     do {
0154         int ret;
0155 
0156         next = pud_addr_end(addr, end);
0157         ret = vmemmap_pmd_range(pud, addr, next, walk);
0158         if (ret)
0159             return ret;
0160     } while (pud++, addr = next, addr != end);
0161 
0162     return 0;
0163 }
0164 
0165 static int vmemmap_p4d_range(pgd_t *pgd, unsigned long addr,
0166                  unsigned long end,
0167                  struct vmemmap_remap_walk *walk)
0168 {
0169     p4d_t *p4d;
0170     unsigned long next;
0171 
0172     p4d = p4d_offset(pgd, addr);
0173     do {
0174         int ret;
0175 
0176         next = p4d_addr_end(addr, end);
0177         ret = vmemmap_pud_range(p4d, addr, next, walk);
0178         if (ret)
0179             return ret;
0180     } while (p4d++, addr = next, addr != end);
0181 
0182     return 0;
0183 }
0184 
0185 static int vmemmap_remap_range(unsigned long start, unsigned long end,
0186                    struct vmemmap_remap_walk *walk)
0187 {
0188     unsigned long addr = start;
0189     unsigned long next;
0190     pgd_t *pgd;
0191 
0192     VM_BUG_ON(!PAGE_ALIGNED(start));
0193     VM_BUG_ON(!PAGE_ALIGNED(end));
0194 
0195     pgd = pgd_offset_k(addr);
0196     do {
0197         int ret;
0198 
0199         next = pgd_addr_end(addr, end);
0200         ret = vmemmap_p4d_range(pgd, addr, next, walk);
0201         if (ret)
0202             return ret;
0203     } while (pgd++, addr = next, addr != end);
0204 
0205     /*
0206      * We only change the mapping of the vmemmap virtual address range
0207      * [@start + PAGE_SIZE, end), so we only need to flush the TLB which
0208      * belongs to the range.
0209      */
0210     flush_tlb_kernel_range(start + PAGE_SIZE, end);
0211 
0212     return 0;
0213 }
0214 
0215 /*
0216  * Free a vmemmap page. A vmemmap page can be allocated from the memblock
0217  * allocator or buddy allocator. If the PG_reserved flag is set, it means
0218  * that it allocated from the memblock allocator, just free it via the
0219  * free_bootmem_page(). Otherwise, use __free_page().
0220  */
0221 static inline void free_vmemmap_page(struct page *page)
0222 {
0223     if (PageReserved(page))
0224         free_bootmem_page(page);
0225     else
0226         __free_page(page);
0227 }
0228 
0229 /* Free a list of the vmemmap pages */
0230 static void free_vmemmap_page_list(struct list_head *list)
0231 {
0232     struct page *page, *next;
0233 
0234     list_for_each_entry_safe(page, next, list, lru) {
0235         list_del(&page->lru);
0236         free_vmemmap_page(page);
0237     }
0238 }
0239 
0240 static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
0241                   struct vmemmap_remap_walk *walk)
0242 {
0243     /*
0244      * Remap the tail pages as read-only to catch illegal write operation
0245      * to the tail pages.
0246      */
0247     pgprot_t pgprot = PAGE_KERNEL_RO;
0248     pte_t entry = mk_pte(walk->reuse_page, pgprot);
0249     struct page *page = pte_page(*pte);
0250 
0251     list_add_tail(&page->lru, walk->vmemmap_pages);
0252     set_pte_at(&init_mm, addr, pte, entry);
0253 }
0254 
0255 /*
0256  * How many struct page structs need to be reset. When we reuse the head
0257  * struct page, the special metadata (e.g. page->flags or page->mapping)
0258  * cannot copy to the tail struct page structs. The invalid value will be
0259  * checked in the free_tail_pages_check(). In order to avoid the message
0260  * of "corrupted mapping in tail page". We need to reset at least 3 (one
0261  * head struct page struct and two tail struct page structs) struct page
0262  * structs.
0263  */
0264 #define NR_RESET_STRUCT_PAGE        3
0265 
0266 static inline void reset_struct_pages(struct page *start)
0267 {
0268     int i;
0269     struct page *from = start + NR_RESET_STRUCT_PAGE;
0270 
0271     for (i = 0; i < NR_RESET_STRUCT_PAGE; i++)
0272         memcpy(start + i, from, sizeof(*from));
0273 }
0274 
0275 static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
0276                 struct vmemmap_remap_walk *walk)
0277 {
0278     pgprot_t pgprot = PAGE_KERNEL;
0279     struct page *page;
0280     void *to;
0281 
0282     BUG_ON(pte_page(*pte) != walk->reuse_page);
0283 
0284     page = list_first_entry(walk->vmemmap_pages, struct page, lru);
0285     list_del(&page->lru);
0286     to = page_to_virt(page);
0287     copy_page(to, (void *)walk->reuse_addr);
0288     reset_struct_pages(to);
0289 
0290     set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
0291 }
0292 
0293 /**
0294  * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
0295  *          to the page which @reuse is mapped to, then free vmemmap
0296  *          which the range are mapped to.
0297  * @start:  start address of the vmemmap virtual address range that we want
0298  *      to remap.
0299  * @end:    end address of the vmemmap virtual address range that we want to
0300  *      remap.
0301  * @reuse:  reuse address.
0302  *
0303  * Return: %0 on success, negative error code otherwise.
0304  */
0305 static int vmemmap_remap_free(unsigned long start, unsigned long end,
0306                   unsigned long reuse)
0307 {
0308     int ret;
0309     LIST_HEAD(vmemmap_pages);
0310     struct vmemmap_remap_walk walk = {
0311         .remap_pte  = vmemmap_remap_pte,
0312         .reuse_addr = reuse,
0313         .vmemmap_pages  = &vmemmap_pages,
0314     };
0315 
0316     /*
0317      * In order to make remapping routine most efficient for the huge pages,
0318      * the routine of vmemmap page table walking has the following rules
0319      * (see more details from the vmemmap_pte_range()):
0320      *
0321      * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
0322      *   should be continuous.
0323      * - The @reuse address is part of the range [@reuse, @end) that we are
0324      *   walking which is passed to vmemmap_remap_range().
0325      * - The @reuse address is the first in the complete range.
0326      *
0327      * So we need to make sure that @start and @reuse meet the above rules.
0328      */
0329     BUG_ON(start - reuse != PAGE_SIZE);
0330 
0331     mmap_read_lock(&init_mm);
0332     ret = vmemmap_remap_range(reuse, end, &walk);
0333     if (ret && walk.nr_walked) {
0334         end = reuse + walk.nr_walked * PAGE_SIZE;
0335         /*
0336          * vmemmap_pages contains pages from the previous
0337          * vmemmap_remap_range call which failed.  These
0338          * are pages which were removed from the vmemmap.
0339          * They will be restored in the following call.
0340          */
0341         walk = (struct vmemmap_remap_walk) {
0342             .remap_pte  = vmemmap_restore_pte,
0343             .reuse_addr = reuse,
0344             .vmemmap_pages  = &vmemmap_pages,
0345         };
0346 
0347         vmemmap_remap_range(reuse, end, &walk);
0348     }
0349     mmap_read_unlock(&init_mm);
0350 
0351     free_vmemmap_page_list(&vmemmap_pages);
0352 
0353     return ret;
0354 }
0355 
0356 static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
0357                    gfp_t gfp_mask, struct list_head *list)
0358 {
0359     unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
0360     int nid = page_to_nid((struct page *)start);
0361     struct page *page, *next;
0362 
0363     while (nr_pages--) {
0364         page = alloc_pages_node(nid, gfp_mask, 0);
0365         if (!page)
0366             goto out;
0367         list_add_tail(&page->lru, list);
0368     }
0369 
0370     return 0;
0371 out:
0372     list_for_each_entry_safe(page, next, list, lru)
0373         __free_pages(page, 0);
0374     return -ENOMEM;
0375 }
0376 
0377 /**
0378  * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
0379  *           to the page which is from the @vmemmap_pages
0380  *           respectively.
0381  * @start:  start address of the vmemmap virtual address range that we want
0382  *      to remap.
0383  * @end:    end address of the vmemmap virtual address range that we want to
0384  *      remap.
0385  * @reuse:  reuse address.
0386  * @gfp_mask:   GFP flag for allocating vmemmap pages.
0387  *
0388  * Return: %0 on success, negative error code otherwise.
0389  */
0390 static int vmemmap_remap_alloc(unsigned long start, unsigned long end,
0391                    unsigned long reuse, gfp_t gfp_mask)
0392 {
0393     LIST_HEAD(vmemmap_pages);
0394     struct vmemmap_remap_walk walk = {
0395         .remap_pte  = vmemmap_restore_pte,
0396         .reuse_addr = reuse,
0397         .vmemmap_pages  = &vmemmap_pages,
0398     };
0399 
0400     /* See the comment in the vmemmap_remap_free(). */
0401     BUG_ON(start - reuse != PAGE_SIZE);
0402 
0403     if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages))
0404         return -ENOMEM;
0405 
0406     mmap_read_lock(&init_mm);
0407     vmemmap_remap_range(reuse, end, &walk);
0408     mmap_read_unlock(&init_mm);
0409 
0410     return 0;
0411 }
0412 
0413 DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);
0414 EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
0415 
0416 static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON);
0417 core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0);
0418 
0419 /**
0420  * hugetlb_vmemmap_restore - restore previously optimized (by
0421  *               hugetlb_vmemmap_optimize()) vmemmap pages which
0422  *               will be reallocated and remapped.
0423  * @h:      struct hstate.
0424  * @head:   the head page whose vmemmap pages will be restored.
0425  *
0426  * Return: %0 if @head's vmemmap pages have been reallocated and remapped,
0427  * negative error code otherwise.
0428  */
0429 int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head)
0430 {
0431     int ret;
0432     unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
0433     unsigned long vmemmap_reuse;
0434 
0435     if (!HPageVmemmapOptimized(head))
0436         return 0;
0437 
0438     vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
0439     vmemmap_reuse   = vmemmap_start;
0440     vmemmap_start   += HUGETLB_VMEMMAP_RESERVE_SIZE;
0441 
0442     /*
0443      * The pages which the vmemmap virtual address range [@vmemmap_start,
0444      * @vmemmap_end) are mapped to are freed to the buddy allocator, and
0445      * the range is mapped to the page which @vmemmap_reuse is mapped to.
0446      * When a HugeTLB page is freed to the buddy allocator, previously
0447      * discarded vmemmap pages must be allocated and remapping.
0448      */
0449     ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse,
0450                   GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE);
0451     if (!ret) {
0452         ClearHPageVmemmapOptimized(head);
0453         static_branch_dec(&hugetlb_optimize_vmemmap_key);
0454     }
0455 
0456     return ret;
0457 }
0458 
0459 /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */
0460 static bool vmemmap_should_optimize(const struct hstate *h, const struct page *head)
0461 {
0462     if (!READ_ONCE(vmemmap_optimize_enabled))
0463         return false;
0464 
0465     if (!hugetlb_vmemmap_optimizable(h))
0466         return false;
0467 
0468     if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) {
0469         pmd_t *pmdp, pmd;
0470         struct page *vmemmap_page;
0471         unsigned long vaddr = (unsigned long)head;
0472 
0473         /*
0474          * Only the vmemmap page's vmemmap page can be self-hosted.
0475          * Walking the page tables to find the backing page of the
0476          * vmemmap page.
0477          */
0478         pmdp = pmd_off_k(vaddr);
0479         /*
0480          * The READ_ONCE() is used to stabilize *pmdp in a register or
0481          * on the stack so that it will stop changing under the code.
0482          * The only concurrent operation where it can be changed is
0483          * split_vmemmap_huge_pmd() (*pmdp will be stable after this
0484          * operation).
0485          */
0486         pmd = READ_ONCE(*pmdp);
0487         if (pmd_leaf(pmd))
0488             vmemmap_page = pmd_page(pmd) + pte_index(vaddr);
0489         else
0490             vmemmap_page = pte_page(*pte_offset_kernel(pmdp, vaddr));
0491         /*
0492          * Due to HugeTLB alignment requirements and the vmemmap pages
0493          * being at the start of the hotplugged memory region in
0494          * memory_hotplug.memmap_on_memory case. Checking any vmemmap
0495          * page's vmemmap page if it is marked as VmemmapSelfHosted is
0496          * sufficient.
0497          *
0498          * [                  hotplugged memory                  ]
0499          * [        section        ][...][        section        ]
0500          * [ vmemmap ][              usable memory               ]
0501          *   ^   |     |                                        |
0502          *   +---+     |                                        |
0503          *     ^       |                                        |
0504          *     +-------+                                        |
0505          *          ^                                           |
0506          *          +-------------------------------------------+
0507          */
0508         if (PageVmemmapSelfHosted(vmemmap_page))
0509             return false;
0510     }
0511 
0512     return true;
0513 }
0514 
0515 /**
0516  * hugetlb_vmemmap_optimize - optimize @head page's vmemmap pages.
0517  * @h:      struct hstate.
0518  * @head:   the head page whose vmemmap pages will be optimized.
0519  *
0520  * This function only tries to optimize @head's vmemmap pages and does not
0521  * guarantee that the optimization will succeed after it returns. The caller
0522  * can use HPageVmemmapOptimized(@head) to detect if @head's vmemmap pages
0523  * have been optimized.
0524  */
0525 void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head)
0526 {
0527     unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
0528     unsigned long vmemmap_reuse;
0529 
0530     if (!vmemmap_should_optimize(h, head))
0531         return;
0532 
0533     static_branch_inc(&hugetlb_optimize_vmemmap_key);
0534 
0535     vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
0536     vmemmap_reuse   = vmemmap_start;
0537     vmemmap_start   += HUGETLB_VMEMMAP_RESERVE_SIZE;
0538 
0539     /*
0540      * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end)
0541      * to the page which @vmemmap_reuse is mapped to, then free the pages
0542      * which the range [@vmemmap_start, @vmemmap_end] is mapped to.
0543      */
0544     if (vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse))
0545         static_branch_dec(&hugetlb_optimize_vmemmap_key);
0546     else
0547         SetHPageVmemmapOptimized(head);
0548 }
0549 
0550 static struct ctl_table hugetlb_vmemmap_sysctls[] = {
0551     {
0552         .procname   = "hugetlb_optimize_vmemmap",
0553         .data       = &vmemmap_optimize_enabled,
0554         .maxlen     = sizeof(int),
0555         .mode       = 0644,
0556         .proc_handler   = proc_dobool,
0557     },
0558     { }
0559 };
0560 
0561 static int __init hugetlb_vmemmap_init(void)
0562 {
0563     /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */
0564     BUILD_BUG_ON(__NR_USED_SUBPAGE * sizeof(struct page) > HUGETLB_VMEMMAP_RESERVE_SIZE);
0565 
0566     if (IS_ENABLED(CONFIG_PROC_SYSCTL)) {
0567         const struct hstate *h;
0568 
0569         for_each_hstate(h) {
0570             if (hugetlb_vmemmap_optimizable(h)) {
0571                 register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
0572                 break;
0573             }
0574         }
0575     }
0576     return 0;
0577 }
0578 late_initcall(hugetlb_vmemmap_init);