0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011 #define pr_fmt(fmt) "HugeTLB: " fmt
0012
0013 #include <linux/pgtable.h>
0014 #include <linux/bootmem_info.h>
0015 #include <asm/pgalloc.h>
0016 #include <asm/tlbflush.h>
0017 #include "hugetlb_vmemmap.h"
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029 struct vmemmap_remap_walk {
0030 void (*remap_pte)(pte_t *pte, unsigned long addr,
0031 struct vmemmap_remap_walk *walk);
0032 unsigned long nr_walked;
0033 struct page *reuse_page;
0034 unsigned long reuse_addr;
0035 struct list_head *vmemmap_pages;
0036 };
0037
0038 static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
0039 {
0040 pmd_t __pmd;
0041 int i;
0042 unsigned long addr = start;
0043 struct page *page = pmd_page(*pmd);
0044 pte_t *pgtable = pte_alloc_one_kernel(&init_mm);
0045
0046 if (!pgtable)
0047 return -ENOMEM;
0048
0049 pmd_populate_kernel(&init_mm, &__pmd, pgtable);
0050
0051 for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
0052 pte_t entry, *pte;
0053 pgprot_t pgprot = PAGE_KERNEL;
0054
0055 entry = mk_pte(page + i, pgprot);
0056 pte = pte_offset_kernel(&__pmd, addr);
0057 set_pte_at(&init_mm, addr, pte, entry);
0058 }
0059
0060 spin_lock(&init_mm.page_table_lock);
0061 if (likely(pmd_leaf(*pmd))) {
0062
0063
0064
0065
0066
0067 if (!PageReserved(page))
0068 split_page(page, get_order(PMD_SIZE));
0069
0070
0071 smp_wmb();
0072 pmd_populate_kernel(&init_mm, pmd, pgtable);
0073 flush_tlb_kernel_range(start, start + PMD_SIZE);
0074 } else {
0075 pte_free_kernel(&init_mm, pgtable);
0076 }
0077 spin_unlock(&init_mm.page_table_lock);
0078
0079 return 0;
0080 }
0081
0082 static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
0083 {
0084 int leaf;
0085
0086 spin_lock(&init_mm.page_table_lock);
0087 leaf = pmd_leaf(*pmd);
0088 spin_unlock(&init_mm.page_table_lock);
0089
0090 if (!leaf)
0091 return 0;
0092
0093 return __split_vmemmap_huge_pmd(pmd, start);
0094 }
0095
0096 static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
0097 unsigned long end,
0098 struct vmemmap_remap_walk *walk)
0099 {
0100 pte_t *pte = pte_offset_kernel(pmd, addr);
0101
0102
0103
0104
0105
0106 if (!walk->reuse_page) {
0107 walk->reuse_page = pte_page(*pte);
0108
0109
0110
0111
0112 addr += PAGE_SIZE;
0113 pte++;
0114 walk->nr_walked++;
0115 }
0116
0117 for (; addr != end; addr += PAGE_SIZE, pte++) {
0118 walk->remap_pte(pte, addr, walk);
0119 walk->nr_walked++;
0120 }
0121 }
0122
0123 static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
0124 unsigned long end,
0125 struct vmemmap_remap_walk *walk)
0126 {
0127 pmd_t *pmd;
0128 unsigned long next;
0129
0130 pmd = pmd_offset(pud, addr);
0131 do {
0132 int ret;
0133
0134 ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK);
0135 if (ret)
0136 return ret;
0137
0138 next = pmd_addr_end(addr, end);
0139 vmemmap_pte_range(pmd, addr, next, walk);
0140 } while (pmd++, addr = next, addr != end);
0141
0142 return 0;
0143 }
0144
0145 static int vmemmap_pud_range(p4d_t *p4d, unsigned long addr,
0146 unsigned long end,
0147 struct vmemmap_remap_walk *walk)
0148 {
0149 pud_t *pud;
0150 unsigned long next;
0151
0152 pud = pud_offset(p4d, addr);
0153 do {
0154 int ret;
0155
0156 next = pud_addr_end(addr, end);
0157 ret = vmemmap_pmd_range(pud, addr, next, walk);
0158 if (ret)
0159 return ret;
0160 } while (pud++, addr = next, addr != end);
0161
0162 return 0;
0163 }
0164
0165 static int vmemmap_p4d_range(pgd_t *pgd, unsigned long addr,
0166 unsigned long end,
0167 struct vmemmap_remap_walk *walk)
0168 {
0169 p4d_t *p4d;
0170 unsigned long next;
0171
0172 p4d = p4d_offset(pgd, addr);
0173 do {
0174 int ret;
0175
0176 next = p4d_addr_end(addr, end);
0177 ret = vmemmap_pud_range(p4d, addr, next, walk);
0178 if (ret)
0179 return ret;
0180 } while (p4d++, addr = next, addr != end);
0181
0182 return 0;
0183 }
0184
0185 static int vmemmap_remap_range(unsigned long start, unsigned long end,
0186 struct vmemmap_remap_walk *walk)
0187 {
0188 unsigned long addr = start;
0189 unsigned long next;
0190 pgd_t *pgd;
0191
0192 VM_BUG_ON(!PAGE_ALIGNED(start));
0193 VM_BUG_ON(!PAGE_ALIGNED(end));
0194
0195 pgd = pgd_offset_k(addr);
0196 do {
0197 int ret;
0198
0199 next = pgd_addr_end(addr, end);
0200 ret = vmemmap_p4d_range(pgd, addr, next, walk);
0201 if (ret)
0202 return ret;
0203 } while (pgd++, addr = next, addr != end);
0204
0205
0206
0207
0208
0209
0210 flush_tlb_kernel_range(start + PAGE_SIZE, end);
0211
0212 return 0;
0213 }
0214
0215
0216
0217
0218
0219
0220
0221 static inline void free_vmemmap_page(struct page *page)
0222 {
0223 if (PageReserved(page))
0224 free_bootmem_page(page);
0225 else
0226 __free_page(page);
0227 }
0228
0229
0230 static void free_vmemmap_page_list(struct list_head *list)
0231 {
0232 struct page *page, *next;
0233
0234 list_for_each_entry_safe(page, next, list, lru) {
0235 list_del(&page->lru);
0236 free_vmemmap_page(page);
0237 }
0238 }
0239
0240 static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
0241 struct vmemmap_remap_walk *walk)
0242 {
0243
0244
0245
0246
0247 pgprot_t pgprot = PAGE_KERNEL_RO;
0248 pte_t entry = mk_pte(walk->reuse_page, pgprot);
0249 struct page *page = pte_page(*pte);
0250
0251 list_add_tail(&page->lru, walk->vmemmap_pages);
0252 set_pte_at(&init_mm, addr, pte, entry);
0253 }
0254
0255
0256
0257
0258
0259
0260
0261
0262
0263
0264 #define NR_RESET_STRUCT_PAGE 3
0265
0266 static inline void reset_struct_pages(struct page *start)
0267 {
0268 int i;
0269 struct page *from = start + NR_RESET_STRUCT_PAGE;
0270
0271 for (i = 0; i < NR_RESET_STRUCT_PAGE; i++)
0272 memcpy(start + i, from, sizeof(*from));
0273 }
0274
0275 static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
0276 struct vmemmap_remap_walk *walk)
0277 {
0278 pgprot_t pgprot = PAGE_KERNEL;
0279 struct page *page;
0280 void *to;
0281
0282 BUG_ON(pte_page(*pte) != walk->reuse_page);
0283
0284 page = list_first_entry(walk->vmemmap_pages, struct page, lru);
0285 list_del(&page->lru);
0286 to = page_to_virt(page);
0287 copy_page(to, (void *)walk->reuse_addr);
0288 reset_struct_pages(to);
0289
0290 set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
0291 }
0292
0293
0294
0295
0296
0297
0298
0299
0300
0301
0302
0303
0304
0305 static int vmemmap_remap_free(unsigned long start, unsigned long end,
0306 unsigned long reuse)
0307 {
0308 int ret;
0309 LIST_HEAD(vmemmap_pages);
0310 struct vmemmap_remap_walk walk = {
0311 .remap_pte = vmemmap_remap_pte,
0312 .reuse_addr = reuse,
0313 .vmemmap_pages = &vmemmap_pages,
0314 };
0315
0316
0317
0318
0319
0320
0321
0322
0323
0324
0325
0326
0327
0328
0329 BUG_ON(start - reuse != PAGE_SIZE);
0330
0331 mmap_read_lock(&init_mm);
0332 ret = vmemmap_remap_range(reuse, end, &walk);
0333 if (ret && walk.nr_walked) {
0334 end = reuse + walk.nr_walked * PAGE_SIZE;
0335
0336
0337
0338
0339
0340
0341 walk = (struct vmemmap_remap_walk) {
0342 .remap_pte = vmemmap_restore_pte,
0343 .reuse_addr = reuse,
0344 .vmemmap_pages = &vmemmap_pages,
0345 };
0346
0347 vmemmap_remap_range(reuse, end, &walk);
0348 }
0349 mmap_read_unlock(&init_mm);
0350
0351 free_vmemmap_page_list(&vmemmap_pages);
0352
0353 return ret;
0354 }
0355
0356 static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
0357 gfp_t gfp_mask, struct list_head *list)
0358 {
0359 unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
0360 int nid = page_to_nid((struct page *)start);
0361 struct page *page, *next;
0362
0363 while (nr_pages--) {
0364 page = alloc_pages_node(nid, gfp_mask, 0);
0365 if (!page)
0366 goto out;
0367 list_add_tail(&page->lru, list);
0368 }
0369
0370 return 0;
0371 out:
0372 list_for_each_entry_safe(page, next, list, lru)
0373 __free_pages(page, 0);
0374 return -ENOMEM;
0375 }
0376
0377
0378
0379
0380
0381
0382
0383
0384
0385
0386
0387
0388
0389
0390 static int vmemmap_remap_alloc(unsigned long start, unsigned long end,
0391 unsigned long reuse, gfp_t gfp_mask)
0392 {
0393 LIST_HEAD(vmemmap_pages);
0394 struct vmemmap_remap_walk walk = {
0395 .remap_pte = vmemmap_restore_pte,
0396 .reuse_addr = reuse,
0397 .vmemmap_pages = &vmemmap_pages,
0398 };
0399
0400
0401 BUG_ON(start - reuse != PAGE_SIZE);
0402
0403 if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages))
0404 return -ENOMEM;
0405
0406 mmap_read_lock(&init_mm);
0407 vmemmap_remap_range(reuse, end, &walk);
0408 mmap_read_unlock(&init_mm);
0409
0410 return 0;
0411 }
0412
0413 DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);
0414 EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
0415
0416 static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON);
0417 core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0);
0418
0419
0420
0421
0422
0423
0424
0425
0426
0427
0428
0429 int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head)
0430 {
0431 int ret;
0432 unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
0433 unsigned long vmemmap_reuse;
0434
0435 if (!HPageVmemmapOptimized(head))
0436 return 0;
0437
0438 vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
0439 vmemmap_reuse = vmemmap_start;
0440 vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE;
0441
0442
0443
0444
0445
0446
0447
0448
0449 ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse,
0450 GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE);
0451 if (!ret) {
0452 ClearHPageVmemmapOptimized(head);
0453 static_branch_dec(&hugetlb_optimize_vmemmap_key);
0454 }
0455
0456 return ret;
0457 }
0458
0459
0460 static bool vmemmap_should_optimize(const struct hstate *h, const struct page *head)
0461 {
0462 if (!READ_ONCE(vmemmap_optimize_enabled))
0463 return false;
0464
0465 if (!hugetlb_vmemmap_optimizable(h))
0466 return false;
0467
0468 if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) {
0469 pmd_t *pmdp, pmd;
0470 struct page *vmemmap_page;
0471 unsigned long vaddr = (unsigned long)head;
0472
0473
0474
0475
0476
0477
0478 pmdp = pmd_off_k(vaddr);
0479
0480
0481
0482
0483
0484
0485
0486 pmd = READ_ONCE(*pmdp);
0487 if (pmd_leaf(pmd))
0488 vmemmap_page = pmd_page(pmd) + pte_index(vaddr);
0489 else
0490 vmemmap_page = pte_page(*pte_offset_kernel(pmdp, vaddr));
0491
0492
0493
0494
0495
0496
0497
0498
0499
0500
0501
0502
0503
0504
0505
0506
0507
0508 if (PageVmemmapSelfHosted(vmemmap_page))
0509 return false;
0510 }
0511
0512 return true;
0513 }
0514
0515
0516
0517
0518
0519
0520
0521
0522
0523
0524
0525 void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head)
0526 {
0527 unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
0528 unsigned long vmemmap_reuse;
0529
0530 if (!vmemmap_should_optimize(h, head))
0531 return;
0532
0533 static_branch_inc(&hugetlb_optimize_vmemmap_key);
0534
0535 vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
0536 vmemmap_reuse = vmemmap_start;
0537 vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE;
0538
0539
0540
0541
0542
0543
0544 if (vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse))
0545 static_branch_dec(&hugetlb_optimize_vmemmap_key);
0546 else
0547 SetHPageVmemmapOptimized(head);
0548 }
0549
0550 static struct ctl_table hugetlb_vmemmap_sysctls[] = {
0551 {
0552 .procname = "hugetlb_optimize_vmemmap",
0553 .data = &vmemmap_optimize_enabled,
0554 .maxlen = sizeof(int),
0555 .mode = 0644,
0556 .proc_handler = proc_dobool,
0557 },
0558 { }
0559 };
0560
0561 static int __init hugetlb_vmemmap_init(void)
0562 {
0563
0564 BUILD_BUG_ON(__NR_USED_SUBPAGE * sizeof(struct page) > HUGETLB_VMEMMAP_RESERVE_SIZE);
0565
0566 if (IS_ENABLED(CONFIG_PROC_SYSCTL)) {
0567 const struct hstate *h;
0568
0569 for_each_hstate(h) {
0570 if (hugetlb_vmemmap_optimizable(h)) {
0571 register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
0572 break;
0573 }
0574 }
0575 }
0576 return 0;
0577 }
0578 late_initcall(hugetlb_vmemmap_init);