0001
0002
0003
0004
0005
0006 #include <linux/list.h>
0007 #include <linux/init.h>
0008 #include <linux/mm.h>
0009 #include <linux/seq_file.h>
0010 #include <linux/sysctl.h>
0011 #include <linux/highmem.h>
0012 #include <linux/mmu_notifier.h>
0013 #include <linux/nodemask.h>
0014 #include <linux/pagemap.h>
0015 #include <linux/mempolicy.h>
0016 #include <linux/compiler.h>
0017 #include <linux/cpuset.h>
0018 #include <linux/mutex.h>
0019 #include <linux/memblock.h>
0020 #include <linux/sysfs.h>
0021 #include <linux/slab.h>
0022 #include <linux/sched/mm.h>
0023 #include <linux/mmdebug.h>
0024 #include <linux/sched/signal.h>
0025 #include <linux/rmap.h>
0026 #include <linux/string_helpers.h>
0027 #include <linux/swap.h>
0028 #include <linux/swapops.h>
0029 #include <linux/jhash.h>
0030 #include <linux/numa.h>
0031 #include <linux/llist.h>
0032 #include <linux/cma.h>
0033 #include <linux/migrate.h>
0034 #include <linux/nospec.h>
0035 #include <linux/delayacct.h>
0036
0037 #include <asm/page.h>
0038 #include <asm/pgalloc.h>
0039 #include <asm/tlb.h>
0040
0041 #include <linux/io.h>
0042 #include <linux/hugetlb.h>
0043 #include <linux/hugetlb_cgroup.h>
0044 #include <linux/node.h>
0045 #include <linux/page_owner.h>
0046 #include "internal.h"
0047 #include "hugetlb_vmemmap.h"
0048
0049 int hugetlb_max_hstate __read_mostly;
0050 unsigned int default_hstate_idx;
0051 struct hstate hstates[HUGE_MAX_HSTATE];
0052
0053 #ifdef CONFIG_CMA
0054 static struct cma *hugetlb_cma[MAX_NUMNODES];
0055 static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata;
0056 static bool hugetlb_cma_page(struct page *page, unsigned int order)
0057 {
0058 return cma_pages_valid(hugetlb_cma[page_to_nid(page)], page,
0059 1 << order);
0060 }
0061 #else
0062 static bool hugetlb_cma_page(struct page *page, unsigned int order)
0063 {
0064 return false;
0065 }
0066 #endif
0067 static unsigned long hugetlb_cma_size __initdata;
0068
0069 __initdata LIST_HEAD(huge_boot_pages);
0070
0071
0072 static struct hstate * __initdata parsed_hstate;
0073 static unsigned long __initdata default_hstate_max_huge_pages;
0074 static bool __initdata parsed_valid_hugepagesz = true;
0075 static bool __initdata parsed_default_hugepagesz;
0076 static unsigned int default_hugepages_in_node[MAX_NUMNODES] __initdata;
0077
0078
0079
0080
0081
0082 DEFINE_SPINLOCK(hugetlb_lock);
0083
0084
0085
0086
0087
0088 static int num_fault_mutexes;
0089 struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
0090
0091
0092 static int hugetlb_acct_memory(struct hstate *h, long delta);
0093
0094 static inline bool subpool_is_free(struct hugepage_subpool *spool)
0095 {
0096 if (spool->count)
0097 return false;
0098 if (spool->max_hpages != -1)
0099 return spool->used_hpages == 0;
0100 if (spool->min_hpages != -1)
0101 return spool->rsv_hpages == spool->min_hpages;
0102
0103 return true;
0104 }
0105
0106 static inline void unlock_or_release_subpool(struct hugepage_subpool *spool,
0107 unsigned long irq_flags)
0108 {
0109 spin_unlock_irqrestore(&spool->lock, irq_flags);
0110
0111
0112
0113
0114 if (subpool_is_free(spool)) {
0115 if (spool->min_hpages != -1)
0116 hugetlb_acct_memory(spool->hstate,
0117 -spool->min_hpages);
0118 kfree(spool);
0119 }
0120 }
0121
0122 struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
0123 long min_hpages)
0124 {
0125 struct hugepage_subpool *spool;
0126
0127 spool = kzalloc(sizeof(*spool), GFP_KERNEL);
0128 if (!spool)
0129 return NULL;
0130
0131 spin_lock_init(&spool->lock);
0132 spool->count = 1;
0133 spool->max_hpages = max_hpages;
0134 spool->hstate = h;
0135 spool->min_hpages = min_hpages;
0136
0137 if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
0138 kfree(spool);
0139 return NULL;
0140 }
0141 spool->rsv_hpages = min_hpages;
0142
0143 return spool;
0144 }
0145
0146 void hugepage_put_subpool(struct hugepage_subpool *spool)
0147 {
0148 unsigned long flags;
0149
0150 spin_lock_irqsave(&spool->lock, flags);
0151 BUG_ON(!spool->count);
0152 spool->count--;
0153 unlock_or_release_subpool(spool, flags);
0154 }
0155
0156
0157
0158
0159
0160
0161
0162
0163
0164 static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
0165 long delta)
0166 {
0167 long ret = delta;
0168
0169 if (!spool)
0170 return ret;
0171
0172 spin_lock_irq(&spool->lock);
0173
0174 if (spool->max_hpages != -1) {
0175 if ((spool->used_hpages + delta) <= spool->max_hpages)
0176 spool->used_hpages += delta;
0177 else {
0178 ret = -ENOMEM;
0179 goto unlock_ret;
0180 }
0181 }
0182
0183
0184 if (spool->min_hpages != -1 && spool->rsv_hpages) {
0185 if (delta > spool->rsv_hpages) {
0186
0187
0188
0189
0190 ret = delta - spool->rsv_hpages;
0191 spool->rsv_hpages = 0;
0192 } else {
0193 ret = 0;
0194 spool->rsv_hpages -= delta;
0195 }
0196 }
0197
0198 unlock_ret:
0199 spin_unlock_irq(&spool->lock);
0200 return ret;
0201 }
0202
0203
0204
0205
0206
0207
0208
0209 static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
0210 long delta)
0211 {
0212 long ret = delta;
0213 unsigned long flags;
0214
0215 if (!spool)
0216 return delta;
0217
0218 spin_lock_irqsave(&spool->lock, flags);
0219
0220 if (spool->max_hpages != -1)
0221 spool->used_hpages -= delta;
0222
0223
0224 if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) {
0225 if (spool->rsv_hpages + delta <= spool->min_hpages)
0226 ret = 0;
0227 else
0228 ret = spool->rsv_hpages + delta - spool->min_hpages;
0229
0230 spool->rsv_hpages += delta;
0231 if (spool->rsv_hpages > spool->min_hpages)
0232 spool->rsv_hpages = spool->min_hpages;
0233 }
0234
0235
0236
0237
0238
0239 unlock_or_release_subpool(spool, flags);
0240
0241 return ret;
0242 }
0243
0244 static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
0245 {
0246 return HUGETLBFS_SB(inode->i_sb)->spool;
0247 }
0248
0249 static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
0250 {
0251 return subpool_inode(file_inode(vma->vm_file));
0252 }
0253
0254
0255
0256
0257 static struct file_region *
0258 get_file_region_entry_from_cache(struct resv_map *resv, long from, long to)
0259 {
0260 struct file_region *nrg = NULL;
0261
0262 VM_BUG_ON(resv->region_cache_count <= 0);
0263
0264 resv->region_cache_count--;
0265 nrg = list_first_entry(&resv->region_cache, struct file_region, link);
0266 list_del(&nrg->link);
0267
0268 nrg->from = from;
0269 nrg->to = to;
0270
0271 return nrg;
0272 }
0273
0274 static void copy_hugetlb_cgroup_uncharge_info(struct file_region *nrg,
0275 struct file_region *rg)
0276 {
0277 #ifdef CONFIG_CGROUP_HUGETLB
0278 nrg->reservation_counter = rg->reservation_counter;
0279 nrg->css = rg->css;
0280 if (rg->css)
0281 css_get(rg->css);
0282 #endif
0283 }
0284
0285
0286 static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg,
0287 struct hstate *h,
0288 struct resv_map *resv,
0289 struct file_region *nrg)
0290 {
0291 #ifdef CONFIG_CGROUP_HUGETLB
0292 if (h_cg) {
0293 nrg->reservation_counter =
0294 &h_cg->rsvd_hugepage[hstate_index(h)];
0295 nrg->css = &h_cg->css;
0296
0297
0298
0299
0300
0301
0302
0303
0304
0305
0306 css_get(&h_cg->css);
0307 if (!resv->pages_per_hpage)
0308 resv->pages_per_hpage = pages_per_huge_page(h);
0309
0310
0311
0312 VM_BUG_ON(resv->pages_per_hpage != pages_per_huge_page(h));
0313 } else {
0314 nrg->reservation_counter = NULL;
0315 nrg->css = NULL;
0316 }
0317 #endif
0318 }
0319
0320 static void put_uncharge_info(struct file_region *rg)
0321 {
0322 #ifdef CONFIG_CGROUP_HUGETLB
0323 if (rg->css)
0324 css_put(rg->css);
0325 #endif
0326 }
0327
0328 static bool has_same_uncharge_info(struct file_region *rg,
0329 struct file_region *org)
0330 {
0331 #ifdef CONFIG_CGROUP_HUGETLB
0332 return rg->reservation_counter == org->reservation_counter &&
0333 rg->css == org->css;
0334
0335 #else
0336 return true;
0337 #endif
0338 }
0339
0340 static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
0341 {
0342 struct file_region *nrg = NULL, *prg = NULL;
0343
0344 prg = list_prev_entry(rg, link);
0345 if (&prg->link != &resv->regions && prg->to == rg->from &&
0346 has_same_uncharge_info(prg, rg)) {
0347 prg->to = rg->to;
0348
0349 list_del(&rg->link);
0350 put_uncharge_info(rg);
0351 kfree(rg);
0352
0353 rg = prg;
0354 }
0355
0356 nrg = list_next_entry(rg, link);
0357 if (&nrg->link != &resv->regions && nrg->from == rg->to &&
0358 has_same_uncharge_info(nrg, rg)) {
0359 nrg->from = rg->from;
0360
0361 list_del(&rg->link);
0362 put_uncharge_info(rg);
0363 kfree(rg);
0364 }
0365 }
0366
0367 static inline long
0368 hugetlb_resv_map_add(struct resv_map *map, struct list_head *rg, long from,
0369 long to, struct hstate *h, struct hugetlb_cgroup *cg,
0370 long *regions_needed)
0371 {
0372 struct file_region *nrg;
0373
0374 if (!regions_needed) {
0375 nrg = get_file_region_entry_from_cache(map, from, to);
0376 record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg);
0377 list_add(&nrg->link, rg);
0378 coalesce_file_region(map, nrg);
0379 } else
0380 *regions_needed += 1;
0381
0382 return to - from;
0383 }
0384
0385
0386
0387
0388
0389
0390
0391
0392
0393 static long add_reservation_in_range(struct resv_map *resv, long f, long t,
0394 struct hugetlb_cgroup *h_cg,
0395 struct hstate *h, long *regions_needed)
0396 {
0397 long add = 0;
0398 struct list_head *head = &resv->regions;
0399 long last_accounted_offset = f;
0400 struct file_region *iter, *trg = NULL;
0401 struct list_head *rg = NULL;
0402
0403 if (regions_needed)
0404 *regions_needed = 0;
0405
0406
0407
0408
0409
0410 list_for_each_entry_safe(iter, trg, head, link) {
0411
0412 if (iter->from < f) {
0413
0414
0415
0416 if (iter->to > last_accounted_offset)
0417 last_accounted_offset = iter->to;
0418 continue;
0419 }
0420
0421
0422
0423
0424 if (iter->from >= t) {
0425 rg = iter->link.prev;
0426 break;
0427 }
0428
0429
0430
0431
0432 if (iter->from > last_accounted_offset)
0433 add += hugetlb_resv_map_add(resv, iter->link.prev,
0434 last_accounted_offset,
0435 iter->from, h, h_cg,
0436 regions_needed);
0437
0438 last_accounted_offset = iter->to;
0439 }
0440
0441
0442
0443
0444 if (!rg)
0445 rg = head->prev;
0446 if (last_accounted_offset < t)
0447 add += hugetlb_resv_map_add(resv, rg, last_accounted_offset,
0448 t, h, h_cg, regions_needed);
0449
0450 return add;
0451 }
0452
0453
0454
0455 static int allocate_file_region_entries(struct resv_map *resv,
0456 int regions_needed)
0457 __must_hold(&resv->lock)
0458 {
0459 struct list_head allocated_regions;
0460 int to_allocate = 0, i = 0;
0461 struct file_region *trg = NULL, *rg = NULL;
0462
0463 VM_BUG_ON(regions_needed < 0);
0464
0465 INIT_LIST_HEAD(&allocated_regions);
0466
0467
0468
0469
0470
0471
0472
0473
0474
0475
0476 while (resv->region_cache_count <
0477 (resv->adds_in_progress + regions_needed)) {
0478 to_allocate = resv->adds_in_progress + regions_needed -
0479 resv->region_cache_count;
0480
0481
0482
0483
0484
0485 VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress);
0486
0487 spin_unlock(&resv->lock);
0488 for (i = 0; i < to_allocate; i++) {
0489 trg = kmalloc(sizeof(*trg), GFP_KERNEL);
0490 if (!trg)
0491 goto out_of_memory;
0492 list_add(&trg->link, &allocated_regions);
0493 }
0494
0495 spin_lock(&resv->lock);
0496
0497 list_splice(&allocated_regions, &resv->region_cache);
0498 resv->region_cache_count += to_allocate;
0499 }
0500
0501 return 0;
0502
0503 out_of_memory:
0504 list_for_each_entry_safe(rg, trg, &allocated_regions, link) {
0505 list_del(&rg->link);
0506 kfree(rg);
0507 }
0508 return -ENOMEM;
0509 }
0510
0511
0512
0513
0514
0515
0516
0517
0518
0519
0520
0521
0522
0523
0524
0525
0526
0527
0528 static long region_add(struct resv_map *resv, long f, long t,
0529 long in_regions_needed, struct hstate *h,
0530 struct hugetlb_cgroup *h_cg)
0531 {
0532 long add = 0, actual_regions_needed = 0;
0533
0534 spin_lock(&resv->lock);
0535 retry:
0536
0537
0538 add_reservation_in_range(resv, f, t, NULL, NULL,
0539 &actual_regions_needed);
0540
0541
0542
0543
0544
0545
0546
0547
0548
0549
0550 if (actual_regions_needed > in_regions_needed &&
0551 resv->region_cache_count <
0552 resv->adds_in_progress +
0553 (actual_regions_needed - in_regions_needed)) {
0554
0555
0556
0557 VM_BUG_ON(t - f <= 1);
0558
0559 if (allocate_file_region_entries(
0560 resv, actual_regions_needed - in_regions_needed)) {
0561 return -ENOMEM;
0562 }
0563
0564 goto retry;
0565 }
0566
0567 add = add_reservation_in_range(resv, f, t, h_cg, h, NULL);
0568
0569 resv->adds_in_progress -= in_regions_needed;
0570
0571 spin_unlock(&resv->lock);
0572 return add;
0573 }
0574
0575
0576
0577
0578
0579
0580
0581
0582
0583
0584
0585
0586
0587
0588
0589
0590
0591
0592
0593
0594
0595 static long region_chg(struct resv_map *resv, long f, long t,
0596 long *out_regions_needed)
0597 {
0598 long chg = 0;
0599
0600 spin_lock(&resv->lock);
0601
0602
0603 chg = add_reservation_in_range(resv, f, t, NULL, NULL,
0604 out_regions_needed);
0605
0606 if (*out_regions_needed == 0)
0607 *out_regions_needed = 1;
0608
0609 if (allocate_file_region_entries(resv, *out_regions_needed))
0610 return -ENOMEM;
0611
0612 resv->adds_in_progress += *out_regions_needed;
0613
0614 spin_unlock(&resv->lock);
0615 return chg;
0616 }
0617
0618
0619
0620
0621
0622
0623
0624
0625
0626
0627
0628
0629
0630
0631 static void region_abort(struct resv_map *resv, long f, long t,
0632 long regions_needed)
0633 {
0634 spin_lock(&resv->lock);
0635 VM_BUG_ON(!resv->region_cache_count);
0636 resv->adds_in_progress -= regions_needed;
0637 spin_unlock(&resv->lock);
0638 }
0639
0640
0641
0642
0643
0644
0645
0646
0647
0648
0649
0650
0651
0652
0653
0654 static long region_del(struct resv_map *resv, long f, long t)
0655 {
0656 struct list_head *head = &resv->regions;
0657 struct file_region *rg, *trg;
0658 struct file_region *nrg = NULL;
0659 long del = 0;
0660
0661 retry:
0662 spin_lock(&resv->lock);
0663 list_for_each_entry_safe(rg, trg, head, link) {
0664
0665
0666
0667
0668
0669
0670
0671 if (rg->to <= f && (rg->to != rg->from || rg->to != f))
0672 continue;
0673
0674 if (rg->from >= t)
0675 break;
0676
0677 if (f > rg->from && t < rg->to) {
0678
0679
0680
0681
0682 if (!nrg &&
0683 resv->region_cache_count > resv->adds_in_progress) {
0684 nrg = list_first_entry(&resv->region_cache,
0685 struct file_region,
0686 link);
0687 list_del(&nrg->link);
0688 resv->region_cache_count--;
0689 }
0690
0691 if (!nrg) {
0692 spin_unlock(&resv->lock);
0693 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
0694 if (!nrg)
0695 return -ENOMEM;
0696 goto retry;
0697 }
0698
0699 del += t - f;
0700 hugetlb_cgroup_uncharge_file_region(
0701 resv, rg, t - f, false);
0702
0703
0704 nrg->from = t;
0705 nrg->to = rg->to;
0706
0707 copy_hugetlb_cgroup_uncharge_info(nrg, rg);
0708
0709 INIT_LIST_HEAD(&nrg->link);
0710
0711
0712 rg->to = f;
0713
0714 list_add(&nrg->link, &rg->link);
0715 nrg = NULL;
0716 break;
0717 }
0718
0719 if (f <= rg->from && t >= rg->to) {
0720 del += rg->to - rg->from;
0721 hugetlb_cgroup_uncharge_file_region(resv, rg,
0722 rg->to - rg->from, true);
0723 list_del(&rg->link);
0724 kfree(rg);
0725 continue;
0726 }
0727
0728 if (f <= rg->from) {
0729 hugetlb_cgroup_uncharge_file_region(resv, rg,
0730 t - rg->from, false);
0731
0732 del += t - rg->from;
0733 rg->from = t;
0734 } else {
0735 hugetlb_cgroup_uncharge_file_region(resv, rg,
0736 rg->to - f, false);
0737
0738 del += rg->to - f;
0739 rg->to = f;
0740 }
0741 }
0742
0743 spin_unlock(&resv->lock);
0744 kfree(nrg);
0745 return del;
0746 }
0747
0748
0749
0750
0751
0752
0753
0754
0755
0756
0757 void hugetlb_fix_reserve_counts(struct inode *inode)
0758 {
0759 struct hugepage_subpool *spool = subpool_inode(inode);
0760 long rsv_adjust;
0761 bool reserved = false;
0762
0763 rsv_adjust = hugepage_subpool_get_pages(spool, 1);
0764 if (rsv_adjust > 0) {
0765 struct hstate *h = hstate_inode(inode);
0766
0767 if (!hugetlb_acct_memory(h, 1))
0768 reserved = true;
0769 } else if (!rsv_adjust) {
0770 reserved = true;
0771 }
0772
0773 if (!reserved)
0774 pr_warn("hugetlb: Huge Page Reserved count may go negative.\n");
0775 }
0776
0777
0778
0779
0780
0781 static long region_count(struct resv_map *resv, long f, long t)
0782 {
0783 struct list_head *head = &resv->regions;
0784 struct file_region *rg;
0785 long chg = 0;
0786
0787 spin_lock(&resv->lock);
0788
0789 list_for_each_entry(rg, head, link) {
0790 long seg_from;
0791 long seg_to;
0792
0793 if (rg->to <= f)
0794 continue;
0795 if (rg->from >= t)
0796 break;
0797
0798 seg_from = max(rg->from, f);
0799 seg_to = min(rg->to, t);
0800
0801 chg += seg_to - seg_from;
0802 }
0803 spin_unlock(&resv->lock);
0804
0805 return chg;
0806 }
0807
0808
0809
0810
0811
0812 static pgoff_t vma_hugecache_offset(struct hstate *h,
0813 struct vm_area_struct *vma, unsigned long address)
0814 {
0815 return ((address - vma->vm_start) >> huge_page_shift(h)) +
0816 (vma->vm_pgoff >> huge_page_order(h));
0817 }
0818
0819 pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
0820 unsigned long address)
0821 {
0822 return vma_hugecache_offset(hstate_vma(vma), vma, address);
0823 }
0824 EXPORT_SYMBOL_GPL(linear_hugepage_index);
0825
0826
0827
0828
0829
0830 unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
0831 {
0832 if (vma->vm_ops && vma->vm_ops->pagesize)
0833 return vma->vm_ops->pagesize(vma);
0834 return PAGE_SIZE;
0835 }
0836 EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
0837
0838
0839
0840
0841
0842
0843
0844 __weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
0845 {
0846 return vma_kernel_pagesize(vma);
0847 }
0848
0849
0850
0851
0852
0853
0854 #define HPAGE_RESV_OWNER (1UL << 0)
0855 #define HPAGE_RESV_UNMAPPED (1UL << 1)
0856 #define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
0857
0858
0859
0860
0861
0862
0863
0864
0865
0866
0867
0868
0869
0870
0871
0872
0873
0874
0875
0876
0877 static unsigned long get_vma_private_data(struct vm_area_struct *vma)
0878 {
0879 return (unsigned long)vma->vm_private_data;
0880 }
0881
0882 static void set_vma_private_data(struct vm_area_struct *vma,
0883 unsigned long value)
0884 {
0885 vma->vm_private_data = (void *)value;
0886 }
0887
0888 static void
0889 resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map,
0890 struct hugetlb_cgroup *h_cg,
0891 struct hstate *h)
0892 {
0893 #ifdef CONFIG_CGROUP_HUGETLB
0894 if (!h_cg || !h) {
0895 resv_map->reservation_counter = NULL;
0896 resv_map->pages_per_hpage = 0;
0897 resv_map->css = NULL;
0898 } else {
0899 resv_map->reservation_counter =
0900 &h_cg->rsvd_hugepage[hstate_index(h)];
0901 resv_map->pages_per_hpage = pages_per_huge_page(h);
0902 resv_map->css = &h_cg->css;
0903 }
0904 #endif
0905 }
0906
0907 struct resv_map *resv_map_alloc(void)
0908 {
0909 struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
0910 struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL);
0911
0912 if (!resv_map || !rg) {
0913 kfree(resv_map);
0914 kfree(rg);
0915 return NULL;
0916 }
0917
0918 kref_init(&resv_map->refs);
0919 spin_lock_init(&resv_map->lock);
0920 INIT_LIST_HEAD(&resv_map->regions);
0921
0922 resv_map->adds_in_progress = 0;
0923
0924
0925
0926
0927
0928
0929 resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, NULL, NULL);
0930
0931 INIT_LIST_HEAD(&resv_map->region_cache);
0932 list_add(&rg->link, &resv_map->region_cache);
0933 resv_map->region_cache_count = 1;
0934
0935 return resv_map;
0936 }
0937
0938 void resv_map_release(struct kref *ref)
0939 {
0940 struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
0941 struct list_head *head = &resv_map->region_cache;
0942 struct file_region *rg, *trg;
0943
0944
0945 region_del(resv_map, 0, LONG_MAX);
0946
0947
0948 list_for_each_entry_safe(rg, trg, head, link) {
0949 list_del(&rg->link);
0950 kfree(rg);
0951 }
0952
0953 VM_BUG_ON(resv_map->adds_in_progress);
0954
0955 kfree(resv_map);
0956 }
0957
0958 static inline struct resv_map *inode_resv_map(struct inode *inode)
0959 {
0960
0961
0962
0963
0964
0965
0966
0967
0968 return (struct resv_map *)(&inode->i_data)->private_data;
0969 }
0970
0971 static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
0972 {
0973 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
0974 if (vma->vm_flags & VM_MAYSHARE) {
0975 struct address_space *mapping = vma->vm_file->f_mapping;
0976 struct inode *inode = mapping->host;
0977
0978 return inode_resv_map(inode);
0979
0980 } else {
0981 return (struct resv_map *)(get_vma_private_data(vma) &
0982 ~HPAGE_RESV_MASK);
0983 }
0984 }
0985
0986 static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
0987 {
0988 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
0989 VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
0990
0991 set_vma_private_data(vma, (get_vma_private_data(vma) &
0992 HPAGE_RESV_MASK) | (unsigned long)map);
0993 }
0994
0995 static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
0996 {
0997 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
0998 VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
0999
1000 set_vma_private_data(vma, get_vma_private_data(vma) | flags);
1001 }
1002
1003 static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
1004 {
1005 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
1006
1007 return (get_vma_private_data(vma) & flag) != 0;
1008 }
1009
1010
1011 void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
1012 {
1013 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
1014 if (!(vma->vm_flags & VM_MAYSHARE))
1015 vma->vm_private_data = (void *)0;
1016 }
1017
1018
1019
1020
1021
1022
1023
1024
1025 void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
1026 {
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039 struct resv_map *reservations = vma_resv_map(vma);
1040
1041 if (reservations && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
1042 resv_map_put_hugetlb_cgroup_uncharge_info(reservations);
1043 kref_put(&reservations->refs, resv_map_release);
1044 }
1045
1046 reset_vma_resv_huge_pages(vma);
1047 }
1048
1049
1050 static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
1051 {
1052 if (vma->vm_flags & VM_NORESERVE) {
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062 if (vma->vm_flags & VM_MAYSHARE && chg == 0)
1063 return true;
1064 else
1065 return false;
1066 }
1067
1068
1069 if (vma->vm_flags & VM_MAYSHARE) {
1070
1071
1072
1073
1074
1075
1076
1077 if (chg)
1078 return false;
1079 else
1080 return true;
1081 }
1082
1083
1084
1085
1086
1087 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103 if (chg)
1104 return false;
1105 else
1106 return true;
1107 }
1108
1109 return false;
1110 }
1111
1112 static void enqueue_huge_page(struct hstate *h, struct page *page)
1113 {
1114 int nid = page_to_nid(page);
1115
1116 lockdep_assert_held(&hugetlb_lock);
1117 VM_BUG_ON_PAGE(page_count(page), page);
1118
1119 list_move(&page->lru, &h->hugepage_freelists[nid]);
1120 h->free_huge_pages++;
1121 h->free_huge_pages_node[nid]++;
1122 SetHPageFreed(page);
1123 }
1124
1125 static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
1126 {
1127 struct page *page;
1128 bool pin = !!(current->flags & PF_MEMALLOC_PIN);
1129
1130 lockdep_assert_held(&hugetlb_lock);
1131 list_for_each_entry(page, &h->hugepage_freelists[nid], lru) {
1132 if (pin && !is_longterm_pinnable_page(page))
1133 continue;
1134
1135 if (PageHWPoison(page))
1136 continue;
1137
1138 list_move(&page->lru, &h->hugepage_activelist);
1139 set_page_refcounted(page);
1140 ClearHPageFreed(page);
1141 h->free_huge_pages--;
1142 h->free_huge_pages_node[nid]--;
1143 return page;
1144 }
1145
1146 return NULL;
1147 }
1148
1149 static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid,
1150 nodemask_t *nmask)
1151 {
1152 unsigned int cpuset_mems_cookie;
1153 struct zonelist *zonelist;
1154 struct zone *zone;
1155 struct zoneref *z;
1156 int node = NUMA_NO_NODE;
1157
1158 zonelist = node_zonelist(nid, gfp_mask);
1159
1160 retry_cpuset:
1161 cpuset_mems_cookie = read_mems_allowed_begin();
1162 for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) {
1163 struct page *page;
1164
1165 if (!cpuset_zone_allowed(zone, gfp_mask))
1166 continue;
1167
1168
1169
1170
1171 if (zone_to_nid(zone) == node)
1172 continue;
1173 node = zone_to_nid(zone);
1174
1175 page = dequeue_huge_page_node_exact(h, node);
1176 if (page)
1177 return page;
1178 }
1179 if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie)))
1180 goto retry_cpuset;
1181
1182 return NULL;
1183 }
1184
1185 static struct page *dequeue_huge_page_vma(struct hstate *h,
1186 struct vm_area_struct *vma,
1187 unsigned long address, int avoid_reserve,
1188 long chg)
1189 {
1190 struct page *page = NULL;
1191 struct mempolicy *mpol;
1192 gfp_t gfp_mask;
1193 nodemask_t *nodemask;
1194 int nid;
1195
1196
1197
1198
1199
1200
1201 if (!vma_has_reserves(vma, chg) &&
1202 h->free_huge_pages - h->resv_huge_pages == 0)
1203 goto err;
1204
1205
1206 if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
1207 goto err;
1208
1209 gfp_mask = htlb_alloc_mask(h);
1210 nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
1211
1212 if (mpol_is_preferred_many(mpol)) {
1213 page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
1214
1215
1216 nodemask = NULL;
1217 }
1218
1219 if (!page)
1220 page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
1221
1222 if (page && !avoid_reserve && vma_has_reserves(vma, chg)) {
1223 SetHPageRestoreReserve(page);
1224 h->resv_huge_pages--;
1225 }
1226
1227 mpol_cond_put(mpol);
1228 return page;
1229
1230 err:
1231 return NULL;
1232 }
1233
1234
1235
1236
1237
1238
1239
1240
1241 static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
1242 {
1243 nid = next_node_in(nid, *nodes_allowed);
1244 VM_BUG_ON(nid >= MAX_NUMNODES);
1245
1246 return nid;
1247 }
1248
1249 static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
1250 {
1251 if (!node_isset(nid, *nodes_allowed))
1252 nid = next_node_allowed(nid, nodes_allowed);
1253 return nid;
1254 }
1255
1256
1257
1258
1259
1260
1261
1262 static int hstate_next_node_to_alloc(struct hstate *h,
1263 nodemask_t *nodes_allowed)
1264 {
1265 int nid;
1266
1267 VM_BUG_ON(!nodes_allowed);
1268
1269 nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
1270 h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
1271
1272 return nid;
1273 }
1274
1275
1276
1277
1278
1279
1280
1281 static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
1282 {
1283 int nid;
1284
1285 VM_BUG_ON(!nodes_allowed);
1286
1287 nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
1288 h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
1289
1290 return nid;
1291 }
1292
1293 #define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \
1294 for (nr_nodes = nodes_weight(*mask); \
1295 nr_nodes > 0 && \
1296 ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \
1297 nr_nodes--)
1298
1299 #define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \
1300 for (nr_nodes = nodes_weight(*mask); \
1301 nr_nodes > 0 && \
1302 ((node = hstate_next_node_to_free(hs, mask)) || 1); \
1303 nr_nodes--)
1304
1305
1306 static void __destroy_compound_gigantic_page(struct page *page,
1307 unsigned int order, bool demote)
1308 {
1309 int i;
1310 int nr_pages = 1 << order;
1311 struct page *p = page + 1;
1312
1313 atomic_set(compound_mapcount_ptr(page), 0);
1314 atomic_set(compound_pincount_ptr(page), 0);
1315
1316 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
1317 p->mapping = NULL;
1318 clear_compound_head(p);
1319 if (!demote)
1320 set_page_refcounted(p);
1321 }
1322
1323 set_compound_order(page, 0);
1324 #ifdef CONFIG_64BIT
1325 page[1].compound_nr = 0;
1326 #endif
1327 __ClearPageHead(page);
1328 }
1329
1330 static void destroy_compound_hugetlb_page_for_demote(struct page *page,
1331 unsigned int order)
1332 {
1333 __destroy_compound_gigantic_page(page, order, true);
1334 }
1335
1336 #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
1337 static void destroy_compound_gigantic_page(struct page *page,
1338 unsigned int order)
1339 {
1340 __destroy_compound_gigantic_page(page, order, false);
1341 }
1342
1343 static void free_gigantic_page(struct page *page, unsigned int order)
1344 {
1345
1346
1347
1348
1349 #ifdef CONFIG_CMA
1350 if (cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order))
1351 return;
1352 #endif
1353
1354 free_contig_range(page_to_pfn(page), 1 << order);
1355 }
1356
1357 #ifdef CONFIG_CONTIG_ALLOC
1358 static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
1359 int nid, nodemask_t *nodemask)
1360 {
1361 unsigned long nr_pages = pages_per_huge_page(h);
1362 if (nid == NUMA_NO_NODE)
1363 nid = numa_mem_id();
1364
1365 #ifdef CONFIG_CMA
1366 {
1367 struct page *page;
1368 int node;
1369
1370 if (hugetlb_cma[nid]) {
1371 page = cma_alloc(hugetlb_cma[nid], nr_pages,
1372 huge_page_order(h), true);
1373 if (page)
1374 return page;
1375 }
1376
1377 if (!(gfp_mask & __GFP_THISNODE)) {
1378 for_each_node_mask(node, *nodemask) {
1379 if (node == nid || !hugetlb_cma[node])
1380 continue;
1381
1382 page = cma_alloc(hugetlb_cma[node], nr_pages,
1383 huge_page_order(h), true);
1384 if (page)
1385 return page;
1386 }
1387 }
1388 }
1389 #endif
1390
1391 return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
1392 }
1393
1394 #else
1395 static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
1396 int nid, nodemask_t *nodemask)
1397 {
1398 return NULL;
1399 }
1400 #endif
1401
1402 #else
1403 static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
1404 int nid, nodemask_t *nodemask)
1405 {
1406 return NULL;
1407 }
1408 static inline void free_gigantic_page(struct page *page, unsigned int order) { }
1409 static inline void destroy_compound_gigantic_page(struct page *page,
1410 unsigned int order) { }
1411 #endif
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421 static void __remove_hugetlb_page(struct hstate *h, struct page *page,
1422 bool adjust_surplus,
1423 bool demote)
1424 {
1425 int nid = page_to_nid(page);
1426
1427 VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
1428 VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page);
1429
1430 lockdep_assert_held(&hugetlb_lock);
1431 if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
1432 return;
1433
1434 list_del(&page->lru);
1435
1436 if (HPageFreed(page)) {
1437 h->free_huge_pages--;
1438 h->free_huge_pages_node[nid]--;
1439 }
1440 if (adjust_surplus) {
1441 h->surplus_huge_pages--;
1442 h->surplus_huge_pages_node[nid]--;
1443 }
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465 if (!demote)
1466 set_page_refcounted(page);
1467 if (hstate_is_gigantic(h))
1468 set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
1469 else
1470 set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
1471
1472 h->nr_huge_pages--;
1473 h->nr_huge_pages_node[nid]--;
1474 }
1475
1476 static void remove_hugetlb_page(struct hstate *h, struct page *page,
1477 bool adjust_surplus)
1478 {
1479 __remove_hugetlb_page(h, page, adjust_surplus, false);
1480 }
1481
1482 static void remove_hugetlb_page_for_demote(struct hstate *h, struct page *page,
1483 bool adjust_surplus)
1484 {
1485 __remove_hugetlb_page(h, page, adjust_surplus, true);
1486 }
1487
1488 static void add_hugetlb_page(struct hstate *h, struct page *page,
1489 bool adjust_surplus)
1490 {
1491 int zeroed;
1492 int nid = page_to_nid(page);
1493
1494 VM_BUG_ON_PAGE(!HPageVmemmapOptimized(page), page);
1495
1496 lockdep_assert_held(&hugetlb_lock);
1497
1498 INIT_LIST_HEAD(&page->lru);
1499 h->nr_huge_pages++;
1500 h->nr_huge_pages_node[nid]++;
1501
1502 if (adjust_surplus) {
1503 h->surplus_huge_pages++;
1504 h->surplus_huge_pages_node[nid]++;
1505 }
1506
1507 set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
1508 set_page_private(page, 0);
1509 SetHPageVmemmapOptimized(page);
1510
1511
1512
1513
1514
1515
1516 zeroed = put_page_testzero(page);
1517 if (!zeroed)
1518
1519
1520
1521
1522
1523
1524 return;
1525
1526 arch_clear_hugepage_flags(page);
1527 enqueue_huge_page(h, page);
1528 }
1529
1530 static void __update_and_free_page(struct hstate *h, struct page *page)
1531 {
1532 int i;
1533 struct page *subpage = page;
1534
1535 if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
1536 return;
1537
1538
1539
1540
1541
1542 if (HPageRawHwpUnreliable(page))
1543 return;
1544
1545 if (hugetlb_vmemmap_restore(h, page)) {
1546 spin_lock_irq(&hugetlb_lock);
1547
1548
1549
1550
1551
1552 add_hugetlb_page(h, page, true);
1553 spin_unlock_irq(&hugetlb_lock);
1554 return;
1555 }
1556
1557
1558
1559
1560
1561 if (unlikely(PageHWPoison(page)))
1562 hugetlb_clear_page_hwpoison(page);
1563
1564 for (i = 0; i < pages_per_huge_page(h);
1565 i++, subpage = mem_map_next(subpage, page, i)) {
1566 subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
1567 1 << PG_referenced | 1 << PG_dirty |
1568 1 << PG_active | 1 << PG_private |
1569 1 << PG_writeback);
1570 }
1571
1572
1573
1574
1575
1576 if (hstate_is_gigantic(h) ||
1577 hugetlb_cma_page(page, huge_page_order(h))) {
1578 destroy_compound_gigantic_page(page, huge_page_order(h));
1579 free_gigantic_page(page, huge_page_order(h));
1580 } else {
1581 __free_pages(page, huge_page_order(h));
1582 }
1583 }
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596 static LLIST_HEAD(hpage_freelist);
1597
1598 static void free_hpage_workfn(struct work_struct *work)
1599 {
1600 struct llist_node *node;
1601
1602 node = llist_del_all(&hpage_freelist);
1603
1604 while (node) {
1605 struct page *page;
1606 struct hstate *h;
1607
1608 page = container_of((struct address_space **)node,
1609 struct page, mapping);
1610 node = node->next;
1611 page->mapping = NULL;
1612
1613
1614
1615
1616
1617
1618 h = size_to_hstate(page_size(page));
1619
1620 __update_and_free_page(h, page);
1621
1622 cond_resched();
1623 }
1624 }
1625 static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
1626
1627 static inline void flush_free_hpage_work(struct hstate *h)
1628 {
1629 if (hugetlb_vmemmap_optimizable(h))
1630 flush_work(&free_hpage_work);
1631 }
1632
1633 static void update_and_free_page(struct hstate *h, struct page *page,
1634 bool atomic)
1635 {
1636 if (!HPageVmemmapOptimized(page) || !atomic) {
1637 __update_and_free_page(h, page);
1638 return;
1639 }
1640
1641
1642
1643
1644
1645
1646
1647
1648 if (llist_add((struct llist_node *)&page->mapping, &hpage_freelist))
1649 schedule_work(&free_hpage_work);
1650 }
1651
1652 static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
1653 {
1654 struct page *page, *t_page;
1655
1656 list_for_each_entry_safe(page, t_page, list, lru) {
1657 update_and_free_page(h, page, false);
1658 cond_resched();
1659 }
1660 }
1661
1662 struct hstate *size_to_hstate(unsigned long size)
1663 {
1664 struct hstate *h;
1665
1666 for_each_hstate(h) {
1667 if (huge_page_size(h) == size)
1668 return h;
1669 }
1670 return NULL;
1671 }
1672
1673 void free_huge_page(struct page *page)
1674 {
1675
1676
1677
1678
1679 struct hstate *h = page_hstate(page);
1680 int nid = page_to_nid(page);
1681 struct hugepage_subpool *spool = hugetlb_page_subpool(page);
1682 bool restore_reserve;
1683 unsigned long flags;
1684
1685 VM_BUG_ON_PAGE(page_count(page), page);
1686 VM_BUG_ON_PAGE(page_mapcount(page), page);
1687
1688 hugetlb_set_page_subpool(page, NULL);
1689 if (PageAnon(page))
1690 __ClearPageAnonExclusive(page);
1691 page->mapping = NULL;
1692 restore_reserve = HPageRestoreReserve(page);
1693 ClearHPageRestoreReserve(page);
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703 if (!restore_reserve) {
1704
1705
1706
1707
1708
1709
1710 if (hugepage_subpool_put_pages(spool, 1) == 0)
1711 restore_reserve = true;
1712 }
1713
1714 spin_lock_irqsave(&hugetlb_lock, flags);
1715 ClearHPageMigratable(page);
1716 hugetlb_cgroup_uncharge_page(hstate_index(h),
1717 pages_per_huge_page(h), page);
1718 hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
1719 pages_per_huge_page(h), page);
1720 if (restore_reserve)
1721 h->resv_huge_pages++;
1722
1723 if (HPageTemporary(page)) {
1724 remove_hugetlb_page(h, page, false);
1725 spin_unlock_irqrestore(&hugetlb_lock, flags);
1726 update_and_free_page(h, page, true);
1727 } else if (h->surplus_huge_pages_node[nid]) {
1728
1729 remove_hugetlb_page(h, page, true);
1730 spin_unlock_irqrestore(&hugetlb_lock, flags);
1731 update_and_free_page(h, page, true);
1732 } else {
1733 arch_clear_hugepage_flags(page);
1734 enqueue_huge_page(h, page);
1735 spin_unlock_irqrestore(&hugetlb_lock, flags);
1736 }
1737 }
1738
1739
1740
1741
1742 static void __prep_account_new_huge_page(struct hstate *h, int nid)
1743 {
1744 lockdep_assert_held(&hugetlb_lock);
1745 h->nr_huge_pages++;
1746 h->nr_huge_pages_node[nid]++;
1747 }
1748
1749 static void __prep_new_huge_page(struct hstate *h, struct page *page)
1750 {
1751 hugetlb_vmemmap_optimize(h, page);
1752 INIT_LIST_HEAD(&page->lru);
1753 set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
1754 hugetlb_set_page_subpool(page, NULL);
1755 set_hugetlb_cgroup(page, NULL);
1756 set_hugetlb_cgroup_rsvd(page, NULL);
1757 }
1758
1759 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
1760 {
1761 __prep_new_huge_page(h, page);
1762 spin_lock_irq(&hugetlb_lock);
1763 __prep_account_new_huge_page(h, nid);
1764 spin_unlock_irq(&hugetlb_lock);
1765 }
1766
1767 static bool __prep_compound_gigantic_page(struct page *page, unsigned int order,
1768 bool demote)
1769 {
1770 int i, j;
1771 int nr_pages = 1 << order;
1772 struct page *p = page + 1;
1773
1774
1775 set_compound_order(page, order);
1776 __ClearPageReserved(page);
1777 __SetPageHead(page);
1778 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791 __ClearPageReserved(p);
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809 if (!demote) {
1810 if (!page_ref_freeze(p, 1)) {
1811 pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n");
1812 goto out_error;
1813 }
1814 } else {
1815 VM_BUG_ON_PAGE(page_count(p), p);
1816 }
1817 set_compound_head(p, page);
1818 }
1819 atomic_set(compound_mapcount_ptr(page), -1);
1820 atomic_set(compound_pincount_ptr(page), 0);
1821 return true;
1822
1823 out_error:
1824
1825 p = page + 1;
1826 for (j = 1; j < i; j++, p = mem_map_next(p, page, j)) {
1827 clear_compound_head(p);
1828 set_page_refcounted(p);
1829 }
1830
1831 for (; j < nr_pages; j++, p = mem_map_next(p, page, j))
1832 __ClearPageReserved(p);
1833 set_compound_order(page, 0);
1834 #ifdef CONFIG_64BIT
1835 page[1].compound_nr = 0;
1836 #endif
1837 __ClearPageHead(page);
1838 return false;
1839 }
1840
1841 static bool prep_compound_gigantic_page(struct page *page, unsigned int order)
1842 {
1843 return __prep_compound_gigantic_page(page, order, false);
1844 }
1845
1846 static bool prep_compound_gigantic_page_for_demote(struct page *page,
1847 unsigned int order)
1848 {
1849 return __prep_compound_gigantic_page(page, order, true);
1850 }
1851
1852
1853
1854
1855
1856
1857 int PageHuge(struct page *page)
1858 {
1859 if (!PageCompound(page))
1860 return 0;
1861
1862 page = compound_head(page);
1863 return page[1].compound_dtor == HUGETLB_PAGE_DTOR;
1864 }
1865 EXPORT_SYMBOL_GPL(PageHuge);
1866
1867
1868
1869
1870
1871 int PageHeadHuge(struct page *page_head)
1872 {
1873 if (!PageHead(page_head))
1874 return 0;
1875
1876 return page_head[1].compound_dtor == HUGETLB_PAGE_DTOR;
1877 }
1878 EXPORT_SYMBOL_GPL(PageHeadHuge);
1879
1880
1881
1882
1883
1884
1885
1886
1887 struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage)
1888 {
1889 struct address_space *mapping = page_mapping(hpage);
1890
1891 if (!mapping)
1892 return mapping;
1893
1894 if (i_mmap_trylock_write(mapping))
1895 return mapping;
1896
1897 return NULL;
1898 }
1899
1900 pgoff_t hugetlb_basepage_index(struct page *page)
1901 {
1902 struct page *page_head = compound_head(page);
1903 pgoff_t index = page_index(page_head);
1904 unsigned long compound_idx;
1905
1906 if (compound_order(page_head) >= MAX_ORDER)
1907 compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
1908 else
1909 compound_idx = page - page_head;
1910
1911 return (index << compound_order(page_head)) + compound_idx;
1912 }
1913
1914 static struct page *alloc_buddy_huge_page(struct hstate *h,
1915 gfp_t gfp_mask, int nid, nodemask_t *nmask,
1916 nodemask_t *node_alloc_noretry)
1917 {
1918 int order = huge_page_order(h);
1919 struct page *page;
1920 bool alloc_try_hard = true;
1921
1922
1923
1924
1925
1926
1927
1928
1929 if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry))
1930 alloc_try_hard = false;
1931 gfp_mask |= __GFP_COMP|__GFP_NOWARN;
1932 if (alloc_try_hard)
1933 gfp_mask |= __GFP_RETRY_MAYFAIL;
1934 if (nid == NUMA_NO_NODE)
1935 nid = numa_mem_id();
1936 page = __alloc_pages(gfp_mask, order, nid, nmask);
1937 if (page)
1938 __count_vm_event(HTLB_BUDDY_PGALLOC);
1939 else
1940 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
1941
1942
1943
1944
1945
1946
1947 if (node_alloc_noretry && page && !alloc_try_hard)
1948 node_clear(nid, *node_alloc_noretry);
1949
1950
1951
1952
1953
1954
1955 if (node_alloc_noretry && !page && alloc_try_hard)
1956 node_set(nid, *node_alloc_noretry);
1957
1958 return page;
1959 }
1960
1961
1962
1963
1964
1965 static struct page *alloc_fresh_huge_page(struct hstate *h,
1966 gfp_t gfp_mask, int nid, nodemask_t *nmask,
1967 nodemask_t *node_alloc_noretry)
1968 {
1969 struct page *page;
1970 bool retry = false;
1971
1972 retry:
1973 if (hstate_is_gigantic(h))
1974 page = alloc_gigantic_page(h, gfp_mask, nid, nmask);
1975 else
1976 page = alloc_buddy_huge_page(h, gfp_mask,
1977 nid, nmask, node_alloc_noretry);
1978 if (!page)
1979 return NULL;
1980
1981 if (hstate_is_gigantic(h)) {
1982 if (!prep_compound_gigantic_page(page, huge_page_order(h))) {
1983
1984
1985
1986
1987 free_gigantic_page(page, huge_page_order(h));
1988 if (!retry) {
1989 retry = true;
1990 goto retry;
1991 }
1992 return NULL;
1993 }
1994 }
1995 prep_new_huge_page(h, page, page_to_nid(page));
1996
1997 return page;
1998 }
1999
2000
2001
2002
2003
2004 static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
2005 nodemask_t *node_alloc_noretry)
2006 {
2007 struct page *page;
2008 int nr_nodes, node;
2009 gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
2010
2011 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
2012 page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed,
2013 node_alloc_noretry);
2014 if (page)
2015 break;
2016 }
2017
2018 if (!page)
2019 return 0;
2020
2021 put_page(page);
2022
2023 return 1;
2024 }
2025
2026
2027
2028
2029
2030
2031
2032
2033 static struct page *remove_pool_huge_page(struct hstate *h,
2034 nodemask_t *nodes_allowed,
2035 bool acct_surplus)
2036 {
2037 int nr_nodes, node;
2038 struct page *page = NULL;
2039
2040 lockdep_assert_held(&hugetlb_lock);
2041 for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
2042
2043
2044
2045
2046 if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
2047 !list_empty(&h->hugepage_freelists[node])) {
2048 page = list_entry(h->hugepage_freelists[node].next,
2049 struct page, lru);
2050 remove_hugetlb_page(h, page, acct_surplus);
2051 break;
2052 }
2053 }
2054
2055 return page;
2056 }
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072 int dissolve_free_huge_page(struct page *page)
2073 {
2074 int rc = -EBUSY;
2075
2076 retry:
2077
2078 if (!PageHuge(page))
2079 return 0;
2080
2081 spin_lock_irq(&hugetlb_lock);
2082 if (!PageHuge(page)) {
2083 rc = 0;
2084 goto out;
2085 }
2086
2087 if (!page_count(page)) {
2088 struct page *head = compound_head(page);
2089 struct hstate *h = page_hstate(head);
2090 if (h->free_huge_pages - h->resv_huge_pages == 0)
2091 goto out;
2092
2093
2094
2095
2096
2097 if (unlikely(!HPageFreed(head))) {
2098 spin_unlock_irq(&hugetlb_lock);
2099 cond_resched();
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109 goto retry;
2110 }
2111
2112 remove_hugetlb_page(h, head, false);
2113 h->max_huge_pages--;
2114 spin_unlock_irq(&hugetlb_lock);
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124 rc = hugetlb_vmemmap_restore(h, head);
2125 if (!rc) {
2126 update_and_free_page(h, head, false);
2127 } else {
2128 spin_lock_irq(&hugetlb_lock);
2129 add_hugetlb_page(h, head, false);
2130 h->max_huge_pages++;
2131 spin_unlock_irq(&hugetlb_lock);
2132 }
2133
2134 return rc;
2135 }
2136 out:
2137 spin_unlock_irq(&hugetlb_lock);
2138 return rc;
2139 }
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149 int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
2150 {
2151 unsigned long pfn;
2152 struct page *page;
2153 int rc = 0;
2154 unsigned int order;
2155 struct hstate *h;
2156
2157 if (!hugepages_supported())
2158 return rc;
2159
2160 order = huge_page_order(&default_hstate);
2161 for_each_hstate(h)
2162 order = min(order, huge_page_order(h));
2163
2164 for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) {
2165 page = pfn_to_page(pfn);
2166 rc = dissolve_free_huge_page(page);
2167 if (rc)
2168 break;
2169 }
2170
2171 return rc;
2172 }
2173
2174
2175
2176
2177 static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
2178 int nid, nodemask_t *nmask, bool zero_ref)
2179 {
2180 struct page *page = NULL;
2181 bool retry = false;
2182
2183 if (hstate_is_gigantic(h))
2184 return NULL;
2185
2186 spin_lock_irq(&hugetlb_lock);
2187 if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages)
2188 goto out_unlock;
2189 spin_unlock_irq(&hugetlb_lock);
2190
2191 retry:
2192 page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
2193 if (!page)
2194 return NULL;
2195
2196 spin_lock_irq(&hugetlb_lock);
2197
2198
2199
2200
2201
2202
2203
2204 if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
2205 SetHPageTemporary(page);
2206 spin_unlock_irq(&hugetlb_lock);
2207 put_page(page);
2208 return NULL;
2209 }
2210
2211 if (zero_ref) {
2212
2213
2214
2215
2216
2217
2218 SetHPageTemporary(page);
2219 if (!put_page_testzero(page)) {
2220
2221
2222
2223
2224 pr_info("HugeTLB unexpected inflated ref count on freshly allocated page\n");
2225 spin_unlock_irq(&hugetlb_lock);
2226 if (retry)
2227 return NULL;
2228
2229 retry = true;
2230 goto retry;
2231 }
2232 ClearHPageTemporary(page);
2233 }
2234
2235 h->surplus_huge_pages++;
2236 h->surplus_huge_pages_node[page_to_nid(page)]++;
2237
2238 out_unlock:
2239 spin_unlock_irq(&hugetlb_lock);
2240
2241 return page;
2242 }
2243
2244 static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
2245 int nid, nodemask_t *nmask)
2246 {
2247 struct page *page;
2248
2249 if (hstate_is_gigantic(h))
2250 return NULL;
2251
2252 page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
2253 if (!page)
2254 return NULL;
2255
2256
2257
2258
2259
2260 SetHPageTemporary(page);
2261
2262 return page;
2263 }
2264
2265
2266
2267
2268 static
2269 struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
2270 struct vm_area_struct *vma, unsigned long addr)
2271 {
2272 struct page *page = NULL;
2273 struct mempolicy *mpol;
2274 gfp_t gfp_mask = htlb_alloc_mask(h);
2275 int nid;
2276 nodemask_t *nodemask;
2277
2278 nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
2279 if (mpol_is_preferred_many(mpol)) {
2280 gfp_t gfp = gfp_mask | __GFP_NOWARN;
2281
2282 gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2283 page = alloc_surplus_huge_page(h, gfp, nid, nodemask, false);
2284
2285
2286 nodemask = NULL;
2287 }
2288
2289 if (!page)
2290 page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask, false);
2291 mpol_cond_put(mpol);
2292 return page;
2293 }
2294
2295
2296 struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
2297 nodemask_t *nmask, gfp_t gfp_mask)
2298 {
2299 spin_lock_irq(&hugetlb_lock);
2300 if (h->free_huge_pages - h->resv_huge_pages > 0) {
2301 struct page *page;
2302
2303 page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask);
2304 if (page) {
2305 spin_unlock_irq(&hugetlb_lock);
2306 return page;
2307 }
2308 }
2309 spin_unlock_irq(&hugetlb_lock);
2310
2311 return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
2312 }
2313
2314
2315 struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
2316 unsigned long address)
2317 {
2318 struct mempolicy *mpol;
2319 nodemask_t *nodemask;
2320 struct page *page;
2321 gfp_t gfp_mask;
2322 int node;
2323
2324 gfp_mask = htlb_alloc_mask(h);
2325 node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
2326 page = alloc_huge_page_nodemask(h, node, nodemask, gfp_mask);
2327 mpol_cond_put(mpol);
2328
2329 return page;
2330 }
2331
2332
2333
2334
2335
2336 static int gather_surplus_pages(struct hstate *h, long delta)
2337 __must_hold(&hugetlb_lock)
2338 {
2339 struct list_head surplus_list;
2340 struct page *page, *tmp;
2341 int ret;
2342 long i;
2343 long needed, allocated;
2344 bool alloc_ok = true;
2345
2346 lockdep_assert_held(&hugetlb_lock);
2347 needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
2348 if (needed <= 0) {
2349 h->resv_huge_pages += delta;
2350 return 0;
2351 }
2352
2353 allocated = 0;
2354 INIT_LIST_HEAD(&surplus_list);
2355
2356 ret = -ENOMEM;
2357 retry:
2358 spin_unlock_irq(&hugetlb_lock);
2359 for (i = 0; i < needed; i++) {
2360 page = alloc_surplus_huge_page(h, htlb_alloc_mask(h),
2361 NUMA_NO_NODE, NULL, true);
2362 if (!page) {
2363 alloc_ok = false;
2364 break;
2365 }
2366 list_add(&page->lru, &surplus_list);
2367 cond_resched();
2368 }
2369 allocated += i;
2370
2371
2372
2373
2374
2375 spin_lock_irq(&hugetlb_lock);
2376 needed = (h->resv_huge_pages + delta) -
2377 (h->free_huge_pages + allocated);
2378 if (needed > 0) {
2379 if (alloc_ok)
2380 goto retry;
2381
2382
2383
2384
2385
2386 goto free;
2387 }
2388
2389
2390
2391
2392
2393
2394
2395
2396 needed += allocated;
2397 h->resv_huge_pages += delta;
2398 ret = 0;
2399
2400
2401 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
2402 if ((--needed) < 0)
2403 break;
2404
2405 enqueue_huge_page(h, page);
2406 }
2407 free:
2408 spin_unlock_irq(&hugetlb_lock);
2409
2410
2411
2412
2413
2414 list_for_each_entry_safe(page, tmp, &surplus_list, lru)
2415 free_huge_page(page);
2416 spin_lock_irq(&hugetlb_lock);
2417
2418 return ret;
2419 }
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429 static void return_unused_surplus_pages(struct hstate *h,
2430 unsigned long unused_resv_pages)
2431 {
2432 unsigned long nr_pages;
2433 struct page *page;
2434 LIST_HEAD(page_list);
2435
2436 lockdep_assert_held(&hugetlb_lock);
2437
2438 h->resv_huge_pages -= unused_resv_pages;
2439
2440 if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
2441 goto out;
2442
2443
2444
2445
2446
2447 nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457 while (nr_pages--) {
2458 page = remove_pool_huge_page(h, &node_states[N_MEMORY], 1);
2459 if (!page)
2460 goto out;
2461
2462 list_add(&page->lru, &page_list);
2463 }
2464
2465 out:
2466 spin_unlock_irq(&hugetlb_lock);
2467 update_and_free_pages_bulk(h, &page_list);
2468 spin_lock_irq(&hugetlb_lock);
2469 }
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501 enum vma_resv_mode {
2502 VMA_NEEDS_RESV,
2503 VMA_COMMIT_RESV,
2504 VMA_END_RESV,
2505 VMA_ADD_RESV,
2506 VMA_DEL_RESV,
2507 };
2508 static long __vma_reservation_common(struct hstate *h,
2509 struct vm_area_struct *vma, unsigned long addr,
2510 enum vma_resv_mode mode)
2511 {
2512 struct resv_map *resv;
2513 pgoff_t idx;
2514 long ret;
2515 long dummy_out_regions_needed;
2516
2517 resv = vma_resv_map(vma);
2518 if (!resv)
2519 return 1;
2520
2521 idx = vma_hugecache_offset(h, vma, addr);
2522 switch (mode) {
2523 case VMA_NEEDS_RESV:
2524 ret = region_chg(resv, idx, idx + 1, &dummy_out_regions_needed);
2525
2526
2527
2528
2529 VM_BUG_ON(dummy_out_regions_needed != 1);
2530 break;
2531 case VMA_COMMIT_RESV:
2532 ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
2533
2534 VM_BUG_ON(ret < 0);
2535 break;
2536 case VMA_END_RESV:
2537 region_abort(resv, idx, idx + 1, 1);
2538 ret = 0;
2539 break;
2540 case VMA_ADD_RESV:
2541 if (vma->vm_flags & VM_MAYSHARE) {
2542 ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
2543
2544 VM_BUG_ON(ret < 0);
2545 } else {
2546 region_abort(resv, idx, idx + 1, 1);
2547 ret = region_del(resv, idx, idx + 1);
2548 }
2549 break;
2550 case VMA_DEL_RESV:
2551 if (vma->vm_flags & VM_MAYSHARE) {
2552 region_abort(resv, idx, idx + 1, 1);
2553 ret = region_del(resv, idx, idx + 1);
2554 } else {
2555 ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
2556
2557 VM_BUG_ON(ret < 0);
2558 }
2559 break;
2560 default:
2561 BUG();
2562 }
2563
2564 if (vma->vm_flags & VM_MAYSHARE || mode == VMA_DEL_RESV)
2565 return ret;
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581 if (ret > 0)
2582 return 0;
2583 if (ret == 0)
2584 return 1;
2585 return ret;
2586 }
2587
2588 static long vma_needs_reservation(struct hstate *h,
2589 struct vm_area_struct *vma, unsigned long addr)
2590 {
2591 return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV);
2592 }
2593
2594 static long vma_commit_reservation(struct hstate *h,
2595 struct vm_area_struct *vma, unsigned long addr)
2596 {
2597 return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV);
2598 }
2599
2600 static void vma_end_reservation(struct hstate *h,
2601 struct vm_area_struct *vma, unsigned long addr)
2602 {
2603 (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV);
2604 }
2605
2606 static long vma_add_reservation(struct hstate *h,
2607 struct vm_area_struct *vma, unsigned long addr)
2608 {
2609 return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV);
2610 }
2611
2612 static long vma_del_reservation(struct hstate *h,
2613 struct vm_area_struct *vma, unsigned long addr)
2614 {
2615 return __vma_reservation_common(h, vma, addr, VMA_DEL_RESV);
2616 }
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638 void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
2639 unsigned long address, struct page *page)
2640 {
2641 long rc = vma_needs_reservation(h, vma, address);
2642
2643 if (HPageRestoreReserve(page)) {
2644 if (unlikely(rc < 0))
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656 ClearHPageRestoreReserve(page);
2657 else if (rc)
2658 (void)vma_add_reservation(h, vma, address);
2659 else
2660 vma_end_reservation(h, vma, address);
2661 } else {
2662 if (!rc) {
2663
2664
2665
2666
2667
2668
2669
2670
2671 rc = vma_del_reservation(h, vma, address);
2672 if (rc < 0)
2673
2674
2675
2676
2677
2678
2679
2680
2681 SetHPageRestoreReserve(page);
2682 } else if (rc < 0) {
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693 if (!(vma->vm_flags & VM_MAYSHARE))
2694
2695
2696
2697
2698
2699
2700
2701
2702 SetHPageRestoreReserve(page);
2703 } else
2704
2705
2706
2707 vma_end_reservation(h, vma, address);
2708 }
2709 }
2710
2711
2712
2713
2714
2715
2716
2717
2718 static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
2719 struct list_head *list)
2720 {
2721 gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
2722 int nid = page_to_nid(old_page);
2723 bool alloc_retry = false;
2724 struct page *new_page;
2725 int ret = 0;
2726
2727
2728
2729
2730
2731
2732
2733
2734 alloc_retry:
2735 new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL);
2736 if (!new_page)
2737 return -ENOMEM;
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748 SetHPageTemporary(new_page);
2749 if (!put_page_testzero(new_page)) {
2750 if (alloc_retry)
2751 return -EBUSY;
2752
2753 alloc_retry = true;
2754 goto alloc_retry;
2755 }
2756 ClearHPageTemporary(new_page);
2757
2758 __prep_new_huge_page(h, new_page);
2759
2760 retry:
2761 spin_lock_irq(&hugetlb_lock);
2762 if (!PageHuge(old_page)) {
2763
2764
2765
2766 goto free_new;
2767 } else if (page_count(old_page)) {
2768
2769
2770
2771
2772 spin_unlock_irq(&hugetlb_lock);
2773 ret = isolate_hugetlb(old_page, list);
2774 spin_lock_irq(&hugetlb_lock);
2775 goto free_new;
2776 } else if (!HPageFreed(old_page)) {
2777
2778
2779
2780
2781
2782 spin_unlock_irq(&hugetlb_lock);
2783 cond_resched();
2784 goto retry;
2785 } else {
2786
2787
2788
2789
2790
2791
2792
2793 remove_hugetlb_page(h, old_page, false);
2794
2795
2796
2797
2798
2799 __prep_account_new_huge_page(h, nid);
2800 enqueue_huge_page(h, new_page);
2801
2802
2803
2804
2805 spin_unlock_irq(&hugetlb_lock);
2806 update_and_free_page(h, old_page, false);
2807 }
2808
2809 return ret;
2810
2811 free_new:
2812 spin_unlock_irq(&hugetlb_lock);
2813
2814 set_page_refcounted(new_page);
2815 update_and_free_page(h, new_page, false);
2816
2817 return ret;
2818 }
2819
2820 int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
2821 {
2822 struct hstate *h;
2823 struct page *head;
2824 int ret = -EBUSY;
2825
2826
2827
2828
2829
2830
2831 spin_lock_irq(&hugetlb_lock);
2832 if (PageHuge(page)) {
2833 head = compound_head(page);
2834 h = page_hstate(head);
2835 } else {
2836 spin_unlock_irq(&hugetlb_lock);
2837 return 0;
2838 }
2839 spin_unlock_irq(&hugetlb_lock);
2840
2841
2842
2843
2844
2845
2846 if (hstate_is_gigantic(h))
2847 return -ENOMEM;
2848
2849 if (page_count(head) && !isolate_hugetlb(head, list))
2850 ret = 0;
2851 else if (!page_count(head))
2852 ret = alloc_and_dissolve_huge_page(h, head, list);
2853
2854 return ret;
2855 }
2856
2857 struct page *alloc_huge_page(struct vm_area_struct *vma,
2858 unsigned long addr, int avoid_reserve)
2859 {
2860 struct hugepage_subpool *spool = subpool_vma(vma);
2861 struct hstate *h = hstate_vma(vma);
2862 struct page *page;
2863 long map_chg, map_commit;
2864 long gbl_chg;
2865 int ret, idx;
2866 struct hugetlb_cgroup *h_cg;
2867 bool deferred_reserve;
2868
2869 idx = hstate_index(h);
2870
2871
2872
2873
2874
2875 map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
2876 if (map_chg < 0)
2877 return ERR_PTR(-ENOMEM);
2878
2879
2880
2881
2882
2883
2884
2885
2886 if (map_chg || avoid_reserve) {
2887 gbl_chg = hugepage_subpool_get_pages(spool, 1);
2888 if (gbl_chg < 0) {
2889 vma_end_reservation(h, vma, addr);
2890 return ERR_PTR(-ENOSPC);
2891 }
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901 if (avoid_reserve)
2902 gbl_chg = 1;
2903 }
2904
2905
2906
2907 deferred_reserve = map_chg || avoid_reserve;
2908 if (deferred_reserve) {
2909 ret = hugetlb_cgroup_charge_cgroup_rsvd(
2910 idx, pages_per_huge_page(h), &h_cg);
2911 if (ret)
2912 goto out_subpool_put;
2913 }
2914
2915 ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
2916 if (ret)
2917 goto out_uncharge_cgroup_reservation;
2918
2919 spin_lock_irq(&hugetlb_lock);
2920
2921
2922
2923
2924
2925 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
2926 if (!page) {
2927 spin_unlock_irq(&hugetlb_lock);
2928 page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
2929 if (!page)
2930 goto out_uncharge_cgroup;
2931 if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
2932 SetHPageRestoreReserve(page);
2933 h->resv_huge_pages--;
2934 }
2935 spin_lock_irq(&hugetlb_lock);
2936 list_add(&page->lru, &h->hugepage_activelist);
2937
2938 }
2939 hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
2940
2941
2942
2943 if (deferred_reserve) {
2944 hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h),
2945 h_cg, page);
2946 }
2947
2948 spin_unlock_irq(&hugetlb_lock);
2949
2950 hugetlb_set_page_subpool(page, spool);
2951
2952 map_commit = vma_commit_reservation(h, vma, addr);
2953 if (unlikely(map_chg > map_commit)) {
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963 long rsv_adjust;
2964
2965 rsv_adjust = hugepage_subpool_put_pages(spool, 1);
2966 hugetlb_acct_memory(h, -rsv_adjust);
2967 if (deferred_reserve)
2968 hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
2969 pages_per_huge_page(h), page);
2970 }
2971 return page;
2972
2973 out_uncharge_cgroup:
2974 hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
2975 out_uncharge_cgroup_reservation:
2976 if (deferred_reserve)
2977 hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h),
2978 h_cg);
2979 out_subpool_put:
2980 if (map_chg || avoid_reserve)
2981 hugepage_subpool_put_pages(spool, 1);
2982 vma_end_reservation(h, vma, addr);
2983 return ERR_PTR(-ENOSPC);
2984 }
2985
2986 int alloc_bootmem_huge_page(struct hstate *h, int nid)
2987 __attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
2988 int __alloc_bootmem_huge_page(struct hstate *h, int nid)
2989 {
2990 struct huge_bootmem_page *m = NULL;
2991 int nr_nodes, node;
2992
2993
2994 if (nid != NUMA_NO_NODE) {
2995 m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h),
2996 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
2997 if (!m)
2998 return 0;
2999 goto found;
3000 }
3001
3002 for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
3003 m = memblock_alloc_try_nid_raw(
3004 huge_page_size(h), huge_page_size(h),
3005 0, MEMBLOCK_ALLOC_ACCESSIBLE, node);
3006
3007
3008
3009
3010
3011 if (!m)
3012 return 0;
3013 goto found;
3014 }
3015
3016 found:
3017
3018 INIT_LIST_HEAD(&m->list);
3019 list_add(&m->list, &huge_boot_pages);
3020 m->hstate = h;
3021 return 1;
3022 }
3023
3024
3025
3026
3027
3028 static void __init gather_bootmem_prealloc(void)
3029 {
3030 struct huge_bootmem_page *m;
3031
3032 list_for_each_entry(m, &huge_boot_pages, list) {
3033 struct page *page = virt_to_page(m);
3034 struct hstate *h = m->hstate;
3035
3036 VM_BUG_ON(!hstate_is_gigantic(h));
3037 WARN_ON(page_count(page) != 1);
3038 if (prep_compound_gigantic_page(page, huge_page_order(h))) {
3039 WARN_ON(PageReserved(page));
3040 prep_new_huge_page(h, page, page_to_nid(page));
3041 put_page(page);
3042 } else {
3043
3044 free_gigantic_page(page, huge_page_order(h));
3045 }
3046
3047
3048
3049
3050
3051
3052 adjust_managed_page_count(page, pages_per_huge_page(h));
3053 cond_resched();
3054 }
3055 }
3056 static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
3057 {
3058 unsigned long i;
3059 char buf[32];
3060
3061 for (i = 0; i < h->max_huge_pages_node[nid]; ++i) {
3062 if (hstate_is_gigantic(h)) {
3063 if (!alloc_bootmem_huge_page(h, nid))
3064 break;
3065 } else {
3066 struct page *page;
3067 gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
3068
3069 page = alloc_fresh_huge_page(h, gfp_mask, nid,
3070 &node_states[N_MEMORY], NULL);
3071 if (!page)
3072 break;
3073 put_page(page);
3074 }
3075 cond_resched();
3076 }
3077 if (i == h->max_huge_pages_node[nid])
3078 return;
3079
3080 string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
3081 pr_warn("HugeTLB: allocating %u of page size %s failed node%d. Only allocated %lu hugepages.\n",
3082 h->max_huge_pages_node[nid], buf, nid, i);
3083 h->max_huge_pages -= (h->max_huge_pages_node[nid] - i);
3084 h->max_huge_pages_node[nid] = i;
3085 }
3086
3087 static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
3088 {
3089 unsigned long i;
3090 nodemask_t *node_alloc_noretry;
3091 bool node_specific_alloc = false;
3092
3093
3094 if (hstate_is_gigantic(h) && hugetlb_cma_size) {
3095 pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
3096 return;
3097 }
3098
3099
3100 for_each_online_node(i) {
3101 if (h->max_huge_pages_node[i] > 0) {
3102 hugetlb_hstate_alloc_pages_onenode(h, i);
3103 node_specific_alloc = true;
3104 }
3105 }
3106
3107 if (node_specific_alloc)
3108 return;
3109
3110
3111 if (!hstate_is_gigantic(h)) {
3112
3113
3114
3115
3116
3117
3118 node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry),
3119 GFP_KERNEL);
3120 } else {
3121
3122 node_alloc_noretry = NULL;
3123 }
3124
3125
3126 if (node_alloc_noretry)
3127 nodes_clear(*node_alloc_noretry);
3128
3129 for (i = 0; i < h->max_huge_pages; ++i) {
3130 if (hstate_is_gigantic(h)) {
3131 if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE))
3132 break;
3133 } else if (!alloc_pool_huge_page(h,
3134 &node_states[N_MEMORY],
3135 node_alloc_noretry))
3136 break;
3137 cond_resched();
3138 }
3139 if (i < h->max_huge_pages) {
3140 char buf[32];
3141
3142 string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
3143 pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n",
3144 h->max_huge_pages, buf, i);
3145 h->max_huge_pages = i;
3146 }
3147 kfree(node_alloc_noretry);
3148 }
3149
3150 static void __init hugetlb_init_hstates(void)
3151 {
3152 struct hstate *h, *h2;
3153
3154 for_each_hstate(h) {
3155
3156 if (!hstate_is_gigantic(h))
3157 hugetlb_hstate_alloc_pages(h);
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167 if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
3168 continue;
3169 if (hugetlb_cma_size && h->order <= HUGETLB_PAGE_ORDER)
3170 continue;
3171 for_each_hstate(h2) {
3172 if (h2 == h)
3173 continue;
3174 if (h2->order < h->order &&
3175 h2->order > h->demote_order)
3176 h->demote_order = h2->order;
3177 }
3178 }
3179 }
3180
3181 static void __init report_hugepages(void)
3182 {
3183 struct hstate *h;
3184
3185 for_each_hstate(h) {
3186 char buf[32];
3187
3188 string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
3189 pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n",
3190 buf, h->free_huge_pages);
3191 pr_info("HugeTLB: %d KiB vmemmap can be freed for a %s page\n",
3192 hugetlb_vmemmap_optimizable_size(h) / SZ_1K, buf);
3193 }
3194 }
3195
3196 #ifdef CONFIG_HIGHMEM
3197 static void try_to_free_low(struct hstate *h, unsigned long count,
3198 nodemask_t *nodes_allowed)
3199 {
3200 int i;
3201 LIST_HEAD(page_list);
3202
3203 lockdep_assert_held(&hugetlb_lock);
3204 if (hstate_is_gigantic(h))
3205 return;
3206
3207
3208
3209
3210 for_each_node_mask(i, *nodes_allowed) {
3211 struct page *page, *next;
3212 struct list_head *freel = &h->hugepage_freelists[i];
3213 list_for_each_entry_safe(page, next, freel, lru) {
3214 if (count >= h->nr_huge_pages)
3215 goto out;
3216 if (PageHighMem(page))
3217 continue;
3218 remove_hugetlb_page(h, page, false);
3219 list_add(&page->lru, &page_list);
3220 }
3221 }
3222
3223 out:
3224 spin_unlock_irq(&hugetlb_lock);
3225 update_and_free_pages_bulk(h, &page_list);
3226 spin_lock_irq(&hugetlb_lock);
3227 }
3228 #else
3229 static inline void try_to_free_low(struct hstate *h, unsigned long count,
3230 nodemask_t *nodes_allowed)
3231 {
3232 }
3233 #endif
3234
3235
3236
3237
3238
3239
3240 static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
3241 int delta)
3242 {
3243 int nr_nodes, node;
3244
3245 lockdep_assert_held(&hugetlb_lock);
3246 VM_BUG_ON(delta != -1 && delta != 1);
3247
3248 if (delta < 0) {
3249 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
3250 if (h->surplus_huge_pages_node[node])
3251 goto found;
3252 }
3253 } else {
3254 for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
3255 if (h->surplus_huge_pages_node[node] <
3256 h->nr_huge_pages_node[node])
3257 goto found;
3258 }
3259 }
3260 return 0;
3261
3262 found:
3263 h->surplus_huge_pages += delta;
3264 h->surplus_huge_pages_node[node] += delta;
3265 return 1;
3266 }
3267
3268 #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
3269 static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
3270 nodemask_t *nodes_allowed)
3271 {
3272 unsigned long min_count, ret;
3273 struct page *page;
3274 LIST_HEAD(page_list);
3275 NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL);
3276
3277
3278
3279
3280
3281
3282 if (node_alloc_noretry)
3283 nodes_clear(*node_alloc_noretry);
3284 else
3285 return -ENOMEM;
3286
3287
3288
3289
3290
3291 mutex_lock(&h->resize_lock);
3292 flush_free_hpage_work(h);
3293 spin_lock_irq(&hugetlb_lock);
3294
3295
3296
3297
3298
3299
3300
3301 if (nid != NUMA_NO_NODE) {
3302 unsigned long old_count = count;
3303
3304 count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
3305
3306
3307
3308
3309
3310
3311 if (count < old_count)
3312 count = ULONG_MAX;
3313 }
3314
3315
3316
3317
3318
3319
3320
3321
3322 if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
3323 if (count > persistent_huge_pages(h)) {
3324 spin_unlock_irq(&hugetlb_lock);
3325 mutex_unlock(&h->resize_lock);
3326 NODEMASK_FREE(node_alloc_noretry);
3327 return -EINVAL;
3328 }
3329
3330 }
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343 while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
3344 if (!adjust_pool_surplus(h, nodes_allowed, -1))
3345 break;
3346 }
3347
3348 while (count > persistent_huge_pages(h)) {
3349
3350
3351
3352
3353
3354 spin_unlock_irq(&hugetlb_lock);
3355
3356
3357 cond_resched();
3358
3359 ret = alloc_pool_huge_page(h, nodes_allowed,
3360 node_alloc_noretry);
3361 spin_lock_irq(&hugetlb_lock);
3362 if (!ret)
3363 goto out;
3364
3365
3366 if (signal_pending(current))
3367 goto out;
3368 }
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385 min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
3386 min_count = max(count, min_count);
3387 try_to_free_low(h, min_count, nodes_allowed);
3388
3389
3390
3391
3392 while (min_count < persistent_huge_pages(h)) {
3393 page = remove_pool_huge_page(h, nodes_allowed, 0);
3394 if (!page)
3395 break;
3396
3397 list_add(&page->lru, &page_list);
3398 }
3399
3400 spin_unlock_irq(&hugetlb_lock);
3401 update_and_free_pages_bulk(h, &page_list);
3402 flush_free_hpage_work(h);
3403 spin_lock_irq(&hugetlb_lock);
3404
3405 while (count < persistent_huge_pages(h)) {
3406 if (!adjust_pool_surplus(h, nodes_allowed, 1))
3407 break;
3408 }
3409 out:
3410 h->max_huge_pages = persistent_huge_pages(h);
3411 spin_unlock_irq(&hugetlb_lock);
3412 mutex_unlock(&h->resize_lock);
3413
3414 NODEMASK_FREE(node_alloc_noretry);
3415
3416 return 0;
3417 }
3418
3419 static int demote_free_huge_page(struct hstate *h, struct page *page)
3420 {
3421 int i, nid = page_to_nid(page);
3422 struct hstate *target_hstate;
3423 struct page *subpage;
3424 int rc = 0;
3425
3426 target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order);
3427
3428 remove_hugetlb_page_for_demote(h, page, false);
3429 spin_unlock_irq(&hugetlb_lock);
3430
3431 rc = hugetlb_vmemmap_restore(h, page);
3432 if (rc) {
3433
3434 spin_lock_irq(&hugetlb_lock);
3435 set_page_refcounted(page);
3436 add_hugetlb_page(h, page, false);
3437 return rc;
3438 }
3439
3440
3441
3442
3443
3444 destroy_compound_hugetlb_page_for_demote(page, huge_page_order(h));
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454 mutex_lock(&target_hstate->resize_lock);
3455 for (i = 0; i < pages_per_huge_page(h);
3456 i += pages_per_huge_page(target_hstate)) {
3457 subpage = nth_page(page, i);
3458 if (hstate_is_gigantic(target_hstate))
3459 prep_compound_gigantic_page_for_demote(subpage,
3460 target_hstate->order);
3461 else
3462 prep_compound_page(subpage, target_hstate->order);
3463 set_page_private(subpage, 0);
3464 set_page_refcounted(subpage);
3465 prep_new_huge_page(target_hstate, subpage, nid);
3466 put_page(subpage);
3467 }
3468 mutex_unlock(&target_hstate->resize_lock);
3469
3470 spin_lock_irq(&hugetlb_lock);
3471
3472
3473
3474
3475
3476 h->max_huge_pages--;
3477 target_hstate->max_huge_pages += pages_per_huge_page(h);
3478
3479 return rc;
3480 }
3481
3482 static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
3483 __must_hold(&hugetlb_lock)
3484 {
3485 int nr_nodes, node;
3486 struct page *page;
3487
3488 lockdep_assert_held(&hugetlb_lock);
3489
3490
3491 if (!h->demote_order) {
3492 pr_warn("HugeTLB: NULL demote order passed to demote_pool_huge_page.\n");
3493 return -EINVAL;
3494 }
3495
3496 for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
3497 list_for_each_entry(page, &h->hugepage_freelists[node], lru) {
3498 if (PageHWPoison(page))
3499 continue;
3500
3501 return demote_free_huge_page(h, page);
3502 }
3503 }
3504
3505
3506
3507
3508
3509 return -EBUSY;
3510 }
3511
3512 #define HSTATE_ATTR_RO(_name) \
3513 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
3514
3515 #define HSTATE_ATTR_WO(_name) \
3516 static struct kobj_attribute _name##_attr = __ATTR_WO(_name)
3517
3518 #define HSTATE_ATTR(_name) \
3519 static struct kobj_attribute _name##_attr = __ATTR_RW(_name)
3520
3521 static struct kobject *hugepages_kobj;
3522 static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
3523
3524 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp);
3525
3526 static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp)
3527 {
3528 int i;
3529
3530 for (i = 0; i < HUGE_MAX_HSTATE; i++)
3531 if (hstate_kobjs[i] == kobj) {
3532 if (nidp)
3533 *nidp = NUMA_NO_NODE;
3534 return &hstates[i];
3535 }
3536
3537 return kobj_to_node_hstate(kobj, nidp);
3538 }
3539
3540 static ssize_t nr_hugepages_show_common(struct kobject *kobj,
3541 struct kobj_attribute *attr, char *buf)
3542 {
3543 struct hstate *h;
3544 unsigned long nr_huge_pages;
3545 int nid;
3546
3547 h = kobj_to_hstate(kobj, &nid);
3548 if (nid == NUMA_NO_NODE)
3549 nr_huge_pages = h->nr_huge_pages;
3550 else
3551 nr_huge_pages = h->nr_huge_pages_node[nid];
3552
3553 return sysfs_emit(buf, "%lu\n", nr_huge_pages);
3554 }
3555
3556 static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
3557 struct hstate *h, int nid,
3558 unsigned long count, size_t len)
3559 {
3560 int err;
3561 nodemask_t nodes_allowed, *n_mask;
3562
3563 if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
3564 return -EINVAL;
3565
3566 if (nid == NUMA_NO_NODE) {
3567
3568
3569
3570 if (!(obey_mempolicy &&
3571 init_nodemask_of_mempolicy(&nodes_allowed)))
3572 n_mask = &node_states[N_MEMORY];
3573 else
3574 n_mask = &nodes_allowed;
3575 } else {
3576
3577
3578
3579
3580 init_nodemask_of_node(&nodes_allowed, nid);
3581 n_mask = &nodes_allowed;
3582 }
3583
3584 err = set_max_huge_pages(h, count, nid, n_mask);
3585
3586 return err ? err : len;
3587 }
3588
3589 static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
3590 struct kobject *kobj, const char *buf,
3591 size_t len)
3592 {
3593 struct hstate *h;
3594 unsigned long count;
3595 int nid;
3596 int err;
3597
3598 err = kstrtoul(buf, 10, &count);
3599 if (err)
3600 return err;
3601
3602 h = kobj_to_hstate(kobj, &nid);
3603 return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len);
3604 }
3605
3606 static ssize_t nr_hugepages_show(struct kobject *kobj,
3607 struct kobj_attribute *attr, char *buf)
3608 {
3609 return nr_hugepages_show_common(kobj, attr, buf);
3610 }
3611
3612 static ssize_t nr_hugepages_store(struct kobject *kobj,
3613 struct kobj_attribute *attr, const char *buf, size_t len)
3614 {
3615 return nr_hugepages_store_common(false, kobj, buf, len);
3616 }
3617 HSTATE_ATTR(nr_hugepages);
3618
3619 #ifdef CONFIG_NUMA
3620
3621
3622
3623
3624
3625 static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
3626 struct kobj_attribute *attr,
3627 char *buf)
3628 {
3629 return nr_hugepages_show_common(kobj, attr, buf);
3630 }
3631
3632 static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
3633 struct kobj_attribute *attr, const char *buf, size_t len)
3634 {
3635 return nr_hugepages_store_common(true, kobj, buf, len);
3636 }
3637 HSTATE_ATTR(nr_hugepages_mempolicy);
3638 #endif
3639
3640
3641 static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
3642 struct kobj_attribute *attr, char *buf)
3643 {
3644 struct hstate *h = kobj_to_hstate(kobj, NULL);
3645 return sysfs_emit(buf, "%lu\n", h->nr_overcommit_huge_pages);
3646 }
3647
3648 static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
3649 struct kobj_attribute *attr, const char *buf, size_t count)
3650 {
3651 int err;
3652 unsigned long input;
3653 struct hstate *h = kobj_to_hstate(kobj, NULL);
3654
3655 if (hstate_is_gigantic(h))
3656 return -EINVAL;
3657
3658 err = kstrtoul(buf, 10, &input);
3659 if (err)
3660 return err;
3661
3662 spin_lock_irq(&hugetlb_lock);
3663 h->nr_overcommit_huge_pages = input;
3664 spin_unlock_irq(&hugetlb_lock);
3665
3666 return count;
3667 }
3668 HSTATE_ATTR(nr_overcommit_hugepages);
3669
3670 static ssize_t free_hugepages_show(struct kobject *kobj,
3671 struct kobj_attribute *attr, char *buf)
3672 {
3673 struct hstate *h;
3674 unsigned long free_huge_pages;
3675 int nid;
3676
3677 h = kobj_to_hstate(kobj, &nid);
3678 if (nid == NUMA_NO_NODE)
3679 free_huge_pages = h->free_huge_pages;
3680 else
3681 free_huge_pages = h->free_huge_pages_node[nid];
3682
3683 return sysfs_emit(buf, "%lu\n", free_huge_pages);
3684 }
3685 HSTATE_ATTR_RO(free_hugepages);
3686
3687 static ssize_t resv_hugepages_show(struct kobject *kobj,
3688 struct kobj_attribute *attr, char *buf)
3689 {
3690 struct hstate *h = kobj_to_hstate(kobj, NULL);
3691 return sysfs_emit(buf, "%lu\n", h->resv_huge_pages);
3692 }
3693 HSTATE_ATTR_RO(resv_hugepages);
3694
3695 static ssize_t surplus_hugepages_show(struct kobject *kobj,
3696 struct kobj_attribute *attr, char *buf)
3697 {
3698 struct hstate *h;
3699 unsigned long surplus_huge_pages;
3700 int nid;
3701
3702 h = kobj_to_hstate(kobj, &nid);
3703 if (nid == NUMA_NO_NODE)
3704 surplus_huge_pages = h->surplus_huge_pages;
3705 else
3706 surplus_huge_pages = h->surplus_huge_pages_node[nid];
3707
3708 return sysfs_emit(buf, "%lu\n", surplus_huge_pages);
3709 }
3710 HSTATE_ATTR_RO(surplus_hugepages);
3711
3712 static ssize_t demote_store(struct kobject *kobj,
3713 struct kobj_attribute *attr, const char *buf, size_t len)
3714 {
3715 unsigned long nr_demote;
3716 unsigned long nr_available;
3717 nodemask_t nodes_allowed, *n_mask;
3718 struct hstate *h;
3719 int err = 0;
3720 int nid;
3721
3722 err = kstrtoul(buf, 10, &nr_demote);
3723 if (err)
3724 return err;
3725 h = kobj_to_hstate(kobj, &nid);
3726
3727 if (nid != NUMA_NO_NODE) {
3728 init_nodemask_of_node(&nodes_allowed, nid);
3729 n_mask = &nodes_allowed;
3730 } else {
3731 n_mask = &node_states[N_MEMORY];
3732 }
3733
3734
3735 mutex_lock(&h->resize_lock);
3736 spin_lock_irq(&hugetlb_lock);
3737
3738 while (nr_demote) {
3739
3740
3741
3742
3743 if (nid != NUMA_NO_NODE)
3744 nr_available = h->free_huge_pages_node[nid];
3745 else
3746 nr_available = h->free_huge_pages;
3747 nr_available -= h->resv_huge_pages;
3748 if (!nr_available)
3749 break;
3750
3751 err = demote_pool_huge_page(h, n_mask);
3752 if (err)
3753 break;
3754
3755 nr_demote--;
3756 }
3757
3758 spin_unlock_irq(&hugetlb_lock);
3759 mutex_unlock(&h->resize_lock);
3760
3761 if (err)
3762 return err;
3763 return len;
3764 }
3765 HSTATE_ATTR_WO(demote);
3766
3767 static ssize_t demote_size_show(struct kobject *kobj,
3768 struct kobj_attribute *attr, char *buf)
3769 {
3770 int nid;
3771 struct hstate *h = kobj_to_hstate(kobj, &nid);
3772 unsigned long demote_size = (PAGE_SIZE << h->demote_order) / SZ_1K;
3773
3774 return sysfs_emit(buf, "%lukB\n", demote_size);
3775 }
3776
3777 static ssize_t demote_size_store(struct kobject *kobj,
3778 struct kobj_attribute *attr,
3779 const char *buf, size_t count)
3780 {
3781 struct hstate *h, *demote_hstate;
3782 unsigned long demote_size;
3783 unsigned int demote_order;
3784 int nid;
3785
3786 demote_size = (unsigned long)memparse(buf, NULL);
3787
3788 demote_hstate = size_to_hstate(demote_size);
3789 if (!demote_hstate)
3790 return -EINVAL;
3791 demote_order = demote_hstate->order;
3792 if (demote_order < HUGETLB_PAGE_ORDER)
3793 return -EINVAL;
3794
3795
3796 h = kobj_to_hstate(kobj, &nid);
3797 if (demote_order >= h->order)
3798 return -EINVAL;
3799
3800
3801 mutex_lock(&h->resize_lock);
3802 h->demote_order = demote_order;
3803 mutex_unlock(&h->resize_lock);
3804
3805 return count;
3806 }
3807 HSTATE_ATTR(demote_size);
3808
3809 static struct attribute *hstate_attrs[] = {
3810 &nr_hugepages_attr.attr,
3811 &nr_overcommit_hugepages_attr.attr,
3812 &free_hugepages_attr.attr,
3813 &resv_hugepages_attr.attr,
3814 &surplus_hugepages_attr.attr,
3815 #ifdef CONFIG_NUMA
3816 &nr_hugepages_mempolicy_attr.attr,
3817 #endif
3818 NULL,
3819 };
3820
3821 static const struct attribute_group hstate_attr_group = {
3822 .attrs = hstate_attrs,
3823 };
3824
3825 static struct attribute *hstate_demote_attrs[] = {
3826 &demote_size_attr.attr,
3827 &demote_attr.attr,
3828 NULL,
3829 };
3830
3831 static const struct attribute_group hstate_demote_attr_group = {
3832 .attrs = hstate_demote_attrs,
3833 };
3834
3835 static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
3836 struct kobject **hstate_kobjs,
3837 const struct attribute_group *hstate_attr_group)
3838 {
3839 int retval;
3840 int hi = hstate_index(h);
3841
3842 hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
3843 if (!hstate_kobjs[hi])
3844 return -ENOMEM;
3845
3846 retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
3847 if (retval) {
3848 kobject_put(hstate_kobjs[hi]);
3849 hstate_kobjs[hi] = NULL;
3850 }
3851
3852 if (h->demote_order) {
3853 if (sysfs_create_group(hstate_kobjs[hi],
3854 &hstate_demote_attr_group))
3855 pr_warn("HugeTLB unable to create demote interfaces for %s\n", h->name);
3856 }
3857
3858 return retval;
3859 }
3860
3861 static void __init hugetlb_sysfs_init(void)
3862 {
3863 struct hstate *h;
3864 int err;
3865
3866 hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
3867 if (!hugepages_kobj)
3868 return;
3869
3870 for_each_hstate(h) {
3871 err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
3872 hstate_kobjs, &hstate_attr_group);
3873 if (err)
3874 pr_err("HugeTLB: Unable to add hstate %s", h->name);
3875 }
3876 }
3877
3878 #ifdef CONFIG_NUMA
3879
3880
3881
3882
3883
3884
3885
3886
3887 struct node_hstate {
3888 struct kobject *hugepages_kobj;
3889 struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
3890 };
3891 static struct node_hstate node_hstates[MAX_NUMNODES];
3892
3893
3894
3895
3896 static struct attribute *per_node_hstate_attrs[] = {
3897 &nr_hugepages_attr.attr,
3898 &free_hugepages_attr.attr,
3899 &surplus_hugepages_attr.attr,
3900 NULL,
3901 };
3902
3903 static const struct attribute_group per_node_hstate_attr_group = {
3904 .attrs = per_node_hstate_attrs,
3905 };
3906
3907
3908
3909
3910
3911 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
3912 {
3913 int nid;
3914
3915 for (nid = 0; nid < nr_node_ids; nid++) {
3916 struct node_hstate *nhs = &node_hstates[nid];
3917 int i;
3918 for (i = 0; i < HUGE_MAX_HSTATE; i++)
3919 if (nhs->hstate_kobjs[i] == kobj) {
3920 if (nidp)
3921 *nidp = nid;
3922 return &hstates[i];
3923 }
3924 }
3925
3926 BUG();
3927 return NULL;
3928 }
3929
3930
3931
3932
3933
3934 static void hugetlb_unregister_node(struct node *node)
3935 {
3936 struct hstate *h;
3937 struct node_hstate *nhs = &node_hstates[node->dev.id];
3938
3939 if (!nhs->hugepages_kobj)
3940 return;
3941
3942 for_each_hstate(h) {
3943 int idx = hstate_index(h);
3944 if (nhs->hstate_kobjs[idx]) {
3945 kobject_put(nhs->hstate_kobjs[idx]);
3946 nhs->hstate_kobjs[idx] = NULL;
3947 }
3948 }
3949
3950 kobject_put(nhs->hugepages_kobj);
3951 nhs->hugepages_kobj = NULL;
3952 }
3953
3954
3955
3956
3957
3958
3959 static void hugetlb_register_node(struct node *node)
3960 {
3961 struct hstate *h;
3962 struct node_hstate *nhs = &node_hstates[node->dev.id];
3963 int err;
3964
3965 if (nhs->hugepages_kobj)
3966 return;
3967
3968 nhs->hugepages_kobj = kobject_create_and_add("hugepages",
3969 &node->dev.kobj);
3970 if (!nhs->hugepages_kobj)
3971 return;
3972
3973 for_each_hstate(h) {
3974 err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj,
3975 nhs->hstate_kobjs,
3976 &per_node_hstate_attr_group);
3977 if (err) {
3978 pr_err("HugeTLB: Unable to add hstate %s for node %d\n",
3979 h->name, node->dev.id);
3980 hugetlb_unregister_node(node);
3981 break;
3982 }
3983 }
3984 }
3985
3986
3987
3988
3989
3990
3991 static void __init hugetlb_register_all_nodes(void)
3992 {
3993 int nid;
3994
3995 for_each_node_state(nid, N_MEMORY) {
3996 struct node *node = node_devices[nid];
3997 if (node->dev.id == nid)
3998 hugetlb_register_node(node);
3999 }
4000
4001
4002
4003
4004
4005 register_hugetlbfs_with_node(hugetlb_register_node,
4006 hugetlb_unregister_node);
4007 }
4008 #else
4009
4010 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
4011 {
4012 BUG();
4013 if (nidp)
4014 *nidp = -1;
4015 return NULL;
4016 }
4017
4018 static void hugetlb_register_all_nodes(void) { }
4019
4020 #endif
4021
4022 static int __init hugetlb_init(void)
4023 {
4024 int i;
4025
4026 BUILD_BUG_ON(sizeof_field(struct page, private) * BITS_PER_BYTE <
4027 __NR_HPAGEFLAGS);
4028
4029 if (!hugepages_supported()) {
4030 if (hugetlb_max_hstate || default_hstate_max_huge_pages)
4031 pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n");
4032 return 0;
4033 }
4034
4035
4036
4037
4038
4039 hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
4040 if (!parsed_default_hugepagesz) {
4041
4042
4043
4044
4045
4046
4047
4048
4049 default_hstate_idx = hstate_index(size_to_hstate(HPAGE_SIZE));
4050 if (default_hstate_max_huge_pages) {
4051 if (default_hstate.max_huge_pages) {
4052 char buf[32];
4053
4054 string_get_size(huge_page_size(&default_hstate),
4055 1, STRING_UNITS_2, buf, 32);
4056 pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n",
4057 default_hstate.max_huge_pages, buf);
4058 pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n",
4059 default_hstate_max_huge_pages);
4060 }
4061 default_hstate.max_huge_pages =
4062 default_hstate_max_huge_pages;
4063
4064 for_each_online_node(i)
4065 default_hstate.max_huge_pages_node[i] =
4066 default_hugepages_in_node[i];
4067 }
4068 }
4069
4070 hugetlb_cma_check();
4071 hugetlb_init_hstates();
4072 gather_bootmem_prealloc();
4073 report_hugepages();
4074
4075 hugetlb_sysfs_init();
4076 hugetlb_register_all_nodes();
4077 hugetlb_cgroup_file_init();
4078
4079 #ifdef CONFIG_SMP
4080 num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus());
4081 #else
4082 num_fault_mutexes = 1;
4083 #endif
4084 hugetlb_fault_mutex_table =
4085 kmalloc_array(num_fault_mutexes, sizeof(struct mutex),
4086 GFP_KERNEL);
4087 BUG_ON(!hugetlb_fault_mutex_table);
4088
4089 for (i = 0; i < num_fault_mutexes; i++)
4090 mutex_init(&hugetlb_fault_mutex_table[i]);
4091 return 0;
4092 }
4093 subsys_initcall(hugetlb_init);
4094
4095
4096 bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size)
4097 {
4098 return size == HPAGE_SIZE;
4099 }
4100
4101 void __init hugetlb_add_hstate(unsigned int order)
4102 {
4103 struct hstate *h;
4104 unsigned long i;
4105
4106 if (size_to_hstate(PAGE_SIZE << order)) {
4107 return;
4108 }
4109 BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
4110 BUG_ON(order == 0);
4111 h = &hstates[hugetlb_max_hstate++];
4112 mutex_init(&h->resize_lock);
4113 h->order = order;
4114 h->mask = ~(huge_page_size(h) - 1);
4115 for (i = 0; i < MAX_NUMNODES; ++i)
4116 INIT_LIST_HEAD(&h->hugepage_freelists[i]);
4117 INIT_LIST_HEAD(&h->hugepage_activelist);
4118 h->next_nid_to_alloc = first_memory_node;
4119 h->next_nid_to_free = first_memory_node;
4120 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
4121 huge_page_size(h)/1024);
4122
4123 parsed_hstate = h;
4124 }
4125
4126 bool __init __weak hugetlb_node_alloc_supported(void)
4127 {
4128 return true;
4129 }
4130
4131 static void __init hugepages_clear_pages_in_node(void)
4132 {
4133 if (!hugetlb_max_hstate) {
4134 default_hstate_max_huge_pages = 0;
4135 memset(default_hugepages_in_node, 0,
4136 MAX_NUMNODES * sizeof(unsigned int));
4137 } else {
4138 parsed_hstate->max_huge_pages = 0;
4139 memset(parsed_hstate->max_huge_pages_node, 0,
4140 MAX_NUMNODES * sizeof(unsigned int));
4141 }
4142 }
4143
4144
4145
4146
4147
4148
4149
4150
4151 static int __init hugepages_setup(char *s)
4152 {
4153 unsigned long *mhp;
4154 static unsigned long *last_mhp;
4155 int node = NUMA_NO_NODE;
4156 int count;
4157 unsigned long tmp;
4158 char *p = s;
4159
4160 if (!parsed_valid_hugepagesz) {
4161 pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s);
4162 parsed_valid_hugepagesz = true;
4163 return 1;
4164 }
4165
4166
4167
4168
4169
4170
4171
4172 else if (!hugetlb_max_hstate)
4173 mhp = &default_hstate_max_huge_pages;
4174 else
4175 mhp = &parsed_hstate->max_huge_pages;
4176
4177 if (mhp == last_mhp) {
4178 pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s);
4179 return 1;
4180 }
4181
4182 while (*p) {
4183 count = 0;
4184 if (sscanf(p, "%lu%n", &tmp, &count) != 1)
4185 goto invalid;
4186
4187 if (p[count] == ':') {
4188 if (!hugetlb_node_alloc_supported()) {
4189 pr_warn("HugeTLB: architecture can't support node specific alloc, ignoring!\n");
4190 return 1;
4191 }
4192 if (tmp >= MAX_NUMNODES || !node_online(tmp))
4193 goto invalid;
4194 node = array_index_nospec(tmp, MAX_NUMNODES);
4195 p += count + 1;
4196
4197 if (sscanf(p, "%lu%n", &tmp, &count) != 1)
4198 goto invalid;
4199 if (!hugetlb_max_hstate)
4200 default_hugepages_in_node[node] = tmp;
4201 else
4202 parsed_hstate->max_huge_pages_node[node] = tmp;
4203 *mhp += tmp;
4204
4205 if (p[count] == ',')
4206 p += count + 1;
4207 else
4208 break;
4209 } else {
4210 if (p != s)
4211 goto invalid;
4212 *mhp = tmp;
4213 break;
4214 }
4215 }
4216
4217
4218
4219
4220
4221
4222 if (hugetlb_max_hstate && hstate_is_gigantic(parsed_hstate))
4223 hugetlb_hstate_alloc_pages(parsed_hstate);
4224
4225 last_mhp = mhp;
4226
4227 return 1;
4228
4229 invalid:
4230 pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p);
4231 hugepages_clear_pages_in_node();
4232 return 1;
4233 }
4234 __setup("hugepages=", hugepages_setup);
4235
4236
4237
4238
4239
4240
4241
4242
4243 static int __init hugepagesz_setup(char *s)
4244 {
4245 unsigned long size;
4246 struct hstate *h;
4247
4248 parsed_valid_hugepagesz = false;
4249 size = (unsigned long)memparse(s, NULL);
4250
4251 if (!arch_hugetlb_valid_size(size)) {
4252 pr_err("HugeTLB: unsupported hugepagesz=%s\n", s);
4253 return 1;
4254 }
4255
4256 h = size_to_hstate(size);
4257 if (h) {
4258
4259
4260
4261
4262
4263
4264
4265 if (!parsed_default_hugepagesz || h != &default_hstate ||
4266 default_hstate.max_huge_pages) {
4267 pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s);
4268 return 1;
4269 }
4270
4271
4272
4273
4274
4275
4276 parsed_hstate = h;
4277 parsed_valid_hugepagesz = true;
4278 return 1;
4279 }
4280
4281 hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
4282 parsed_valid_hugepagesz = true;
4283 return 1;
4284 }
4285 __setup("hugepagesz=", hugepagesz_setup);
4286
4287
4288
4289
4290
4291 static int __init default_hugepagesz_setup(char *s)
4292 {
4293 unsigned long size;
4294 int i;
4295
4296 parsed_valid_hugepagesz = false;
4297 if (parsed_default_hugepagesz) {
4298 pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s);
4299 return 1;
4300 }
4301
4302 size = (unsigned long)memparse(s, NULL);
4303
4304 if (!arch_hugetlb_valid_size(size)) {
4305 pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s);
4306 return 1;
4307 }
4308
4309 hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
4310 parsed_valid_hugepagesz = true;
4311 parsed_default_hugepagesz = true;
4312 default_hstate_idx = hstate_index(size_to_hstate(size));
4313
4314
4315
4316
4317
4318
4319
4320
4321 if (default_hstate_max_huge_pages) {
4322 default_hstate.max_huge_pages = default_hstate_max_huge_pages;
4323 for_each_online_node(i)
4324 default_hstate.max_huge_pages_node[i] =
4325 default_hugepages_in_node[i];
4326 if (hstate_is_gigantic(&default_hstate))
4327 hugetlb_hstate_alloc_pages(&default_hstate);
4328 default_hstate_max_huge_pages = 0;
4329 }
4330
4331 return 1;
4332 }
4333 __setup("default_hugepagesz=", default_hugepagesz_setup);
4334
4335 static unsigned int allowed_mems_nr(struct hstate *h)
4336 {
4337 int node;
4338 unsigned int nr = 0;
4339 nodemask_t *mpol_allowed;
4340 unsigned int *array = h->free_huge_pages_node;
4341 gfp_t gfp_mask = htlb_alloc_mask(h);
4342
4343 mpol_allowed = policy_nodemask_current(gfp_mask);
4344
4345 for_each_node_mask(node, cpuset_current_mems_allowed) {
4346 if (!mpol_allowed || node_isset(node, *mpol_allowed))
4347 nr += array[node];
4348 }
4349
4350 return nr;
4351 }
4352
4353 #ifdef CONFIG_SYSCTL
4354 static int proc_hugetlb_doulongvec_minmax(struct ctl_table *table, int write,
4355 void *buffer, size_t *length,
4356 loff_t *ppos, unsigned long *out)
4357 {
4358 struct ctl_table dup_table;
4359
4360
4361
4362
4363
4364 dup_table = *table;
4365 dup_table.data = out;
4366
4367 return proc_doulongvec_minmax(&dup_table, write, buffer, length, ppos);
4368 }
4369
4370 static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
4371 struct ctl_table *table, int write,
4372 void *buffer, size_t *length, loff_t *ppos)
4373 {
4374 struct hstate *h = &default_hstate;
4375 unsigned long tmp = h->max_huge_pages;
4376 int ret;
4377
4378 if (!hugepages_supported())
4379 return -EOPNOTSUPP;
4380
4381 ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
4382 &tmp);
4383 if (ret)
4384 goto out;
4385
4386 if (write)
4387 ret = __nr_hugepages_store_common(obey_mempolicy, h,
4388 NUMA_NO_NODE, tmp, *length);
4389 out:
4390 return ret;
4391 }
4392
4393 int hugetlb_sysctl_handler(struct ctl_table *table, int write,
4394 void *buffer, size_t *length, loff_t *ppos)
4395 {
4396
4397 return hugetlb_sysctl_handler_common(false, table, write,
4398 buffer, length, ppos);
4399 }
4400
4401 #ifdef CONFIG_NUMA
4402 int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
4403 void *buffer, size_t *length, loff_t *ppos)
4404 {
4405 return hugetlb_sysctl_handler_common(true, table, write,
4406 buffer, length, ppos);
4407 }
4408 #endif
4409
4410 int hugetlb_overcommit_handler(struct ctl_table *table, int write,
4411 void *buffer, size_t *length, loff_t *ppos)
4412 {
4413 struct hstate *h = &default_hstate;
4414 unsigned long tmp;
4415 int ret;
4416
4417 if (!hugepages_supported())
4418 return -EOPNOTSUPP;
4419
4420 tmp = h->nr_overcommit_huge_pages;
4421
4422 if (write && hstate_is_gigantic(h))
4423 return -EINVAL;
4424
4425 ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
4426 &tmp);
4427 if (ret)
4428 goto out;
4429
4430 if (write) {
4431 spin_lock_irq(&hugetlb_lock);
4432 h->nr_overcommit_huge_pages = tmp;
4433 spin_unlock_irq(&hugetlb_lock);
4434 }
4435 out:
4436 return ret;
4437 }
4438
4439 #endif
4440
4441 void hugetlb_report_meminfo(struct seq_file *m)
4442 {
4443 struct hstate *h;
4444 unsigned long total = 0;
4445
4446 if (!hugepages_supported())
4447 return;
4448
4449 for_each_hstate(h) {
4450 unsigned long count = h->nr_huge_pages;
4451
4452 total += huge_page_size(h) * count;
4453
4454 if (h == &default_hstate)
4455 seq_printf(m,
4456 "HugePages_Total: %5lu\n"
4457 "HugePages_Free: %5lu\n"
4458 "HugePages_Rsvd: %5lu\n"
4459 "HugePages_Surp: %5lu\n"
4460 "Hugepagesize: %8lu kB\n",
4461 count,
4462 h->free_huge_pages,
4463 h->resv_huge_pages,
4464 h->surplus_huge_pages,
4465 huge_page_size(h) / SZ_1K);
4466 }
4467
4468 seq_printf(m, "Hugetlb: %8lu kB\n", total / SZ_1K);
4469 }
4470
4471 int hugetlb_report_node_meminfo(char *buf, int len, int nid)
4472 {
4473 struct hstate *h = &default_hstate;
4474
4475 if (!hugepages_supported())
4476 return 0;
4477
4478 return sysfs_emit_at(buf, len,
4479 "Node %d HugePages_Total: %5u\n"
4480 "Node %d HugePages_Free: %5u\n"
4481 "Node %d HugePages_Surp: %5u\n",
4482 nid, h->nr_huge_pages_node[nid],
4483 nid, h->free_huge_pages_node[nid],
4484 nid, h->surplus_huge_pages_node[nid]);
4485 }
4486
4487 void hugetlb_show_meminfo_node(int nid)
4488 {
4489 struct hstate *h;
4490
4491 if (!hugepages_supported())
4492 return;
4493
4494 for_each_hstate(h)
4495 printk("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
4496 nid,
4497 h->nr_huge_pages_node[nid],
4498 h->free_huge_pages_node[nid],
4499 h->surplus_huge_pages_node[nid],
4500 huge_page_size(h) / SZ_1K);
4501 }
4502
4503 void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm)
4504 {
4505 seq_printf(m, "HugetlbPages:\t%8lu kB\n",
4506 atomic_long_read(&mm->hugetlb_usage) << (PAGE_SHIFT - 10));
4507 }
4508
4509
4510 unsigned long hugetlb_total_pages(void)
4511 {
4512 struct hstate *h;
4513 unsigned long nr_total_pages = 0;
4514
4515 for_each_hstate(h)
4516 nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h);
4517 return nr_total_pages;
4518 }
4519
4520 static int hugetlb_acct_memory(struct hstate *h, long delta)
4521 {
4522 int ret = -ENOMEM;
4523
4524 if (!delta)
4525 return 0;
4526
4527 spin_lock_irq(&hugetlb_lock);
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551 if (delta > 0) {
4552 if (gather_surplus_pages(h, delta) < 0)
4553 goto out;
4554
4555 if (delta > allowed_mems_nr(h)) {
4556 return_unused_surplus_pages(h, delta);
4557 goto out;
4558 }
4559 }
4560
4561 ret = 0;
4562 if (delta < 0)
4563 return_unused_surplus_pages(h, (unsigned long) -delta);
4564
4565 out:
4566 spin_unlock_irq(&hugetlb_lock);
4567 return ret;
4568 }
4569
4570 static void hugetlb_vm_op_open(struct vm_area_struct *vma)
4571 {
4572 struct resv_map *resv = vma_resv_map(vma);
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582 if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
4583 resv_map_dup_hugetlb_cgroup_uncharge_info(resv);
4584 kref_get(&resv->refs);
4585 }
4586 }
4587
4588 static void hugetlb_vm_op_close(struct vm_area_struct *vma)
4589 {
4590 struct hstate *h = hstate_vma(vma);
4591 struct resv_map *resv = vma_resv_map(vma);
4592 struct hugepage_subpool *spool = subpool_vma(vma);
4593 unsigned long reserve, start, end;
4594 long gbl_reserve;
4595
4596 if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
4597 return;
4598
4599 start = vma_hugecache_offset(h, vma, vma->vm_start);
4600 end = vma_hugecache_offset(h, vma, vma->vm_end);
4601
4602 reserve = (end - start) - region_count(resv, start, end);
4603 hugetlb_cgroup_uncharge_counter(resv, start, end);
4604 if (reserve) {
4605
4606
4607
4608
4609 gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
4610 hugetlb_acct_memory(h, -gbl_reserve);
4611 }
4612
4613 kref_put(&resv->refs, resv_map_release);
4614 }
4615
4616 static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
4617 {
4618 if (addr & ~(huge_page_mask(hstate_vma(vma))))
4619 return -EINVAL;
4620 return 0;
4621 }
4622
4623 static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
4624 {
4625 return huge_page_size(hstate_vma(vma));
4626 }
4627
4628
4629
4630
4631
4632
4633
4634 static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf)
4635 {
4636 BUG();
4637 return 0;
4638 }
4639
4640
4641
4642
4643
4644
4645
4646
4647 const struct vm_operations_struct hugetlb_vm_ops = {
4648 .fault = hugetlb_vm_op_fault,
4649 .open = hugetlb_vm_op_open,
4650 .close = hugetlb_vm_op_close,
4651 .may_split = hugetlb_vm_op_split,
4652 .pagesize = hugetlb_vm_op_pagesize,
4653 };
4654
4655 static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
4656 int writable)
4657 {
4658 pte_t entry;
4659 unsigned int shift = huge_page_shift(hstate_vma(vma));
4660
4661 if (writable) {
4662 entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
4663 vma->vm_page_prot)));
4664 } else {
4665 entry = huge_pte_wrprotect(mk_huge_pte(page,
4666 vma->vm_page_prot));
4667 }
4668 entry = pte_mkyoung(entry);
4669 entry = arch_make_huge_pte(entry, shift, vma->vm_flags);
4670
4671 return entry;
4672 }
4673
4674 static void set_huge_ptep_writable(struct vm_area_struct *vma,
4675 unsigned long address, pte_t *ptep)
4676 {
4677 pte_t entry;
4678
4679 entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep)));
4680 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
4681 update_mmu_cache(vma, address, ptep);
4682 }
4683
4684 bool is_hugetlb_entry_migration(pte_t pte)
4685 {
4686 swp_entry_t swp;
4687
4688 if (huge_pte_none(pte) || pte_present(pte))
4689 return false;
4690 swp = pte_to_swp_entry(pte);
4691 if (is_migration_entry(swp))
4692 return true;
4693 else
4694 return false;
4695 }
4696
4697 static bool is_hugetlb_entry_hwpoisoned(pte_t pte)
4698 {
4699 swp_entry_t swp;
4700
4701 if (huge_pte_none(pte) || pte_present(pte))
4702 return false;
4703 swp = pte_to_swp_entry(pte);
4704 if (is_hwpoison_entry(swp))
4705 return true;
4706 else
4707 return false;
4708 }
4709
4710 static void
4711 hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr,
4712 struct page *new_page)
4713 {
4714 __SetPageUptodate(new_page);
4715 hugepage_add_new_anon_rmap(new_page, vma, addr);
4716 set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1));
4717 hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm);
4718 ClearHPageRestoreReserve(new_page);
4719 SetHPageMigratable(new_page);
4720 }
4721
4722 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
4723 struct vm_area_struct *dst_vma,
4724 struct vm_area_struct *src_vma)
4725 {
4726 pte_t *src_pte, *dst_pte, entry, dst_entry;
4727 struct page *ptepage;
4728 unsigned long addr;
4729 bool cow = is_cow_mapping(src_vma->vm_flags);
4730 struct hstate *h = hstate_vma(src_vma);
4731 unsigned long sz = huge_page_size(h);
4732 unsigned long npages = pages_per_huge_page(h);
4733 struct address_space *mapping = src_vma->vm_file->f_mapping;
4734 struct mmu_notifier_range range;
4735 unsigned long last_addr_mask;
4736 int ret = 0;
4737
4738 if (cow) {
4739 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src_vma, src,
4740 src_vma->vm_start,
4741 src_vma->vm_end);
4742 mmu_notifier_invalidate_range_start(&range);
4743 mmap_assert_write_locked(src);
4744 raw_write_seqcount_begin(&src->write_protect_seq);
4745 } else {
4746
4747
4748
4749
4750
4751
4752 i_mmap_lock_read(mapping);
4753 }
4754
4755 last_addr_mask = hugetlb_mask_last_page(h);
4756 for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) {
4757 spinlock_t *src_ptl, *dst_ptl;
4758 src_pte = huge_pte_offset(src, addr, sz);
4759 if (!src_pte) {
4760 addr |= last_addr_mask;
4761 continue;
4762 }
4763 dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz);
4764 if (!dst_pte) {
4765 ret = -ENOMEM;
4766 break;
4767 }
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778 dst_entry = huge_ptep_get(dst_pte);
4779 if ((dst_pte == src_pte) || !huge_pte_none(dst_entry)) {
4780 addr |= last_addr_mask;
4781 continue;
4782 }
4783
4784 dst_ptl = huge_pte_lock(h, dst, dst_pte);
4785 src_ptl = huge_pte_lockptr(h, src, src_pte);
4786 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
4787 entry = huge_ptep_get(src_pte);
4788 dst_entry = huge_ptep_get(dst_pte);
4789 again:
4790 if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) {
4791
4792
4793
4794
4795
4796 ;
4797 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) {
4798 bool uffd_wp = huge_pte_uffd_wp(entry);
4799
4800 if (!userfaultfd_wp(dst_vma) && uffd_wp)
4801 entry = huge_pte_clear_uffd_wp(entry);
4802 set_huge_pte_at(dst, addr, dst_pte, entry);
4803 } else if (unlikely(is_hugetlb_entry_migration(entry))) {
4804 swp_entry_t swp_entry = pte_to_swp_entry(entry);
4805 bool uffd_wp = huge_pte_uffd_wp(entry);
4806
4807 if (!is_readable_migration_entry(swp_entry) && cow) {
4808
4809
4810
4811
4812 swp_entry = make_readable_migration_entry(
4813 swp_offset(swp_entry));
4814 entry = swp_entry_to_pte(swp_entry);
4815 if (userfaultfd_wp(src_vma) && uffd_wp)
4816 entry = huge_pte_mkuffd_wp(entry);
4817 set_huge_pte_at(src, addr, src_pte, entry);
4818 }
4819 if (!userfaultfd_wp(dst_vma) && uffd_wp)
4820 entry = huge_pte_clear_uffd_wp(entry);
4821 set_huge_pte_at(dst, addr, dst_pte, entry);
4822 } else if (unlikely(is_pte_marker(entry))) {
4823
4824
4825
4826
4827 if (userfaultfd_wp(dst_vma))
4828 set_huge_pte_at(dst, addr, dst_pte, entry);
4829 } else {
4830 entry = huge_ptep_get(src_pte);
4831 ptepage = pte_page(entry);
4832 get_page(ptepage);
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844 if (!PageAnon(ptepage)) {
4845 page_dup_file_rmap(ptepage, true);
4846 } else if (page_try_dup_anon_rmap(ptepage, true,
4847 src_vma)) {
4848 pte_t src_pte_old = entry;
4849 struct page *new;
4850
4851 spin_unlock(src_ptl);
4852 spin_unlock(dst_ptl);
4853
4854 new = alloc_huge_page(dst_vma, addr, 1);
4855 if (IS_ERR(new)) {
4856 put_page(ptepage);
4857 ret = PTR_ERR(new);
4858 break;
4859 }
4860 copy_user_huge_page(new, ptepage, addr, dst_vma,
4861 npages);
4862 put_page(ptepage);
4863
4864
4865 dst_ptl = huge_pte_lock(h, dst, dst_pte);
4866 src_ptl = huge_pte_lockptr(h, src, src_pte);
4867 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
4868 entry = huge_ptep_get(src_pte);
4869 if (!pte_same(src_pte_old, entry)) {
4870 restore_reserve_on_error(h, dst_vma, addr,
4871 new);
4872 put_page(new);
4873
4874 goto again;
4875 }
4876 hugetlb_install_page(dst_vma, dst_pte, addr, new);
4877 spin_unlock(src_ptl);
4878 spin_unlock(dst_ptl);
4879 continue;
4880 }
4881
4882 if (cow) {
4883
4884
4885
4886
4887
4888
4889
4890 huge_ptep_set_wrprotect(src, addr, src_pte);
4891 entry = huge_pte_wrprotect(entry);
4892 }
4893
4894 set_huge_pte_at(dst, addr, dst_pte, entry);
4895 hugetlb_count_add(npages, dst);
4896 }
4897 spin_unlock(src_ptl);
4898 spin_unlock(dst_ptl);
4899 }
4900
4901 if (cow) {
4902 raw_write_seqcount_end(&src->write_protect_seq);
4903 mmu_notifier_invalidate_range_end(&range);
4904 } else {
4905 i_mmap_unlock_read(mapping);
4906 }
4907
4908 return ret;
4909 }
4910
4911 static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
4912 unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte)
4913 {
4914 struct hstate *h = hstate_vma(vma);
4915 struct mm_struct *mm = vma->vm_mm;
4916 spinlock_t *src_ptl, *dst_ptl;
4917 pte_t pte;
4918
4919 dst_ptl = huge_pte_lock(h, mm, dst_pte);
4920 src_ptl = huge_pte_lockptr(h, mm, src_pte);
4921
4922
4923
4924
4925
4926 if (src_ptl != dst_ptl)
4927 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
4928
4929 pte = huge_ptep_get_and_clear(mm, old_addr, src_pte);
4930 set_huge_pte_at(mm, new_addr, dst_pte, pte);
4931
4932 if (src_ptl != dst_ptl)
4933 spin_unlock(src_ptl);
4934 spin_unlock(dst_ptl);
4935 }
4936
4937 int move_hugetlb_page_tables(struct vm_area_struct *vma,
4938 struct vm_area_struct *new_vma,
4939 unsigned long old_addr, unsigned long new_addr,
4940 unsigned long len)
4941 {
4942 struct hstate *h = hstate_vma(vma);
4943 struct address_space *mapping = vma->vm_file->f_mapping;
4944 unsigned long sz = huge_page_size(h);
4945 struct mm_struct *mm = vma->vm_mm;
4946 unsigned long old_end = old_addr + len;
4947 unsigned long last_addr_mask;
4948 pte_t *src_pte, *dst_pte;
4949 struct mmu_notifier_range range;
4950 bool shared_pmd = false;
4951
4952 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, old_addr,
4953 old_end);
4954 adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
4955
4956
4957
4958
4959 flush_cache_range(vma, range.start, range.end);
4960
4961 mmu_notifier_invalidate_range_start(&range);
4962 last_addr_mask = hugetlb_mask_last_page(h);
4963
4964 i_mmap_lock_write(mapping);
4965 for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
4966 src_pte = huge_pte_offset(mm, old_addr, sz);
4967 if (!src_pte) {
4968 old_addr |= last_addr_mask;
4969 new_addr |= last_addr_mask;
4970 continue;
4971 }
4972 if (huge_pte_none(huge_ptep_get(src_pte)))
4973 continue;
4974
4975 if (huge_pmd_unshare(mm, vma, old_addr, src_pte)) {
4976 shared_pmd = true;
4977 old_addr |= last_addr_mask;
4978 new_addr |= last_addr_mask;
4979 continue;
4980 }
4981
4982 dst_pte = huge_pte_alloc(mm, new_vma, new_addr, sz);
4983 if (!dst_pte)
4984 break;
4985
4986 move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte);
4987 }
4988
4989 if (shared_pmd)
4990 flush_tlb_range(vma, range.start, range.end);
4991 else
4992 flush_tlb_range(vma, old_end - len, old_end);
4993 mmu_notifier_invalidate_range_end(&range);
4994 i_mmap_unlock_write(mapping);
4995
4996 return len + old_addr - old_end;
4997 }
4998
4999 static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
5000 unsigned long start, unsigned long end,
5001 struct page *ref_page, zap_flags_t zap_flags)
5002 {
5003 struct mm_struct *mm = vma->vm_mm;
5004 unsigned long address;
5005 pte_t *ptep;
5006 pte_t pte;
5007 spinlock_t *ptl;
5008 struct page *page;
5009 struct hstate *h = hstate_vma(vma);
5010 unsigned long sz = huge_page_size(h);
5011 struct mmu_notifier_range range;
5012 unsigned long last_addr_mask;
5013 bool force_flush = false;
5014
5015 WARN_ON(!is_vm_hugetlb_page(vma));
5016 BUG_ON(start & ~huge_page_mask(h));
5017 BUG_ON(end & ~huge_page_mask(h));
5018
5019
5020
5021
5022
5023 tlb_change_page_size(tlb, sz);
5024 tlb_start_vma(tlb, vma);
5025
5026
5027
5028
5029 mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start,
5030 end);
5031 adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
5032 mmu_notifier_invalidate_range_start(&range);
5033 last_addr_mask = hugetlb_mask_last_page(h);
5034 address = start;
5035 for (; address < end; address += sz) {
5036 ptep = huge_pte_offset(mm, address, sz);
5037 if (!ptep) {
5038 address |= last_addr_mask;
5039 continue;
5040 }
5041
5042 ptl = huge_pte_lock(h, mm, ptep);
5043 if (huge_pmd_unshare(mm, vma, address, ptep)) {
5044 spin_unlock(ptl);
5045 tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE);
5046 force_flush = true;
5047 address |= last_addr_mask;
5048 continue;
5049 }
5050
5051 pte = huge_ptep_get(ptep);
5052 if (huge_pte_none(pte)) {
5053 spin_unlock(ptl);
5054 continue;
5055 }
5056
5057
5058
5059
5060
5061 if (unlikely(!pte_present(pte))) {
5062
5063
5064
5065
5066
5067
5068 if (pte_swp_uffd_wp_any(pte) &&
5069 !(zap_flags & ZAP_FLAG_DROP_MARKER))
5070 set_huge_pte_at(mm, address, ptep,
5071 make_pte_marker(PTE_MARKER_UFFD_WP));
5072 else
5073 huge_pte_clear(mm, address, ptep, sz);
5074 spin_unlock(ptl);
5075 continue;
5076 }
5077
5078 page = pte_page(pte);
5079
5080
5081
5082
5083
5084 if (ref_page) {
5085 if (page != ref_page) {
5086 spin_unlock(ptl);
5087 continue;
5088 }
5089
5090
5091
5092
5093
5094 set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
5095 }
5096
5097 pte = huge_ptep_get_and_clear(mm, address, ptep);
5098 tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
5099 if (huge_pte_dirty(pte))
5100 set_page_dirty(page);
5101
5102 if (huge_pte_uffd_wp(pte) &&
5103 !(zap_flags & ZAP_FLAG_DROP_MARKER))
5104 set_huge_pte_at(mm, address, ptep,
5105 make_pte_marker(PTE_MARKER_UFFD_WP));
5106 hugetlb_count_sub(pages_per_huge_page(h), mm);
5107 page_remove_rmap(page, vma, true);
5108
5109 spin_unlock(ptl);
5110 tlb_remove_page_size(tlb, page, huge_page_size(h));
5111
5112
5113
5114 if (ref_page)
5115 break;
5116 }
5117 mmu_notifier_invalidate_range_end(&range);
5118 tlb_end_vma(tlb, vma);
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133 if (force_flush)
5134 tlb_flush_mmu_tlbonly(tlb);
5135 }
5136
5137 void __unmap_hugepage_range_final(struct mmu_gather *tlb,
5138 struct vm_area_struct *vma, unsigned long start,
5139 unsigned long end, struct page *ref_page,
5140 zap_flags_t zap_flags)
5141 {
5142 __unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags);
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154 vma->vm_flags &= ~VM_MAYSHARE;
5155 }
5156
5157 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
5158 unsigned long end, struct page *ref_page,
5159 zap_flags_t zap_flags)
5160 {
5161 struct mmu_gather tlb;
5162
5163 tlb_gather_mmu(&tlb, vma->vm_mm);
5164 __unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags);
5165 tlb_finish_mmu(&tlb);
5166 }
5167
5168
5169
5170
5171
5172
5173
5174 static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
5175 struct page *page, unsigned long address)
5176 {
5177 struct hstate *h = hstate_vma(vma);
5178 struct vm_area_struct *iter_vma;
5179 struct address_space *mapping;
5180 pgoff_t pgoff;
5181
5182
5183
5184
5185
5186 address = address & huge_page_mask(h);
5187 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
5188 vma->vm_pgoff;
5189 mapping = vma->vm_file->f_mapping;
5190
5191
5192
5193
5194
5195
5196 i_mmap_lock_write(mapping);
5197 vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
5198
5199 if (iter_vma == vma)
5200 continue;
5201
5202
5203
5204
5205
5206
5207 if (iter_vma->vm_flags & VM_MAYSHARE)
5208 continue;
5209
5210
5211
5212
5213
5214
5215
5216
5217 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
5218 unmap_hugepage_range(iter_vma, address,
5219 address + huge_page_size(h), page, 0);
5220 }
5221 i_mmap_unlock_write(mapping);
5222 }
5223
5224
5225
5226
5227
5228
5229
5230 static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
5231 unsigned long address, pte_t *ptep, unsigned int flags,
5232 struct page *pagecache_page, spinlock_t *ptl)
5233 {
5234 const bool unshare = flags & FAULT_FLAG_UNSHARE;
5235 pte_t pte;
5236 struct hstate *h = hstate_vma(vma);
5237 struct page *old_page, *new_page;
5238 int outside_reserve = 0;
5239 vm_fault_t ret = 0;
5240 unsigned long haddr = address & huge_page_mask(h);
5241 struct mmu_notifier_range range;
5242
5243 VM_BUG_ON(unshare && (flags & FOLL_WRITE));
5244 VM_BUG_ON(!unshare && !(flags & FOLL_WRITE));
5245
5246
5247
5248
5249
5250 if (WARN_ON_ONCE(!unshare && !(vma->vm_flags & VM_WRITE)))
5251 return VM_FAULT_SIGSEGV;
5252
5253
5254 if (vma->vm_flags & VM_MAYSHARE) {
5255 if (unlikely(unshare))
5256 return 0;
5257 set_huge_ptep_writable(vma, haddr, ptep);
5258 return 0;
5259 }
5260
5261 pte = huge_ptep_get(ptep);
5262 old_page = pte_page(pte);
5263
5264 delayacct_wpcopy_start();
5265
5266 retry_avoidcopy:
5267
5268
5269
5270
5271 if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
5272 if (!PageAnonExclusive(old_page))
5273 page_move_anon_rmap(old_page, vma);
5274 if (likely(!unshare))
5275 set_huge_ptep_writable(vma, haddr, ptep);
5276
5277 delayacct_wpcopy_end();
5278 return 0;
5279 }
5280 VM_BUG_ON_PAGE(PageAnon(old_page) && PageAnonExclusive(old_page),
5281 old_page);
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
5293 old_page != pagecache_page)
5294 outside_reserve = 1;
5295
5296 get_page(old_page);
5297
5298
5299
5300
5301
5302 spin_unlock(ptl);
5303 new_page = alloc_huge_page(vma, haddr, outside_reserve);
5304
5305 if (IS_ERR(new_page)) {
5306
5307
5308
5309
5310
5311
5312
5313 if (outside_reserve) {
5314 struct address_space *mapping = vma->vm_file->f_mapping;
5315 pgoff_t idx;
5316 u32 hash;
5317
5318 put_page(old_page);
5319 BUG_ON(huge_pte_none(pte));
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329 idx = vma_hugecache_offset(h, vma, haddr);
5330 hash = hugetlb_fault_mutex_hash(mapping, idx);
5331 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
5332 i_mmap_unlock_read(mapping);
5333
5334 unmap_ref_private(mm, vma, old_page, haddr);
5335
5336 i_mmap_lock_read(mapping);
5337 mutex_lock(&hugetlb_fault_mutex_table[hash]);
5338 spin_lock(ptl);
5339 ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
5340 if (likely(ptep &&
5341 pte_same(huge_ptep_get(ptep), pte)))
5342 goto retry_avoidcopy;
5343
5344
5345
5346
5347 delayacct_wpcopy_end();
5348 return 0;
5349 }
5350
5351 ret = vmf_error(PTR_ERR(new_page));
5352 goto out_release_old;
5353 }
5354
5355
5356
5357
5358
5359 if (unlikely(anon_vma_prepare(vma))) {
5360 ret = VM_FAULT_OOM;
5361 goto out_release_all;
5362 }
5363
5364 copy_user_huge_page(new_page, old_page, address, vma,
5365 pages_per_huge_page(h));
5366 __SetPageUptodate(new_page);
5367
5368 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr,
5369 haddr + huge_page_size(h));
5370 mmu_notifier_invalidate_range_start(&range);
5371
5372
5373
5374
5375
5376 spin_lock(ptl);
5377 ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
5378 if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
5379 ClearHPageRestoreReserve(new_page);
5380
5381
5382 huge_ptep_clear_flush(vma, haddr, ptep);
5383 mmu_notifier_invalidate_range(mm, range.start, range.end);
5384 page_remove_rmap(old_page, vma, true);
5385 hugepage_add_new_anon_rmap(new_page, vma, haddr);
5386 set_huge_pte_at(mm, haddr, ptep,
5387 make_huge_pte(vma, new_page, !unshare));
5388 SetHPageMigratable(new_page);
5389
5390 new_page = old_page;
5391 }
5392 spin_unlock(ptl);
5393 mmu_notifier_invalidate_range_end(&range);
5394 out_release_all:
5395
5396
5397
5398
5399 if (new_page != old_page)
5400 restore_reserve_on_error(h, vma, haddr, new_page);
5401 put_page(new_page);
5402 out_release_old:
5403 put_page(old_page);
5404
5405 spin_lock(ptl);
5406
5407 delayacct_wpcopy_end();
5408 return ret;
5409 }
5410
5411
5412 static struct page *hugetlbfs_pagecache_page(struct hstate *h,
5413 struct vm_area_struct *vma, unsigned long address)
5414 {
5415 struct address_space *mapping;
5416 pgoff_t idx;
5417
5418 mapping = vma->vm_file->f_mapping;
5419 idx = vma_hugecache_offset(h, vma, address);
5420
5421 return find_lock_page(mapping, idx);
5422 }
5423
5424
5425
5426
5427
5428 static bool hugetlbfs_pagecache_present(struct hstate *h,
5429 struct vm_area_struct *vma, unsigned long address)
5430 {
5431 struct address_space *mapping;
5432 pgoff_t idx;
5433 struct page *page;
5434
5435 mapping = vma->vm_file->f_mapping;
5436 idx = vma_hugecache_offset(h, vma, address);
5437
5438 page = find_get_page(mapping, idx);
5439 if (page)
5440 put_page(page);
5441 return page != NULL;
5442 }
5443
5444 int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
5445 pgoff_t idx)
5446 {
5447 struct folio *folio = page_folio(page);
5448 struct inode *inode = mapping->host;
5449 struct hstate *h = hstate_inode(inode);
5450 int err;
5451
5452 __folio_set_locked(folio);
5453 err = __filemap_add_folio(mapping, folio, idx, GFP_KERNEL, NULL);
5454
5455 if (unlikely(err)) {
5456 __folio_clear_locked(folio);
5457 return err;
5458 }
5459 ClearHPageRestoreReserve(page);
5460
5461
5462
5463
5464
5465 folio_mark_dirty(folio);
5466
5467 spin_lock(&inode->i_lock);
5468 inode->i_blocks += blocks_per_huge_page(h);
5469 spin_unlock(&inode->i_lock);
5470 return 0;
5471 }
5472
5473 static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
5474 struct address_space *mapping,
5475 pgoff_t idx,
5476 unsigned int flags,
5477 unsigned long haddr,
5478 unsigned long addr,
5479 unsigned long reason)
5480 {
5481 vm_fault_t ret;
5482 u32 hash;
5483 struct vm_fault vmf = {
5484 .vma = vma,
5485 .address = haddr,
5486 .real_address = addr,
5487 .flags = flags,
5488
5489
5490
5491
5492
5493
5494
5495
5496 };
5497
5498
5499
5500
5501
5502
5503 hash = hugetlb_fault_mutex_hash(mapping, idx);
5504 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
5505 i_mmap_unlock_read(mapping);
5506 ret = handle_userfault(&vmf, reason);
5507 i_mmap_lock_read(mapping);
5508 mutex_lock(&hugetlb_fault_mutex_table[hash]);
5509
5510 return ret;
5511 }
5512
5513 static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
5514 struct vm_area_struct *vma,
5515 struct address_space *mapping, pgoff_t idx,
5516 unsigned long address, pte_t *ptep,
5517 pte_t old_pte, unsigned int flags)
5518 {
5519 struct hstate *h = hstate_vma(vma);
5520 vm_fault_t ret = VM_FAULT_SIGBUS;
5521 int anon_rmap = 0;
5522 unsigned long size;
5523 struct page *page;
5524 pte_t new_pte;
5525 spinlock_t *ptl;
5526 unsigned long haddr = address & huge_page_mask(h);
5527 bool new_page, new_pagecache_page = false;
5528
5529
5530
5531
5532
5533
5534
5535 if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
5536 pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
5537 current->pid);
5538 return ret;
5539 }
5540
5541
5542
5543
5544
5545
5546 size = i_size_read(mapping->host) >> huge_page_shift(h);
5547 if (idx >= size)
5548 goto out;
5549
5550 retry:
5551 new_page = false;
5552 page = find_lock_page(mapping, idx);
5553 if (!page) {
5554
5555 if (userfaultfd_missing(vma)) {
5556 ret = hugetlb_handle_userfault(vma, mapping, idx,
5557 flags, haddr, address,
5558 VM_UFFD_MISSING);
5559 goto out;
5560 }
5561
5562 page = alloc_huge_page(vma, haddr, 0);
5563 if (IS_ERR(page)) {
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576 ptl = huge_pte_lock(h, mm, ptep);
5577 ret = 0;
5578 if (huge_pte_none(huge_ptep_get(ptep)))
5579 ret = vmf_error(PTR_ERR(page));
5580 spin_unlock(ptl);
5581 goto out;
5582 }
5583 clear_huge_page(page, address, pages_per_huge_page(h));
5584 __SetPageUptodate(page);
5585 new_page = true;
5586
5587 if (vma->vm_flags & VM_MAYSHARE) {
5588 int err = huge_add_to_page_cache(page, mapping, idx);
5589 if (err) {
5590 put_page(page);
5591 if (err == -EEXIST)
5592 goto retry;
5593 goto out;
5594 }
5595 new_pagecache_page = true;
5596 } else {
5597 lock_page(page);
5598 if (unlikely(anon_vma_prepare(vma))) {
5599 ret = VM_FAULT_OOM;
5600 goto backout_unlocked;
5601 }
5602 anon_rmap = 1;
5603 }
5604 } else {
5605
5606
5607
5608
5609
5610 if (unlikely(PageHWPoison(page))) {
5611 ret = VM_FAULT_HWPOISON_LARGE |
5612 VM_FAULT_SET_HINDEX(hstate_index(h));
5613 goto backout_unlocked;
5614 }
5615
5616
5617 if (userfaultfd_minor(vma)) {
5618 unlock_page(page);
5619 put_page(page);
5620 ret = hugetlb_handle_userfault(vma, mapping, idx,
5621 flags, haddr, address,
5622 VM_UFFD_MINOR);
5623 goto out;
5624 }
5625 }
5626
5627
5628
5629
5630
5631
5632
5633 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
5634 if (vma_needs_reservation(h, vma, haddr) < 0) {
5635 ret = VM_FAULT_OOM;
5636 goto backout_unlocked;
5637 }
5638
5639 vma_end_reservation(h, vma, haddr);
5640 }
5641
5642 ptl = huge_pte_lock(h, mm, ptep);
5643 ret = 0;
5644
5645 if (!pte_same(huge_ptep_get(ptep), old_pte))
5646 goto backout;
5647
5648 if (anon_rmap) {
5649 ClearHPageRestoreReserve(page);
5650 hugepage_add_new_anon_rmap(page, vma, haddr);
5651 } else
5652 page_dup_file_rmap(page, true);
5653 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
5654 && (vma->vm_flags & VM_SHARED)));
5655
5656
5657
5658
5659 if (unlikely(pte_marker_uffd_wp(old_pte)))
5660 new_pte = huge_pte_wrprotect(huge_pte_mkuffd_wp(new_pte));
5661 set_huge_pte_at(mm, haddr, ptep, new_pte);
5662
5663 hugetlb_count_add(pages_per_huge_page(h), mm);
5664 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
5665
5666 ret = hugetlb_wp(mm, vma, address, ptep, flags, page, ptl);
5667 }
5668
5669 spin_unlock(ptl);
5670
5671
5672
5673
5674
5675
5676 if (new_page)
5677 SetHPageMigratable(page);
5678
5679 unlock_page(page);
5680 out:
5681 return ret;
5682
5683 backout:
5684 spin_unlock(ptl);
5685 backout_unlocked:
5686 unlock_page(page);
5687
5688 if (new_page && !new_pagecache_page)
5689 restore_reserve_on_error(h, vma, haddr, page);
5690 put_page(page);
5691 goto out;
5692 }
5693
5694 #ifdef CONFIG_SMP
5695 u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
5696 {
5697 unsigned long key[2];
5698 u32 hash;
5699
5700 key[0] = (unsigned long) mapping;
5701 key[1] = idx;
5702
5703 hash = jhash2((u32 *)&key, sizeof(key)/(sizeof(u32)), 0);
5704
5705 return hash & (num_fault_mutexes - 1);
5706 }
5707 #else
5708
5709
5710
5711
5712 u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
5713 {
5714 return 0;
5715 }
5716 #endif
5717
5718 vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
5719 unsigned long address, unsigned int flags)
5720 {
5721 pte_t *ptep, entry;
5722 spinlock_t *ptl;
5723 vm_fault_t ret;
5724 u32 hash;
5725 pgoff_t idx;
5726 struct page *page = NULL;
5727 struct page *pagecache_page = NULL;
5728 struct hstate *h = hstate_vma(vma);
5729 struct address_space *mapping;
5730 int need_wait_lock = 0;
5731 unsigned long haddr = address & huge_page_mask(h);
5732
5733 ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
5734 if (ptep) {
5735
5736
5737
5738
5739
5740 entry = huge_ptep_get(ptep);
5741 if (unlikely(is_hugetlb_entry_migration(entry))) {
5742 migration_entry_wait_huge(vma, ptep);
5743 return 0;
5744 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
5745 return VM_FAULT_HWPOISON_LARGE |
5746 VM_FAULT_SET_HINDEX(hstate_index(h));
5747 }
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760 mapping = vma->vm_file->f_mapping;
5761 i_mmap_lock_read(mapping);
5762 ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
5763 if (!ptep) {
5764 i_mmap_unlock_read(mapping);
5765 return VM_FAULT_OOM;
5766 }
5767
5768
5769
5770
5771
5772
5773 idx = vma_hugecache_offset(h, vma, haddr);
5774 hash = hugetlb_fault_mutex_hash(mapping, idx);
5775 mutex_lock(&hugetlb_fault_mutex_table[hash]);
5776
5777 entry = huge_ptep_get(ptep);
5778
5779 if (huge_pte_none_mostly(entry)) {
5780 ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep,
5781 entry, flags);
5782 goto out_mutex;
5783 }
5784
5785 ret = 0;
5786
5787
5788
5789
5790
5791
5792
5793
5794 if (!pte_present(entry))
5795 goto out_mutex;
5796
5797
5798
5799
5800
5801
5802
5803
5804 if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
5805 !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(entry)) {
5806 if (vma_needs_reservation(h, vma, haddr) < 0) {
5807 ret = VM_FAULT_OOM;
5808 goto out_mutex;
5809 }
5810
5811 vma_end_reservation(h, vma, haddr);
5812
5813 pagecache_page = hugetlbfs_pagecache_page(h, vma, haddr);
5814 }
5815
5816 ptl = huge_pte_lock(h, mm, ptep);
5817
5818
5819 if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
5820 goto out_ptl;
5821
5822
5823 if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) &&
5824 (flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
5825 struct vm_fault vmf = {
5826 .vma = vma,
5827 .address = haddr,
5828 .real_address = address,
5829 .flags = flags,
5830 };
5831
5832 spin_unlock(ptl);
5833 if (pagecache_page) {
5834 unlock_page(pagecache_page);
5835 put_page(pagecache_page);
5836 }
5837 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
5838 i_mmap_unlock_read(mapping);
5839 return handle_userfault(&vmf, VM_UFFD_WP);
5840 }
5841
5842
5843
5844
5845
5846
5847 page = pte_page(entry);
5848 if (page != pagecache_page)
5849 if (!trylock_page(page)) {
5850 need_wait_lock = 1;
5851 goto out_ptl;
5852 }
5853
5854 get_page(page);
5855
5856 if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
5857 if (!huge_pte_write(entry)) {
5858 ret = hugetlb_wp(mm, vma, address, ptep, flags,
5859 pagecache_page, ptl);
5860 goto out_put_page;
5861 } else if (likely(flags & FAULT_FLAG_WRITE)) {
5862 entry = huge_pte_mkdirty(entry);
5863 }
5864 }
5865 entry = pte_mkyoung(entry);
5866 if (huge_ptep_set_access_flags(vma, haddr, ptep, entry,
5867 flags & FAULT_FLAG_WRITE))
5868 update_mmu_cache(vma, haddr, ptep);
5869 out_put_page:
5870 if (page != pagecache_page)
5871 unlock_page(page);
5872 put_page(page);
5873 out_ptl:
5874 spin_unlock(ptl);
5875
5876 if (pagecache_page) {
5877 unlock_page(pagecache_page);
5878 put_page(pagecache_page);
5879 }
5880 out_mutex:
5881 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
5882 i_mmap_unlock_read(mapping);
5883
5884
5885
5886
5887
5888
5889
5890 if (need_wait_lock)
5891 wait_on_page_locked(page);
5892 return ret;
5893 }
5894
5895 #ifdef CONFIG_USERFAULTFD
5896
5897
5898
5899
5900 int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
5901 pte_t *dst_pte,
5902 struct vm_area_struct *dst_vma,
5903 unsigned long dst_addr,
5904 unsigned long src_addr,
5905 enum mcopy_atomic_mode mode,
5906 struct page **pagep,
5907 bool wp_copy)
5908 {
5909 bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
5910 struct hstate *h = hstate_vma(dst_vma);
5911 struct address_space *mapping = dst_vma->vm_file->f_mapping;
5912 pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr);
5913 unsigned long size;
5914 int vm_shared = dst_vma->vm_flags & VM_SHARED;
5915 pte_t _dst_pte;
5916 spinlock_t *ptl;
5917 int ret = -ENOMEM;
5918 struct page *page;
5919 int writable;
5920 bool page_in_pagecache = false;
5921
5922 if (is_continue) {
5923 ret = -EFAULT;
5924 page = find_lock_page(mapping, idx);
5925 if (!page)
5926 goto out;
5927 page_in_pagecache = true;
5928 } else if (!*pagep) {
5929
5930
5931
5932 if (vm_shared &&
5933 hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
5934 ret = -EEXIST;
5935 goto out;
5936 }
5937
5938 page = alloc_huge_page(dst_vma, dst_addr, 0);
5939 if (IS_ERR(page)) {
5940 ret = -ENOMEM;
5941 goto out;
5942 }
5943
5944 ret = copy_huge_page_from_user(page,
5945 (const void __user *) src_addr,
5946 pages_per_huge_page(h), false);
5947
5948
5949 if (unlikely(ret)) {
5950 ret = -ENOENT;
5951
5952
5953
5954 restore_reserve_on_error(h, dst_vma, dst_addr, page);
5955 put_page(page);
5956
5957
5958
5959
5960 page = alloc_huge_page_vma(h, dst_vma, dst_addr);
5961 if (!page) {
5962 ret = -ENOMEM;
5963 goto out;
5964 }
5965 *pagep = page;
5966
5967
5968
5969
5970 goto out;
5971 }
5972 } else {
5973 if (vm_shared &&
5974 hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
5975 put_page(*pagep);
5976 ret = -EEXIST;
5977 *pagep = NULL;
5978 goto out;
5979 }
5980
5981 page = alloc_huge_page(dst_vma, dst_addr, 0);
5982 if (IS_ERR(page)) {
5983 put_page(*pagep);
5984 ret = -ENOMEM;
5985 *pagep = NULL;
5986 goto out;
5987 }
5988 copy_user_huge_page(page, *pagep, dst_addr, dst_vma,
5989 pages_per_huge_page(h));
5990 put_page(*pagep);
5991 *pagep = NULL;
5992 }
5993
5994
5995
5996
5997
5998
5999 __SetPageUptodate(page);
6000
6001
6002 if (vm_shared && !is_continue) {
6003 size = i_size_read(mapping->host) >> huge_page_shift(h);
6004 ret = -EFAULT;
6005 if (idx >= size)
6006 goto out_release_nounlock;
6007
6008
6009
6010
6011
6012
6013
6014 ret = huge_add_to_page_cache(page, mapping, idx);
6015 if (ret)
6016 goto out_release_nounlock;
6017 page_in_pagecache = true;
6018 }
6019
6020 ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
6021 spin_lock(ptl);
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032 size = i_size_read(mapping->host) >> huge_page_shift(h);
6033 ret = -EFAULT;
6034 if (idx >= size)
6035 goto out_release_unlock;
6036
6037 ret = -EEXIST;
6038
6039
6040
6041
6042
6043 if (!huge_pte_none_mostly(huge_ptep_get(dst_pte)))
6044 goto out_release_unlock;
6045
6046 if (page_in_pagecache) {
6047 page_dup_file_rmap(page, true);
6048 } else {
6049 ClearHPageRestoreReserve(page);
6050 hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
6051 }
6052
6053
6054
6055
6056
6057 if (wp_copy || (is_continue && !vm_shared))
6058 writable = 0;
6059 else
6060 writable = dst_vma->vm_flags & VM_WRITE;
6061
6062 _dst_pte = make_huge_pte(dst_vma, page, writable);
6063
6064
6065
6066
6067
6068
6069 _dst_pte = huge_pte_mkdirty(_dst_pte);
6070 _dst_pte = pte_mkyoung(_dst_pte);
6071
6072 if (wp_copy)
6073 _dst_pte = huge_pte_mkuffd_wp(_dst_pte);
6074
6075 set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
6076
6077 hugetlb_count_add(pages_per_huge_page(h), dst_mm);
6078
6079
6080 update_mmu_cache(dst_vma, dst_addr, dst_pte);
6081
6082 spin_unlock(ptl);
6083 if (!is_continue)
6084 SetHPageMigratable(page);
6085 if (vm_shared || is_continue)
6086 unlock_page(page);
6087 ret = 0;
6088 out:
6089 return ret;
6090 out_release_unlock:
6091 spin_unlock(ptl);
6092 if (vm_shared || is_continue)
6093 unlock_page(page);
6094 out_release_nounlock:
6095 if (!page_in_pagecache)
6096 restore_reserve_on_error(h, dst_vma, dst_addr, page);
6097 put_page(page);
6098 goto out;
6099 }
6100 #endif
6101
6102 static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
6103 int refs, struct page **pages,
6104 struct vm_area_struct **vmas)
6105 {
6106 int nr;
6107
6108 for (nr = 0; nr < refs; nr++) {
6109 if (likely(pages))
6110 pages[nr] = mem_map_offset(page, nr);
6111 if (vmas)
6112 vmas[nr] = vma;
6113 }
6114 }
6115
6116 static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte,
6117 bool *unshare)
6118 {
6119 pte_t pteval = huge_ptep_get(pte);
6120
6121 *unshare = false;
6122 if (is_swap_pte(pteval))
6123 return true;
6124 if (huge_pte_write(pteval))
6125 return false;
6126 if (flags & FOLL_WRITE)
6127 return true;
6128 if (gup_must_unshare(flags, pte_page(pteval))) {
6129 *unshare = true;
6130 return true;
6131 }
6132 return false;
6133 }
6134
6135 long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
6136 struct page **pages, struct vm_area_struct **vmas,
6137 unsigned long *position, unsigned long *nr_pages,
6138 long i, unsigned int flags, int *locked)
6139 {
6140 unsigned long pfn_offset;
6141 unsigned long vaddr = *position;
6142 unsigned long remainder = *nr_pages;
6143 struct hstate *h = hstate_vma(vma);
6144 int err = -EFAULT, refs;
6145
6146 while (vaddr < vma->vm_end && remainder) {
6147 pte_t *pte;
6148 spinlock_t *ptl = NULL;
6149 bool unshare = false;
6150 int absent;
6151 struct page *page;
6152
6153
6154
6155
6156
6157 if (fatal_signal_pending(current)) {
6158 remainder = 0;
6159 break;
6160 }
6161
6162
6163
6164
6165
6166
6167
6168
6169 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h),
6170 huge_page_size(h));
6171 if (pte)
6172 ptl = huge_pte_lock(h, mm, pte);
6173 absent = !pte || huge_pte_none(huge_ptep_get(pte));
6174
6175
6176
6177
6178
6179
6180
6181
6182 if (absent && (flags & FOLL_DUMP) &&
6183 !hugetlbfs_pagecache_present(h, vma, vaddr)) {
6184 if (pte)
6185 spin_unlock(ptl);
6186 remainder = 0;
6187 break;
6188 }
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200 if (absent ||
6201 __follow_hugetlb_must_fault(flags, pte, &unshare)) {
6202 vm_fault_t ret;
6203 unsigned int fault_flags = 0;
6204
6205 if (pte)
6206 spin_unlock(ptl);
6207 if (flags & FOLL_WRITE)
6208 fault_flags |= FAULT_FLAG_WRITE;
6209 else if (unshare)
6210 fault_flags |= FAULT_FLAG_UNSHARE;
6211 if (locked)
6212 fault_flags |= FAULT_FLAG_ALLOW_RETRY |
6213 FAULT_FLAG_KILLABLE;
6214 if (flags & FOLL_NOWAIT)
6215 fault_flags |= FAULT_FLAG_ALLOW_RETRY |
6216 FAULT_FLAG_RETRY_NOWAIT;
6217 if (flags & FOLL_TRIED) {
6218
6219
6220
6221
6222 fault_flags |= FAULT_FLAG_TRIED;
6223 }
6224 ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
6225 if (ret & VM_FAULT_ERROR) {
6226 err = vm_fault_to_errno(ret, flags);
6227 remainder = 0;
6228 break;
6229 }
6230 if (ret & VM_FAULT_RETRY) {
6231 if (locked &&
6232 !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
6233 *locked = 0;
6234 *nr_pages = 0;
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244 return i;
6245 }
6246 continue;
6247 }
6248
6249 pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
6250 page = pte_page(huge_ptep_get(pte));
6251
6252 VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
6253 !PageAnonExclusive(page), page);
6254
6255
6256
6257
6258
6259 if (!pages && !vmas && !pfn_offset &&
6260 (vaddr + huge_page_size(h) < vma->vm_end) &&
6261 (remainder >= pages_per_huge_page(h))) {
6262 vaddr += huge_page_size(h);
6263 remainder -= pages_per_huge_page(h);
6264 i += pages_per_huge_page(h);
6265 spin_unlock(ptl);
6266 continue;
6267 }
6268
6269
6270 refs = min3(pages_per_huge_page(h) - pfn_offset, remainder,
6271 (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT);
6272
6273 if (pages || vmas)
6274 record_subpages_vmas(mem_map_offset(page, pfn_offset),
6275 vma, refs,
6276 likely(pages) ? pages + i : NULL,
6277 vmas ? vmas + i : NULL);
6278
6279 if (pages) {
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290 if (WARN_ON_ONCE(!try_grab_folio(pages[i], refs,
6291 flags))) {
6292 spin_unlock(ptl);
6293 remainder = 0;
6294 err = -ENOMEM;
6295 break;
6296 }
6297 }
6298
6299 vaddr += (refs << PAGE_SHIFT);
6300 remainder -= refs;
6301 i += refs;
6302
6303 spin_unlock(ptl);
6304 }
6305 *nr_pages = remainder;
6306
6307
6308
6309
6310
6311 *position = vaddr;
6312
6313 return i ? i : err;
6314 }
6315
6316 unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
6317 unsigned long address, unsigned long end,
6318 pgprot_t newprot, unsigned long cp_flags)
6319 {
6320 struct mm_struct *mm = vma->vm_mm;
6321 unsigned long start = address;
6322 pte_t *ptep;
6323 pte_t pte;
6324 struct hstate *h = hstate_vma(vma);
6325 unsigned long pages = 0, psize = huge_page_size(h);
6326 bool shared_pmd = false;
6327 struct mmu_notifier_range range;
6328 unsigned long last_addr_mask;
6329 bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
6330 bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
6331
6332
6333
6334
6335
6336
6337 mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA,
6338 0, vma, mm, start, end);
6339 adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
6340
6341 BUG_ON(address >= end);
6342 flush_cache_range(vma, range.start, range.end);
6343
6344 mmu_notifier_invalidate_range_start(&range);
6345 last_addr_mask = hugetlb_mask_last_page(h);
6346 i_mmap_lock_write(vma->vm_file->f_mapping);
6347 for (; address < end; address += psize) {
6348 spinlock_t *ptl;
6349 ptep = huge_pte_offset(mm, address, psize);
6350 if (!ptep) {
6351 address |= last_addr_mask;
6352 continue;
6353 }
6354 ptl = huge_pte_lock(h, mm, ptep);
6355 if (huge_pmd_unshare(mm, vma, address, ptep)) {
6356
6357
6358
6359
6360
6361 WARN_ON_ONCE(uffd_wp || uffd_wp_resolve);
6362 pages++;
6363 spin_unlock(ptl);
6364 shared_pmd = true;
6365 address |= last_addr_mask;
6366 continue;
6367 }
6368 pte = huge_ptep_get(ptep);
6369 if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
6370 spin_unlock(ptl);
6371 continue;
6372 }
6373 if (unlikely(is_hugetlb_entry_migration(pte))) {
6374 swp_entry_t entry = pte_to_swp_entry(pte);
6375 struct page *page = pfn_swap_entry_to_page(entry);
6376
6377 if (!is_readable_migration_entry(entry)) {
6378 pte_t newpte;
6379
6380 if (PageAnon(page))
6381 entry = make_readable_exclusive_migration_entry(
6382 swp_offset(entry));
6383 else
6384 entry = make_readable_migration_entry(
6385 swp_offset(entry));
6386 newpte = swp_entry_to_pte(entry);
6387 if (uffd_wp)
6388 newpte = pte_swp_mkuffd_wp(newpte);
6389 else if (uffd_wp_resolve)
6390 newpte = pte_swp_clear_uffd_wp(newpte);
6391 set_huge_pte_at(mm, address, ptep, newpte);
6392 pages++;
6393 }
6394 spin_unlock(ptl);
6395 continue;
6396 }
6397 if (unlikely(pte_marker_uffd_wp(pte))) {
6398
6399
6400
6401
6402 if (uffd_wp_resolve)
6403 huge_pte_clear(mm, address, ptep, psize);
6404 }
6405 if (!huge_pte_none(pte)) {
6406 pte_t old_pte;
6407 unsigned int shift = huge_page_shift(hstate_vma(vma));
6408
6409 old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
6410 pte = huge_pte_modify(old_pte, newprot);
6411 pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
6412 if (uffd_wp)
6413 pte = huge_pte_mkuffd_wp(huge_pte_wrprotect(pte));
6414 else if (uffd_wp_resolve)
6415 pte = huge_pte_clear_uffd_wp(pte);
6416 huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
6417 pages++;
6418 } else {
6419
6420 if (unlikely(uffd_wp))
6421
6422 set_huge_pte_at(mm, address, ptep,
6423 make_pte_marker(PTE_MARKER_UFFD_WP));
6424 }
6425 spin_unlock(ptl);
6426 }
6427
6428
6429
6430
6431
6432
6433
6434 if (shared_pmd)
6435 flush_hugetlb_tlb_range(vma, range.start, range.end);
6436 else
6437 flush_hugetlb_tlb_range(vma, start, end);
6438
6439
6440
6441
6442
6443
6444 i_mmap_unlock_write(vma->vm_file->f_mapping);
6445 mmu_notifier_invalidate_range_end(&range);
6446
6447 return pages << h->order;
6448 }
6449
6450
6451 bool hugetlb_reserve_pages(struct inode *inode,
6452 long from, long to,
6453 struct vm_area_struct *vma,
6454 vm_flags_t vm_flags)
6455 {
6456 long chg, add = -1;
6457 struct hstate *h = hstate_inode(inode);
6458 struct hugepage_subpool *spool = subpool_inode(inode);
6459 struct resv_map *resv_map;
6460 struct hugetlb_cgroup *h_cg = NULL;
6461 long gbl_reserve, regions_needed = 0;
6462
6463
6464 if (from > to) {
6465 VM_WARN(1, "%s called with a negative range\n", __func__);
6466 return false;
6467 }
6468
6469
6470
6471
6472
6473
6474 if (vm_flags & VM_NORESERVE)
6475 return true;
6476
6477
6478
6479
6480
6481
6482
6483 if (!vma || vma->vm_flags & VM_MAYSHARE) {
6484
6485
6486
6487
6488
6489 resv_map = inode_resv_map(inode);
6490
6491 chg = region_chg(resv_map, from, to, ®ions_needed);
6492
6493 } else {
6494
6495 resv_map = resv_map_alloc();
6496 if (!resv_map)
6497 return false;
6498
6499 chg = to - from;
6500
6501 set_vma_resv_map(vma, resv_map);
6502 set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
6503 }
6504
6505 if (chg < 0)
6506 goto out_err;
6507
6508 if (hugetlb_cgroup_charge_cgroup_rsvd(hstate_index(h),
6509 chg * pages_per_huge_page(h), &h_cg) < 0)
6510 goto out_err;
6511
6512 if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) {
6513
6514
6515
6516 resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, h_cg, h);
6517 }
6518
6519
6520
6521
6522
6523
6524 gbl_reserve = hugepage_subpool_get_pages(spool, chg);
6525 if (gbl_reserve < 0)
6526 goto out_uncharge_cgroup;
6527
6528
6529
6530
6531
6532 if (hugetlb_acct_memory(h, gbl_reserve) < 0)
6533 goto out_put_pages;
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546 if (!vma || vma->vm_flags & VM_MAYSHARE) {
6547 add = region_add(resv_map, from, to, regions_needed, h, h_cg);
6548
6549 if (unlikely(add < 0)) {
6550 hugetlb_acct_memory(h, -gbl_reserve);
6551 goto out_put_pages;
6552 } else if (unlikely(chg > add)) {
6553
6554
6555
6556
6557
6558
6559
6560 long rsv_adjust;
6561
6562
6563
6564
6565
6566 hugetlb_cgroup_uncharge_cgroup_rsvd(
6567 hstate_index(h),
6568 (chg - add) * pages_per_huge_page(h), h_cg);
6569
6570 rsv_adjust = hugepage_subpool_put_pages(spool,
6571 chg - add);
6572 hugetlb_acct_memory(h, -rsv_adjust);
6573 } else if (h_cg) {
6574
6575
6576
6577
6578
6579
6580 hugetlb_cgroup_put_rsvd_cgroup(h_cg);
6581 }
6582 }
6583 return true;
6584
6585 out_put_pages:
6586
6587 (void)hugepage_subpool_put_pages(spool, chg);
6588 out_uncharge_cgroup:
6589 hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
6590 chg * pages_per_huge_page(h), h_cg);
6591 out_err:
6592 if (!vma || vma->vm_flags & VM_MAYSHARE)
6593
6594
6595
6596 if (chg >= 0 && add < 0)
6597 region_abort(resv_map, from, to, regions_needed);
6598 if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
6599 kref_put(&resv_map->refs, resv_map_release);
6600 return false;
6601 }
6602
6603 long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
6604 long freed)
6605 {
6606 struct hstate *h = hstate_inode(inode);
6607 struct resv_map *resv_map = inode_resv_map(inode);
6608 long chg = 0;
6609 struct hugepage_subpool *spool = subpool_inode(inode);
6610 long gbl_reserve;
6611
6612
6613
6614
6615
6616 if (resv_map) {
6617 chg = region_del(resv_map, start, end);
6618
6619
6620
6621
6622
6623 if (chg < 0)
6624 return chg;
6625 }
6626
6627 spin_lock(&inode->i_lock);
6628 inode->i_blocks -= (blocks_per_huge_page(h) * freed);
6629 spin_unlock(&inode->i_lock);
6630
6631
6632
6633
6634
6635
6636
6637
6638 gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
6639 hugetlb_acct_memory(h, -gbl_reserve);
6640
6641 return 0;
6642 }
6643
6644 #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
6645 static unsigned long page_table_shareable(struct vm_area_struct *svma,
6646 struct vm_area_struct *vma,
6647 unsigned long addr, pgoff_t idx)
6648 {
6649 unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
6650 svma->vm_start;
6651 unsigned long sbase = saddr & PUD_MASK;
6652 unsigned long s_end = sbase + PUD_SIZE;
6653
6654
6655 unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
6656 unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK;
6657
6658
6659
6660
6661
6662 if (pmd_index(addr) != pmd_index(saddr) ||
6663 vm_flags != svm_flags ||
6664 !range_in_vma(svma, sbase, s_end))
6665 return 0;
6666
6667 return saddr;
6668 }
6669
6670 static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
6671 {
6672 unsigned long base = addr & PUD_MASK;
6673 unsigned long end = base + PUD_SIZE;
6674
6675
6676
6677
6678 if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end))
6679 return true;
6680 return false;
6681 }
6682
6683 bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
6684 {
6685 #ifdef CONFIG_USERFAULTFD
6686 if (uffd_disable_huge_pmd_share(vma))
6687 return false;
6688 #endif
6689 return vma_shareable(vma, addr);
6690 }
6691
6692
6693
6694
6695
6696
6697 void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
6698 unsigned long *start, unsigned long *end)
6699 {
6700 unsigned long v_start = ALIGN(vma->vm_start, PUD_SIZE),
6701 v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
6702
6703
6704
6705
6706
6707 if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) ||
6708 (*end <= v_start) || (*start >= v_end))
6709 return;
6710
6711
6712 if (*start > v_start)
6713 *start = ALIGN_DOWN(*start, PUD_SIZE);
6714
6715 if (*end < v_end)
6716 *end = ALIGN(*end, PUD_SIZE);
6717 }
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730 pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
6731 unsigned long addr, pud_t *pud)
6732 {
6733 struct address_space *mapping = vma->vm_file->f_mapping;
6734 pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
6735 vma->vm_pgoff;
6736 struct vm_area_struct *svma;
6737 unsigned long saddr;
6738 pte_t *spte = NULL;
6739 pte_t *pte;
6740 spinlock_t *ptl;
6741
6742 i_mmap_assert_locked(mapping);
6743 vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
6744 if (svma == vma)
6745 continue;
6746
6747 saddr = page_table_shareable(svma, vma, addr, idx);
6748 if (saddr) {
6749 spte = huge_pte_offset(svma->vm_mm, saddr,
6750 vma_mmu_pagesize(svma));
6751 if (spte) {
6752 get_page(virt_to_page(spte));
6753 break;
6754 }
6755 }
6756 }
6757
6758 if (!spte)
6759 goto out;
6760
6761 ptl = huge_pte_lock(hstate_vma(vma), mm, spte);
6762 if (pud_none(*pud)) {
6763 pud_populate(mm, pud,
6764 (pmd_t *)((unsigned long)spte & PAGE_MASK));
6765 mm_inc_nr_pmds(mm);
6766 } else {
6767 put_page(virt_to_page(spte));
6768 }
6769 spin_unlock(ptl);
6770 out:
6771 pte = (pte_t *)pmd_alloc(mm, pud, addr);
6772 return pte;
6773 }
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787 int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
6788 unsigned long addr, pte_t *ptep)
6789 {
6790 pgd_t *pgd = pgd_offset(mm, addr);
6791 p4d_t *p4d = p4d_offset(pgd, addr);
6792 pud_t *pud = pud_offset(p4d, addr);
6793
6794 i_mmap_assert_write_locked(vma->vm_file->f_mapping);
6795 BUG_ON(page_count(virt_to_page(ptep)) == 0);
6796 if (page_count(virt_to_page(ptep)) == 1)
6797 return 0;
6798
6799 pud_clear(pud);
6800 put_page(virt_to_page(ptep));
6801 mm_dec_nr_pmds(mm);
6802 return 1;
6803 }
6804
6805 #else
6806 pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
6807 unsigned long addr, pud_t *pud)
6808 {
6809 return NULL;
6810 }
6811
6812 int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
6813 unsigned long addr, pte_t *ptep)
6814 {
6815 return 0;
6816 }
6817
6818 void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
6819 unsigned long *start, unsigned long *end)
6820 {
6821 }
6822
6823 bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
6824 {
6825 return false;
6826 }
6827 #endif
6828
6829 #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
6830 pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
6831 unsigned long addr, unsigned long sz)
6832 {
6833 pgd_t *pgd;
6834 p4d_t *p4d;
6835 pud_t *pud;
6836 pte_t *pte = NULL;
6837
6838 pgd = pgd_offset(mm, addr);
6839 p4d = p4d_alloc(mm, pgd, addr);
6840 if (!p4d)
6841 return NULL;
6842 pud = pud_alloc(mm, p4d, addr);
6843 if (pud) {
6844 if (sz == PUD_SIZE) {
6845 pte = (pte_t *)pud;
6846 } else {
6847 BUG_ON(sz != PMD_SIZE);
6848 if (want_pmd_share(vma, addr) && pud_none(*pud))
6849 pte = huge_pmd_share(mm, vma, addr, pud);
6850 else
6851 pte = (pte_t *)pmd_alloc(mm, pud, addr);
6852 }
6853 }
6854 BUG_ON(pte && pte_present(*pte) && !pte_huge(*pte));
6855
6856 return pte;
6857 }
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868 pte_t *huge_pte_offset(struct mm_struct *mm,
6869 unsigned long addr, unsigned long sz)
6870 {
6871 pgd_t *pgd;
6872 p4d_t *p4d;
6873 pud_t *pud;
6874 pmd_t *pmd;
6875
6876 pgd = pgd_offset(mm, addr);
6877 if (!pgd_present(*pgd))
6878 return NULL;
6879 p4d = p4d_offset(pgd, addr);
6880 if (!p4d_present(*p4d))
6881 return NULL;
6882
6883 pud = pud_offset(p4d, addr);
6884 if (sz == PUD_SIZE)
6885
6886 return (pte_t *)pud;
6887 if (!pud_present(*pud))
6888 return NULL;
6889
6890
6891 pmd = pmd_offset(pud, addr);
6892
6893 return (pte_t *)pmd;
6894 }
6895
6896
6897
6898
6899
6900
6901
6902
6903 unsigned long hugetlb_mask_last_page(struct hstate *h)
6904 {
6905 unsigned long hp_size = huge_page_size(h);
6906
6907 if (hp_size == PUD_SIZE)
6908 return P4D_SIZE - PUD_SIZE;
6909 else if (hp_size == PMD_SIZE)
6910 return PUD_SIZE - PMD_SIZE;
6911 else
6912 return 0UL;
6913 }
6914
6915 #else
6916
6917
6918 __weak unsigned long hugetlb_mask_last_page(struct hstate *h)
6919 {
6920 #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
6921 if (huge_page_size(h) == PMD_SIZE)
6922 return PUD_SIZE - PMD_SIZE;
6923 #endif
6924 return 0UL;
6925 }
6926
6927 #endif
6928
6929
6930
6931
6932
6933 struct page * __weak
6934 follow_huge_addr(struct mm_struct *mm, unsigned long address,
6935 int write)
6936 {
6937 return ERR_PTR(-EINVAL);
6938 }
6939
6940 struct page * __weak
6941 follow_huge_pd(struct vm_area_struct *vma,
6942 unsigned long address, hugepd_t hpd, int flags, int pdshift)
6943 {
6944 WARN(1, "hugepd follow called with no support for hugepage directory format\n");
6945 return NULL;
6946 }
6947
6948 struct page * __weak
6949 follow_huge_pmd(struct mm_struct *mm, unsigned long address,
6950 pmd_t *pmd, int flags)
6951 {
6952 struct page *page = NULL;
6953 spinlock_t *ptl;
6954 pte_t pte;
6955
6956
6957
6958
6959
6960 if (WARN_ON_ONCE(flags & FOLL_PIN))
6961 return NULL;
6962
6963 retry:
6964 ptl = pmd_lockptr(mm, pmd);
6965 spin_lock(ptl);
6966
6967
6968
6969
6970 if (!pmd_huge(*pmd))
6971 goto out;
6972 pte = huge_ptep_get((pte_t *)pmd);
6973 if (pte_present(pte)) {
6974 page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
6975
6976
6977
6978
6979
6980
6981
6982
6983 if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
6984 page = NULL;
6985 goto out;
6986 }
6987 } else {
6988 if (is_hugetlb_entry_migration(pte)) {
6989 spin_unlock(ptl);
6990 __migration_entry_wait_huge((pte_t *)pmd, ptl);
6991 goto retry;
6992 }
6993
6994
6995
6996
6997 }
6998 out:
6999 spin_unlock(ptl);
7000 return page;
7001 }
7002
7003 struct page * __weak
7004 follow_huge_pud(struct mm_struct *mm, unsigned long address,
7005 pud_t *pud, int flags)
7006 {
7007 struct page *page = NULL;
7008 spinlock_t *ptl;
7009 pte_t pte;
7010
7011 if (WARN_ON_ONCE(flags & FOLL_PIN))
7012 return NULL;
7013
7014 retry:
7015 ptl = huge_pte_lock(hstate_sizelog(PUD_SHIFT), mm, (pte_t *)pud);
7016 if (!pud_huge(*pud))
7017 goto out;
7018 pte = huge_ptep_get((pte_t *)pud);
7019 if (pte_present(pte)) {
7020 page = pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
7021 if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
7022 page = NULL;
7023 goto out;
7024 }
7025 } else {
7026 if (is_hugetlb_entry_migration(pte)) {
7027 spin_unlock(ptl);
7028 __migration_entry_wait(mm, (pte_t *)pud, ptl);
7029 goto retry;
7030 }
7031
7032
7033
7034
7035 }
7036 out:
7037 spin_unlock(ptl);
7038 return page;
7039 }
7040
7041 struct page * __weak
7042 follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags)
7043 {
7044 if (flags & (FOLL_GET | FOLL_PIN))
7045 return NULL;
7046
7047 return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
7048 }
7049
7050 int isolate_hugetlb(struct page *page, struct list_head *list)
7051 {
7052 int ret = 0;
7053
7054 spin_lock_irq(&hugetlb_lock);
7055 if (!PageHeadHuge(page) ||
7056 !HPageMigratable(page) ||
7057 !get_page_unless_zero(page)) {
7058 ret = -EBUSY;
7059 goto unlock;
7060 }
7061 ClearHPageMigratable(page);
7062 list_move_tail(&page->lru, list);
7063 unlock:
7064 spin_unlock_irq(&hugetlb_lock);
7065 return ret;
7066 }
7067
7068 int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
7069 {
7070 int ret = 0;
7071
7072 *hugetlb = false;
7073 spin_lock_irq(&hugetlb_lock);
7074 if (PageHeadHuge(page)) {
7075 *hugetlb = true;
7076 if (HPageFreed(page))
7077 ret = 0;
7078 else if (HPageMigratable(page))
7079 ret = get_page_unless_zero(page);
7080 else
7081 ret = -EBUSY;
7082 }
7083 spin_unlock_irq(&hugetlb_lock);
7084 return ret;
7085 }
7086
7087 int get_huge_page_for_hwpoison(unsigned long pfn, int flags)
7088 {
7089 int ret;
7090
7091 spin_lock_irq(&hugetlb_lock);
7092 ret = __get_huge_page_for_hwpoison(pfn, flags);
7093 spin_unlock_irq(&hugetlb_lock);
7094 return ret;
7095 }
7096
7097 void putback_active_hugepage(struct page *page)
7098 {
7099 spin_lock_irq(&hugetlb_lock);
7100 SetHPageMigratable(page);
7101 list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
7102 spin_unlock_irq(&hugetlb_lock);
7103 put_page(page);
7104 }
7105
7106 void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
7107 {
7108 struct hstate *h = page_hstate(oldpage);
7109
7110 hugetlb_cgroup_migrate(oldpage, newpage);
7111 set_page_owner_migrate_reason(newpage, reason);
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123 if (HPageTemporary(newpage)) {
7124 int old_nid = page_to_nid(oldpage);
7125 int new_nid = page_to_nid(newpage);
7126
7127 SetHPageTemporary(oldpage);
7128 ClearHPageTemporary(newpage);
7129
7130
7131
7132
7133
7134 if (new_nid == old_nid)
7135 return;
7136 spin_lock_irq(&hugetlb_lock);
7137 if (h->surplus_huge_pages_node[old_nid]) {
7138 h->surplus_huge_pages_node[old_nid]--;
7139 h->surplus_huge_pages_node[new_nid]++;
7140 }
7141 spin_unlock_irq(&hugetlb_lock);
7142 }
7143 }
7144
7145
7146
7147
7148
7149 void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
7150 {
7151 struct hstate *h = hstate_vma(vma);
7152 unsigned long sz = huge_page_size(h);
7153 struct mm_struct *mm = vma->vm_mm;
7154 struct mmu_notifier_range range;
7155 unsigned long address, start, end;
7156 spinlock_t *ptl;
7157 pte_t *ptep;
7158
7159 if (!(vma->vm_flags & VM_MAYSHARE))
7160 return;
7161
7162 start = ALIGN(vma->vm_start, PUD_SIZE);
7163 end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
7164
7165 if (start >= end)
7166 return;
7167
7168 flush_cache_range(vma, start, end);
7169
7170
7171
7172
7173 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
7174 start, end);
7175 mmu_notifier_invalidate_range_start(&range);
7176 i_mmap_lock_write(vma->vm_file->f_mapping);
7177 for (address = start; address < end; address += PUD_SIZE) {
7178 ptep = huge_pte_offset(mm, address, sz);
7179 if (!ptep)
7180 continue;
7181 ptl = huge_pte_lock(h, mm, ptep);
7182 huge_pmd_unshare(mm, vma, address, ptep);
7183 spin_unlock(ptl);
7184 }
7185 flush_hugetlb_tlb_range(vma, start, end);
7186 i_mmap_unlock_write(vma->vm_file->f_mapping);
7187
7188
7189
7190
7191 mmu_notifier_invalidate_range_end(&range);
7192 }
7193
7194 #ifdef CONFIG_CMA
7195 static bool cma_reserve_called __initdata;
7196
7197 static int __init cmdline_parse_hugetlb_cma(char *p)
7198 {
7199 int nid, count = 0;
7200 unsigned long tmp;
7201 char *s = p;
7202
7203 while (*s) {
7204 if (sscanf(s, "%lu%n", &tmp, &count) != 1)
7205 break;
7206
7207 if (s[count] == ':') {
7208 if (tmp >= MAX_NUMNODES)
7209 break;
7210 nid = array_index_nospec(tmp, MAX_NUMNODES);
7211
7212 s += count + 1;
7213 tmp = memparse(s, &s);
7214 hugetlb_cma_size_in_node[nid] = tmp;
7215 hugetlb_cma_size += tmp;
7216
7217
7218
7219
7220
7221 if (*s == ',')
7222 s++;
7223 else
7224 break;
7225 } else {
7226 hugetlb_cma_size = memparse(p, &p);
7227 break;
7228 }
7229 }
7230
7231 return 0;
7232 }
7233
7234 early_param("hugetlb_cma", cmdline_parse_hugetlb_cma);
7235
7236 void __init hugetlb_cma_reserve(int order)
7237 {
7238 unsigned long size, reserved, per_node;
7239 bool node_specific_cma_alloc = false;
7240 int nid;
7241
7242 cma_reserve_called = true;
7243
7244 if (!hugetlb_cma_size)
7245 return;
7246
7247 for (nid = 0; nid < MAX_NUMNODES; nid++) {
7248 if (hugetlb_cma_size_in_node[nid] == 0)
7249 continue;
7250
7251 if (!node_online(nid)) {
7252 pr_warn("hugetlb_cma: invalid node %d specified\n", nid);
7253 hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
7254 hugetlb_cma_size_in_node[nid] = 0;
7255 continue;
7256 }
7257
7258 if (hugetlb_cma_size_in_node[nid] < (PAGE_SIZE << order)) {
7259 pr_warn("hugetlb_cma: cma area of node %d should be at least %lu MiB\n",
7260 nid, (PAGE_SIZE << order) / SZ_1M);
7261 hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
7262 hugetlb_cma_size_in_node[nid] = 0;
7263 } else {
7264 node_specific_cma_alloc = true;
7265 }
7266 }
7267
7268
7269 if (!hugetlb_cma_size)
7270 return;
7271
7272 if (hugetlb_cma_size < (PAGE_SIZE << order)) {
7273 pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n",
7274 (PAGE_SIZE << order) / SZ_1M);
7275 hugetlb_cma_size = 0;
7276 return;
7277 }
7278
7279 if (!node_specific_cma_alloc) {
7280
7281
7282
7283
7284 per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes);
7285 pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n",
7286 hugetlb_cma_size / SZ_1M, per_node / SZ_1M);
7287 }
7288
7289 reserved = 0;
7290 for_each_online_node(nid) {
7291 int res;
7292 char name[CMA_MAX_NAME];
7293
7294 if (node_specific_cma_alloc) {
7295 if (hugetlb_cma_size_in_node[nid] == 0)
7296 continue;
7297
7298 size = hugetlb_cma_size_in_node[nid];
7299 } else {
7300 size = min(per_node, hugetlb_cma_size - reserved);
7301 }
7302
7303 size = round_up(size, PAGE_SIZE << order);
7304
7305 snprintf(name, sizeof(name), "hugetlb%d", nid);
7306
7307
7308
7309
7310
7311 res = cma_declare_contiguous_nid(0, size, 0,
7312 PAGE_SIZE << HUGETLB_PAGE_ORDER,
7313 0, false, name,
7314 &hugetlb_cma[nid], nid);
7315 if (res) {
7316 pr_warn("hugetlb_cma: reservation failed: err %d, node %d",
7317 res, nid);
7318 continue;
7319 }
7320
7321 reserved += size;
7322 pr_info("hugetlb_cma: reserved %lu MiB on node %d\n",
7323 size / SZ_1M, nid);
7324
7325 if (reserved >= hugetlb_cma_size)
7326 break;
7327 }
7328
7329 if (!reserved)
7330
7331
7332
7333
7334 hugetlb_cma_size = 0;
7335 }
7336
7337 void __init hugetlb_cma_check(void)
7338 {
7339 if (!hugetlb_cma_size || cma_reserve_called)
7340 return;
7341
7342 pr_warn("hugetlb_cma: the option isn't supported by current arch\n");
7343 }
7344
7345 #endif