the-tree/mm/hugetlb.c

0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * Generic hugetlb support.
0004  * (C) Nadia Yvette Chambers, April 2004
0005  */
0006 #include <linux/list.h>
0007 #include <linux/init.h>
0008 #include <linux/mm.h>
0009 #include <linux/seq_file.h>
0010 #include <linux/sysctl.h>
0011 #include <linux/highmem.h>
0012 #include <linux/mmu_notifier.h>
0013 #include <linux/nodemask.h>
0014 #include <linux/pagemap.h>
0015 #include <linux/mempolicy.h>
0016 #include <linux/compiler.h>
0017 #include <linux/cpuset.h>
0018 #include <linux/mutex.h>
0019 #include <linux/memblock.h>
0020 #include <linux/sysfs.h>
0021 #include <linux/slab.h>
0022 #include <linux/sched/mm.h>
0023 #include <linux/mmdebug.h>
0024 #include <linux/sched/signal.h>
0025 #include <linux/rmap.h>
0026 #include <linux/string_helpers.h>
0027 #include <linux/swap.h>
0028 #include <linux/swapops.h>
0029 #include <linux/jhash.h>
0030 #include <linux/numa.h>
0031 #include <linux/llist.h>
0032 #include <linux/cma.h>
0033 #include <linux/migrate.h>
0034 #include <linux/nospec.h>
0035 #include <linux/delayacct.h>
0036
0037 #include <asm/page.h>
0038 #include <asm/pgalloc.h>
0039 #include <asm/tlb.h>
0040
0041 #include <linux/io.h>
0042 #include <linux/hugetlb.h>
0043 #include <linux/hugetlb_cgroup.h>
0044 #include <linux/node.h>
0045 #include <linux/page_owner.h>
0046 #include "internal.h"
0047 #include "hugetlb_vmemmap.h"
0048
0049 int hugetlb_max_hstate __read_mostly;
0050 unsigned int default_hstate_idx;
0051 struct hstate hstates[HUGE_MAX_HSTATE];
0052
0053 #ifdef CONFIG_CMA
0054 static struct cma *hugetlb_cma[MAX_NUMNODES];
0055 static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata;
0056 static bool hugetlb_cma_page(struct page *page, unsigned int order)
0057 {
0058     return cma_pages_valid(hugetlb_cma[page_to_nid(page)], page,
0059                 1 << order);
0060 }
0061 #else
0062 static bool hugetlb_cma_page(struct page *page, unsigned int order)
0063 {
0064     return false;
0065 }
0066 #endif
0067 static unsigned long hugetlb_cma_size __initdata;
0068
0069 __initdata LIST_HEAD(huge_boot_pages);
0070
0071 /* for command line parsing */
0072 static struct hstate * __initdata parsed_hstate;
0073 static unsigned long __initdata default_hstate_max_huge_pages;
0074 static bool __initdata parsed_valid_hugepagesz = true;
0075 static bool __initdata parsed_default_hugepagesz;
0076 static unsigned int default_hugepages_in_node[MAX_NUMNODES] __initdata;
0077
0078 /*
0079  * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
0080  * free_huge_pages, and surplus_huge_pages.
0081  */
0082 DEFINE_SPINLOCK(hugetlb_lock);
0083
0084 /*
0085  * Serializes faults on the same logical page.  This is used to
0086  * prevent spurious OOMs when the hugepage pool is fully utilized.
0087  */
0088 static int num_fault_mutexes;
0089 struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
0090
0091 /* Forward declaration */
0092 static int hugetlb_acct_memory(struct hstate *h, long delta);
0093
0094 static inline bool subpool_is_free(struct hugepage_subpool *spool)
0095 {
0096     if (spool->count)
0097         return false;
0098     if (spool->max_hpages != -1)
0099         return spool->used_hpages == 0;
0100     if (spool->min_hpages != -1)
0101         return spool->rsv_hpages == spool->min_hpages;
0102
0103     return true;
0104 }
0105
0106 static inline void unlock_or_release_subpool(struct hugepage_subpool *spool,
0107                         unsigned long irq_flags)
0108 {
0109     spin_unlock_irqrestore(&spool->lock, irq_flags);
0110
0111     /* If no pages are used, and no other handles to the subpool
0112      * remain, give up any reservations based on minimum size and
0113      * free the subpool */
0114     if (subpool_is_free(spool)) {
0115         if (spool->min_hpages != -1)
0116             hugetlb_acct_memory(spool->hstate,
0117                         -spool->min_hpages);
0118         kfree(spool);
0119     }
0120 }
0121
0122 struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
0123                         long min_hpages)
0124 {
0125     struct hugepage_subpool *spool;
0126
0127     spool = kzalloc(sizeof(*spool), GFP_KERNEL);
0128     if (!spool)
0129         return NULL;
0130
0131     spin_lock_init(&spool->lock);
0132     spool->count = 1;
0133     spool->max_hpages = max_hpages;
0134     spool->hstate = h;
0135     spool->min_hpages = min_hpages;
0136
0137     if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
0138         kfree(spool);
0139         return NULL;
0140     }
0141     spool->rsv_hpages = min_hpages;
0142
0143     return spool;
0144 }
0145
0146 void hugepage_put_subpool(struct hugepage_subpool *spool)
0147 {
0148     unsigned long flags;
0149
0150     spin_lock_irqsave(&spool->lock, flags);
0151     BUG_ON(!spool->count);
0152     spool->count--;
0153     unlock_or_release_subpool(spool, flags);
0154 }
0155
0156 /*
0157  * Subpool accounting for allocating and reserving pages.
0158  * Return -ENOMEM if there are not enough resources to satisfy the
0159  * request.  Otherwise, return the number of pages by which the
0160  * global pools must be adjusted (upward).  The returned value may
0161  * only be different than the passed value (delta) in the case where
0162  * a subpool minimum size must be maintained.
0163  */
0164 static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
0165                       long delta)
0166 {
0167     long ret = delta;
0168
0169     if (!spool)
0170         return ret;
0171
0172     spin_lock_irq(&spool->lock);
0173
0174     if (spool->max_hpages != -1) {      /* maximum size accounting */
0175         if ((spool->used_hpages + delta) <= spool->max_hpages)
0176             spool->used_hpages += delta;
0177         else {
0178             ret = -ENOMEM;
0179             goto unlock_ret;
0180         }
0181     }
0182
0183     /* minimum size accounting */
0184     if (spool->min_hpages != -1 && spool->rsv_hpages) {
0185         if (delta > spool->rsv_hpages) {
0186             /*
0187              * Asking for more reserves than those already taken on
0188              * behalf of subpool.  Return difference.
0189              */
0190             ret = delta - spool->rsv_hpages;
0191             spool->rsv_hpages = 0;
0192         } else {
0193             ret = 0;    /* reserves already accounted for */
0194             spool->rsv_hpages -= delta;
0195         }
0196     }
0197
0198 unlock_ret:
0199     spin_unlock_irq(&spool->lock);
0200     return ret;
0201 }
0202
0203 /*
0204  * Subpool accounting for freeing and unreserving pages.
0205  * Return the number of global page reservations that must be dropped.
0206  * The return value may only be different than the passed value (delta)
0207  * in the case where a subpool minimum size must be maintained.
0208  */
0209 static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
0210                        long delta)
0211 {
0212     long ret = delta;
0213     unsigned long flags;
0214
0215     if (!spool)
0216         return delta;
0217
0218     spin_lock_irqsave(&spool->lock, flags);
0219
0220     if (spool->max_hpages != -1)        /* maximum size accounting */
0221         spool->used_hpages -= delta;
0222
0223      /* minimum size accounting */
0224     if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) {
0225         if (spool->rsv_hpages + delta <= spool->min_hpages)
0226             ret = 0;
0227         else
0228             ret = spool->rsv_hpages + delta - spool->min_hpages;
0229
0230         spool->rsv_hpages += delta;
0231         if (spool->rsv_hpages > spool->min_hpages)
0232             spool->rsv_hpages = spool->min_hpages;
0233     }
0234
0235     /*
0236      * If hugetlbfs_put_super couldn't free spool due to an outstanding
0237      * quota reference, free it now.
0238      */
0239     unlock_or_release_subpool(spool, flags);
0240
0241     return ret;
0242 }
0243
0244 static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
0245 {
0246     return HUGETLBFS_SB(inode->i_sb)->spool;
0247 }
0248
0249 static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
0250 {
0251     return subpool_inode(file_inode(vma->vm_file));
0252 }
0253
0254 /* Helper that removes a struct file_region from the resv_map cache and returns
0255  * it for use.
0256  */
0257 static struct file_region *
0258 get_file_region_entry_from_cache(struct resv_map *resv, long from, long to)
0259 {
0260     struct file_region *nrg = NULL;
0261
0262     VM_BUG_ON(resv->region_cache_count <= 0);
0263
0264     resv->region_cache_count--;
0265     nrg = list_first_entry(&resv->region_cache, struct file_region, link);
0266     list_del(&nrg->link);
0267
0268     nrg->from = from;
0269     nrg->to = to;
0270
0271     return nrg;
0272 }
0273
0274 static void copy_hugetlb_cgroup_uncharge_info(struct file_region *nrg,
0275                           struct file_region *rg)
0276 {
0277 #ifdef CONFIG_CGROUP_HUGETLB
0278     nrg->reservation_counter = rg->reservation_counter;
0279     nrg->css = rg->css;
0280     if (rg->css)
0281         css_get(rg->css);
0282 #endif
0283 }
0284
0285 /* Helper that records hugetlb_cgroup uncharge info. */
0286 static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg,
0287                         struct hstate *h,
0288                         struct resv_map *resv,
0289                         struct file_region *nrg)
0290 {
0291 #ifdef CONFIG_CGROUP_HUGETLB
0292     if (h_cg) {
0293         nrg->reservation_counter =
0294             &h_cg->rsvd_hugepage[hstate_index(h)];
0295         nrg->css = &h_cg->css;
0296         /*
0297          * The caller will hold exactly one h_cg->css reference for the
0298          * whole contiguous reservation region. But this area might be
0299          * scattered when there are already some file_regions reside in
0300          * it. As a result, many file_regions may share only one css
0301          * reference. In order to ensure that one file_region must hold
0302          * exactly one h_cg->css reference, we should do css_get for
0303          * each file_region and leave the reference held by caller
0304          * untouched.
0305          */
0306         css_get(&h_cg->css);
0307         if (!resv->pages_per_hpage)
0308             resv->pages_per_hpage = pages_per_huge_page(h);
0309         /* pages_per_hpage should be the same for all entries in
0310          * a resv_map.
0311          */
0312         VM_BUG_ON(resv->pages_per_hpage != pages_per_huge_page(h));
0313     } else {
0314         nrg->reservation_counter = NULL;
0315         nrg->css = NULL;
0316     }
0317 #endif
0318 }
0319
0320 static void put_uncharge_info(struct file_region *rg)
0321 {
0322 #ifdef CONFIG_CGROUP_HUGETLB
0323     if (rg->css)
0324         css_put(rg->css);
0325 #endif
0326 }
0327
0328 static bool has_same_uncharge_info(struct file_region *rg,
0329                    struct file_region *org)
0330 {
0331 #ifdef CONFIG_CGROUP_HUGETLB
0332     return rg->reservation_counter == org->reservation_counter &&
0333            rg->css == org->css;
0334
0335 #else
0336     return true;
0337 #endif
0338 }
0339
0340 static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
0341 {
0342     struct file_region *nrg = NULL, *prg = NULL;
0343
0344     prg = list_prev_entry(rg, link);
0345     if (&prg->link != &resv->regions && prg->to == rg->from &&
0346         has_same_uncharge_info(prg, rg)) {
0347         prg->to = rg->to;
0348
0349         list_del(&rg->link);
0350         put_uncharge_info(rg);
0351         kfree(rg);
0352
0353         rg = prg;
0354     }
0355
0356     nrg = list_next_entry(rg, link);
0357     if (&nrg->link != &resv->regions && nrg->from == rg->to &&
0358         has_same_uncharge_info(nrg, rg)) {
0359         nrg->from = rg->from;
0360
0361         list_del(&rg->link);
0362         put_uncharge_info(rg);
0363         kfree(rg);
0364     }
0365 }
0366
0367 static inline long
0368 hugetlb_resv_map_add(struct resv_map *map, struct list_head *rg, long from,
0369              long to, struct hstate *h, struct hugetlb_cgroup *cg,
0370              long *regions_needed)
0371 {
0372     struct file_region *nrg;
0373
0374     if (!regions_needed) {
0375         nrg = get_file_region_entry_from_cache(map, from, to);
0376         record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg);
0377         list_add(&nrg->link, rg);
0378         coalesce_file_region(map, nrg);
0379     } else
0380         *regions_needed += 1;
0381
0382     return to - from;
0383 }
0384
0385 /*
0386  * Must be called with resv->lock held.
0387  *
0388  * Calling this with regions_needed != NULL will count the number of pages
0389  * to be added but will not modify the linked list. And regions_needed will
0390  * indicate the number of file_regions needed in the cache to carry out to add
0391  * the regions for this range.
0392  */
0393 static long add_reservation_in_range(struct resv_map *resv, long f, long t,
0394                      struct hugetlb_cgroup *h_cg,
0395                      struct hstate *h, long *regions_needed)
0396 {
0397     long add = 0;
0398     struct list_head *head = &resv->regions;
0399     long last_accounted_offset = f;
0400     struct file_region *iter, *trg = NULL;
0401     struct list_head *rg = NULL;
0402
0403     if (regions_needed)
0404         *regions_needed = 0;
0405
0406     /* In this loop, we essentially handle an entry for the range
0407      * [last_accounted_offset, iter->from), at every iteration, with some
0408      * bounds checking.
0409      */
0410     list_for_each_entry_safe(iter, trg, head, link) {
0411         /* Skip irrelevant regions that start before our range. */
0412         if (iter->from < f) {
0413             /* If this region ends after the last accounted offset,
0414              * then we need to update last_accounted_offset.
0415              */
0416             if (iter->to > last_accounted_offset)
0417                 last_accounted_offset = iter->to;
0418             continue;
0419         }
0420
0421         /* When we find a region that starts beyond our range, we've
0422          * finished.
0423          */
0424         if (iter->from >= t) {
0425             rg = iter->link.prev;
0426             break;
0427         }
0428
0429         /* Add an entry for last_accounted_offset -> iter->from, and
0430          * update last_accounted_offset.
0431          */
0432         if (iter->from > last_accounted_offset)
0433             add += hugetlb_resv_map_add(resv, iter->link.prev,
0434                             last_accounted_offset,
0435                             iter->from, h, h_cg,
0436                             regions_needed);
0437
0438         last_accounted_offset = iter->to;
0439     }
0440
0441     /* Handle the case where our range extends beyond
0442      * last_accounted_offset.
0443      */
0444     if (!rg)
0445         rg = head->prev;
0446     if (last_accounted_offset < t)
0447         add += hugetlb_resv_map_add(resv, rg, last_accounted_offset,
0448                         t, h, h_cg, regions_needed);
0449
0450     return add;
0451 }
0452
0453 /* Must be called with resv->lock acquired. Will drop lock to allocate entries.
0454  */
0455 static int allocate_file_region_entries(struct resv_map *resv,
0456                     int regions_needed)
0457     __must_hold(&resv->lock)
0458 {
0459     struct list_head allocated_regions;
0460     int to_allocate = 0, i = 0;
0461     struct file_region *trg = NULL, *rg = NULL;
0462
0463     VM_BUG_ON(regions_needed < 0);
0464
0465     INIT_LIST_HEAD(&allocated_regions);
0466
0467     /*
0468      * Check for sufficient descriptors in the cache to accommodate
0469      * the number of in progress add operations plus regions_needed.
0470      *
0471      * This is a while loop because when we drop the lock, some other call
0472      * to region_add or region_del may have consumed some region_entries,
0473      * so we keep looping here until we finally have enough entries for
0474      * (adds_in_progress + regions_needed).
0475      */
0476     while (resv->region_cache_count <
0477            (resv->adds_in_progress + regions_needed)) {
0478         to_allocate = resv->adds_in_progress + regions_needed -
0479                   resv->region_cache_count;
0480
0481         /* At this point, we should have enough entries in the cache
0482          * for all the existing adds_in_progress. We should only be
0483          * needing to allocate for regions_needed.
0484          */
0485         VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress);
0486
0487         spin_unlock(&resv->lock);
0488         for (i = 0; i < to_allocate; i++) {
0489             trg = kmalloc(sizeof(*trg), GFP_KERNEL);
0490             if (!trg)
0491                 goto out_of_memory;
0492             list_add(&trg->link, &allocated_regions);
0493         }
0494
0495         spin_lock(&resv->lock);
0496
0497         list_splice(&allocated_regions, &resv->region_cache);
0498         resv->region_cache_count += to_allocate;
0499     }
0500
0501     return 0;
0502
0503 out_of_memory:
0504     list_for_each_entry_safe(rg, trg, &allocated_regions, link) {
0505         list_del(&rg->link);
0506         kfree(rg);
0507     }
0508     return -ENOMEM;
0509 }
0510
0511 /*
0512  * Add the huge page range represented by [f, t) to the reserve
0513  * map.  Regions will be taken from the cache to fill in this range.
0514  * Sufficient regions should exist in the cache due to the previous
0515  * call to region_chg with the same range, but in some cases the cache will not
0516  * have sufficient entries due to races with other code doing region_add or
0517  * region_del.  The extra needed entries will be allocated.
0518  *
0519  * regions_needed is the out value provided by a previous call to region_chg.
0520  *
0521  * Return the number of new huge pages added to the map.  This number is greater
0522  * than or equal to zero.  If file_region entries needed to be allocated for
0523  * this operation and we were not able to allocate, it returns -ENOMEM.
0524  * region_add of regions of length 1 never allocate file_regions and cannot
0525  * fail; region_chg will always allocate at least 1 entry and a region_add for
0526  * 1 page will only require at most 1 entry.
0527  */
0528 static long region_add(struct resv_map *resv, long f, long t,
0529                long in_regions_needed, struct hstate *h,
0530                struct hugetlb_cgroup *h_cg)
0531 {
0532     long add = 0, actual_regions_needed = 0;
0533
0534     spin_lock(&resv->lock);
0535 retry:
0536
0537     /* Count how many regions are actually needed to execute this add. */
0538     add_reservation_in_range(resv, f, t, NULL, NULL,
0539                  &actual_regions_needed);
0540
0541     /*
0542      * Check for sufficient descriptors in the cache to accommodate
0543      * this add operation. Note that actual_regions_needed may be greater
0544      * than in_regions_needed, as the resv_map may have been modified since
0545      * the region_chg call. In this case, we need to make sure that we
0546      * allocate extra entries, such that we have enough for all the
0547      * existing adds_in_progress, plus the excess needed for this
0548      * operation.
0549      */
0550     if (actual_regions_needed > in_regions_needed &&
0551         resv->region_cache_count <
0552             resv->adds_in_progress +
0553                 (actual_regions_needed - in_regions_needed)) {
0554         /* region_add operation of range 1 should never need to
0555          * allocate file_region entries.
0556          */
0557         VM_BUG_ON(t - f <= 1);
0558
0559         if (allocate_file_region_entries(
0560                 resv, actual_regions_needed - in_regions_needed)) {
0561             return -ENOMEM;
0562         }
0563
0564         goto retry;
0565     }
0566
0567     add = add_reservation_in_range(resv, f, t, h_cg, h, NULL);
0568
0569     resv->adds_in_progress -= in_regions_needed;
0570
0571     spin_unlock(&resv->lock);
0572     return add;
0573 }
0574
0575 /*
0576  * Examine the existing reserve map and determine how many
0577  * huge pages in the specified range [f, t) are NOT currently
0578  * represented.  This routine is called before a subsequent
0579  * call to region_add that will actually modify the reserve
0580  * map to add the specified range [f, t).  region_chg does
0581  * not change the number of huge pages represented by the
0582  * map.  A number of new file_region structures is added to the cache as a
0583  * placeholder, for the subsequent region_add call to use. At least 1
0584  * file_region structure is added.
0585  *
0586  * out_regions_needed is the number of regions added to the
0587  * resv->adds_in_progress.  This value needs to be provided to a follow up call
0588  * to region_add or region_abort for proper accounting.
0589  *
0590  * Returns the number of huge pages that need to be added to the existing
0591  * reservation map for the range [f, t).  This number is greater or equal to
0592  * zero.  -ENOMEM is returned if a new file_region structure or cache entry
0593  * is needed and can not be allocated.
0594  */
0595 static long region_chg(struct resv_map *resv, long f, long t,
0596                long *out_regions_needed)
0597 {
0598     long chg = 0;
0599
0600     spin_lock(&resv->lock);
0601
0602     /* Count how many hugepages in this range are NOT represented. */
0603     chg = add_reservation_in_range(resv, f, t, NULL, NULL,
0604                        out_regions_needed);
0605
0606     if (*out_regions_needed == 0)
0607         *out_regions_needed = 1;
0608
0609     if (allocate_file_region_entries(resv, *out_regions_needed))
0610         return -ENOMEM;
0611
0612     resv->adds_in_progress += *out_regions_needed;
0613
0614     spin_unlock(&resv->lock);
0615     return chg;
0616 }
0617
0618 /*
0619  * Abort the in progress add operation.  The adds_in_progress field
0620  * of the resv_map keeps track of the operations in progress between
0621  * calls to region_chg and region_add.  Operations are sometimes
0622  * aborted after the call to region_chg.  In such cases, region_abort
0623  * is called to decrement the adds_in_progress counter. regions_needed
0624  * is the value returned by the region_chg call, it is used to decrement
0625  * the adds_in_progress counter.
0626  *
0627  * NOTE: The range arguments [f, t) are not needed or used in this
0628  * routine.  They are kept to make reading the calling code easier as
0629  * arguments will match the associated region_chg call.
0630  */
0631 static void region_abort(struct resv_map *resv, long f, long t,
0632              long regions_needed)
0633 {
0634     spin_lock(&resv->lock);
0635     VM_BUG_ON(!resv->region_cache_count);
0636     resv->adds_in_progress -= regions_needed;
0637     spin_unlock(&resv->lock);
0638 }
0639
0640 /*
0641  * Delete the specified range [f, t) from the reserve map.  If the
0642  * t parameter is LONG_MAX, this indicates that ALL regions after f
0643  * should be deleted.  Locate the regions which intersect [f, t)
0644  * and either trim, delete or split the existing regions.
0645  *
0646  * Returns the number of huge pages deleted from the reserve map.
0647  * In the normal case, the return value is zero or more.  In the
0648  * case where a region must be split, a new region descriptor must
0649  * be allocated.  If the allocation fails, -ENOMEM will be returned.
0650  * NOTE: If the parameter t == LONG_MAX, then we will never split
0651  * a region and possibly return -ENOMEM.  Callers specifying
0652  * t == LONG_MAX do not need to check for -ENOMEM error.
0653  */
0654 static long region_del(struct resv_map *resv, long f, long t)
0655 {
0656     struct list_head *head = &resv->regions;
0657     struct file_region *rg, *trg;
0658     struct file_region *nrg = NULL;
0659     long del = 0;
0660
0661 retry:
0662     spin_lock(&resv->lock);
0663     list_for_each_entry_safe(rg, trg, head, link) {
0664         /*
0665          * Skip regions before the range to be deleted.  file_region
0666          * ranges are normally of the form [from, to).  However, there
0667          * may be a "placeholder" entry in the map which is of the form
0668          * (from, to) with from == to.  Check for placeholder entries
0669          * at the beginning of the range to be deleted.
0670          */
0671         if (rg->to <= f && (rg->to != rg->from || rg->to != f))
0672             continue;
0673
0674         if (rg->from >= t)
0675             break;
0676
0677         if (f > rg->from && t < rg->to) { /* Must split region */
0678             /*
0679              * Check for an entry in the cache before dropping
0680              * lock and attempting allocation.
0681              */
0682             if (!nrg &&
0683                 resv->region_cache_count > resv->adds_in_progress) {
0684                 nrg = list_first_entry(&resv->region_cache,
0685                             struct file_region,
0686                             link);
0687                 list_del(&nrg->link);
0688                 resv->region_cache_count--;
0689             }
0690
0691             if (!nrg) {
0692                 spin_unlock(&resv->lock);
0693                 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
0694                 if (!nrg)
0695                     return -ENOMEM;
0696                 goto retry;
0697             }
0698
0699             del += t - f;
0700             hugetlb_cgroup_uncharge_file_region(
0701                 resv, rg, t - f, false);
0702
0703             /* New entry for end of split region */
0704             nrg->from = t;
0705             nrg->to = rg->to;
0706
0707             copy_hugetlb_cgroup_uncharge_info(nrg, rg);
0708
0709             INIT_LIST_HEAD(&nrg->link);
0710
0711             /* Original entry is trimmed */
0712             rg->to = f;
0713
0714             list_add(&nrg->link, &rg->link);
0715             nrg = NULL;
0716             break;
0717         }
0718
0719         if (f <= rg->from && t >= rg->to) { /* Remove entire region */
0720             del += rg->to - rg->from;
0721             hugetlb_cgroup_uncharge_file_region(resv, rg,
0722                                 rg->to - rg->from, true);
0723             list_del(&rg->link);
0724             kfree(rg);
0725             continue;
0726         }
0727
0728         if (f <= rg->from) {    /* Trim beginning of region */
0729             hugetlb_cgroup_uncharge_file_region(resv, rg,
0730                                 t - rg->from, false);
0731
0732             del += t - rg->from;
0733             rg->from = t;
0734         } else {        /* Trim end of region */
0735             hugetlb_cgroup_uncharge_file_region(resv, rg,
0736                                 rg->to - f, false);
0737
0738             del += rg->to - f;
0739             rg->to = f;
0740         }
0741     }
0742
0743     spin_unlock(&resv->lock);
0744     kfree(nrg);
0745     return del;
0746 }
0747
0748 /*
0749  * A rare out of memory error was encountered which prevented removal of
0750  * the reserve map region for a page.  The huge page itself was free'ed
0751  * and removed from the page cache.  This routine will adjust the subpool
0752  * usage count, and the global reserve count if needed.  By incrementing
0753  * these counts, the reserve map entry which could not be deleted will
0754  * appear as a "reserved" entry instead of simply dangling with incorrect
0755  * counts.
0756  */
0757 void hugetlb_fix_reserve_counts(struct inode *inode)
0758 {
0759     struct hugepage_subpool *spool = subpool_inode(inode);
0760     long rsv_adjust;
0761     bool reserved = false;
0762
0763     rsv_adjust = hugepage_subpool_get_pages(spool, 1);
0764     if (rsv_adjust > 0) {
0765         struct hstate *h = hstate_inode(inode);
0766
0767         if (!hugetlb_acct_memory(h, 1))
0768             reserved = true;
0769     } else if (!rsv_adjust) {
0770         reserved = true;
0771     }
0772
0773     if (!reserved)
0774         pr_warn("hugetlb: Huge Page Reserved count may go negative.\n");
0775 }
0776
0777 /*
0778  * Count and return the number of huge pages in the reserve map
0779  * that intersect with the range [f, t).
0780  */
0781 static long region_count(struct resv_map *resv, long f, long t)
0782 {
0783     struct list_head *head = &resv->regions;
0784     struct file_region *rg;
0785     long chg = 0;
0786
0787     spin_lock(&resv->lock);
0788     /* Locate each segment we overlap with, and count that overlap. */
0789     list_for_each_entry(rg, head, link) {
0790         long seg_from;
0791         long seg_to;
0792
0793         if (rg->to <= f)
0794             continue;
0795         if (rg->from >= t)
0796             break;
0797
0798         seg_from = max(rg->from, f);
0799         seg_to = min(rg->to, t);
0800
0801         chg += seg_to - seg_from;
0802     }
0803     spin_unlock(&resv->lock);
0804
0805     return chg;
0806 }
0807
0808 /*
0809  * Convert the address within this vma to the page offset within
0810  * the mapping, in pagecache page units; huge pages here.
0811  */
0812 static pgoff_t vma_hugecache_offset(struct hstate *h,
0813             struct vm_area_struct *vma, unsigned long address)
0814 {
0815     return ((address - vma->vm_start) >> huge_page_shift(h)) +
0816             (vma->vm_pgoff >> huge_page_order(h));
0817 }
0818
0819 pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
0820                      unsigned long address)
0821 {
0822     return vma_hugecache_offset(hstate_vma(vma), vma, address);
0823 }
0824 EXPORT_SYMBOL_GPL(linear_hugepage_index);
0825
0826 /*
0827  * Return the size of the pages allocated when backing a VMA. In the majority
0828  * cases this will be same size as used by the page table entries.
0829  */
0830 unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
0831 {
0832     if (vma->vm_ops && vma->vm_ops->pagesize)
0833         return vma->vm_ops->pagesize(vma);
0834     return PAGE_SIZE;
0835 }
0836 EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
0837
0838 /*
0839  * Return the page size being used by the MMU to back a VMA. In the majority
0840  * of cases, the page size used by the kernel matches the MMU size. On
0841  * architectures where it differs, an architecture-specific 'strong'
0842  * version of this symbol is required.
0843  */
0844 __weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
0845 {
0846     return vma_kernel_pagesize(vma);
0847 }
0848
0849 /*
0850  * Flags for MAP_PRIVATE reservations.  These are stored in the bottom
0851  * bits of the reservation map pointer, which are always clear due to
0852  * alignment.
0853  */
0854 #define HPAGE_RESV_OWNER    (1UL << 0)
0855 #define HPAGE_RESV_UNMAPPED (1UL << 1)
0856 #define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
0857
0858 /*
0859  * These helpers are used to track how many pages are reserved for
0860  * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
0861  * is guaranteed to have their future faults succeed.
0862  *
0863  * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
0864  * the reserve counters are updated with the hugetlb_lock held. It is safe
0865  * to reset the VMA at fork() time as it is not in use yet and there is no
0866  * chance of the global counters getting corrupted as a result of the values.
0867  *
0868  * The private mapping reservation is represented in a subtly different
0869  * manner to a shared mapping.  A shared mapping has a region map associated
0870  * with the underlying file, this region map represents the backing file
0871  * pages which have ever had a reservation assigned which this persists even
0872  * after the page is instantiated.  A private mapping has a region map
0873  * associated with the original mmap which is attached to all VMAs which
0874  * reference it, this region map represents those offsets which have consumed
0875  * reservation ie. where pages have been instantiated.
0876  */
0877 static unsigned long get_vma_private_data(struct vm_area_struct *vma)
0878 {
0879     return (unsigned long)vma->vm_private_data;
0880 }
0881
0882 static void set_vma_private_data(struct vm_area_struct *vma,
0883                             unsigned long value)
0884 {
0885     vma->vm_private_data = (void *)value;
0886 }
0887
0888 static void
0889 resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map,
0890                       struct hugetlb_cgroup *h_cg,
0891                       struct hstate *h)
0892 {
0893 #ifdef CONFIG_CGROUP_HUGETLB
0894     if (!h_cg || !h) {
0895         resv_map->reservation_counter = NULL;
0896         resv_map->pages_per_hpage = 0;
0897         resv_map->css = NULL;
0898     } else {
0899         resv_map->reservation_counter =
0900             &h_cg->rsvd_hugepage[hstate_index(h)];
0901         resv_map->pages_per_hpage = pages_per_huge_page(h);
0902         resv_map->css = &h_cg->css;
0903     }
0904 #endif
0905 }
0906
0907 struct resv_map *resv_map_alloc(void)
0908 {
0909     struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
0910     struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL);
0911
0912     if (!resv_map || !rg) {
0913         kfree(resv_map);
0914         kfree(rg);
0915         return NULL;
0916     }
0917
0918     kref_init(&resv_map->refs);
0919     spin_lock_init(&resv_map->lock);
0920     INIT_LIST_HEAD(&resv_map->regions);
0921
0922     resv_map->adds_in_progress = 0;
0923     /*
0924      * Initialize these to 0. On shared mappings, 0's here indicate these
0925      * fields don't do cgroup accounting. On private mappings, these will be
0926      * re-initialized to the proper values, to indicate that hugetlb cgroup
0927      * reservations are to be un-charged from here.
0928      */
0929     resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, NULL, NULL);
0930
0931     INIT_LIST_HEAD(&resv_map->region_cache);
0932     list_add(&rg->link, &resv_map->region_cache);
0933     resv_map->region_cache_count = 1;
0934
0935     return resv_map;
0936 }
0937
0938 void resv_map_release(struct kref *ref)
0939 {
0940     struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
0941     struct list_head *head = &resv_map->region_cache;
0942     struct file_region *rg, *trg;
0943
0944     /* Clear out any active regions before we release the map. */
0945     region_del(resv_map, 0, LONG_MAX);
0946
0947     /* ... and any entries left in the cache */
0948     list_for_each_entry_safe(rg, trg, head, link) {
0949         list_del(&rg->link);
0950         kfree(rg);
0951     }
0952
0953     VM_BUG_ON(resv_map->adds_in_progress);
0954
0955     kfree(resv_map);
0956 }
0957
0958 static inline struct resv_map *inode_resv_map(struct inode *inode)
0959 {
0960     /*
0961      * At inode evict time, i_mapping may not point to the original
0962      * address space within the inode.  This original address space
0963      * contains the pointer to the resv_map.  So, always use the
0964      * address space embedded within the inode.
0965      * The VERY common case is inode->mapping == &inode->i_data but,
0966      * this may not be true for device special inodes.
0967      */
0968     return (struct resv_map *)(&inode->i_data)->private_data;
0969 }
0970
0971 static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
0972 {
0973     VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
0974     if (vma->vm_flags & VM_MAYSHARE) {
0975         struct address_space *mapping = vma->vm_file->f_mapping;
0976         struct inode *inode = mapping->host;
0977
0978         return inode_resv_map(inode);
0979
0980     } else {
0981         return (struct resv_map *)(get_vma_private_data(vma) &
0982                             ~HPAGE_RESV_MASK);
0983     }
0984 }
0985
0986 static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
0987 {
0988     VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
0989     VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
0990
0991     set_vma_private_data(vma, (get_vma_private_data(vma) &
0992                 HPAGE_RESV_MASK) | (unsigned long)map);
0993 }
0994
0995 static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
0996 {
0997     VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
0998     VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
0999
1000     set_vma_private_data(vma, get_vma_private_data(vma) | flags);
1001 }
1002
1003 static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
1004 {
1005     VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
1006
1007     return (get_vma_private_data(vma) & flag) != 0;
1008 }
1009
1010 /* Reset counters to 0 and clear all HPAGE_RESV_* flags */
1011 void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
1012 {
1013     VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
1014     if (!(vma->vm_flags & VM_MAYSHARE))
1015         vma->vm_private_data = (void *)0;
1016 }
1017
1018 /*
1019  * Reset and decrement one ref on hugepage private reservation.
1020  * Called with mm->mmap_sem writer semaphore held.
1021  * This function should be only used by move_vma() and operate on
1022  * same sized vma. It should never come here with last ref on the
1023  * reservation.
1024  */
1025 void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
1026 {
1027     /*
1028      * Clear the old hugetlb private page reservation.
1029      * It has already been transferred to new_vma.
1030      *
1031      * During a mremap() operation of a hugetlb vma we call move_vma()
1032      * which copies vma into new_vma and unmaps vma. After the copy
1033      * operation both new_vma and vma share a reference to the resv_map
1034      * struct, and at that point vma is about to be unmapped. We don't
1035      * want to return the reservation to the pool at unmap of vma because
1036      * the reservation still lives on in new_vma, so simply decrement the
1037      * ref here and remove the resv_map reference from this vma.
1038      */
1039     struct resv_map *reservations = vma_resv_map(vma);
1040
1041     if (reservations && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
1042         resv_map_put_hugetlb_cgroup_uncharge_info(reservations);
1043         kref_put(&reservations->refs, resv_map_release);
1044     }
1045
1046     reset_vma_resv_huge_pages(vma);
1047 }
1048
1049 /* Returns true if the VMA has associated reserve pages */
1050 static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
1051 {
1052     if (vma->vm_flags & VM_NORESERVE) {
1053         /*
1054          * This address is already reserved by other process(chg == 0),
1055          * so, we should decrement reserved count. Without decrementing,
1056          * reserve count remains after releasing inode, because this
1057          * allocated page will go into page cache and is regarded as
1058          * coming from reserved pool in releasing step.  Currently, we
1059          * don't have any other solution to deal with this situation
1060          * properly, so add work-around here.
1061          */
1062         if (vma->vm_flags & VM_MAYSHARE && chg == 0)
1063             return true;
1064         else
1065             return false;
1066     }
1067
1068     /* Shared mappings always use reserves */
1069     if (vma->vm_flags & VM_MAYSHARE) {
1070         /*
1071          * We know VM_NORESERVE is not set.  Therefore, there SHOULD
1072          * be a region map for all pages.  The only situation where
1073          * there is no region map is if a hole was punched via
1074          * fallocate.  In this case, there really are no reserves to
1075          * use.  This situation is indicated if chg != 0.
1076          */
1077         if (chg)
1078             return false;
1079         else
1080             return true;
1081     }
1082
1083     /*
1084      * Only the process that called mmap() has reserves for
1085      * private mappings.
1086      */
1087     if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
1088         /*
1089          * Like the shared case above, a hole punch or truncate
1090          * could have been performed on the private mapping.
1091          * Examine the value of chg to determine if reserves
1092          * actually exist or were previously consumed.
1093          * Very Subtle - The value of chg comes from a previous
1094          * call to vma_needs_reserves().  The reserve map for
1095          * private mappings has different (opposite) semantics
1096          * than that of shared mappings.  vma_needs_reserves()
1097          * has already taken this difference in semantics into
1098          * account.  Therefore, the meaning of chg is the same
1099          * as in the shared case above.  Code could easily be
1100          * combined, but keeping it separate draws attention to
1101          * subtle differences.
1102          */
1103         if (chg)
1104             return false;
1105         else
1106             return true;
1107     }
1108
1109     return false;
1110 }
1111
1112 static void enqueue_huge_page(struct hstate *h, struct page *page)
1113 {
1114     int nid = page_to_nid(page);
1115
1116     lockdep_assert_held(&hugetlb_lock);
1117     VM_BUG_ON_PAGE(page_count(page), page);
1118
1119     list_move(&page->lru, &h->hugepage_freelists[nid]);
1120     h->free_huge_pages++;
1121     h->free_huge_pages_node[nid]++;
1122     SetHPageFreed(page);
1123 }
1124
1125 static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
1126 {
1127     struct page *page;
1128     bool pin = !!(current->flags & PF_MEMALLOC_PIN);
1129
1130     lockdep_assert_held(&hugetlb_lock);
1131     list_for_each_entry(page, &h->hugepage_freelists[nid], lru) {
1132         if (pin && !is_longterm_pinnable_page(page))
1133             continue;
1134
1135         if (PageHWPoison(page))
1136             continue;
1137
1138         list_move(&page->lru, &h->hugepage_activelist);
1139         set_page_refcounted(page);
1140         ClearHPageFreed(page);
1141         h->free_huge_pages--;
1142         h->free_huge_pages_node[nid]--;
1143         return page;
1144     }
1145
1146     return NULL;
1147 }
1148
1149 static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid,
1150         nodemask_t *nmask)
1151 {
1152     unsigned int cpuset_mems_cookie;
1153     struct zonelist *zonelist;
1154     struct zone *zone;
1155     struct zoneref *z;
1156     int node = NUMA_NO_NODE;
1157
1158     zonelist = node_zonelist(nid, gfp_mask);
1159
1160 retry_cpuset:
1161     cpuset_mems_cookie = read_mems_allowed_begin();
1162     for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) {
1163         struct page *page;
1164
1165         if (!cpuset_zone_allowed(zone, gfp_mask))
1166             continue;
1167         /*
1168          * no need to ask again on the same node. Pool is node rather than
1169          * zone aware
1170          */
1171         if (zone_to_nid(zone) == node)
1172             continue;
1173         node = zone_to_nid(zone);
1174
1175         page = dequeue_huge_page_node_exact(h, node);
1176         if (page)
1177             return page;
1178     }
1179     if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie)))
1180         goto retry_cpuset;
1181
1182     return NULL;
1183 }
1184
1185 static struct page *dequeue_huge_page_vma(struct hstate *h,
1186                 struct vm_area_struct *vma,
1187                 unsigned long address, int avoid_reserve,
1188                 long chg)
1189 {
1190     struct page *page = NULL;
1191     struct mempolicy *mpol;
1192     gfp_t gfp_mask;
1193     nodemask_t *nodemask;
1194     int nid;
1195
1196     /*
1197      * A child process with MAP_PRIVATE mappings created by their parent
1198      * have no page reserves. This check ensures that reservations are
1199      * not "stolen". The child may still get SIGKILLed
1200      */
1201     if (!vma_has_reserves(vma, chg) &&
1202             h->free_huge_pages - h->resv_huge_pages == 0)
1203         goto err;
1204
1205     /* If reserves cannot be used, ensure enough pages are in the pool */
1206     if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
1207         goto err;
1208
1209     gfp_mask = htlb_alloc_mask(h);
1210     nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
1211
1212     if (mpol_is_preferred_many(mpol)) {
1213         page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
1214
1215         /* Fallback to all nodes if page==NULL */
1216         nodemask = NULL;
1217     }
1218
1219     if (!page)
1220         page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
1221
1222     if (page && !avoid_reserve && vma_has_reserves(vma, chg)) {
1223         SetHPageRestoreReserve(page);
1224         h->resv_huge_pages--;
1225     }
1226
1227     mpol_cond_put(mpol);
1228     return page;
1229
1230 err:
1231     return NULL;
1232 }
1233
1234 /*
1235  * common helper functions for hstate_next_node_to_{alloc|free}.
1236  * We may have allocated or freed a huge page based on a different
1237  * nodes_allowed previously, so h->next_node_to_{alloc|free} might
1238  * be outside of *nodes_allowed.  Ensure that we use an allowed
1239  * node for alloc or free.
1240  */
1241 static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
1242 {
1243     nid = next_node_in(nid, *nodes_allowed);
1244     VM_BUG_ON(nid >= MAX_NUMNODES);
1245
1246     return nid;
1247 }
1248
1249 static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
1250 {
1251     if (!node_isset(nid, *nodes_allowed))
1252         nid = next_node_allowed(nid, nodes_allowed);
1253     return nid;
1254 }
1255
1256 /*
1257  * returns the previously saved node ["this node"] from which to
1258  * allocate a persistent huge page for the pool and advance the
1259  * next node from which to allocate, handling wrap at end of node
1260  * mask.
1261  */
1262 static int hstate_next_node_to_alloc(struct hstate *h,
1263                     nodemask_t *nodes_allowed)
1264 {
1265     int nid;
1266
1267     VM_BUG_ON(!nodes_allowed);
1268
1269     nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
1270     h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
1271
1272     return nid;
1273 }
1274
1275 /*
1276  * helper for remove_pool_huge_page() - return the previously saved
1277  * node ["this node"] from which to free a huge page.  Advance the
1278  * next node id whether or not we find a free huge page to free so
1279  * that the next attempt to free addresses the next node.
1280  */
1281 static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
1282 {
1283     int nid;
1284
1285     VM_BUG_ON(!nodes_allowed);
1286
1287     nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
1288     h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
1289
1290     return nid;
1291 }
1292
1293 #define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask)       \
1294     for (nr_nodes = nodes_weight(*mask);                \
1295         nr_nodes > 0 &&                     \
1296         ((node = hstate_next_node_to_alloc(hs, mask)) || 1);    \
1297         nr_nodes--)
1298
1299 #define for_each_node_mask_to_free(hs, nr_nodes, node, mask)        \
1300     for (nr_nodes = nodes_weight(*mask);                \
1301         nr_nodes > 0 &&                     \
1302         ((node = hstate_next_node_to_free(hs, mask)) || 1); \
1303         nr_nodes--)
1304
1305 /* used to demote non-gigantic_huge pages as well */
1306 static void __destroy_compound_gigantic_page(struct page *page,
1307                     unsigned int order, bool demote)
1308 {
1309     int i;
1310     int nr_pages = 1 << order;
1311     struct page *p = page + 1;
1312
1313     atomic_set(compound_mapcount_ptr(page), 0);
1314     atomic_set(compound_pincount_ptr(page), 0);
1315
1316     for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
1317         p->mapping = NULL;
1318         clear_compound_head(p);
1319         if (!demote)
1320             set_page_refcounted(p);
1321     }
1322
1323     set_compound_order(page, 0);
1324 #ifdef CONFIG_64BIT
1325     page[1].compound_nr = 0;
1326 #endif
1327     __ClearPageHead(page);
1328 }
1329
1330 static void destroy_compound_hugetlb_page_for_demote(struct page *page,
1331                     unsigned int order)
1332 {
1333     __destroy_compound_gigantic_page(page, order, true);
1334 }
1335
1336 #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
1337 static void destroy_compound_gigantic_page(struct page *page,
1338                     unsigned int order)
1339 {
1340     __destroy_compound_gigantic_page(page, order, false);
1341 }
1342
1343 static void free_gigantic_page(struct page *page, unsigned int order)
1344 {
1345     /*
1346      * If the page isn't allocated using the cma allocator,
1347      * cma_release() returns false.
1348      */
1349 #ifdef CONFIG_CMA
1350     if (cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order))
1351         return;
1352 #endif
1353
1354     free_contig_range(page_to_pfn(page), 1 << order);
1355 }
1356
1357 #ifdef CONFIG_CONTIG_ALLOC
1358 static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
1359         int nid, nodemask_t *nodemask)
1360 {
1361     unsigned long nr_pages = pages_per_huge_page(h);
1362     if (nid == NUMA_NO_NODE)
1363         nid = numa_mem_id();
1364
1365 #ifdef CONFIG_CMA
1366     {
1367         struct page *page;
1368         int node;
1369
1370         if (hugetlb_cma[nid]) {
1371             page = cma_alloc(hugetlb_cma[nid], nr_pages,
1372                     huge_page_order(h), true);
1373             if (page)
1374                 return page;
1375         }
1376
1377         if (!(gfp_mask & __GFP_THISNODE)) {
1378             for_each_node_mask(node, *nodemask) {
1379                 if (node == nid || !hugetlb_cma[node])
1380                     continue;
1381
1382                 page = cma_alloc(hugetlb_cma[node], nr_pages,
1383                         huge_page_order(h), true);
1384                 if (page)
1385                     return page;
1386             }
1387         }
1388     }
1389 #endif
1390
1391     return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
1392 }
1393
1394 #else /* !CONFIG_CONTIG_ALLOC */
1395 static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
1396                     int nid, nodemask_t *nodemask)
1397 {
1398     return NULL;
1399 }
1400 #endif /* CONFIG_CONTIG_ALLOC */
1401
1402 #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
1403 static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
1404                     int nid, nodemask_t *nodemask)
1405 {
1406     return NULL;
1407 }
1408 static inline void free_gigantic_page(struct page *page, unsigned int order) { }
1409 static inline void destroy_compound_gigantic_page(struct page *page,
1410                         unsigned int order) { }
1411 #endif
1412
1413 /*
1414  * Remove hugetlb page from lists, and update dtor so that page appears
1415  * as just a compound page.
1416  *
1417  * A reference is held on the page, except in the case of demote.
1418  *
1419  * Must be called with hugetlb lock held.
1420  */
1421 static void __remove_hugetlb_page(struct hstate *h, struct page *page,
1422                             bool adjust_surplus,
1423                             bool demote)
1424 {
1425     int nid = page_to_nid(page);
1426
1427     VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
1428     VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page);
1429
1430     lockdep_assert_held(&hugetlb_lock);
1431     if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
1432         return;
1433
1434     list_del(&page->lru);
1435
1436     if (HPageFreed(page)) {
1437         h->free_huge_pages--;
1438         h->free_huge_pages_node[nid]--;
1439     }
1440     if (adjust_surplus) {
1441         h->surplus_huge_pages--;
1442         h->surplus_huge_pages_node[nid]--;
1443     }
1444
1445     /*
1446      * Very subtle
1447      *
1448      * For non-gigantic pages set the destructor to the normal compound
1449      * page dtor.  This is needed in case someone takes an additional
1450      * temporary ref to the page, and freeing is delayed until they drop
1451      * their reference.
1452      *
1453      * For gigantic pages set the destructor to the null dtor.  This
1454      * destructor will never be called.  Before freeing the gigantic
1455      * page destroy_compound_gigantic_page will turn the compound page
1456      * into a simple group of pages.  After this the destructor does not
1457      * apply.
1458      *
1459      * This handles the case where more than one ref is held when and
1460      * after update_and_free_page is called.
1461      *
1462      * In the case of demote we do not ref count the page as it will soon
1463      * be turned into a page of smaller size.
1464      */
1465     if (!demote)
1466         set_page_refcounted(page);
1467     if (hstate_is_gigantic(h))
1468         set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
1469     else
1470         set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
1471
1472     h->nr_huge_pages--;
1473     h->nr_huge_pages_node[nid]--;
1474 }
1475
1476 static void remove_hugetlb_page(struct hstate *h, struct page *page,
1477                             bool adjust_surplus)
1478 {
1479     __remove_hugetlb_page(h, page, adjust_surplus, false);
1480 }
1481
1482 static void remove_hugetlb_page_for_demote(struct hstate *h, struct page *page,
1483                             bool adjust_surplus)
1484 {
1485     __remove_hugetlb_page(h, page, adjust_surplus, true);
1486 }
1487
1488 static void add_hugetlb_page(struct hstate *h, struct page *page,
1489                  bool adjust_surplus)
1490 {
1491     int zeroed;
1492     int nid = page_to_nid(page);
1493
1494     VM_BUG_ON_PAGE(!HPageVmemmapOptimized(page), page);
1495
1496     lockdep_assert_held(&hugetlb_lock);
1497
1498     INIT_LIST_HEAD(&page->lru);
1499     h->nr_huge_pages++;
1500     h->nr_huge_pages_node[nid]++;
1501
1502     if (adjust_surplus) {
1503         h->surplus_huge_pages++;
1504         h->surplus_huge_pages_node[nid]++;
1505     }
1506
1507     set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
1508     set_page_private(page, 0);
1509     SetHPageVmemmapOptimized(page);
1510
1511     /*
1512      * This page is about to be managed by the hugetlb allocator and
1513      * should have no users.  Drop our reference, and check for others
1514      * just in case.
1515      */
1516     zeroed = put_page_testzero(page);
1517     if (!zeroed)
1518         /*
1519          * It is VERY unlikely soneone else has taken a ref on
1520          * the page.  In this case, we simply return as the
1521          * hugetlb destructor (free_huge_page) will be called
1522          * when this other ref is dropped.
1523          */
1524         return;
1525
1526     arch_clear_hugepage_flags(page);
1527     enqueue_huge_page(h, page);
1528 }
1529
1530 static void __update_and_free_page(struct hstate *h, struct page *page)
1531 {
1532     int i;
1533     struct page *subpage = page;
1534
1535     if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
1536         return;
1537
1538     /*
1539      * If we don't know which subpages are hwpoisoned, we can't free
1540      * the hugepage, so it's leaked intentionally.
1541      */
1542     if (HPageRawHwpUnreliable(page))
1543         return;
1544
1545     if (hugetlb_vmemmap_restore(h, page)) {
1546         spin_lock_irq(&hugetlb_lock);
1547         /*
1548          * If we cannot allocate vmemmap pages, just refuse to free the
1549          * page and put the page back on the hugetlb free list and treat
1550          * as a surplus page.
1551          */
1552         add_hugetlb_page(h, page, true);
1553         spin_unlock_irq(&hugetlb_lock);
1554         return;
1555     }
1556
1557     /*
1558      * Move PageHWPoison flag from head page to the raw error pages,
1559      * which makes any healthy subpages reusable.
1560      */
1561     if (unlikely(PageHWPoison(page)))
1562         hugetlb_clear_page_hwpoison(page);
1563
1564     for (i = 0; i < pages_per_huge_page(h);
1565          i++, subpage = mem_map_next(subpage, page, i)) {
1566         subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
1567                 1 << PG_referenced | 1 << PG_dirty |
1568                 1 << PG_active | 1 << PG_private |
1569                 1 << PG_writeback);
1570     }
1571
1572     /*
1573      * Non-gigantic pages demoted from CMA allocated gigantic pages
1574      * need to be given back to CMA in free_gigantic_page.
1575      */
1576     if (hstate_is_gigantic(h) ||
1577         hugetlb_cma_page(page, huge_page_order(h))) {
1578         destroy_compound_gigantic_page(page, huge_page_order(h));
1579         free_gigantic_page(page, huge_page_order(h));
1580     } else {
1581         __free_pages(page, huge_page_order(h));
1582     }
1583 }
1584
1585 /*
1586  * As update_and_free_page() can be called under any context, so we cannot
1587  * use GFP_KERNEL to allocate vmemmap pages. However, we can defer the
1588  * actual freeing in a workqueue to prevent from using GFP_ATOMIC to allocate
1589  * the vmemmap pages.
1590  *
1591  * free_hpage_workfn() locklessly retrieves the linked list of pages to be
1592  * freed and frees them one-by-one. As the page->mapping pointer is going
1593  * to be cleared in free_hpage_workfn() anyway, it is reused as the llist_node
1594  * structure of a lockless linked list of huge pages to be freed.
1595  */
1596 static LLIST_HEAD(hpage_freelist);
1597
1598 static void free_hpage_workfn(struct work_struct *work)
1599 {
1600     struct llist_node *node;
1601
1602     node = llist_del_all(&hpage_freelist);
1603
1604     while (node) {
1605         struct page *page;
1606         struct hstate *h;
1607
1608         page = container_of((struct address_space **)node,
1609                      struct page, mapping);
1610         node = node->next;
1611         page->mapping = NULL;
1612         /*
1613          * The VM_BUG_ON_PAGE(!PageHuge(page), page) in page_hstate()
1614          * is going to trigger because a previous call to
1615          * remove_hugetlb_page() will set_compound_page_dtor(page,
1616          * NULL_COMPOUND_DTOR), so do not use page_hstate() directly.
1617          */
1618         h = size_to_hstate(page_size(page));
1619
1620         __update_and_free_page(h, page);
1621
1622         cond_resched();
1623     }
1624 }
1625 static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
1626
1627 static inline void flush_free_hpage_work(struct hstate *h)
1628 {
1629     if (hugetlb_vmemmap_optimizable(h))
1630         flush_work(&free_hpage_work);
1631 }
1632
1633 static void update_and_free_page(struct hstate *h, struct page *page,
1634                  bool atomic)
1635 {
1636     if (!HPageVmemmapOptimized(page) || !atomic) {
1637         __update_and_free_page(h, page);
1638         return;
1639     }
1640
1641     /*
1642      * Defer freeing to avoid using GFP_ATOMIC to allocate vmemmap pages.
1643      *
1644      * Only call schedule_work() if hpage_freelist is previously
1645      * empty. Otherwise, schedule_work() had been called but the workfn
1646      * hasn't retrieved the list yet.
1647      */
1648     if (llist_add((struct llist_node *)&page->mapping, &hpage_freelist))
1649         schedule_work(&free_hpage_work);
1650 }
1651
1652 static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
1653 {
1654     struct page *page, *t_page;
1655
1656     list_for_each_entry_safe(page, t_page, list, lru) {
1657         update_and_free_page(h, page, false);
1658         cond_resched();
1659     }
1660 }
1661
1662 struct hstate *size_to_hstate(unsigned long size)
1663 {
1664     struct hstate *h;
1665
1666     for_each_hstate(h) {
1667         if (huge_page_size(h) == size)
1668             return h;
1669     }
1670     return NULL;
1671 }
1672
1673 void free_huge_page(struct page *page)
1674 {
1675     /*
1676      * Can't pass hstate in here because it is called from the
1677      * compound page destructor.
1678      */
1679     struct hstate *h = page_hstate(page);
1680     int nid = page_to_nid(page);
1681     struct hugepage_subpool *spool = hugetlb_page_subpool(page);
1682     bool restore_reserve;
1683     unsigned long flags;
1684
1685     VM_BUG_ON_PAGE(page_count(page), page);
1686     VM_BUG_ON_PAGE(page_mapcount(page), page);
1687
1688     hugetlb_set_page_subpool(page, NULL);
1689     if (PageAnon(page))
1690         __ClearPageAnonExclusive(page);
1691     page->mapping = NULL;
1692     restore_reserve = HPageRestoreReserve(page);
1693     ClearHPageRestoreReserve(page);
1694
1695     /*
1696      * If HPageRestoreReserve was set on page, page allocation consumed a
1697      * reservation.  If the page was associated with a subpool, there
1698      * would have been a page reserved in the subpool before allocation
1699      * via hugepage_subpool_get_pages().  Since we are 'restoring' the
1700      * reservation, do not call hugepage_subpool_put_pages() as this will
1701      * remove the reserved page from the subpool.
1702      */
1703     if (!restore_reserve) {
1704         /*
1705          * A return code of zero implies that the subpool will be
1706          * under its minimum size if the reservation is not restored
1707          * after page is free.  Therefore, force restore_reserve
1708          * operation.
1709          */
1710         if (hugepage_subpool_put_pages(spool, 1) == 0)
1711             restore_reserve = true;
1712     }
1713
1714     spin_lock_irqsave(&hugetlb_lock, flags);
1715     ClearHPageMigratable(page);
1716     hugetlb_cgroup_uncharge_page(hstate_index(h),
1717                      pages_per_huge_page(h), page);
1718     hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
1719                       pages_per_huge_page(h), page);
1720     if (restore_reserve)
1721         h->resv_huge_pages++;
1722
1723     if (HPageTemporary(page)) {
1724         remove_hugetlb_page(h, page, false);
1725         spin_unlock_irqrestore(&hugetlb_lock, flags);
1726         update_and_free_page(h, page, true);
1727     } else if (h->surplus_huge_pages_node[nid]) {
1728         /* remove the page from active list */
1729         remove_hugetlb_page(h, page, true);
1730         spin_unlock_irqrestore(&hugetlb_lock, flags);
1731         update_and_free_page(h, page, true);
1732     } else {
1733         arch_clear_hugepage_flags(page);
1734         enqueue_huge_page(h, page);
1735         spin_unlock_irqrestore(&hugetlb_lock, flags);
1736     }
1737 }
1738
1739 /*
1740  * Must be called with the hugetlb lock held
1741  */
1742 static void __prep_account_new_huge_page(struct hstate *h, int nid)
1743 {
1744     lockdep_assert_held(&hugetlb_lock);
1745     h->nr_huge_pages++;
1746     h->nr_huge_pages_node[nid]++;
1747 }
1748
1749 static void __prep_new_huge_page(struct hstate *h, struct page *page)
1750 {
1751     hugetlb_vmemmap_optimize(h, page);
1752     INIT_LIST_HEAD(&page->lru);
1753     set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
1754     hugetlb_set_page_subpool(page, NULL);
1755     set_hugetlb_cgroup(page, NULL);
1756     set_hugetlb_cgroup_rsvd(page, NULL);
1757 }
1758
1759 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
1760 {
1761     __prep_new_huge_page(h, page);
1762     spin_lock_irq(&hugetlb_lock);
1763     __prep_account_new_huge_page(h, nid);
1764     spin_unlock_irq(&hugetlb_lock);
1765 }
1766
1767 static bool __prep_compound_gigantic_page(struct page *page, unsigned int order,
1768                                 bool demote)
1769 {
1770     int i, j;
1771     int nr_pages = 1 << order;
1772     struct page *p = page + 1;
1773
1774     /* we rely on prep_new_huge_page to set the destructor */
1775     set_compound_order(page, order);
1776     __ClearPageReserved(page);
1777     __SetPageHead(page);
1778     for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
1779         /*
1780          * For gigantic hugepages allocated through bootmem at
1781          * boot, it's safer to be consistent with the not-gigantic
1782          * hugepages and clear the PG_reserved bit from all tail pages
1783          * too.  Otherwise drivers using get_user_pages() to access tail
1784          * pages may get the reference counting wrong if they see
1785          * PG_reserved set on a tail page (despite the head page not
1786          * having PG_reserved set).  Enforcing this consistency between
1787          * head and tail pages allows drivers to optimize away a check
1788          * on the head page when they need know if put_page() is needed
1789          * after get_user_pages().
1790          */
1791         __ClearPageReserved(p);
1792         /*
1793          * Subtle and very unlikely
1794          *
1795          * Gigantic 'page allocators' such as memblock or cma will
1796          * return a set of pages with each page ref counted.  We need
1797          * to turn this set of pages into a compound page with tail
1798          * page ref counts set to zero.  Code such as speculative page
1799          * cache adding could take a ref on a 'to be' tail page.
1800          * We need to respect any increased ref count, and only set
1801          * the ref count to zero if count is currently 1.  If count
1802          * is not 1, we return an error.  An error return indicates
1803          * the set of pages can not be converted to a gigantic page.
1804          * The caller who allocated the pages should then discard the
1805          * pages using the appropriate free interface.
1806          *
1807          * In the case of demote, the ref count will be zero.
1808          */
1809         if (!demote) {
1810             if (!page_ref_freeze(p, 1)) {
1811                 pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n");
1812                 goto out_error;
1813             }
1814         } else {
1815             VM_BUG_ON_PAGE(page_count(p), p);
1816         }
1817         set_compound_head(p, page);
1818     }
1819     atomic_set(compound_mapcount_ptr(page), -1);
1820     atomic_set(compound_pincount_ptr(page), 0);
1821     return true;
1822
1823 out_error:
1824     /* undo tail page modifications made above */
1825     p = page + 1;
1826     for (j = 1; j < i; j++, p = mem_map_next(p, page, j)) {
1827         clear_compound_head(p);
1828         set_page_refcounted(p);
1829     }
1830     /* need to clear PG_reserved on remaining tail pages  */
1831     for (; j < nr_pages; j++, p = mem_map_next(p, page, j))
1832         __ClearPageReserved(p);
1833     set_compound_order(page, 0);
1834 #ifdef CONFIG_64BIT
1835     page[1].compound_nr = 0;
1836 #endif
1837     __ClearPageHead(page);
1838     return false;
1839 }
1840
1841 static bool prep_compound_gigantic_page(struct page *page, unsigned int order)
1842 {
1843     return __prep_compound_gigantic_page(page, order, false);
1844 }
1845
1846 static bool prep_compound_gigantic_page_for_demote(struct page *page,
1847                             unsigned int order)
1848 {
1849     return __prep_compound_gigantic_page(page, order, true);
1850 }
1851
1852 /*
1853  * PageHuge() only returns true for hugetlbfs pages, but not for normal or
1854  * transparent huge pages.  See the PageTransHuge() documentation for more
1855  * details.
1856  */
1857 int PageHuge(struct page *page)
1858 {
1859     if (!PageCompound(page))
1860         return 0;
1861
1862     page = compound_head(page);
1863     return page[1].compound_dtor == HUGETLB_PAGE_DTOR;
1864 }
1865 EXPORT_SYMBOL_GPL(PageHuge);
1866
1867 /*
1868  * PageHeadHuge() only returns true for hugetlbfs head page, but not for
1869  * normal or transparent huge pages.
1870  */
1871 int PageHeadHuge(struct page *page_head)
1872 {
1873     if (!PageHead(page_head))
1874         return 0;
1875
1876     return page_head[1].compound_dtor == HUGETLB_PAGE_DTOR;
1877 }
1878 EXPORT_SYMBOL_GPL(PageHeadHuge);
1879
1880 /*
1881  * Find and lock address space (mapping) in write mode.
1882  *
1883  * Upon entry, the page is locked which means that page_mapping() is
1884  * stable.  Due to locking order, we can only trylock_write.  If we can
1885  * not get the lock, simply return NULL to caller.
1886  */
1887 struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage)
1888 {
1889     struct address_space *mapping = page_mapping(hpage);
1890
1891     if (!mapping)
1892         return mapping;
1893
1894     if (i_mmap_trylock_write(mapping))
1895         return mapping;
1896
1897     return NULL;
1898 }
1899
1900 pgoff_t hugetlb_basepage_index(struct page *page)
1901 {
1902     struct page *page_head = compound_head(page);
1903     pgoff_t index = page_index(page_head);
1904     unsigned long compound_idx;
1905
1906     if (compound_order(page_head) >= MAX_ORDER)
1907         compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
1908     else
1909         compound_idx = page - page_head;
1910
1911     return (index << compound_order(page_head)) + compound_idx;
1912 }
1913
1914 static struct page *alloc_buddy_huge_page(struct hstate *h,
1915         gfp_t gfp_mask, int nid, nodemask_t *nmask,
1916         nodemask_t *node_alloc_noretry)
1917 {
1918     int order = huge_page_order(h);
1919     struct page *page;
1920     bool alloc_try_hard = true;
1921
1922     /*
1923      * By default we always try hard to allocate the page with
1924      * __GFP_RETRY_MAYFAIL flag.  However, if we are allocating pages in
1925      * a loop (to adjust global huge page counts) and previous allocation
1926      * failed, do not continue to try hard on the same node.  Use the
1927      * node_alloc_noretry bitmap to manage this state information.
1928      */
1929     if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry))
1930         alloc_try_hard = false;
1931     gfp_mask |= __GFP_COMP|__GFP_NOWARN;
1932     if (alloc_try_hard)
1933         gfp_mask |= __GFP_RETRY_MAYFAIL;
1934     if (nid == NUMA_NO_NODE)
1935         nid = numa_mem_id();
1936     page = __alloc_pages(gfp_mask, order, nid, nmask);
1937     if (page)
1938         __count_vm_event(HTLB_BUDDY_PGALLOC);
1939     else
1940         __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
1941
1942     /*
1943      * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this
1944      * indicates an overall state change.  Clear bit so that we resume
1945      * normal 'try hard' allocations.
1946      */
1947     if (node_alloc_noretry && page && !alloc_try_hard)
1948         node_clear(nid, *node_alloc_noretry);
1949
1950     /*
1951      * If we tried hard to get a page but failed, set bit so that
1952      * subsequent attempts will not try as hard until there is an
1953      * overall state change.
1954      */
1955     if (node_alloc_noretry && !page && alloc_try_hard)
1956         node_set(nid, *node_alloc_noretry);
1957
1958     return page;
1959 }
1960
1961 /*
1962  * Common helper to allocate a fresh hugetlb page. All specific allocators
1963  * should use this function to get new hugetlb pages
1964  */
1965 static struct page *alloc_fresh_huge_page(struct hstate *h,
1966         gfp_t gfp_mask, int nid, nodemask_t *nmask,
1967         nodemask_t *node_alloc_noretry)
1968 {
1969     struct page *page;
1970     bool retry = false;
1971
1972 retry:
1973     if (hstate_is_gigantic(h))
1974         page = alloc_gigantic_page(h, gfp_mask, nid, nmask);
1975     else
1976         page = alloc_buddy_huge_page(h, gfp_mask,
1977                 nid, nmask, node_alloc_noretry);
1978     if (!page)
1979         return NULL;
1980
1981     if (hstate_is_gigantic(h)) {
1982         if (!prep_compound_gigantic_page(page, huge_page_order(h))) {
1983             /*
1984              * Rare failure to convert pages to compound page.
1985              * Free pages and try again - ONCE!
1986              */
1987             free_gigantic_page(page, huge_page_order(h));
1988             if (!retry) {
1989                 retry = true;
1990                 goto retry;
1991             }
1992             return NULL;
1993         }
1994     }
1995     prep_new_huge_page(h, page, page_to_nid(page));
1996
1997     return page;
1998 }
1999
2000 /*
2001  * Allocates a fresh page to the hugetlb allocator pool in the node interleaved
2002  * manner.
2003  */
2004 static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
2005                 nodemask_t *node_alloc_noretry)
2006 {
2007     struct page *page;
2008     int nr_nodes, node;
2009     gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
2010
2011     for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
2012         page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed,
2013                         node_alloc_noretry);
2014         if (page)
2015             break;
2016     }
2017
2018     if (!page)
2019         return 0;
2020
2021     put_page(page); /* free it into the hugepage allocator */
2022
2023     return 1;
2024 }
2025
2026 /*
2027  * Remove huge page from pool from next node to free.  Attempt to keep
2028  * persistent huge pages more or less balanced over allowed nodes.
2029  * This routine only 'removes' the hugetlb page.  The caller must make
2030  * an additional call to free the page to low level allocators.
2031  * Called with hugetlb_lock locked.
2032  */
2033 static struct page *remove_pool_huge_page(struct hstate *h,
2034                         nodemask_t *nodes_allowed,
2035                          bool acct_surplus)
2036 {
2037     int nr_nodes, node;
2038     struct page *page = NULL;
2039
2040     lockdep_assert_held(&hugetlb_lock);
2041     for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
2042         /*
2043          * If we're returning unused surplus pages, only examine
2044          * nodes with surplus pages.
2045          */
2046         if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
2047             !list_empty(&h->hugepage_freelists[node])) {
2048             page = list_entry(h->hugepage_freelists[node].next,
2049                       struct page, lru);
2050             remove_hugetlb_page(h, page, acct_surplus);
2051             break;
2052         }
2053     }
2054
2055     return page;
2056 }
2057
2058 /*
2059  * Dissolve a given free hugepage into free buddy pages. This function does
2060  * nothing for in-use hugepages and non-hugepages.
2061  * This function returns values like below:
2062  *
2063  *  -ENOMEM: failed to allocate vmemmap pages to free the freed hugepages
2064  *           when the system is under memory pressure and the feature of
2065  *           freeing unused vmemmap pages associated with each hugetlb page
2066  *           is enabled.
2067  *  -EBUSY:  failed to dissolved free hugepages or the hugepage is in-use
2068  *           (allocated or reserved.)
2069  *       0:  successfully dissolved free hugepages or the page is not a
2070  *           hugepage (considered as already dissolved)
2071  */
2072 int dissolve_free_huge_page(struct page *page)
2073 {
2074     int rc = -EBUSY;
2075
2076 retry:
2077     /* Not to disrupt normal path by vainly holding hugetlb_lock */
2078     if (!PageHuge(page))
2079         return 0;
2080
2081     spin_lock_irq(&hugetlb_lock);
2082     if (!PageHuge(page)) {
2083         rc = 0;
2084         goto out;
2085     }
2086
2087     if (!page_count(page)) {
2088         struct page *head = compound_head(page);
2089         struct hstate *h = page_hstate(head);
2090         if (h->free_huge_pages - h->resv_huge_pages == 0)
2091             goto out;
2092
2093         /*
2094          * We should make sure that the page is already on the free list
2095          * when it is dissolved.
2096          */
2097         if (unlikely(!HPageFreed(head))) {
2098             spin_unlock_irq(&hugetlb_lock);
2099             cond_resched();
2100
2101             /*
2102              * Theoretically, we should return -EBUSY when we
2103              * encounter this race. In fact, we have a chance
2104              * to successfully dissolve the page if we do a
2105              * retry. Because the race window is quite small.
2106              * If we seize this opportunity, it is an optimization
2107              * for increasing the success rate of dissolving page.
2108              */
2109             goto retry;
2110         }
2111
2112         remove_hugetlb_page(h, head, false);
2113         h->max_huge_pages--;
2114         spin_unlock_irq(&hugetlb_lock);
2115
2116         /*
2117          * Normally update_and_free_page will allocate required vmemmmap
2118          * before freeing the page.  update_and_free_page will fail to
2119          * free the page if it can not allocate required vmemmap.  We
2120          * need to adjust max_huge_pages if the page is not freed.
2121          * Attempt to allocate vmemmmap here so that we can take
2122          * appropriate action on failure.
2123          */
2124         rc = hugetlb_vmemmap_restore(h, head);
2125         if (!rc) {
2126             update_and_free_page(h, head, false);
2127         } else {
2128             spin_lock_irq(&hugetlb_lock);
2129             add_hugetlb_page(h, head, false);
2130             h->max_huge_pages++;
2131             spin_unlock_irq(&hugetlb_lock);
2132         }
2133
2134         return rc;
2135     }
2136 out:
2137     spin_unlock_irq(&hugetlb_lock);
2138     return rc;
2139 }
2140
2141 /*
2142  * Dissolve free hugepages in a given pfn range. Used by memory hotplug to
2143  * make specified memory blocks removable from the system.
2144  * Note that this will dissolve a free gigantic hugepage completely, if any
2145  * part of it lies within the given range.
2146  * Also note that if dissolve_free_huge_page() returns with an error, all
2147  * free hugepages that were dissolved before that error are lost.
2148  */
2149 int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
2150 {
2151     unsigned long pfn;
2152     struct page *page;
2153     int rc = 0;
2154     unsigned int order;
2155     struct hstate *h;
2156
2157     if (!hugepages_supported())
2158         return rc;
2159
2160     order = huge_page_order(&default_hstate);
2161     for_each_hstate(h)
2162         order = min(order, huge_page_order(h));
2163
2164     for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) {
2165         page = pfn_to_page(pfn);
2166         rc = dissolve_free_huge_page(page);
2167         if (rc)
2168             break;
2169     }
2170
2171     return rc;
2172 }
2173
2174 /*
2175  * Allocates a fresh surplus page from the page allocator.
2176  */
2177 static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
2178         int nid, nodemask_t *nmask, bool zero_ref)
2179 {
2180     struct page *page = NULL;
2181     bool retry = false;
2182
2183     if (hstate_is_gigantic(h))
2184         return NULL;
2185
2186     spin_lock_irq(&hugetlb_lock);
2187     if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages)
2188         goto out_unlock;
2189     spin_unlock_irq(&hugetlb_lock);
2190
2191 retry:
2192     page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
2193     if (!page)
2194         return NULL;
2195
2196     spin_lock_irq(&hugetlb_lock);
2197     /*
2198      * We could have raced with the pool size change.
2199      * Double check that and simply deallocate the new page
2200      * if we would end up overcommiting the surpluses. Abuse
2201      * temporary page to workaround the nasty free_huge_page
2202      * codeflow
2203      */
2204     if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
2205         SetHPageTemporary(page);
2206         spin_unlock_irq(&hugetlb_lock);
2207         put_page(page);
2208         return NULL;
2209     }
2210
2211     if (zero_ref) {
2212         /*
2213          * Caller requires a page with zero ref count.
2214          * We will drop ref count here.  If someone else is holding
2215          * a ref, the page will be freed when they drop it.  Abuse
2216          * temporary page flag to accomplish this.
2217          */
2218         SetHPageTemporary(page);
2219         if (!put_page_testzero(page)) {
2220             /*
2221              * Unexpected inflated ref count on freshly allocated
2222              * huge.  Retry once.
2223              */
2224             pr_info("HugeTLB unexpected inflated ref count on freshly allocated page\n");
2225             spin_unlock_irq(&hugetlb_lock);
2226             if (retry)
2227                 return NULL;
2228
2229             retry = true;
2230             goto retry;
2231         }
2232         ClearHPageTemporary(page);
2233     }
2234
2235     h->surplus_huge_pages++;
2236     h->surplus_huge_pages_node[page_to_nid(page)]++;
2237
2238 out_unlock:
2239     spin_unlock_irq(&hugetlb_lock);
2240
2241     return page;
2242 }
2243
2244 static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
2245                      int nid, nodemask_t *nmask)
2246 {
2247     struct page *page;
2248
2249     if (hstate_is_gigantic(h))
2250         return NULL;
2251
2252     page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
2253     if (!page)
2254         return NULL;
2255
2256     /*
2257      * We do not account these pages as surplus because they are only
2258      * temporary and will be released properly on the last reference
2259      */
2260     SetHPageTemporary(page);
2261
2262     return page;
2263 }
2264
2265 /*
2266  * Use the VMA's mpolicy to allocate a huge page from the buddy.
2267  */
2268 static
2269 struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
2270         struct vm_area_struct *vma, unsigned long addr)
2271 {
2272     struct page *page = NULL;
2273     struct mempolicy *mpol;
2274     gfp_t gfp_mask = htlb_alloc_mask(h);
2275     int nid;
2276     nodemask_t *nodemask;
2277
2278     nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
2279     if (mpol_is_preferred_many(mpol)) {
2280         gfp_t gfp = gfp_mask | __GFP_NOWARN;
2281
2282         gfp &=  ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2283         page = alloc_surplus_huge_page(h, gfp, nid, nodemask, false);
2284
2285         /* Fallback to all nodes if page==NULL */
2286         nodemask = NULL;
2287     }
2288
2289     if (!page)
2290         page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask, false);
2291     mpol_cond_put(mpol);
2292     return page;
2293 }
2294
2295 /* page migration callback function */
2296 struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
2297         nodemask_t *nmask, gfp_t gfp_mask)
2298 {
2299     spin_lock_irq(&hugetlb_lock);
2300     if (h->free_huge_pages - h->resv_huge_pages > 0) {
2301         struct page *page;
2302
2303         page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask);
2304         if (page) {
2305             spin_unlock_irq(&hugetlb_lock);
2306             return page;
2307         }
2308     }
2309     spin_unlock_irq(&hugetlb_lock);
2310
2311     return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
2312 }
2313
2314 /* mempolicy aware migration callback */
2315 struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
2316         unsigned long address)
2317 {
2318     struct mempolicy *mpol;
2319     nodemask_t *nodemask;
2320     struct page *page;
2321     gfp_t gfp_mask;
2322     int node;
2323
2324     gfp_mask = htlb_alloc_mask(h);
2325     node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
2326     page = alloc_huge_page_nodemask(h, node, nodemask, gfp_mask);
2327     mpol_cond_put(mpol);
2328
2329     return page;
2330 }
2331
2332 /*
2333  * Increase the hugetlb pool such that it can accommodate a reservation
2334  * of size 'delta'.
2335  */
2336 static int gather_surplus_pages(struct hstate *h, long delta)
2337     __must_hold(&hugetlb_lock)
2338 {
2339     struct list_head surplus_list;
2340     struct page *page, *tmp;
2341     int ret;
2342     long i;
2343     long needed, allocated;
2344     bool alloc_ok = true;
2345
2346     lockdep_assert_held(&hugetlb_lock);
2347     needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
2348     if (needed <= 0) {
2349         h->resv_huge_pages += delta;
2350         return 0;
2351     }
2352
2353     allocated = 0;
2354     INIT_LIST_HEAD(&surplus_list);
2355
2356     ret = -ENOMEM;
2357 retry:
2358     spin_unlock_irq(&hugetlb_lock);
2359     for (i = 0; i < needed; i++) {
2360         page = alloc_surplus_huge_page(h, htlb_alloc_mask(h),
2361                 NUMA_NO_NODE, NULL, true);
2362         if (!page) {
2363             alloc_ok = false;
2364             break;
2365         }
2366         list_add(&page->lru, &surplus_list);
2367         cond_resched();
2368     }
2369     allocated += i;
2370
2371     /*
2372      * After retaking hugetlb_lock, we need to recalculate 'needed'
2373      * because either resv_huge_pages or free_huge_pages may have changed.
2374      */
2375     spin_lock_irq(&hugetlb_lock);
2376     needed = (h->resv_huge_pages + delta) -
2377             (h->free_huge_pages + allocated);
2378     if (needed > 0) {
2379         if (alloc_ok)
2380             goto retry;
2381         /*
2382          * We were not able to allocate enough pages to
2383          * satisfy the entire reservation so we free what
2384          * we've allocated so far.
2385          */
2386         goto free;
2387     }
2388     /*
2389      * The surplus_list now contains _at_least_ the number of extra pages
2390      * needed to accommodate the reservation.  Add the appropriate number
2391      * of pages to the hugetlb pool and free the extras back to the buddy
2392      * allocator.  Commit the entire reservation here to prevent another
2393      * process from stealing the pages as they are added to the pool but
2394      * before they are reserved.
2395      */
2396     needed += allocated;
2397     h->resv_huge_pages += delta;
2398     ret = 0;
2399
2400     /* Free the needed pages to the hugetlb pool */
2401     list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
2402         if ((--needed) < 0)
2403             break;
2404         /* Add the page to the hugetlb allocator */
2405         enqueue_huge_page(h, page);
2406     }
2407 free:
2408     spin_unlock_irq(&hugetlb_lock);
2409
2410     /*
2411      * Free unnecessary surplus pages to the buddy allocator.
2412      * Pages have no ref count, call free_huge_page directly.
2413      */
2414     list_for_each_entry_safe(page, tmp, &surplus_list, lru)
2415         free_huge_page(page);
2416     spin_lock_irq(&hugetlb_lock);
2417
2418     return ret;
2419 }
2420
2421 /*
2422  * This routine has two main purposes:
2423  * 1) Decrement the reservation count (resv_huge_pages) by the value passed
2424  *    in unused_resv_pages.  This corresponds to the prior adjustments made
2425  *    to the associated reservation map.
2426  * 2) Free any unused surplus pages that may have been allocated to satisfy
2427  *    the reservation.  As many as unused_resv_pages may be freed.
2428  */
2429 static void return_unused_surplus_pages(struct hstate *h,
2430                     unsigned long unused_resv_pages)
2431 {
2432     unsigned long nr_pages;
2433     struct page *page;
2434     LIST_HEAD(page_list);
2435
2436     lockdep_assert_held(&hugetlb_lock);
2437     /* Uncommit the reservation */
2438     h->resv_huge_pages -= unused_resv_pages;
2439
2440     if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
2441         goto out;
2442
2443     /*
2444      * Part (or even all) of the reservation could have been backed
2445      * by pre-allocated pages. Only free surplus pages.
2446      */
2447     nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
2448
2449     /*
2450      * We want to release as many surplus pages as possible, spread
2451      * evenly across all nodes with memory. Iterate across these nodes
2452      * until we can no longer free unreserved surplus pages. This occurs
2453      * when the nodes with surplus pages have no free pages.
2454      * remove_pool_huge_page() will balance the freed pages across the
2455      * on-line nodes with memory and will handle the hstate accounting.
2456      */
2457     while (nr_pages--) {
2458         page = remove_pool_huge_page(h, &node_states[N_MEMORY], 1);
2459         if (!page)
2460             goto out;
2461
2462         list_add(&page->lru, &page_list);
2463     }
2464
2465 out:
2466     spin_unlock_irq(&hugetlb_lock);
2467     update_and_free_pages_bulk(h, &page_list);
2468     spin_lock_irq(&hugetlb_lock);
2469 }
2470
2471
2472 /*
2473  * vma_needs_reservation, vma_commit_reservation and vma_end_reservation
2474  * are used by the huge page allocation routines to manage reservations.
2475  *
2476  * vma_needs_reservation is called to determine if the huge page at addr
2477  * within the vma has an associated reservation.  If a reservation is
2478  * needed, the value 1 is returned.  The caller is then responsible for
2479  * managing the global reservation and subpool usage counts.  After
2480  * the huge page has been allocated, vma_commit_reservation is called
2481  * to add the page to the reservation map.  If the page allocation fails,
2482  * the reservation must be ended instead of committed.  vma_end_reservation
2483  * is called in such cases.
2484  *
2485  * In the normal case, vma_commit_reservation returns the same value
2486  * as the preceding vma_needs_reservation call.  The only time this
2487  * is not the case is if a reserve map was changed between calls.  It
2488  * is the responsibility of the caller to notice the difference and
2489  * take appropriate action.
2490  *
2491  * vma_add_reservation is used in error paths where a reservation must
2492  * be restored when a newly allocated huge page must be freed.  It is
2493  * to be called after calling vma_needs_reservation to determine if a
2494  * reservation exists.
2495  *
2496  * vma_del_reservation is used in error paths where an entry in the reserve
2497  * map was created during huge page allocation and must be removed.  It is to
2498  * be called after calling vma_needs_reservation to determine if a reservation
2499  * exists.
2500  */
2501 enum vma_resv_mode {
2502     VMA_NEEDS_RESV,
2503     VMA_COMMIT_RESV,
2504     VMA_END_RESV,
2505     VMA_ADD_RESV,
2506     VMA_DEL_RESV,
2507 };
2508 static long __vma_reservation_common(struct hstate *h,
2509                 struct vm_area_struct *vma, unsigned long addr,
2510                 enum vma_resv_mode mode)
2511 {
2512     struct resv_map *resv;
2513     pgoff_t idx;
2514     long ret;
2515     long dummy_out_regions_needed;
2516
2517     resv = vma_resv_map(vma);
2518     if (!resv)
2519         return 1;
2520
2521     idx = vma_hugecache_offset(h, vma, addr);
2522     switch (mode) {
2523     case VMA_NEEDS_RESV:
2524         ret = region_chg(resv, idx, idx + 1, &dummy_out_regions_needed);
2525         /* We assume that vma_reservation_* routines always operate on
2526          * 1 page, and that adding to resv map a 1 page entry can only
2527          * ever require 1 region.
2528          */
2529         VM_BUG_ON(dummy_out_regions_needed != 1);
2530         break;
2531     case VMA_COMMIT_RESV:
2532         ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
2533         /* region_add calls of range 1 should never fail. */
2534         VM_BUG_ON(ret < 0);
2535         break;
2536     case VMA_END_RESV:
2537         region_abort(resv, idx, idx + 1, 1);
2538         ret = 0;
2539         break;
2540     case VMA_ADD_RESV:
2541         if (vma->vm_flags & VM_MAYSHARE) {
2542             ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
2543             /* region_add calls of range 1 should never fail. */
2544             VM_BUG_ON(ret < 0);
2545         } else {
2546             region_abort(resv, idx, idx + 1, 1);
2547             ret = region_del(resv, idx, idx + 1);
2548         }
2549         break;
2550     case VMA_DEL_RESV:
2551         if (vma->vm_flags & VM_MAYSHARE) {
2552             region_abort(resv, idx, idx + 1, 1);
2553             ret = region_del(resv, idx, idx + 1);
2554         } else {
2555             ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
2556             /* region_add calls of range 1 should never fail. */
2557             VM_BUG_ON(ret < 0);
2558         }
2559         break;
2560     default:
2561         BUG();
2562     }
2563
2564     if (vma->vm_flags & VM_MAYSHARE || mode == VMA_DEL_RESV)
2565         return ret;
2566     /*
2567      * We know private mapping must have HPAGE_RESV_OWNER set.
2568      *
2569      * In most cases, reserves always exist for private mappings.
2570      * However, a file associated with mapping could have been
2571      * hole punched or truncated after reserves were consumed.
2572      * As subsequent fault on such a range will not use reserves.
2573      * Subtle - The reserve map for private mappings has the
2574      * opposite meaning than that of shared mappings.  If NO
2575      * entry is in the reserve map, it means a reservation exists.
2576      * If an entry exists in the reserve map, it means the
2577      * reservation has already been consumed.  As a result, the
2578      * return value of this routine is the opposite of the
2579      * value returned from reserve map manipulation routines above.
2580      */
2581     if (ret > 0)
2582         return 0;
2583     if (ret == 0)
2584         return 1;
2585     return ret;
2586 }
2587
2588 static long vma_needs_reservation(struct hstate *h,
2589             struct vm_area_struct *vma, unsigned long addr)
2590 {
2591     return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV);
2592 }
2593
2594 static long vma_commit_reservation(struct hstate *h,
2595             struct vm_area_struct *vma, unsigned long addr)
2596 {
2597     return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV);
2598 }
2599
2600 static void vma_end_reservation(struct hstate *h,
2601             struct vm_area_struct *vma, unsigned long addr)
2602 {
2603     (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV);
2604 }
2605
2606 static long vma_add_reservation(struct hstate *h,
2607             struct vm_area_struct *vma, unsigned long addr)
2608 {
2609     return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV);
2610 }
2611
2612 static long vma_del_reservation(struct hstate *h,
2613             struct vm_area_struct *vma, unsigned long addr)
2614 {
2615     return __vma_reservation_common(h, vma, addr, VMA_DEL_RESV);
2616 }
2617
2618 /*
2619  * This routine is called to restore reservation information on error paths.
2620  * It should ONLY be called for pages allocated via alloc_huge_page(), and
2621  * the hugetlb mutex should remain held when calling this routine.
2622  *
2623  * It handles two specific cases:
2624  * 1) A reservation was in place and the page consumed the reservation.
2625  *    HPageRestoreReserve is set in the page.
2626  * 2) No reservation was in place for the page, so HPageRestoreReserve is
2627  *    not set.  However, alloc_huge_page always updates the reserve map.
2628  *
2629  * In case 1, free_huge_page later in the error path will increment the
2630  * global reserve count.  But, free_huge_page does not have enough context
2631  * to adjust the reservation map.  This case deals primarily with private
2632  * mappings.  Adjust the reserve map here to be consistent with global
2633  * reserve count adjustments to be made by free_huge_page.  Make sure the
2634  * reserve map indicates there is a reservation present.
2635  *
2636  * In case 2, simply undo reserve map modifications done by alloc_huge_page.
2637  */
2638 void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
2639             unsigned long address, struct page *page)
2640 {
2641     long rc = vma_needs_reservation(h, vma, address);
2642
2643     if (HPageRestoreReserve(page)) {
2644         if (unlikely(rc < 0))
2645             /*
2646              * Rare out of memory condition in reserve map
2647              * manipulation.  Clear HPageRestoreReserve so that
2648              * global reserve count will not be incremented
2649              * by free_huge_page.  This will make it appear
2650              * as though the reservation for this page was
2651              * consumed.  This may prevent the task from
2652              * faulting in the page at a later time.  This
2653              * is better than inconsistent global huge page
2654              * accounting of reserve counts.
2655              */
2656             ClearHPageRestoreReserve(page);
2657         else if (rc)
2658             (void)vma_add_reservation(h, vma, address);
2659         else
2660             vma_end_reservation(h, vma, address);
2661     } else {
2662         if (!rc) {
2663             /*
2664              * This indicates there is an entry in the reserve map
2665              * not added by alloc_huge_page.  We know it was added
2666              * before the alloc_huge_page call, otherwise
2667              * HPageRestoreReserve would be set on the page.
2668              * Remove the entry so that a subsequent allocation
2669              * does not consume a reservation.
2670              */
2671             rc = vma_del_reservation(h, vma, address);
2672             if (rc < 0)
2673                 /*
2674                  * VERY rare out of memory condition.  Since
2675                  * we can not delete the entry, set
2676                  * HPageRestoreReserve so that the reserve
2677                  * count will be incremented when the page
2678                  * is freed.  This reserve will be consumed
2679                  * on a subsequent allocation.
2680                  */
2681                 SetHPageRestoreReserve(page);
2682         } else if (rc < 0) {
2683             /*
2684              * Rare out of memory condition from
2685              * vma_needs_reservation call.  Memory allocation is
2686              * only attempted if a new entry is needed.  Therefore,
2687              * this implies there is not an entry in the
2688              * reserve map.
2689              *
2690              * For shared mappings, no entry in the map indicates
2691              * no reservation.  We are done.
2692              */
2693             if (!(vma->vm_flags & VM_MAYSHARE))
2694                 /*
2695                  * For private mappings, no entry indicates
2696                  * a reservation is present.  Since we can
2697                  * not add an entry, set SetHPageRestoreReserve
2698                  * on the page so reserve count will be
2699                  * incremented when freed.  This reserve will
2700                  * be consumed on a subsequent allocation.
2701                  */
2702                 SetHPageRestoreReserve(page);
2703         } else
2704             /*
2705              * No reservation present, do nothing
2706              */
2707              vma_end_reservation(h, vma, address);
2708     }
2709 }
2710
2711 /*
2712  * alloc_and_dissolve_huge_page - Allocate a new page and dissolve the old one
2713  * @h: struct hstate old page belongs to
2714  * @old_page: Old page to dissolve
2715  * @list: List to isolate the page in case we need to
2716  * Returns 0 on success, otherwise negated error.
2717  */
2718 static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
2719                     struct list_head *list)
2720 {
2721     gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
2722     int nid = page_to_nid(old_page);
2723     bool alloc_retry = false;
2724     struct page *new_page;
2725     int ret = 0;
2726
2727     /*
2728      * Before dissolving the page, we need to allocate a new one for the
2729      * pool to remain stable.  Here, we allocate the page and 'prep' it
2730      * by doing everything but actually updating counters and adding to
2731      * the pool.  This simplifies and let us do most of the processing
2732      * under the lock.
2733      */
2734 alloc_retry:
2735     new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL);
2736     if (!new_page)
2737         return -ENOMEM;
2738     /*
2739      * If all goes well, this page will be directly added to the free
2740      * list in the pool.  For this the ref count needs to be zero.
2741      * Attempt to drop now, and retry once if needed.  It is VERY
2742      * unlikely there is another ref on the page.
2743      *
2744      * If someone else has a reference to the page, it will be freed
2745      * when they drop their ref.  Abuse temporary page flag to accomplish
2746      * this.  Retry once if there is an inflated ref count.
2747      */
2748     SetHPageTemporary(new_page);
2749     if (!put_page_testzero(new_page)) {
2750         if (alloc_retry)
2751             return -EBUSY;
2752
2753         alloc_retry = true;
2754         goto alloc_retry;
2755     }
2756     ClearHPageTemporary(new_page);
2757
2758     __prep_new_huge_page(h, new_page);
2759
2760 retry:
2761     spin_lock_irq(&hugetlb_lock);
2762     if (!PageHuge(old_page)) {
2763         /*
2764          * Freed from under us. Drop new_page too.
2765          */
2766         goto free_new;
2767     } else if (page_count(old_page)) {
2768         /*
2769          * Someone has grabbed the page, try to isolate it here.
2770          * Fail with -EBUSY if not possible.
2771          */
2772         spin_unlock_irq(&hugetlb_lock);
2773         ret = isolate_hugetlb(old_page, list);
2774         spin_lock_irq(&hugetlb_lock);
2775         goto free_new;
2776     } else if (!HPageFreed(old_page)) {
2777         /*
2778          * Page's refcount is 0 but it has not been enqueued in the
2779          * freelist yet. Race window is small, so we can succeed here if
2780          * we retry.
2781          */
2782         spin_unlock_irq(&hugetlb_lock);
2783         cond_resched();
2784         goto retry;
2785     } else {
2786         /*
2787          * Ok, old_page is still a genuine free hugepage. Remove it from
2788          * the freelist and decrease the counters. These will be
2789          * incremented again when calling __prep_account_new_huge_page()
2790          * and enqueue_huge_page() for new_page. The counters will remain
2791          * stable since this happens under the lock.
2792          */
2793         remove_hugetlb_page(h, old_page, false);
2794
2795         /*
2796          * Ref count on new page is already zero as it was dropped
2797          * earlier.  It can be directly added to the pool free list.
2798          */
2799         __prep_account_new_huge_page(h, nid);
2800         enqueue_huge_page(h, new_page);
2801
2802         /*
2803          * Pages have been replaced, we can safely free the old one.
2804          */
2805         spin_unlock_irq(&hugetlb_lock);
2806         update_and_free_page(h, old_page, false);
2807     }
2808
2809     return ret;
2810
2811 free_new:
2812     spin_unlock_irq(&hugetlb_lock);
2813     /* Page has a zero ref count, but needs a ref to be freed */
2814     set_page_refcounted(new_page);
2815     update_and_free_page(h, new_page, false);
2816
2817     return ret;
2818 }
2819
2820 int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
2821 {
2822     struct hstate *h;
2823     struct page *head;
2824     int ret = -EBUSY;
2825
2826     /*
2827      * The page might have been dissolved from under our feet, so make sure
2828      * to carefully check the state under the lock.
2829      * Return success when racing as if we dissolved the page ourselves.
2830      */
2831     spin_lock_irq(&hugetlb_lock);
2832     if (PageHuge(page)) {
2833         head = compound_head(page);
2834         h = page_hstate(head);
2835     } else {
2836         spin_unlock_irq(&hugetlb_lock);
2837         return 0;
2838     }
2839     spin_unlock_irq(&hugetlb_lock);
2840
2841     /*
2842      * Fence off gigantic pages as there is a cyclic dependency between
2843      * alloc_contig_range and them. Return -ENOMEM as this has the effect
2844      * of bailing out right away without further retrying.
2845      */
2846     if (hstate_is_gigantic(h))
2847         return -ENOMEM;
2848
2849     if (page_count(head) && !isolate_hugetlb(head, list))
2850         ret = 0;
2851     else if (!page_count(head))
2852         ret = alloc_and_dissolve_huge_page(h, head, list);
2853
2854     return ret;
2855 }
2856
2857 struct page *alloc_huge_page(struct vm_area_struct *vma,
2858                     unsigned long addr, int avoid_reserve)
2859 {
2860     struct hugepage_subpool *spool = subpool_vma(vma);
2861     struct hstate *h = hstate_vma(vma);
2862     struct page *page;
2863     long map_chg, map_commit;
2864     long gbl_chg;
2865     int ret, idx;
2866     struct hugetlb_cgroup *h_cg;
2867     bool deferred_reserve;
2868
2869     idx = hstate_index(h);
2870     /*
2871      * Examine the region/reserve map to determine if the process
2872      * has a reservation for the page to be allocated.  A return
2873      * code of zero indicates a reservation exists (no change).
2874      */
2875     map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
2876     if (map_chg < 0)
2877         return ERR_PTR(-ENOMEM);
2878
2879     /*
2880      * Processes that did not create the mapping will have no
2881      * reserves as indicated by the region/reserve map. Check
2882      * that the allocation will not exceed the subpool limit.
2883      * Allocations for MAP_NORESERVE mappings also need to be
2884      * checked against any subpool limit.
2885      */
2886     if (map_chg || avoid_reserve) {
2887         gbl_chg = hugepage_subpool_get_pages(spool, 1);
2888         if (gbl_chg < 0) {
2889             vma_end_reservation(h, vma, addr);
2890             return ERR_PTR(-ENOSPC);
2891         }
2892
2893         /*
2894          * Even though there was no reservation in the region/reserve
2895          * map, there could be reservations associated with the
2896          * subpool that can be used.  This would be indicated if the
2897          * return value of hugepage_subpool_get_pages() is zero.
2898          * However, if avoid_reserve is specified we still avoid even
2899          * the subpool reservations.
2900          */
2901         if (avoid_reserve)
2902             gbl_chg = 1;
2903     }
2904
2905     /* If this allocation is not consuming a reservation, charge it now.
2906      */
2907     deferred_reserve = map_chg || avoid_reserve;
2908     if (deferred_reserve) {
2909         ret = hugetlb_cgroup_charge_cgroup_rsvd(
2910             idx, pages_per_huge_page(h), &h_cg);
2911         if (ret)
2912             goto out_subpool_put;
2913     }
2914
2915     ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
2916     if (ret)
2917         goto out_uncharge_cgroup_reservation;
2918
2919     spin_lock_irq(&hugetlb_lock);
2920     /*
2921      * glb_chg is passed to indicate whether or not a page must be taken
2922      * from the global free pool (global change).  gbl_chg == 0 indicates
2923      * a reservation exists for the allocation.
2924      */
2925     page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
2926     if (!page) {
2927         spin_unlock_irq(&hugetlb_lock);
2928         page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
2929         if (!page)
2930             goto out_uncharge_cgroup;
2931         if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
2932             SetHPageRestoreReserve(page);
2933             h->resv_huge_pages--;
2934         }
2935         spin_lock_irq(&hugetlb_lock);
2936         list_add(&page->lru, &h->hugepage_activelist);
2937         /* Fall through */
2938     }
2939     hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
2940     /* If allocation is not consuming a reservation, also store the
2941      * hugetlb_cgroup pointer on the page.
2942      */
2943     if (deferred_reserve) {
2944         hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h),
2945                           h_cg, page);
2946     }
2947
2948     spin_unlock_irq(&hugetlb_lock);
2949
2950     hugetlb_set_page_subpool(page, spool);
2951
2952     map_commit = vma_commit_reservation(h, vma, addr);
2953     if (unlikely(map_chg > map_commit)) {
2954         /*
2955          * The page was added to the reservation map between
2956          * vma_needs_reservation and vma_commit_reservation.
2957          * This indicates a race with hugetlb_reserve_pages.
2958          * Adjust for the subpool count incremented above AND
2959          * in hugetlb_reserve_pages for the same page.  Also,
2960          * the reservation count added in hugetlb_reserve_pages
2961          * no longer applies.
2962          */
2963         long rsv_adjust;
2964
2965         rsv_adjust = hugepage_subpool_put_pages(spool, 1);
2966         hugetlb_acct_memory(h, -rsv_adjust);
2967         if (deferred_reserve)
2968             hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
2969                     pages_per_huge_page(h), page);
2970     }
2971     return page;
2972
2973 out_uncharge_cgroup:
2974     hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
2975 out_uncharge_cgroup_reservation:
2976     if (deferred_reserve)
2977         hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h),
2978                             h_cg);
2979 out_subpool_put:
2980     if (map_chg || avoid_reserve)
2981         hugepage_subpool_put_pages(spool, 1);
2982     vma_end_reservation(h, vma, addr);
2983     return ERR_PTR(-ENOSPC);
2984 }
2985
2986 int alloc_bootmem_huge_page(struct hstate *h, int nid)
2987     __attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
2988 int __alloc_bootmem_huge_page(struct hstate *h, int nid)
2989 {
2990     struct huge_bootmem_page *m = NULL; /* initialize for clang */
2991     int nr_nodes, node;
2992
2993     /* do node specific alloc */
2994     if (nid != NUMA_NO_NODE) {
2995         m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h),
2996                 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
2997         if (!m)
2998             return 0;
2999         goto found;
3000     }
3001     /* allocate from next node when distributing huge pages */
3002     for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
3003         m = memblock_alloc_try_nid_raw(
3004                 huge_page_size(h), huge_page_size(h),
3005                 0, MEMBLOCK_ALLOC_ACCESSIBLE, node);
3006         /*
3007          * Use the beginning of the huge page to store the
3008          * huge_bootmem_page struct (until gather_bootmem
3009          * puts them into the mem_map).
3010          */
3011         if (!m)
3012             return 0;
3013         goto found;
3014     }
3015
3016 found:
3017     /* Put them into a private list first because mem_map is not up yet */
3018     INIT_LIST_HEAD(&m->list);
3019     list_add(&m->list, &huge_boot_pages);
3020     m->hstate = h;
3021     return 1;
3022 }
3023
3024 /*
3025  * Put bootmem huge pages into the standard lists after mem_map is up.
3026  * Note: This only applies to gigantic (order > MAX_ORDER) pages.
3027  */
3028 static void __init gather_bootmem_prealloc(void)
3029 {
3030     struct huge_bootmem_page *m;
3031
3032     list_for_each_entry(m, &huge_boot_pages, list) {
3033         struct page *page = virt_to_page(m);
3034         struct hstate *h = m->hstate;
3035
3036         VM_BUG_ON(!hstate_is_gigantic(h));
3037         WARN_ON(page_count(page) != 1);
3038         if (prep_compound_gigantic_page(page, huge_page_order(h))) {
3039             WARN_ON(PageReserved(page));
3040             prep_new_huge_page(h, page, page_to_nid(page));
3041             put_page(page); /* add to the hugepage allocator */
3042         } else {
3043             /* VERY unlikely inflated ref count on a tail page */
3044             free_gigantic_page(page, huge_page_order(h));
3045         }
3046
3047         /*
3048          * We need to restore the 'stolen' pages to totalram_pages
3049          * in order to fix confusing memory reports from free(1) and
3050          * other side-effects, like CommitLimit going negative.
3051          */
3052         adjust_managed_page_count(page, pages_per_huge_page(h));
3053         cond_resched();
3054     }
3055 }
3056 static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
3057 {
3058     unsigned long i;
3059     char buf[32];
3060
3061     for (i = 0; i < h->max_huge_pages_node[nid]; ++i) {
3062         if (hstate_is_gigantic(h)) {
3063             if (!alloc_bootmem_huge_page(h, nid))
3064                 break;
3065         } else {
3066             struct page *page;
3067             gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
3068
3069             page = alloc_fresh_huge_page(h, gfp_mask, nid,
3070                     &node_states[N_MEMORY], NULL);
3071             if (!page)
3072                 break;
3073             put_page(page); /* free it into the hugepage allocator */
3074         }
3075         cond_resched();
3076     }
3077     if (i == h->max_huge_pages_node[nid])
3078         return;
3079
3080     string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
3081     pr_warn("HugeTLB: allocating %u of page size %s failed node%d.  Only allocated %lu hugepages.\n",
3082         h->max_huge_pages_node[nid], buf, nid, i);
3083     h->max_huge_pages -= (h->max_huge_pages_node[nid] - i);
3084     h->max_huge_pages_node[nid] = i;
3085 }
3086
3087 static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
3088 {
3089     unsigned long i;
3090     nodemask_t *node_alloc_noretry;
3091     bool node_specific_alloc = false;
3092
3093     /* skip gigantic hugepages allocation if hugetlb_cma enabled */
3094     if (hstate_is_gigantic(h) && hugetlb_cma_size) {
3095         pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
3096         return;
3097     }
3098
3099     /* do node specific alloc */
3100     for_each_online_node(i) {
3101         if (h->max_huge_pages_node[i] > 0) {
3102             hugetlb_hstate_alloc_pages_onenode(h, i);
3103             node_specific_alloc = true;
3104         }
3105     }
3106
3107     if (node_specific_alloc)
3108         return;
3109
3110     /* below will do all node balanced alloc */
3111     if (!hstate_is_gigantic(h)) {
3112         /*
3113          * Bit mask controlling how hard we retry per-node allocations.
3114          * Ignore errors as lower level routines can deal with
3115          * node_alloc_noretry == NULL.  If this kmalloc fails at boot
3116          * time, we are likely in bigger trouble.
3117          */
3118         node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry),
3119                         GFP_KERNEL);
3120     } else {
3121         /* allocations done at boot time */
3122         node_alloc_noretry = NULL;
3123     }
3124
3125     /* bit mask controlling how hard we retry per-node allocations */
3126     if (node_alloc_noretry)
3127         nodes_clear(*node_alloc_noretry);
3128
3129     for (i = 0; i < h->max_huge_pages; ++i) {
3130         if (hstate_is_gigantic(h)) {
3131             if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE))
3132                 break;
3133         } else if (!alloc_pool_huge_page(h,
3134                      &node_states[N_MEMORY],
3135                      node_alloc_noretry))
3136             break;
3137         cond_resched();
3138     }
3139     if (i < h->max_huge_pages) {
3140         char buf[32];
3141
3142         string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
3143         pr_warn("HugeTLB: allocating %lu of page size %s failed.  Only allocated %lu hugepages.\n",
3144             h->max_huge_pages, buf, i);
3145         h->max_huge_pages = i;
3146     }
3147     kfree(node_alloc_noretry);
3148 }
3149
3150 static void __init hugetlb_init_hstates(void)
3151 {
3152     struct hstate *h, *h2;
3153
3154     for_each_hstate(h) {
3155         /* oversize hugepages were init'ed in early boot */
3156         if (!hstate_is_gigantic(h))
3157             hugetlb_hstate_alloc_pages(h);
3158
3159         /*
3160          * Set demote order for each hstate.  Note that
3161          * h->demote_order is initially 0.
3162          * - We can not demote gigantic pages if runtime freeing
3163          *   is not supported, so skip this.
3164          * - If CMA allocation is possible, we can not demote
3165          *   HUGETLB_PAGE_ORDER or smaller size pages.
3166          */
3167         if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
3168             continue;
3169         if (hugetlb_cma_size && h->order <= HUGETLB_PAGE_ORDER)
3170             continue;
3171         for_each_hstate(h2) {
3172             if (h2 == h)
3173                 continue;
3174             if (h2->order < h->order &&
3175                 h2->order > h->demote_order)
3176                 h->demote_order = h2->order;
3177         }
3178     }
3179 }
3180
3181 static void __init report_hugepages(void)
3182 {
3183     struct hstate *h;
3184
3185     for_each_hstate(h) {
3186         char buf[32];
3187
3188         string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
3189         pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n",
3190             buf, h->free_huge_pages);
3191         pr_info("HugeTLB: %d KiB vmemmap can be freed for a %s page\n",
3192             hugetlb_vmemmap_optimizable_size(h) / SZ_1K, buf);
3193     }
3194 }
3195
3196 #ifdef CONFIG_HIGHMEM
3197 static void try_to_free_low(struct hstate *h, unsigned long count,
3198                         nodemask_t *nodes_allowed)
3199 {
3200     int i;
3201     LIST_HEAD(page_list);
3202
3203     lockdep_assert_held(&hugetlb_lock);
3204     if (hstate_is_gigantic(h))
3205         return;
3206
3207     /*
3208      * Collect pages to be freed on a list, and free after dropping lock
3209      */
3210     for_each_node_mask(i, *nodes_allowed) {
3211         struct page *page, *next;
3212         struct list_head *freel = &h->hugepage_freelists[i];
3213         list_for_each_entry_safe(page, next, freel, lru) {
3214             if (count >= h->nr_huge_pages)
3215                 goto out;
3216             if (PageHighMem(page))
3217                 continue;
3218             remove_hugetlb_page(h, page, false);
3219             list_add(&page->lru, &page_list);
3220         }
3221     }
3222
3223 out:
3224     spin_unlock_irq(&hugetlb_lock);
3225     update_and_free_pages_bulk(h, &page_list);
3226     spin_lock_irq(&hugetlb_lock);
3227 }
3228 #else
3229 static inline void try_to_free_low(struct hstate *h, unsigned long count,
3230                         nodemask_t *nodes_allowed)
3231 {
3232 }
3233 #endif
3234
3235 /*
3236  * Increment or decrement surplus_huge_pages.  Keep node-specific counters
3237  * balanced by operating on them in a round-robin fashion.
3238  * Returns 1 if an adjustment was made.
3239  */
3240 static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
3241                 int delta)
3242 {
3243     int nr_nodes, node;
3244
3245     lockdep_assert_held(&hugetlb_lock);
3246     VM_BUG_ON(delta != -1 && delta != 1);
3247
3248     if (delta < 0) {
3249         for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
3250             if (h->surplus_huge_pages_node[node])
3251                 goto found;
3252         }
3253     } else {
3254         for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
3255             if (h->surplus_huge_pages_node[node] <
3256                     h->nr_huge_pages_node[node])
3257                 goto found;
3258         }
3259     }
3260     return 0;
3261
3262 found:
3263     h->surplus_huge_pages += delta;
3264     h->surplus_huge_pages_node[node] += delta;
3265     return 1;
3266 }
3267
3268 #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
3269 static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
3270                   nodemask_t *nodes_allowed)
3271 {
3272     unsigned long min_count, ret;
3273     struct page *page;
3274     LIST_HEAD(page_list);
3275     NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL);
3276
3277     /*
3278      * Bit mask controlling how hard we retry per-node allocations.
3279      * If we can not allocate the bit mask, do not attempt to allocate
3280      * the requested huge pages.
3281      */
3282     if (node_alloc_noretry)
3283         nodes_clear(*node_alloc_noretry);
3284     else
3285         return -ENOMEM;
3286
3287     /*
3288      * resize_lock mutex prevents concurrent adjustments to number of
3289      * pages in hstate via the proc/sysfs interfaces.
3290      */
3291     mutex_lock(&h->resize_lock);
3292     flush_free_hpage_work(h);
3293     spin_lock_irq(&hugetlb_lock);
3294
3295     /*
3296      * Check for a node specific request.
3297      * Changing node specific huge page count may require a corresponding
3298      * change to the global count.  In any case, the passed node mask
3299      * (nodes_allowed) will restrict alloc/free to the specified node.
3300      */
3301     if (nid != NUMA_NO_NODE) {
3302         unsigned long old_count = count;
3303
3304         count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
3305         /*
3306          * User may have specified a large count value which caused the
3307          * above calculation to overflow.  In this case, they wanted
3308          * to allocate as many huge pages as possible.  Set count to
3309          * largest possible value to align with their intention.
3310          */
3311         if (count < old_count)
3312             count = ULONG_MAX;
3313     }
3314
3315     /*
3316      * Gigantic pages runtime allocation depend on the capability for large
3317      * page range allocation.
3318      * If the system does not provide this feature, return an error when
3319      * the user tries to allocate gigantic pages but let the user free the
3320      * boottime allocated gigantic pages.
3321      */
3322     if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
3323         if (count > persistent_huge_pages(h)) {
3324             spin_unlock_irq(&hugetlb_lock);
3325             mutex_unlock(&h->resize_lock);
3326             NODEMASK_FREE(node_alloc_noretry);
3327             return -EINVAL;
3328         }
3329         /* Fall through to decrease pool */
3330     }
3331
3332     /*
3333      * Increase the pool size
3334      * First take pages out of surplus state.  Then make up the
3335      * remaining difference by allocating fresh huge pages.
3336      *
3337      * We might race with alloc_surplus_huge_page() here and be unable
3338      * to convert a surplus huge page to a normal huge page. That is
3339      * not critical, though, it just means the overall size of the
3340      * pool might be one hugepage larger than it needs to be, but
3341      * within all the constraints specified by the sysctls.
3342      */
3343     while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
3344         if (!adjust_pool_surplus(h, nodes_allowed, -1))
3345             break;
3346     }
3347
3348     while (count > persistent_huge_pages(h)) {
3349         /*
3350          * If this allocation races such that we no longer need the
3351          * page, free_huge_page will handle it by freeing the page
3352          * and reducing the surplus.
3353          */
3354         spin_unlock_irq(&hugetlb_lock);
3355
3356         /* yield cpu to avoid soft lockup */
3357         cond_resched();
3358
3359         ret = alloc_pool_huge_page(h, nodes_allowed,
3360                         node_alloc_noretry);
3361         spin_lock_irq(&hugetlb_lock);
3362         if (!ret)
3363             goto out;
3364
3365         /* Bail for signals. Probably ctrl-c from user */
3366         if (signal_pending(current))
3367             goto out;
3368     }
3369
3370     /*
3371      * Decrease the pool size
3372      * First return free pages to the buddy allocator (being careful
3373      * to keep enough around to satisfy reservations).  Then place
3374      * pages into surplus state as needed so the pool will shrink
3375      * to the desired size as pages become free.
3376      *
3377      * By placing pages into the surplus state independent of the
3378      * overcommit value, we are allowing the surplus pool size to
3379      * exceed overcommit. There are few sane options here. Since
3380      * alloc_surplus_huge_page() is checking the global counter,
3381      * though, we'll note that we're not allowed to exceed surplus
3382      * and won't grow the pool anywhere else. Not until one of the
3383      * sysctls are changed, or the surplus pages go out of use.
3384      */
3385     min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
3386     min_count = max(count, min_count);
3387     try_to_free_low(h, min_count, nodes_allowed);
3388
3389     /*
3390      * Collect pages to be removed on list without dropping lock
3391      */
3392     while (min_count < persistent_huge_pages(h)) {
3393         page = remove_pool_huge_page(h, nodes_allowed, 0);
3394         if (!page)
3395             break;
3396
3397         list_add(&page->lru, &page_list);
3398     }
3399     /* free the pages after dropping lock */
3400     spin_unlock_irq(&hugetlb_lock);
3401     update_and_free_pages_bulk(h, &page_list);
3402     flush_free_hpage_work(h);
3403     spin_lock_irq(&hugetlb_lock);
3404
3405     while (count < persistent_huge_pages(h)) {
3406         if (!adjust_pool_surplus(h, nodes_allowed, 1))
3407             break;
3408     }
3409 out:
3410     h->max_huge_pages = persistent_huge_pages(h);
3411     spin_unlock_irq(&hugetlb_lock);
3412     mutex_unlock(&h->resize_lock);
3413
3414     NODEMASK_FREE(node_alloc_noretry);
3415
3416     return 0;
3417 }
3418
3419 static int demote_free_huge_page(struct hstate *h, struct page *page)
3420 {
3421     int i, nid = page_to_nid(page);
3422     struct hstate *target_hstate;
3423     struct page *subpage;
3424     int rc = 0;
3425
3426     target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order);
3427
3428     remove_hugetlb_page_for_demote(h, page, false);
3429     spin_unlock_irq(&hugetlb_lock);
3430
3431     rc = hugetlb_vmemmap_restore(h, page);
3432     if (rc) {
3433         /* Allocation of vmemmmap failed, we can not demote page */
3434         spin_lock_irq(&hugetlb_lock);
3435         set_page_refcounted(page);
3436         add_hugetlb_page(h, page, false);
3437         return rc;
3438     }
3439
3440     /*
3441      * Use destroy_compound_hugetlb_page_for_demote for all huge page
3442      * sizes as it will not ref count pages.
3443      */
3444     destroy_compound_hugetlb_page_for_demote(page, huge_page_order(h));
3445
3446     /*
3447      * Taking target hstate mutex synchronizes with set_max_huge_pages.
3448      * Without the mutex, pages added to target hstate could be marked
3449      * as surplus.
3450      *
3451      * Note that we already hold h->resize_lock.  To prevent deadlock,
3452      * use the convention of always taking larger size hstate mutex first.
3453      */
3454     mutex_lock(&target_hstate->resize_lock);
3455     for (i = 0; i < pages_per_huge_page(h);
3456                 i += pages_per_huge_page(target_hstate)) {
3457         subpage = nth_page(page, i);
3458         if (hstate_is_gigantic(target_hstate))
3459             prep_compound_gigantic_page_for_demote(subpage,
3460                             target_hstate->order);
3461         else
3462             prep_compound_page(subpage, target_hstate->order);
3463         set_page_private(subpage, 0);
3464         set_page_refcounted(subpage);
3465         prep_new_huge_page(target_hstate, subpage, nid);
3466         put_page(subpage);
3467     }
3468     mutex_unlock(&target_hstate->resize_lock);
3469
3470     spin_lock_irq(&hugetlb_lock);
3471
3472     /*
3473      * Not absolutely necessary, but for consistency update max_huge_pages
3474      * based on pool changes for the demoted page.
3475      */
3476     h->max_huge_pages--;
3477     target_hstate->max_huge_pages += pages_per_huge_page(h);
3478
3479     return rc;
3480 }
3481
3482 static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
3483     __must_hold(&hugetlb_lock)
3484 {
3485     int nr_nodes, node;
3486     struct page *page;
3487
3488     lockdep_assert_held(&hugetlb_lock);
3489
3490     /* We should never get here if no demote order */
3491     if (!h->demote_order) {
3492         pr_warn("HugeTLB: NULL demote order passed to demote_pool_huge_page.\n");
3493         return -EINVAL;     /* internal error */
3494     }
3495
3496     for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
3497         list_for_each_entry(page, &h->hugepage_freelists[node], lru) {
3498             if (PageHWPoison(page))
3499                 continue;
3500
3501             return demote_free_huge_page(h, page);
3502         }
3503     }
3504
3505     /*
3506      * Only way to get here is if all pages on free lists are poisoned.
3507      * Return -EBUSY so that caller will not retry.
3508      */
3509     return -EBUSY;
3510 }
3511
3512 #define HSTATE_ATTR_RO(_name) \
3513     static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
3514
3515 #define HSTATE_ATTR_WO(_name) \
3516     static struct kobj_attribute _name##_attr = __ATTR_WO(_name)
3517
3518 #define HSTATE_ATTR(_name) \
3519     static struct kobj_attribute _name##_attr = __ATTR_RW(_name)
3520
3521 static struct kobject *hugepages_kobj;
3522 static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
3523
3524 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp);
3525
3526 static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp)
3527 {
3528     int i;
3529
3530     for (i = 0; i < HUGE_MAX_HSTATE; i++)
3531         if (hstate_kobjs[i] == kobj) {
3532             if (nidp)
3533                 *nidp = NUMA_NO_NODE;
3534             return &hstates[i];
3535         }
3536
3537     return kobj_to_node_hstate(kobj, nidp);
3538 }
3539
3540 static ssize_t nr_hugepages_show_common(struct kobject *kobj,
3541                     struct kobj_attribute *attr, char *buf)
3542 {
3543     struct hstate *h;
3544     unsigned long nr_huge_pages;
3545     int nid;
3546
3547     h = kobj_to_hstate(kobj, &nid);
3548     if (nid == NUMA_NO_NODE)
3549         nr_huge_pages = h->nr_huge_pages;
3550     else
3551         nr_huge_pages = h->nr_huge_pages_node[nid];
3552
3553     return sysfs_emit(buf, "%lu\n", nr_huge_pages);
3554 }
3555
3556 static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
3557                        struct hstate *h, int nid,
3558                        unsigned long count, size_t len)
3559 {
3560     int err;
3561     nodemask_t nodes_allowed, *n_mask;
3562
3563     if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
3564         return -EINVAL;
3565
3566     if (nid == NUMA_NO_NODE) {
3567         /*
3568          * global hstate attribute
3569          */
3570         if (!(obey_mempolicy &&
3571                 init_nodemask_of_mempolicy(&nodes_allowed)))
3572             n_mask = &node_states[N_MEMORY];
3573         else
3574             n_mask = &nodes_allowed;
3575     } else {
3576         /*
3577          * Node specific request.  count adjustment happens in
3578          * set_max_huge_pages() after acquiring hugetlb_lock.
3579          */
3580         init_nodemask_of_node(&nodes_allowed, nid);
3581         n_mask = &nodes_allowed;
3582     }
3583
3584     err = set_max_huge_pages(h, count, nid, n_mask);
3585
3586     return err ? err : len;
3587 }
3588
3589 static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
3590                      struct kobject *kobj, const char *buf,
3591                      size_t len)
3592 {
3593     struct hstate *h;
3594     unsigned long count;
3595     int nid;
3596     int err;
3597
3598     err = kstrtoul(buf, 10, &count);
3599     if (err)
3600         return err;
3601
3602     h = kobj_to_hstate(kobj, &nid);
3603     return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len);
3604 }
3605
3606 static ssize_t nr_hugepages_show(struct kobject *kobj,
3607                        struct kobj_attribute *attr, char *buf)
3608 {
3609     return nr_hugepages_show_common(kobj, attr, buf);
3610 }
3611
3612 static ssize_t nr_hugepages_store(struct kobject *kobj,
3613            struct kobj_attribute *attr, const char *buf, size_t len)
3614 {
3615     return nr_hugepages_store_common(false, kobj, buf, len);
3616 }
3617 HSTATE_ATTR(nr_hugepages);
3618
3619 #ifdef CONFIG_NUMA
3620
3621 /*
3622  * hstate attribute for optionally mempolicy-based constraint on persistent
3623  * huge page alloc/free.
3624  */
3625 static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
3626                        struct kobj_attribute *attr,
3627                        char *buf)
3628 {
3629     return nr_hugepages_show_common(kobj, attr, buf);
3630 }
3631
3632 static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
3633            struct kobj_attribute *attr, const char *buf, size_t len)
3634 {
3635     return nr_hugepages_store_common(true, kobj, buf, len);
3636 }
3637 HSTATE_ATTR(nr_hugepages_mempolicy);
3638 #endif
3639
3640
3641 static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
3642                     struct kobj_attribute *attr, char *buf)
3643 {
3644     struct hstate *h = kobj_to_hstate(kobj, NULL);
3645     return sysfs_emit(buf, "%lu\n", h->nr_overcommit_huge_pages);
3646 }
3647
3648 static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
3649         struct kobj_attribute *attr, const char *buf, size_t count)
3650 {
3651     int err;
3652     unsigned long input;
3653     struct hstate *h = kobj_to_hstate(kobj, NULL);
3654
3655     if (hstate_is_gigantic(h))
3656         return -EINVAL;
3657
3658     err = kstrtoul(buf, 10, &input);
3659     if (err)
3660         return err;
3661
3662     spin_lock_irq(&hugetlb_lock);
3663     h->nr_overcommit_huge_pages = input;
3664     spin_unlock_irq(&hugetlb_lock);
3665
3666     return count;
3667 }
3668 HSTATE_ATTR(nr_overcommit_hugepages);
3669
3670 static ssize_t free_hugepages_show(struct kobject *kobj,
3671                     struct kobj_attribute *attr, char *buf)
3672 {
3673     struct hstate *h;
3674     unsigned long free_huge_pages;
3675     int nid;
3676
3677     h = kobj_to_hstate(kobj, &nid);
3678     if (nid == NUMA_NO_NODE)
3679         free_huge_pages = h->free_huge_pages;
3680     else
3681         free_huge_pages = h->free_huge_pages_node[nid];
3682
3683     return sysfs_emit(buf, "%lu\n", free_huge_pages);
3684 }
3685 HSTATE_ATTR_RO(free_hugepages);
3686
3687 static ssize_t resv_hugepages_show(struct kobject *kobj,
3688                     struct kobj_attribute *attr, char *buf)
3689 {
3690     struct hstate *h = kobj_to_hstate(kobj, NULL);
3691     return sysfs_emit(buf, "%lu\n", h->resv_huge_pages);
3692 }
3693 HSTATE_ATTR_RO(resv_hugepages);
3694
3695 static ssize_t surplus_hugepages_show(struct kobject *kobj,
3696                     struct kobj_attribute *attr, char *buf)
3697 {
3698     struct hstate *h;
3699     unsigned long surplus_huge_pages;
3700     int nid;
3701
3702     h = kobj_to_hstate(kobj, &nid);
3703     if (nid == NUMA_NO_NODE)
3704         surplus_huge_pages = h->surplus_huge_pages;
3705     else
3706         surplus_huge_pages = h->surplus_huge_pages_node[nid];
3707
3708     return sysfs_emit(buf, "%lu\n", surplus_huge_pages);
3709 }
3710 HSTATE_ATTR_RO(surplus_hugepages);
3711
3712 static ssize_t demote_store(struct kobject *kobj,
3713            struct kobj_attribute *attr, const char *buf, size_t len)
3714 {
3715     unsigned long nr_demote;
3716     unsigned long nr_available;
3717     nodemask_t nodes_allowed, *n_mask;
3718     struct hstate *h;
3719     int err = 0;
3720     int nid;
3721
3722     err = kstrtoul(buf, 10, &nr_demote);
3723     if (err)
3724         return err;
3725     h = kobj_to_hstate(kobj, &nid);
3726
3727     if (nid != NUMA_NO_NODE) {
3728         init_nodemask_of_node(&nodes_allowed, nid);
3729         n_mask = &nodes_allowed;
3730     } else {
3731         n_mask = &node_states[N_MEMORY];
3732     }
3733
3734     /* Synchronize with other sysfs operations modifying huge pages */
3735     mutex_lock(&h->resize_lock);
3736     spin_lock_irq(&hugetlb_lock);
3737
3738     while (nr_demote) {
3739         /*
3740          * Check for available pages to demote each time thorough the
3741          * loop as demote_pool_huge_page will drop hugetlb_lock.
3742          */
3743         if (nid != NUMA_NO_NODE)
3744             nr_available = h->free_huge_pages_node[nid];
3745         else
3746             nr_available = h->free_huge_pages;
3747         nr_available -= h->resv_huge_pages;
3748         if (!nr_available)
3749             break;
3750
3751         err = demote_pool_huge_page(h, n_mask);
3752         if (err)
3753             break;
3754
3755         nr_demote--;
3756     }
3757
3758     spin_unlock_irq(&hugetlb_lock);
3759     mutex_unlock(&h->resize_lock);
3760
3761     if (err)
3762         return err;
3763     return len;
3764 }
3765 HSTATE_ATTR_WO(demote);
3766
3767 static ssize_t demote_size_show(struct kobject *kobj,
3768                     struct kobj_attribute *attr, char *buf)
3769 {
3770     int nid;
3771     struct hstate *h = kobj_to_hstate(kobj, &nid);
3772     unsigned long demote_size = (PAGE_SIZE << h->demote_order) / SZ_1K;
3773
3774     return sysfs_emit(buf, "%lukB\n", demote_size);
3775 }
3776
3777 static ssize_t demote_size_store(struct kobject *kobj,
3778                     struct kobj_attribute *attr,
3779                     const char *buf, size_t count)
3780 {
3781     struct hstate *h, *demote_hstate;
3782     unsigned long demote_size;
3783     unsigned int demote_order;
3784     int nid;
3785
3786     demote_size = (unsigned long)memparse(buf, NULL);
3787
3788     demote_hstate = size_to_hstate(demote_size);
3789     if (!demote_hstate)
3790         return -EINVAL;
3791     demote_order = demote_hstate->order;
3792     if (demote_order < HUGETLB_PAGE_ORDER)
3793         return -EINVAL;
3794
3795     /* demote order must be smaller than hstate order */
3796     h = kobj_to_hstate(kobj, &nid);
3797     if (demote_order >= h->order)
3798         return -EINVAL;
3799
3800     /* resize_lock synchronizes access to demote size and writes */
3801     mutex_lock(&h->resize_lock);
3802     h->demote_order = demote_order;
3803     mutex_unlock(&h->resize_lock);
3804
3805     return count;
3806 }
3807 HSTATE_ATTR(demote_size);
3808
3809 static struct attribute *hstate_attrs[] = {
3810     &nr_hugepages_attr.attr,
3811     &nr_overcommit_hugepages_attr.attr,
3812     &free_hugepages_attr.attr,
3813     &resv_hugepages_attr.attr,
3814     &surplus_hugepages_attr.attr,
3815 #ifdef CONFIG_NUMA
3816     &nr_hugepages_mempolicy_attr.attr,
3817 #endif
3818     NULL,
3819 };
3820
3821 static const struct attribute_group hstate_attr_group = {
3822     .attrs = hstate_attrs,
3823 };
3824
3825 static struct attribute *hstate_demote_attrs[] = {
3826     &demote_size_attr.attr,
3827     &demote_attr.attr,
3828     NULL,
3829 };
3830
3831 static const struct attribute_group hstate_demote_attr_group = {
3832     .attrs = hstate_demote_attrs,
3833 };
3834
3835 static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
3836                     struct kobject **hstate_kobjs,
3837                     const struct attribute_group *hstate_attr_group)
3838 {
3839     int retval;
3840     int hi = hstate_index(h);
3841
3842     hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
3843     if (!hstate_kobjs[hi])
3844         return -ENOMEM;
3845
3846     retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
3847     if (retval) {
3848         kobject_put(hstate_kobjs[hi]);
3849         hstate_kobjs[hi] = NULL;
3850     }
3851
3852     if (h->demote_order) {
3853         if (sysfs_create_group(hstate_kobjs[hi],
3854                     &hstate_demote_attr_group))
3855             pr_warn("HugeTLB unable to create demote interfaces for %s\n", h->name);
3856     }
3857
3858     return retval;
3859 }
3860
3861 static void __init hugetlb_sysfs_init(void)
3862 {
3863     struct hstate *h;
3864     int err;
3865
3866     hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
3867     if (!hugepages_kobj)
3868         return;
3869
3870     for_each_hstate(h) {
3871         err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
3872                      hstate_kobjs, &hstate_attr_group);
3873         if (err)
3874             pr_err("HugeTLB: Unable to add hstate %s", h->name);
3875     }
3876 }
3877
3878 #ifdef CONFIG_NUMA
3879
3880 /*
3881  * node_hstate/s - associate per node hstate attributes, via their kobjects,
3882  * with node devices in node_devices[] using a parallel array.  The array
3883  * index of a node device or _hstate == node id.
3884  * This is here to avoid any static dependency of the node device driver, in
3885  * the base kernel, on the hugetlb module.
3886  */
3887 struct node_hstate {
3888     struct kobject      *hugepages_kobj;
3889     struct kobject      *hstate_kobjs[HUGE_MAX_HSTATE];
3890 };
3891 static struct node_hstate node_hstates[MAX_NUMNODES];
3892
3893 /*
3894  * A subset of global hstate attributes for node devices
3895  */
3896 static struct attribute *per_node_hstate_attrs[] = {
3897     &nr_hugepages_attr.attr,
3898     &free_hugepages_attr.attr,
3899     &surplus_hugepages_attr.attr,
3900     NULL,
3901 };
3902
3903 static const struct attribute_group per_node_hstate_attr_group = {
3904     .attrs = per_node_hstate_attrs,
3905 };
3906
3907 /*
3908  * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj.
3909  * Returns node id via non-NULL nidp.
3910  */
3911 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
3912 {
3913     int nid;
3914
3915     for (nid = 0; nid < nr_node_ids; nid++) {
3916         struct node_hstate *nhs = &node_hstates[nid];
3917         int i;
3918         for (i = 0; i < HUGE_MAX_HSTATE; i++)
3919             if (nhs->hstate_kobjs[i] == kobj) {
3920                 if (nidp)
3921                     *nidp = nid;
3922                 return &hstates[i];
3923             }
3924     }
3925
3926     BUG();
3927     return NULL;
3928 }
3929
3930 /*
3931  * Unregister hstate attributes from a single node device.
3932  * No-op if no hstate attributes attached.
3933  */
3934 static void hugetlb_unregister_node(struct node *node)
3935 {
3936     struct hstate *h;
3937     struct node_hstate *nhs = &node_hstates[node->dev.id];
3938
3939     if (!nhs->hugepages_kobj)
3940         return;     /* no hstate attributes */
3941
3942     for_each_hstate(h) {
3943         int idx = hstate_index(h);
3944         if (nhs->hstate_kobjs[idx]) {
3945             kobject_put(nhs->hstate_kobjs[idx]);
3946             nhs->hstate_kobjs[idx] = NULL;
3947         }
3948     }
3949
3950     kobject_put(nhs->hugepages_kobj);
3951     nhs->hugepages_kobj = NULL;
3952 }
3953
3954
3955 /*
3956  * Register hstate attributes for a single node device.
3957  * No-op if attributes already registered.
3958  */
3959 static void hugetlb_register_node(struct node *node)
3960 {
3961     struct hstate *h;
3962     struct node_hstate *nhs = &node_hstates[node->dev.id];
3963     int err;
3964
3965     if (nhs->hugepages_kobj)
3966         return;     /* already allocated */
3967
3968     nhs->hugepages_kobj = kobject_create_and_add("hugepages",
3969                             &node->dev.kobj);
3970     if (!nhs->hugepages_kobj)
3971         return;
3972
3973     for_each_hstate(h) {
3974         err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj,
3975                         nhs->hstate_kobjs,
3976                         &per_node_hstate_attr_group);
3977         if (err) {
3978             pr_err("HugeTLB: Unable to add hstate %s for node %d\n",
3979                 h->name, node->dev.id);
3980             hugetlb_unregister_node(node);
3981             break;
3982         }
3983     }
3984 }
3985
3986 /*
3987  * hugetlb init time:  register hstate attributes for all registered node
3988  * devices of nodes that have memory.  All on-line nodes should have
3989  * registered their associated device by this time.
3990  */
3991 static void __init hugetlb_register_all_nodes(void)
3992 {
3993     int nid;
3994
3995     for_each_node_state(nid, N_MEMORY) {
3996         struct node *node = node_devices[nid];
3997         if (node->dev.id == nid)
3998             hugetlb_register_node(node);
3999     }
4000
4001     /*
4002      * Let the node device driver know we're here so it can
4003      * [un]register hstate attributes on node hotplug.
4004      */
4005     register_hugetlbfs_with_node(hugetlb_register_node,
4006                      hugetlb_unregister_node);
4007 }
4008 #else   /* !CONFIG_NUMA */
4009
4010 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
4011 {
4012     BUG();
4013     if (nidp)
4014         *nidp = -1;
4015     return NULL;
4016 }
4017
4018 static void hugetlb_register_all_nodes(void) { }
4019
4020 #endif
4021
4022 static int __init hugetlb_init(void)
4023 {
4024     int i;
4025
4026     BUILD_BUG_ON(sizeof_field(struct page, private) * BITS_PER_BYTE <
4027             __NR_HPAGEFLAGS);
4028
4029     if (!hugepages_supported()) {
4030         if (hugetlb_max_hstate || default_hstate_max_huge_pages)
4031             pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n");
4032         return 0;
4033     }
4034
4035     /*
4036      * Make sure HPAGE_SIZE (HUGETLB_PAGE_ORDER) hstate exists.  Some
4037      * architectures depend on setup being done here.
4038      */
4039     hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
4040     if (!parsed_default_hugepagesz) {
4041         /*
4042          * If we did not parse a default huge page size, set
4043          * default_hstate_idx to HPAGE_SIZE hstate. And, if the
4044          * number of huge pages for this default size was implicitly
4045          * specified, set that here as well.
4046          * Note that the implicit setting will overwrite an explicit
4047          * setting.  A warning will be printed in this case.
4048          */
4049         default_hstate_idx = hstate_index(size_to_hstate(HPAGE_SIZE));
4050         if (default_hstate_max_huge_pages) {
4051             if (default_hstate.max_huge_pages) {
4052                 char buf[32];
4053
4054                 string_get_size(huge_page_size(&default_hstate),
4055                     1, STRING_UNITS_2, buf, 32);
4056                 pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n",
4057                     default_hstate.max_huge_pages, buf);
4058                 pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n",
4059                     default_hstate_max_huge_pages);
4060             }
4061             default_hstate.max_huge_pages =
4062                 default_hstate_max_huge_pages;
4063
4064             for_each_online_node(i)
4065                 default_hstate.max_huge_pages_node[i] =
4066                     default_hugepages_in_node[i];
4067         }
4068     }
4069
4070     hugetlb_cma_check();
4071     hugetlb_init_hstates();
4072     gather_bootmem_prealloc();
4073     report_hugepages();
4074
4075     hugetlb_sysfs_init();
4076     hugetlb_register_all_nodes();
4077     hugetlb_cgroup_file_init();
4078
4079 #ifdef CONFIG_SMP
4080     num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus());
4081 #else
4082     num_fault_mutexes = 1;
4083 #endif
4084     hugetlb_fault_mutex_table =
4085         kmalloc_array(num_fault_mutexes, sizeof(struct mutex),
4086                   GFP_KERNEL);
4087     BUG_ON(!hugetlb_fault_mutex_table);
4088
4089     for (i = 0; i < num_fault_mutexes; i++)
4090         mutex_init(&hugetlb_fault_mutex_table[i]);
4091     return 0;
4092 }
4093 subsys_initcall(hugetlb_init);
4094
4095 /* Overwritten by architectures with more huge page sizes */
4096 bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size)
4097 {
4098     return size == HPAGE_SIZE;
4099 }
4100
4101 void __init hugetlb_add_hstate(unsigned int order)
4102 {
4103     struct hstate *h;
4104     unsigned long i;
4105
4106     if (size_to_hstate(PAGE_SIZE << order)) {
4107         return;
4108     }
4109     BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
4110     BUG_ON(order == 0);
4111     h = &hstates[hugetlb_max_hstate++];
4112     mutex_init(&h->resize_lock);
4113     h->order = order;
4114     h->mask = ~(huge_page_size(h) - 1);
4115     for (i = 0; i < MAX_NUMNODES; ++i)
4116         INIT_LIST_HEAD(&h->hugepage_freelists[i]);
4117     INIT_LIST_HEAD(&h->hugepage_activelist);
4118     h->next_nid_to_alloc = first_memory_node;
4119     h->next_nid_to_free = first_memory_node;
4120     snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
4121                     huge_page_size(h)/1024);
4122
4123     parsed_hstate = h;
4124 }
4125
4126 bool __init __weak hugetlb_node_alloc_supported(void)
4127 {
4128     return true;
4129 }
4130
4131 static void __init hugepages_clear_pages_in_node(void)
4132 {
4133     if (!hugetlb_max_hstate) {
4134         default_hstate_max_huge_pages = 0;
4135         memset(default_hugepages_in_node, 0,
4136             MAX_NUMNODES * sizeof(unsigned int));
4137     } else {
4138         parsed_hstate->max_huge_pages = 0;
4139         memset(parsed_hstate->max_huge_pages_node, 0,
4140             MAX_NUMNODES * sizeof(unsigned int));
4141     }
4142 }
4143
4144 /*
4145  * hugepages command line processing
4146  * hugepages normally follows a valid hugepagsz or default_hugepagsz
4147  * specification.  If not, ignore the hugepages value.  hugepages can also
4148  * be the first huge page command line  option in which case it implicitly
4149  * specifies the number of huge pages for the default size.
4150  */
4151 static int __init hugepages_setup(char *s)
4152 {
4153     unsigned long *mhp;
4154     static unsigned long *last_mhp;
4155     int node = NUMA_NO_NODE;
4156     int count;
4157     unsigned long tmp;
4158     char *p = s;
4159
4160     if (!parsed_valid_hugepagesz) {
4161         pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s);
4162         parsed_valid_hugepagesz = true;
4163         return 1;
4164     }
4165
4166     /*
4167      * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter
4168      * yet, so this hugepages= parameter goes to the "default hstate".
4169      * Otherwise, it goes with the previously parsed hugepagesz or
4170      * default_hugepagesz.
4171      */
4172     else if (!hugetlb_max_hstate)
4173         mhp = &default_hstate_max_huge_pages;
4174     else
4175         mhp = &parsed_hstate->max_huge_pages;
4176
4177     if (mhp == last_mhp) {
4178         pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s);
4179         return 1;
4180     }
4181
4182     while (*p) {
4183         count = 0;
4184         if (sscanf(p, "%lu%n", &tmp, &count) != 1)
4185             goto invalid;
4186         /* Parameter is node format */
4187         if (p[count] == ':') {
4188             if (!hugetlb_node_alloc_supported()) {
4189                 pr_warn("HugeTLB: architecture can't support node specific alloc, ignoring!\n");
4190                 return 1;
4191             }
4192             if (tmp >= MAX_NUMNODES || !node_online(tmp))
4193                 goto invalid;
4194             node = array_index_nospec(tmp, MAX_NUMNODES);
4195             p += count + 1;
4196             /* Parse hugepages */
4197             if (sscanf(p, "%lu%n", &tmp, &count) != 1)
4198                 goto invalid;
4199             if (!hugetlb_max_hstate)
4200                 default_hugepages_in_node[node] = tmp;
4201             else
4202                 parsed_hstate->max_huge_pages_node[node] = tmp;
4203             *mhp += tmp;
4204             /* Go to parse next node*/
4205             if (p[count] == ',')
4206                 p += count + 1;
4207             else
4208                 break;
4209         } else {
4210             if (p != s)
4211                 goto invalid;
4212             *mhp = tmp;
4213             break;
4214         }
4215     }
4216
4217     /*
4218      * Global state is always initialized later in hugetlb_init.
4219      * But we need to allocate gigantic hstates here early to still
4220      * use the bootmem allocator.
4221      */
4222     if (hugetlb_max_hstate && hstate_is_gigantic(parsed_hstate))
4223         hugetlb_hstate_alloc_pages(parsed_hstate);
4224
4225     last_mhp = mhp;
4226
4227     return 1;
4228
4229 invalid:
4230     pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p);
4231     hugepages_clear_pages_in_node();
4232     return 1;
4233 }
4234 __setup("hugepages=", hugepages_setup);
4235
4236 /*
4237  * hugepagesz command line processing
4238  * A specific huge page size can only be specified once with hugepagesz.
4239  * hugepagesz is followed by hugepages on the command line.  The global
4240  * variable 'parsed_valid_hugepagesz' is used to determine if prior
4241  * hugepagesz argument was valid.
4242  */
4243 static int __init hugepagesz_setup(char *s)
4244 {
4245     unsigned long size;
4246     struct hstate *h;
4247
4248     parsed_valid_hugepagesz = false;
4249     size = (unsigned long)memparse(s, NULL);
4250
4251     if (!arch_hugetlb_valid_size(size)) {
4252         pr_err("HugeTLB: unsupported hugepagesz=%s\n", s);
4253         return 1;
4254     }
4255
4256     h = size_to_hstate(size);
4257     if (h) {
4258         /*
4259          * hstate for this size already exists.  This is normally
4260          * an error, but is allowed if the existing hstate is the
4261          * default hstate.  More specifically, it is only allowed if
4262          * the number of huge pages for the default hstate was not
4263          * previously specified.
4264          */
4265         if (!parsed_default_hugepagesz ||  h != &default_hstate ||
4266             default_hstate.max_huge_pages) {
4267             pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s);
4268             return 1;
4269         }
4270
4271         /*
4272          * No need to call hugetlb_add_hstate() as hstate already
4273          * exists.  But, do set parsed_hstate so that a following
4274          * hugepages= parameter will be applied to this hstate.
4275          */
4276         parsed_hstate = h;
4277         parsed_valid_hugepagesz = true;
4278         return 1;
4279     }
4280
4281     hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
4282     parsed_valid_hugepagesz = true;
4283     return 1;
4284 }
4285 __setup("hugepagesz=", hugepagesz_setup);
4286
4287 /*
4288  * default_hugepagesz command line input
4289  * Only one instance of default_hugepagesz allowed on command line.
4290  */
4291 static int __init default_hugepagesz_setup(char *s)
4292 {
4293     unsigned long size;
4294     int i;
4295
4296     parsed_valid_hugepagesz = false;
4297     if (parsed_default_hugepagesz) {
4298         pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s);
4299         return 1;
4300     }
4301
4302     size = (unsigned long)memparse(s, NULL);
4303
4304     if (!arch_hugetlb_valid_size(size)) {
4305         pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s);
4306         return 1;
4307     }
4308
4309     hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
4310     parsed_valid_hugepagesz = true;
4311     parsed_default_hugepagesz = true;
4312     default_hstate_idx = hstate_index(size_to_hstate(size));
4313
4314     /*
4315      * The number of default huge pages (for this size) could have been
4316      * specified as the first hugetlb parameter: hugepages=X.  If so,
4317      * then default_hstate_max_huge_pages is set.  If the default huge
4318      * page size is gigantic (>= MAX_ORDER), then the pages must be
4319      * allocated here from bootmem allocator.
4320      */
4321     if (default_hstate_max_huge_pages) {
4322         default_hstate.max_huge_pages = default_hstate_max_huge_pages;
4323         for_each_online_node(i)
4324             default_hstate.max_huge_pages_node[i] =
4325                 default_hugepages_in_node[i];
4326         if (hstate_is_gigantic(&default_hstate))
4327             hugetlb_hstate_alloc_pages(&default_hstate);
4328         default_hstate_max_huge_pages = 0;
4329     }
4330
4331     return 1;
4332 }
4333 __setup("default_hugepagesz=", default_hugepagesz_setup);
4334
4335 static unsigned int allowed_mems_nr(struct hstate *h)
4336 {
4337     int node;
4338     unsigned int nr = 0;
4339     nodemask_t *mpol_allowed;
4340     unsigned int *array = h->free_huge_pages_node;
4341     gfp_t gfp_mask = htlb_alloc_mask(h);
4342
4343     mpol_allowed = policy_nodemask_current(gfp_mask);
4344
4345     for_each_node_mask(node, cpuset_current_mems_allowed) {
4346         if (!mpol_allowed || node_isset(node, *mpol_allowed))
4347             nr += array[node];
4348     }
4349
4350     return nr;
4351 }
4352
4353 #ifdef CONFIG_SYSCTL
4354 static int proc_hugetlb_doulongvec_minmax(struct ctl_table *table, int write,
4355                       void *buffer, size_t *length,
4356                       loff_t *ppos, unsigned long *out)
4357 {
4358     struct ctl_table dup_table;
4359
4360     /*
4361      * In order to avoid races with __do_proc_doulongvec_minmax(), we
4362      * can duplicate the @table and alter the duplicate of it.
4363      */
4364     dup_table = *table;
4365     dup_table.data = out;
4366
4367     return proc_doulongvec_minmax(&dup_table, write, buffer, length, ppos);
4368 }
4369
4370 static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
4371              struct ctl_table *table, int write,
4372              void *buffer, size_t *length, loff_t *ppos)
4373 {
4374     struct hstate *h = &default_hstate;
4375     unsigned long tmp = h->max_huge_pages;
4376     int ret;
4377
4378     if (!hugepages_supported())
4379         return -EOPNOTSUPP;
4380
4381     ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
4382                          &tmp);
4383     if (ret)
4384         goto out;
4385
4386     if (write)
4387         ret = __nr_hugepages_store_common(obey_mempolicy, h,
4388                           NUMA_NO_NODE, tmp, *length);
4389 out:
4390     return ret;
4391 }
4392
4393 int hugetlb_sysctl_handler(struct ctl_table *table, int write,
4394               void *buffer, size_t *length, loff_t *ppos)
4395 {
4396
4397     return hugetlb_sysctl_handler_common(false, table, write,
4398                             buffer, length, ppos);
4399 }
4400
4401 #ifdef CONFIG_NUMA
4402 int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
4403               void *buffer, size_t *length, loff_t *ppos)
4404 {
4405     return hugetlb_sysctl_handler_common(true, table, write,
4406                             buffer, length, ppos);
4407 }
4408 #endif /* CONFIG_NUMA */
4409
4410 int hugetlb_overcommit_handler(struct ctl_table *table, int write,
4411         void *buffer, size_t *length, loff_t *ppos)
4412 {
4413     struct hstate *h = &default_hstate;
4414     unsigned long tmp;
4415     int ret;
4416
4417     if (!hugepages_supported())
4418         return -EOPNOTSUPP;
4419
4420     tmp = h->nr_overcommit_huge_pages;
4421
4422     if (write && hstate_is_gigantic(h))
4423         return -EINVAL;
4424
4425     ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
4426                          &tmp);
4427     if (ret)
4428         goto out;
4429
4430     if (write) {
4431         spin_lock_irq(&hugetlb_lock);
4432         h->nr_overcommit_huge_pages = tmp;
4433         spin_unlock_irq(&hugetlb_lock);
4434     }
4435 out:
4436     return ret;
4437 }
4438
4439 #endif /* CONFIG_SYSCTL */
4440
4441 void hugetlb_report_meminfo(struct seq_file *m)
4442 {
4443     struct hstate *h;
4444     unsigned long total = 0;
4445
4446     if (!hugepages_supported())
4447         return;
4448
4449     for_each_hstate(h) {
4450         unsigned long count = h->nr_huge_pages;
4451
4452         total += huge_page_size(h) * count;
4453
4454         if (h == &default_hstate)
4455             seq_printf(m,
4456                    "HugePages_Total:   %5lu\n"
4457                    "HugePages_Free:    %5lu\n"
4458                    "HugePages_Rsvd:    %5lu\n"
4459                    "HugePages_Surp:    %5lu\n"
4460                    "Hugepagesize:   %8lu kB\n",
4461                    count,
4462                    h->free_huge_pages,
4463                    h->resv_huge_pages,
4464                    h->surplus_huge_pages,
4465                    huge_page_size(h) / SZ_1K);
4466     }
4467
4468     seq_printf(m, "Hugetlb:        %8lu kB\n", total / SZ_1K);
4469 }
4470
4471 int hugetlb_report_node_meminfo(char *buf, int len, int nid)
4472 {
4473     struct hstate *h = &default_hstate;
4474
4475     if (!hugepages_supported())
4476         return 0;
4477
4478     return sysfs_emit_at(buf, len,
4479                  "Node %d HugePages_Total: %5u\n"
4480                  "Node %d HugePages_Free:  %5u\n"
4481                  "Node %d HugePages_Surp:  %5u\n",
4482                  nid, h->nr_huge_pages_node[nid],
4483                  nid, h->free_huge_pages_node[nid],
4484                  nid, h->surplus_huge_pages_node[nid]);
4485 }
4486
4487 void hugetlb_show_meminfo_node(int nid)
4488 {
4489     struct hstate *h;
4490
4491     if (!hugepages_supported())
4492         return;
4493
4494     for_each_hstate(h)
4495         printk("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
4496             nid,
4497             h->nr_huge_pages_node[nid],
4498             h->free_huge_pages_node[nid],
4499             h->surplus_huge_pages_node[nid],
4500             huge_page_size(h) / SZ_1K);
4501 }
4502
4503 void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm)
4504 {
4505     seq_printf(m, "HugetlbPages:\t%8lu kB\n",
4506            atomic_long_read(&mm->hugetlb_usage) << (PAGE_SHIFT - 10));
4507 }
4508
4509 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
4510 unsigned long hugetlb_total_pages(void)
4511 {
4512     struct hstate *h;
4513     unsigned long nr_total_pages = 0;
4514
4515     for_each_hstate(h)
4516         nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h);
4517     return nr_total_pages;
4518 }
4519
4520 static int hugetlb_acct_memory(struct hstate *h, long delta)
4521 {
4522     int ret = -ENOMEM;
4523
4524     if (!delta)
4525         return 0;
4526
4527     spin_lock_irq(&hugetlb_lock);
4528     /*
4529      * When cpuset is configured, it breaks the strict hugetlb page
4530      * reservation as the accounting is done on a global variable. Such
4531      * reservation is completely rubbish in the presence of cpuset because
4532      * the reservation is not checked against page availability for the
4533      * current cpuset. Application can still potentially OOM'ed by kernel
4534      * with lack of free htlb page in cpuset that the task is in.
4535      * Attempt to enforce strict accounting with cpuset is almost
4536      * impossible (or too ugly) because cpuset is too fluid that
4537      * task or memory node can be dynamically moved between cpusets.
4538      *
4539      * The change of semantics for shared hugetlb mapping with cpuset is
4540      * undesirable. However, in order to preserve some of the semantics,
4541      * we fall back to check against current free page availability as
4542      * a best attempt and hopefully to minimize the impact of changing
4543      * semantics that cpuset has.
4544      *
4545      * Apart from cpuset, we also have memory policy mechanism that
4546      * also determines from which node the kernel will allocate memory
4547      * in a NUMA system. So similar to cpuset, we also should consider
4548      * the memory policy of the current task. Similar to the description
4549      * above.
4550      */
4551     if (delta > 0) {
4552         if (gather_surplus_pages(h, delta) < 0)
4553             goto out;
4554
4555         if (delta > allowed_mems_nr(h)) {
4556             return_unused_surplus_pages(h, delta);
4557             goto out;
4558         }
4559     }
4560
4561     ret = 0;
4562     if (delta < 0)
4563         return_unused_surplus_pages(h, (unsigned long) -delta);
4564
4565 out:
4566     spin_unlock_irq(&hugetlb_lock);
4567     return ret;
4568 }
4569
4570 static void hugetlb_vm_op_open(struct vm_area_struct *vma)
4571 {
4572     struct resv_map *resv = vma_resv_map(vma);
4573
4574     /*
4575      * This new VMA should share its siblings reservation map if present.
4576      * The VMA will only ever have a valid reservation map pointer where
4577      * it is being copied for another still existing VMA.  As that VMA
4578      * has a reference to the reservation map it cannot disappear until
4579      * after this open call completes.  It is therefore safe to take a
4580      * new reference here without additional locking.
4581      */
4582     if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
4583         resv_map_dup_hugetlb_cgroup_uncharge_info(resv);
4584         kref_get(&resv->refs);
4585     }
4586 }
4587
4588 static void hugetlb_vm_op_close(struct vm_area_struct *vma)
4589 {
4590     struct hstate *h = hstate_vma(vma);
4591     struct resv_map *resv = vma_resv_map(vma);
4592     struct hugepage_subpool *spool = subpool_vma(vma);
4593     unsigned long reserve, start, end;
4594     long gbl_reserve;
4595
4596     if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
4597         return;
4598
4599     start = vma_hugecache_offset(h, vma, vma->vm_start);
4600     end = vma_hugecache_offset(h, vma, vma->vm_end);
4601
4602     reserve = (end - start) - region_count(resv, start, end);
4603     hugetlb_cgroup_uncharge_counter(resv, start, end);
4604     if (reserve) {
4605         /*
4606          * Decrement reserve counts.  The global reserve count may be
4607          * adjusted if the subpool has a minimum size.
4608          */
4609         gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
4610         hugetlb_acct_memory(h, -gbl_reserve);
4611     }
4612
4613     kref_put(&resv->refs, resv_map_release);
4614 }
4615
4616 static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
4617 {
4618     if (addr & ~(huge_page_mask(hstate_vma(vma))))
4619         return -EINVAL;
4620     return 0;
4621 }
4622
4623 static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
4624 {
4625     return huge_page_size(hstate_vma(vma));
4626 }
4627
4628 /*
4629  * We cannot handle pagefaults against hugetlb pages at all.  They cause
4630  * handle_mm_fault() to try to instantiate regular-sized pages in the
4631  * hugepage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
4632  * this far.
4633  */
4634 static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf)
4635 {
4636     BUG();
4637     return 0;
4638 }
4639
4640 /*
4641  * When a new function is introduced to vm_operations_struct and added
4642  * to hugetlb_vm_ops, please consider adding the function to shm_vm_ops.
4643  * This is because under System V memory model, mappings created via
4644  * shmget/shmat with "huge page" specified are backed by hugetlbfs files,
4645  * their original vm_ops are overwritten with shm_vm_ops.
4646  */
4647 const struct vm_operations_struct hugetlb_vm_ops = {
4648     .fault = hugetlb_vm_op_fault,
4649     .open = hugetlb_vm_op_open,
4650     .close = hugetlb_vm_op_close,
4651     .may_split = hugetlb_vm_op_split,
4652     .pagesize = hugetlb_vm_op_pagesize,
4653 };
4654
4655 static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
4656                 int writable)
4657 {
4658     pte_t entry;
4659     unsigned int shift = huge_page_shift(hstate_vma(vma));
4660
4661     if (writable) {
4662         entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
4663                      vma->vm_page_prot)));
4664     } else {
4665         entry = huge_pte_wrprotect(mk_huge_pte(page,
4666                        vma->vm_page_prot));
4667     }
4668     entry = pte_mkyoung(entry);
4669     entry = arch_make_huge_pte(entry, shift, vma->vm_flags);
4670
4671     return entry;
4672 }
4673
4674 static void set_huge_ptep_writable(struct vm_area_struct *vma,
4675                    unsigned long address, pte_t *ptep)
4676 {
4677     pte_t entry;
4678
4679     entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep)));
4680     if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
4681         update_mmu_cache(vma, address, ptep);
4682 }
4683
4684 bool is_hugetlb_entry_migration(pte_t pte)
4685 {
4686     swp_entry_t swp;
4687
4688     if (huge_pte_none(pte) || pte_present(pte))
4689         return false;
4690     swp = pte_to_swp_entry(pte);
4691     if (is_migration_entry(swp))
4692         return true;
4693     else
4694         return false;
4695 }
4696
4697 static bool is_hugetlb_entry_hwpoisoned(pte_t pte)
4698 {
4699     swp_entry_t swp;
4700
4701     if (huge_pte_none(pte) || pte_present(pte))
4702         return false;
4703     swp = pte_to_swp_entry(pte);
4704     if (is_hwpoison_entry(swp))
4705         return true;
4706     else
4707         return false;
4708 }
4709
4710 static void
4711 hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr,
4712              struct page *new_page)
4713 {
4714     __SetPageUptodate(new_page);
4715     hugepage_add_new_anon_rmap(new_page, vma, addr);
4716     set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1));
4717     hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm);
4718     ClearHPageRestoreReserve(new_page);
4719     SetHPageMigratable(new_page);
4720 }
4721
4722 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
4723                 struct vm_area_struct *dst_vma,
4724                 struct vm_area_struct *src_vma)
4725 {
4726     pte_t *src_pte, *dst_pte, entry, dst_entry;
4727     struct page *ptepage;
4728     unsigned long addr;
4729     bool cow = is_cow_mapping(src_vma->vm_flags);
4730     struct hstate *h = hstate_vma(src_vma);
4731     unsigned long sz = huge_page_size(h);
4732     unsigned long npages = pages_per_huge_page(h);
4733     struct address_space *mapping = src_vma->vm_file->f_mapping;
4734     struct mmu_notifier_range range;
4735     unsigned long last_addr_mask;
4736     int ret = 0;
4737
4738     if (cow) {
4739         mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src_vma, src,
4740                     src_vma->vm_start,
4741                     src_vma->vm_end);
4742         mmu_notifier_invalidate_range_start(&range);
4743         mmap_assert_write_locked(src);
4744         raw_write_seqcount_begin(&src->write_protect_seq);
4745     } else {
4746         /*
4747          * For shared mappings i_mmap_rwsem must be held to call
4748          * huge_pte_alloc, otherwise the returned ptep could go
4749          * away if part of a shared pmd and another thread calls
4750          * huge_pmd_unshare.
4751          */
4752         i_mmap_lock_read(mapping);
4753     }
4754
4755     last_addr_mask = hugetlb_mask_last_page(h);
4756     for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) {
4757         spinlock_t *src_ptl, *dst_ptl;
4758         src_pte = huge_pte_offset(src, addr, sz);
4759         if (!src_pte) {
4760             addr |= last_addr_mask;
4761             continue;
4762         }
4763         dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz);
4764         if (!dst_pte) {
4765             ret = -ENOMEM;
4766             break;
4767         }
4768
4769         /*
4770          * If the pagetables are shared don't copy or take references.
4771          * dst_pte == src_pte is the common case of src/dest sharing.
4772          *
4773          * However, src could have 'unshared' and dst shares with
4774          * another vma.  If dst_pte !none, this implies sharing.
4775          * Check here before taking page table lock, and once again
4776          * after taking the lock below.
4777          */
4778         dst_entry = huge_ptep_get(dst_pte);
4779         if ((dst_pte == src_pte) || !huge_pte_none(dst_entry)) {
4780             addr |= last_addr_mask;
4781             continue;
4782         }
4783
4784         dst_ptl = huge_pte_lock(h, dst, dst_pte);
4785         src_ptl = huge_pte_lockptr(h, src, src_pte);
4786         spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
4787         entry = huge_ptep_get(src_pte);
4788         dst_entry = huge_ptep_get(dst_pte);
4789 again:
4790         if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) {
4791             /*
4792              * Skip if src entry none.  Also, skip in the
4793              * unlikely case dst entry !none as this implies
4794              * sharing with another vma.
4795              */
4796             ;
4797         } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) {
4798             bool uffd_wp = huge_pte_uffd_wp(entry);
4799
4800             if (!userfaultfd_wp(dst_vma) && uffd_wp)
4801                 entry = huge_pte_clear_uffd_wp(entry);
4802             set_huge_pte_at(dst, addr, dst_pte, entry);
4803         } else if (unlikely(is_hugetlb_entry_migration(entry))) {
4804             swp_entry_t swp_entry = pte_to_swp_entry(entry);
4805             bool uffd_wp = huge_pte_uffd_wp(entry);
4806
4807             if (!is_readable_migration_entry(swp_entry) && cow) {
4808                 /*
4809                  * COW mappings require pages in both
4810                  * parent and child to be set to read.
4811                  */
4812                 swp_entry = make_readable_migration_entry(
4813                             swp_offset(swp_entry));
4814                 entry = swp_entry_to_pte(swp_entry);
4815                 if (userfaultfd_wp(src_vma) && uffd_wp)
4816                     entry = huge_pte_mkuffd_wp(entry);
4817                 set_huge_pte_at(src, addr, src_pte, entry);
4818             }
4819             if (!userfaultfd_wp(dst_vma) && uffd_wp)
4820                 entry = huge_pte_clear_uffd_wp(entry);
4821             set_huge_pte_at(dst, addr, dst_pte, entry);
4822         } else if (unlikely(is_pte_marker(entry))) {
4823             /*
4824              * We copy the pte marker only if the dst vma has
4825              * uffd-wp enabled.
4826              */
4827             if (userfaultfd_wp(dst_vma))
4828                 set_huge_pte_at(dst, addr, dst_pte, entry);
4829         } else {
4830             entry = huge_ptep_get(src_pte);
4831             ptepage = pte_page(entry);
4832             get_page(ptepage);
4833
4834             /*
4835              * Failing to duplicate the anon rmap is a rare case
4836              * where we see pinned hugetlb pages while they're
4837              * prone to COW. We need to do the COW earlier during
4838              * fork.
4839              *
4840              * When pre-allocating the page or copying data, we
4841              * need to be without the pgtable locks since we could
4842              * sleep during the process.
4843              */
4844             if (!PageAnon(ptepage)) {
4845                 page_dup_file_rmap(ptepage, true);
4846             } else if (page_try_dup_anon_rmap(ptepage, true,
4847                               src_vma)) {
4848                 pte_t src_pte_old = entry;
4849                 struct page *new;
4850
4851                 spin_unlock(src_ptl);
4852                 spin_unlock(dst_ptl);
4853                 /* Do not use reserve as it's private owned */
4854                 new = alloc_huge_page(dst_vma, addr, 1);
4855                 if (IS_ERR(new)) {
4856                     put_page(ptepage);
4857                     ret = PTR_ERR(new);
4858                     break;
4859                 }
4860                 copy_user_huge_page(new, ptepage, addr, dst_vma,
4861                             npages);
4862                 put_page(ptepage);
4863
4864                 /* Install the new huge page if src pte stable */
4865                 dst_ptl = huge_pte_lock(h, dst, dst_pte);
4866                 src_ptl = huge_pte_lockptr(h, src, src_pte);
4867                 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
4868                 entry = huge_ptep_get(src_pte);
4869                 if (!pte_same(src_pte_old, entry)) {
4870                     restore_reserve_on_error(h, dst_vma, addr,
4871                                 new);
4872                     put_page(new);
4873                     /* dst_entry won't change as in child */
4874                     goto again;
4875                 }
4876                 hugetlb_install_page(dst_vma, dst_pte, addr, new);
4877                 spin_unlock(src_ptl);
4878                 spin_unlock(dst_ptl);
4879                 continue;
4880             }
4881
4882             if (cow) {
4883                 /*
4884                  * No need to notify as we are downgrading page
4885                  * table protection not changing it to point
4886                  * to a new page.
4887                  *
4888                  * See Documentation/mm/mmu_notifier.rst
4889                  */
4890                 huge_ptep_set_wrprotect(src, addr, src_pte);
4891                 entry = huge_pte_wrprotect(entry);
4892             }
4893
4894             set_huge_pte_at(dst, addr, dst_pte, entry);
4895             hugetlb_count_add(npages, dst);
4896         }
4897         spin_unlock(src_ptl);
4898         spin_unlock(dst_ptl);
4899     }
4900
4901     if (cow) {
4902         raw_write_seqcount_end(&src->write_protect_seq);
4903         mmu_notifier_invalidate_range_end(&range);
4904     } else {
4905         i_mmap_unlock_read(mapping);
4906     }
4907
4908     return ret;
4909 }
4910
4911 static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
4912               unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte)
4913 {
4914     struct hstate *h = hstate_vma(vma);
4915     struct mm_struct *mm = vma->vm_mm;
4916     spinlock_t *src_ptl, *dst_ptl;
4917     pte_t pte;
4918
4919     dst_ptl = huge_pte_lock(h, mm, dst_pte);
4920     src_ptl = huge_pte_lockptr(h, mm, src_pte);
4921
4922     /*
4923      * We don't have to worry about the ordering of src and dst ptlocks
4924      * because exclusive mmap_sem (or the i_mmap_lock) prevents deadlock.
4925      */
4926     if (src_ptl != dst_ptl)
4927         spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
4928
4929     pte = huge_ptep_get_and_clear(mm, old_addr, src_pte);
4930     set_huge_pte_at(mm, new_addr, dst_pte, pte);
4931
4932     if (src_ptl != dst_ptl)
4933         spin_unlock(src_ptl);
4934     spin_unlock(dst_ptl);
4935 }
4936
4937 int move_hugetlb_page_tables(struct vm_area_struct *vma,
4938                  struct vm_area_struct *new_vma,
4939                  unsigned long old_addr, unsigned long new_addr,
4940                  unsigned long len)
4941 {
4942     struct hstate *h = hstate_vma(vma);
4943     struct address_space *mapping = vma->vm_file->f_mapping;
4944     unsigned long sz = huge_page_size(h);
4945     struct mm_struct *mm = vma->vm_mm;
4946     unsigned long old_end = old_addr + len;
4947     unsigned long last_addr_mask;
4948     pte_t *src_pte, *dst_pte;
4949     struct mmu_notifier_range range;
4950     bool shared_pmd = false;
4951
4952     mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, old_addr,
4953                 old_end);
4954     adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
4955     /*
4956      * In case of shared PMDs, we should cover the maximum possible
4957      * range.
4958      */
4959     flush_cache_range(vma, range.start, range.end);
4960
4961     mmu_notifier_invalidate_range_start(&range);
4962     last_addr_mask = hugetlb_mask_last_page(h);
4963     /* Prevent race with file truncation */
4964     i_mmap_lock_write(mapping);
4965     for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
4966         src_pte = huge_pte_offset(mm, old_addr, sz);
4967         if (!src_pte) {
4968             old_addr |= last_addr_mask;
4969             new_addr |= last_addr_mask;
4970             continue;
4971         }
4972         if (huge_pte_none(huge_ptep_get(src_pte)))
4973             continue;
4974
4975         if (huge_pmd_unshare(mm, vma, old_addr, src_pte)) {
4976             shared_pmd = true;
4977             old_addr |= last_addr_mask;
4978             new_addr |= last_addr_mask;
4979             continue;
4980         }
4981
4982         dst_pte = huge_pte_alloc(mm, new_vma, new_addr, sz);
4983         if (!dst_pte)
4984             break;
4985
4986         move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte);
4987     }
4988
4989     if (shared_pmd)
4990         flush_tlb_range(vma, range.start, range.end);
4991     else
4992         flush_tlb_range(vma, old_end - len, old_end);
4993     mmu_notifier_invalidate_range_end(&range);
4994     i_mmap_unlock_write(mapping);
4995
4996     return len + old_addr - old_end;
4997 }
4998
4999 static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
5000                    unsigned long start, unsigned long end,
5001                    struct page *ref_page, zap_flags_t zap_flags)
5002 {
5003     struct mm_struct *mm = vma->vm_mm;
5004     unsigned long address;
5005     pte_t *ptep;
5006     pte_t pte;
5007     spinlock_t *ptl;
5008     struct page *page;
5009     struct hstate *h = hstate_vma(vma);
5010     unsigned long sz = huge_page_size(h);
5011     struct mmu_notifier_range range;
5012     unsigned long last_addr_mask;
5013     bool force_flush = false;
5014
5015     WARN_ON(!is_vm_hugetlb_page(vma));
5016     BUG_ON(start & ~huge_page_mask(h));
5017     BUG_ON(end & ~huge_page_mask(h));
5018
5019     /*
5020      * This is a hugetlb vma, all the pte entries should point
5021      * to huge page.
5022      */
5023     tlb_change_page_size(tlb, sz);
5024     tlb_start_vma(tlb, vma);
5025
5026     /*
5027      * If sharing possible, alert mmu notifiers of worst case.
5028      */
5029     mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start,
5030                 end);
5031     adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
5032     mmu_notifier_invalidate_range_start(&range);
5033     last_addr_mask = hugetlb_mask_last_page(h);
5034     address = start;
5035     for (; address < end; address += sz) {
5036         ptep = huge_pte_offset(mm, address, sz);
5037         if (!ptep) {
5038             address |= last_addr_mask;
5039             continue;
5040         }
5041
5042         ptl = huge_pte_lock(h, mm, ptep);
5043         if (huge_pmd_unshare(mm, vma, address, ptep)) {
5044             spin_unlock(ptl);
5045             tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE);
5046             force_flush = true;
5047             address |= last_addr_mask;
5048             continue;
5049         }
5050
5051         pte = huge_ptep_get(ptep);
5052         if (huge_pte_none(pte)) {
5053             spin_unlock(ptl);
5054             continue;
5055         }
5056
5057         /*
5058          * Migrating hugepage or HWPoisoned hugepage is already
5059          * unmapped and its refcount is dropped, so just clear pte here.
5060          */
5061         if (unlikely(!pte_present(pte))) {
5062             /*
5063              * If the pte was wr-protected by uffd-wp in any of the
5064              * swap forms, meanwhile the caller does not want to
5065              * drop the uffd-wp bit in this zap, then replace the
5066              * pte with a marker.
5067              */
5068             if (pte_swp_uffd_wp_any(pte) &&
5069                 !(zap_flags & ZAP_FLAG_DROP_MARKER))
5070                 set_huge_pte_at(mm, address, ptep,
5071                         make_pte_marker(PTE_MARKER_UFFD_WP));
5072             else
5073                 huge_pte_clear(mm, address, ptep, sz);
5074             spin_unlock(ptl);
5075             continue;
5076         }
5077
5078         page = pte_page(pte);
5079         /*
5080          * If a reference page is supplied, it is because a specific
5081          * page is being unmapped, not a range. Ensure the page we
5082          * are about to unmap is the actual page of interest.
5083          */
5084         if (ref_page) {
5085             if (page != ref_page) {
5086                 spin_unlock(ptl);
5087                 continue;
5088             }
5089             /*
5090              * Mark the VMA as having unmapped its page so that
5091              * future faults in this VMA will fail rather than
5092              * looking like data was lost
5093              */
5094             set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
5095         }
5096
5097         pte = huge_ptep_get_and_clear(mm, address, ptep);
5098         tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
5099         if (huge_pte_dirty(pte))
5100             set_page_dirty(page);
5101         /* Leave a uffd-wp pte marker if needed */
5102         if (huge_pte_uffd_wp(pte) &&
5103             !(zap_flags & ZAP_FLAG_DROP_MARKER))
5104             set_huge_pte_at(mm, address, ptep,
5105                     make_pte_marker(PTE_MARKER_UFFD_WP));
5106         hugetlb_count_sub(pages_per_huge_page(h), mm);
5107         page_remove_rmap(page, vma, true);
5108
5109         spin_unlock(ptl);
5110         tlb_remove_page_size(tlb, page, huge_page_size(h));
5111         /*
5112          * Bail out after unmapping reference page if supplied
5113          */
5114         if (ref_page)
5115             break;
5116     }
5117     mmu_notifier_invalidate_range_end(&range);
5118     tlb_end_vma(tlb, vma);
5119
5120     /*
5121      * If we unshared PMDs, the TLB flush was not recorded in mmu_gather. We
5122      * could defer the flush until now, since by holding i_mmap_rwsem we
5123      * guaranteed that the last refernece would not be dropped. But we must
5124      * do the flushing before we return, as otherwise i_mmap_rwsem will be
5125      * dropped and the last reference to the shared PMDs page might be
5126      * dropped as well.
5127      *
5128      * In theory we could defer the freeing of the PMD pages as well, but
5129      * huge_pmd_unshare() relies on the exact page_count for the PMD page to
5130      * detect sharing, so we cannot defer the release of the page either.
5131      * Instead, do flush now.
5132      */
5133     if (force_flush)
5134         tlb_flush_mmu_tlbonly(tlb);
5135 }
5136
5137 void __unmap_hugepage_range_final(struct mmu_gather *tlb,
5138               struct vm_area_struct *vma, unsigned long start,
5139               unsigned long end, struct page *ref_page,
5140               zap_flags_t zap_flags)
5141 {
5142     __unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags);
5143
5144     /*
5145      * Clear this flag so that x86's huge_pmd_share page_table_shareable
5146      * test will fail on a vma being torn down, and not grab a page table
5147      * on its way out.  We're lucky that the flag has such an appropriate
5148      * name, and can in fact be safely cleared here. We could clear it
5149      * before the __unmap_hugepage_range above, but all that's necessary
5150      * is to clear it before releasing the i_mmap_rwsem. This works
5151      * because in the context this is called, the VMA is about to be
5152      * destroyed and the i_mmap_rwsem is held.
5153      */
5154     vma->vm_flags &= ~VM_MAYSHARE;
5155 }
5156
5157 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
5158               unsigned long end, struct page *ref_page,
5159               zap_flags_t zap_flags)
5160 {
5161     struct mmu_gather tlb;
5162
5163     tlb_gather_mmu(&tlb, vma->vm_mm);
5164     __unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags);
5165     tlb_finish_mmu(&tlb);
5166 }
5167
5168 /*
5169  * This is called when the original mapper is failing to COW a MAP_PRIVATE
5170  * mapping it owns the reserve page for. The intention is to unmap the page
5171  * from other VMAs and let the children be SIGKILLed if they are faulting the
5172  * same region.
5173  */
5174 static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
5175                   struct page *page, unsigned long address)
5176 {
5177     struct hstate *h = hstate_vma(vma);
5178     struct vm_area_struct *iter_vma;
5179     struct address_space *mapping;
5180     pgoff_t pgoff;
5181
5182     /*
5183      * vm_pgoff is in PAGE_SIZE units, hence the different calculation
5184      * from page cache lookup which is in HPAGE_SIZE units.
5185      */
5186     address = address & huge_page_mask(h);
5187     pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
5188             vma->vm_pgoff;
5189     mapping = vma->vm_file->f_mapping;
5190
5191     /*
5192      * Take the mapping lock for the duration of the table walk. As
5193      * this mapping should be shared between all the VMAs,
5194      * __unmap_hugepage_range() is called as the lock is already held
5195      */
5196     i_mmap_lock_write(mapping);
5197     vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
5198         /* Do not unmap the current VMA */
5199         if (iter_vma == vma)
5200             continue;
5201
5202         /*
5203          * Shared VMAs have their own reserves and do not affect
5204          * MAP_PRIVATE accounting but it is possible that a shared
5205          * VMA is using the same page so check and skip such VMAs.
5206          */
5207         if (iter_vma->vm_flags & VM_MAYSHARE)
5208             continue;
5209
5210         /*
5211          * Unmap the page from other VMAs without their own reserves.
5212          * They get marked to be SIGKILLed if they fault in these
5213          * areas. This is because a future no-page fault on this VMA
5214          * could insert a zeroed page instead of the data existing
5215          * from the time of fork. This would look like data corruption
5216          */
5217         if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
5218             unmap_hugepage_range(iter_vma, address,
5219                          address + huge_page_size(h), page, 0);
5220     }
5221     i_mmap_unlock_write(mapping);
5222 }
5223
5224 /*
5225  * hugetlb_wp() should be called with page lock of the original hugepage held.
5226  * Called with hugetlb_fault_mutex_table held and pte_page locked so we
5227  * cannot race with other handlers or page migration.
5228  * Keep the pte_same checks anyway to make transition from the mutex easier.
5229  */
5230 static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
5231                unsigned long address, pte_t *ptep, unsigned int flags,
5232                struct page *pagecache_page, spinlock_t *ptl)
5233 {
5234     const bool unshare = flags & FAULT_FLAG_UNSHARE;
5235     pte_t pte;
5236     struct hstate *h = hstate_vma(vma);
5237     struct page *old_page, *new_page;
5238     int outside_reserve = 0;
5239     vm_fault_t ret = 0;
5240     unsigned long haddr = address & huge_page_mask(h);
5241     struct mmu_notifier_range range;
5242
5243     VM_BUG_ON(unshare && (flags & FOLL_WRITE));
5244     VM_BUG_ON(!unshare && !(flags & FOLL_WRITE));
5245
5246     /*
5247      * hugetlb does not support FOLL_FORCE-style write faults that keep the
5248      * PTE mapped R/O such as maybe_mkwrite() would do.
5249      */
5250     if (WARN_ON_ONCE(!unshare && !(vma->vm_flags & VM_WRITE)))
5251         return VM_FAULT_SIGSEGV;
5252
5253     /* Let's take out MAP_SHARED mappings first. */
5254     if (vma->vm_flags & VM_MAYSHARE) {
5255         if (unlikely(unshare))
5256             return 0;
5257         set_huge_ptep_writable(vma, haddr, ptep);
5258         return 0;
5259     }
5260
5261     pte = huge_ptep_get(ptep);
5262     old_page = pte_page(pte);
5263
5264     delayacct_wpcopy_start();
5265
5266 retry_avoidcopy:
5267     /*
5268      * If no-one else is actually using this page, we're the exclusive
5269      * owner and can reuse this page.
5270      */
5271     if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
5272         if (!PageAnonExclusive(old_page))
5273             page_move_anon_rmap(old_page, vma);
5274         if (likely(!unshare))
5275             set_huge_ptep_writable(vma, haddr, ptep);
5276
5277         delayacct_wpcopy_end();
5278         return 0;
5279     }
5280     VM_BUG_ON_PAGE(PageAnon(old_page) && PageAnonExclusive(old_page),
5281                old_page);
5282
5283     /*
5284      * If the process that created a MAP_PRIVATE mapping is about to
5285      * perform a COW due to a shared page count, attempt to satisfy
5286      * the allocation without using the existing reserves. The pagecache
5287      * page is used to determine if the reserve at this address was
5288      * consumed or not. If reserves were used, a partial faulted mapping
5289      * at the time of fork() could consume its reserves on COW instead
5290      * of the full address range.
5291      */
5292     if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
5293             old_page != pagecache_page)
5294         outside_reserve = 1;
5295
5296     get_page(old_page);
5297
5298     /*
5299      * Drop page table lock as buddy allocator may be called. It will
5300      * be acquired again before returning to the caller, as expected.
5301      */
5302     spin_unlock(ptl);
5303     new_page = alloc_huge_page(vma, haddr, outside_reserve);
5304
5305     if (IS_ERR(new_page)) {
5306         /*
5307          * If a process owning a MAP_PRIVATE mapping fails to COW,
5308          * it is due to references held by a child and an insufficient
5309          * huge page pool. To guarantee the original mappers
5310          * reliability, unmap the page from child processes. The child
5311          * may get SIGKILLed if it later faults.
5312          */
5313         if (outside_reserve) {
5314             struct address_space *mapping = vma->vm_file->f_mapping;
5315             pgoff_t idx;
5316             u32 hash;
5317
5318             put_page(old_page);
5319             BUG_ON(huge_pte_none(pte));
5320             /*
5321              * Drop hugetlb_fault_mutex and i_mmap_rwsem before
5322              * unmapping.  unmapping needs to hold i_mmap_rwsem
5323              * in write mode.  Dropping i_mmap_rwsem in read mode
5324              * here is OK as COW mappings do not interact with
5325              * PMD sharing.
5326              *
5327              * Reacquire both after unmap operation.
5328              */
5329             idx = vma_hugecache_offset(h, vma, haddr);
5330             hash = hugetlb_fault_mutex_hash(mapping, idx);
5331             mutex_unlock(&hugetlb_fault_mutex_table[hash]);
5332             i_mmap_unlock_read(mapping);
5333
5334             unmap_ref_private(mm, vma, old_page, haddr);
5335
5336             i_mmap_lock_read(mapping);
5337             mutex_lock(&hugetlb_fault_mutex_table[hash]);
5338             spin_lock(ptl);
5339             ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
5340             if (likely(ptep &&
5341                    pte_same(huge_ptep_get(ptep), pte)))
5342                 goto retry_avoidcopy;
5343             /*
5344              * race occurs while re-acquiring page table
5345              * lock, and our job is done.
5346              */
5347             delayacct_wpcopy_end();
5348             return 0;
5349         }
5350
5351         ret = vmf_error(PTR_ERR(new_page));
5352         goto out_release_old;
5353     }
5354
5355     /*
5356      * When the original hugepage is shared one, it does not have
5357      * anon_vma prepared.
5358      */
5359     if (unlikely(anon_vma_prepare(vma))) {
5360         ret = VM_FAULT_OOM;
5361         goto out_release_all;
5362     }
5363
5364     copy_user_huge_page(new_page, old_page, address, vma,
5365                 pages_per_huge_page(h));
5366     __SetPageUptodate(new_page);
5367
5368     mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr,
5369                 haddr + huge_page_size(h));
5370     mmu_notifier_invalidate_range_start(&range);
5371
5372     /*
5373      * Retake the page table lock to check for racing updates
5374      * before the page tables are altered
5375      */
5376     spin_lock(ptl);
5377     ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
5378     if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
5379         ClearHPageRestoreReserve(new_page);
5380
5381         /* Break COW or unshare */
5382         huge_ptep_clear_flush(vma, haddr, ptep);
5383         mmu_notifier_invalidate_range(mm, range.start, range.end);
5384         page_remove_rmap(old_page, vma, true);
5385         hugepage_add_new_anon_rmap(new_page, vma, haddr);
5386         set_huge_pte_at(mm, haddr, ptep,
5387                 make_huge_pte(vma, new_page, !unshare));
5388         SetHPageMigratable(new_page);
5389         /* Make the old page be freed below */
5390         new_page = old_page;
5391     }
5392     spin_unlock(ptl);
5393     mmu_notifier_invalidate_range_end(&range);
5394 out_release_all:
5395     /*
5396      * No restore in case of successful pagetable update (Break COW or
5397      * unshare)
5398      */
5399     if (new_page != old_page)
5400         restore_reserve_on_error(h, vma, haddr, new_page);
5401     put_page(new_page);
5402 out_release_old:
5403     put_page(old_page);
5404
5405     spin_lock(ptl); /* Caller expects lock to be held */
5406
5407     delayacct_wpcopy_end();
5408     return ret;
5409 }
5410
5411 /* Return the pagecache page at a given address within a VMA */
5412 static struct page *hugetlbfs_pagecache_page(struct hstate *h,
5413             struct vm_area_struct *vma, unsigned long address)
5414 {
5415     struct address_space *mapping;
5416     pgoff_t idx;
5417
5418     mapping = vma->vm_file->f_mapping;
5419     idx = vma_hugecache_offset(h, vma, address);
5420
5421     return find_lock_page(mapping, idx);
5422 }
5423
5424 /*
5425  * Return whether there is a pagecache page to back given address within VMA.
5426  * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
5427  */
5428 static bool hugetlbfs_pagecache_present(struct hstate *h,
5429             struct vm_area_struct *vma, unsigned long address)
5430 {
5431     struct address_space *mapping;
5432     pgoff_t idx;
5433     struct page *page;
5434
5435     mapping = vma->vm_file->f_mapping;
5436     idx = vma_hugecache_offset(h, vma, address);
5437
5438     page = find_get_page(mapping, idx);
5439     if (page)
5440         put_page(page);
5441     return page != NULL;
5442 }
5443
5444 int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
5445                pgoff_t idx)
5446 {
5447     struct folio *folio = page_folio(page);
5448     struct inode *inode = mapping->host;
5449     struct hstate *h = hstate_inode(inode);
5450     int err;
5451
5452     __folio_set_locked(folio);
5453     err = __filemap_add_folio(mapping, folio, idx, GFP_KERNEL, NULL);
5454
5455     if (unlikely(err)) {
5456         __folio_clear_locked(folio);
5457         return err;
5458     }
5459     ClearHPageRestoreReserve(page);
5460
5461     /*
5462      * mark folio dirty so that it will not be removed from cache/file
5463      * by non-hugetlbfs specific code paths.
5464      */
5465     folio_mark_dirty(folio);
5466
5467     spin_lock(&inode->i_lock);
5468     inode->i_blocks += blocks_per_huge_page(h);
5469     spin_unlock(&inode->i_lock);
5470     return 0;
5471 }
5472
5473 static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
5474                           struct address_space *mapping,
5475                           pgoff_t idx,
5476                           unsigned int flags,
5477                           unsigned long haddr,
5478                           unsigned long addr,
5479                           unsigned long reason)
5480 {
5481     vm_fault_t ret;
5482     u32 hash;
5483     struct vm_fault vmf = {
5484         .vma = vma,
5485         .address = haddr,
5486         .real_address = addr,
5487         .flags = flags,
5488
5489         /*
5490          * Hard to debug if it ends up being
5491          * used by a callee that assumes
5492          * something about the other
5493          * uninitialized fields... same as in
5494          * memory.c
5495          */
5496     };
5497
5498     /*
5499      * hugetlb_fault_mutex and i_mmap_rwsem must be
5500      * dropped before handling userfault.  Reacquire
5501      * after handling fault to make calling code simpler.
5502      */
5503     hash = hugetlb_fault_mutex_hash(mapping, idx);
5504     mutex_unlock(&hugetlb_fault_mutex_table[hash]);
5505     i_mmap_unlock_read(mapping);
5506     ret = handle_userfault(&vmf, reason);
5507     i_mmap_lock_read(mapping);
5508     mutex_lock(&hugetlb_fault_mutex_table[hash]);
5509
5510     return ret;
5511 }
5512
5513 static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
5514             struct vm_area_struct *vma,
5515             struct address_space *mapping, pgoff_t idx,
5516             unsigned long address, pte_t *ptep,
5517             pte_t old_pte, unsigned int flags)
5518 {
5519     struct hstate *h = hstate_vma(vma);
5520     vm_fault_t ret = VM_FAULT_SIGBUS;
5521     int anon_rmap = 0;
5522     unsigned long size;
5523     struct page *page;
5524     pte_t new_pte;
5525     spinlock_t *ptl;
5526     unsigned long haddr = address & huge_page_mask(h);
5527     bool new_page, new_pagecache_page = false;
5528
5529     /*
5530      * Currently, we are forced to kill the process in the event the
5531      * original mapper has unmapped pages from the child due to a failed
5532      * COW/unsharing. Warn that such a situation has occurred as it may not
5533      * be obvious.
5534      */
5535     if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
5536         pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
5537                current->pid);
5538         return ret;
5539     }
5540
5541     /*
5542      * We can not race with truncation due to holding i_mmap_rwsem.
5543      * i_size is modified when holding i_mmap_rwsem, so check here
5544      * once for faults beyond end of file.
5545      */
5546     size = i_size_read(mapping->host) >> huge_page_shift(h);
5547     if (idx >= size)
5548         goto out;
5549
5550 retry:
5551     new_page = false;
5552     page = find_lock_page(mapping, idx);
5553     if (!page) {
5554         /* Check for page in userfault range */
5555         if (userfaultfd_missing(vma)) {
5556             ret = hugetlb_handle_userfault(vma, mapping, idx,
5557                                flags, haddr, address,
5558                                VM_UFFD_MISSING);
5559             goto out;
5560         }
5561
5562         page = alloc_huge_page(vma, haddr, 0);
5563         if (IS_ERR(page)) {
5564             /*
5565              * Returning error will result in faulting task being
5566              * sent SIGBUS.  The hugetlb fault mutex prevents two
5567              * tasks from racing to fault in the same page which
5568              * could result in false unable to allocate errors.
5569              * Page migration does not take the fault mutex, but
5570              * does a clear then write of pte's under page table
5571              * lock.  Page fault code could race with migration,
5572              * notice the clear pte and try to allocate a page
5573              * here.  Before returning error, get ptl and make
5574              * sure there really is no pte entry.
5575              */
5576             ptl = huge_pte_lock(h, mm, ptep);
5577             ret = 0;
5578             if (huge_pte_none(huge_ptep_get(ptep)))
5579                 ret = vmf_error(PTR_ERR(page));
5580             spin_unlock(ptl);
5581             goto out;
5582         }
5583         clear_huge_page(page, address, pages_per_huge_page(h));
5584         __SetPageUptodate(page);
5585         new_page = true;
5586
5587         if (vma->vm_flags & VM_MAYSHARE) {
5588             int err = huge_add_to_page_cache(page, mapping, idx);
5589             if (err) {
5590                 put_page(page);
5591                 if (err == -EEXIST)
5592                     goto retry;
5593                 goto out;
5594             }
5595             new_pagecache_page = true;
5596         } else {
5597             lock_page(page);
5598             if (unlikely(anon_vma_prepare(vma))) {
5599                 ret = VM_FAULT_OOM;
5600                 goto backout_unlocked;
5601             }
5602             anon_rmap = 1;
5603         }
5604     } else {
5605         /*
5606          * If memory error occurs between mmap() and fault, some process
5607          * don't have hwpoisoned swap entry for errored virtual address.
5608          * So we need to block hugepage fault by PG_hwpoison bit check.
5609          */
5610         if (unlikely(PageHWPoison(page))) {
5611             ret = VM_FAULT_HWPOISON_LARGE |
5612                 VM_FAULT_SET_HINDEX(hstate_index(h));
5613             goto backout_unlocked;
5614         }
5615
5616         /* Check for page in userfault range. */
5617         if (userfaultfd_minor(vma)) {
5618             unlock_page(page);
5619             put_page(page);
5620             ret = hugetlb_handle_userfault(vma, mapping, idx,
5621                                flags, haddr, address,
5622                                VM_UFFD_MINOR);
5623             goto out;
5624         }
5625     }
5626
5627     /*
5628      * If we are going to COW a private mapping later, we examine the
5629      * pending reservations for this page now. This will ensure that
5630      * any allocations necessary to record that reservation occur outside
5631      * the spinlock.
5632      */
5633     if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
5634         if (vma_needs_reservation(h, vma, haddr) < 0) {
5635             ret = VM_FAULT_OOM;
5636             goto backout_unlocked;
5637         }
5638         /* Just decrements count, does not deallocate */
5639         vma_end_reservation(h, vma, haddr);
5640     }
5641
5642     ptl = huge_pte_lock(h, mm, ptep);
5643     ret = 0;
5644     /* If pte changed from under us, retry */
5645     if (!pte_same(huge_ptep_get(ptep), old_pte))
5646         goto backout;
5647
5648     if (anon_rmap) {
5649         ClearHPageRestoreReserve(page);
5650         hugepage_add_new_anon_rmap(page, vma, haddr);
5651     } else
5652         page_dup_file_rmap(page, true);
5653     new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
5654                 && (vma->vm_flags & VM_SHARED)));
5655     /*
5656      * If this pte was previously wr-protected, keep it wr-protected even
5657      * if populated.
5658      */
5659     if (unlikely(pte_marker_uffd_wp(old_pte)))
5660         new_pte = huge_pte_wrprotect(huge_pte_mkuffd_wp(new_pte));
5661     set_huge_pte_at(mm, haddr, ptep, new_pte);
5662
5663     hugetlb_count_add(pages_per_huge_page(h), mm);
5664     if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
5665         /* Optimization, do the COW without a second fault */
5666         ret = hugetlb_wp(mm, vma, address, ptep, flags, page, ptl);
5667     }
5668
5669     spin_unlock(ptl);
5670
5671     /*
5672      * Only set HPageMigratable in newly allocated pages.  Existing pages
5673      * found in the pagecache may not have HPageMigratableset if they have
5674      * been isolated for migration.
5675      */
5676     if (new_page)
5677         SetHPageMigratable(page);
5678
5679     unlock_page(page);
5680 out:
5681     return ret;
5682
5683 backout:
5684     spin_unlock(ptl);
5685 backout_unlocked:
5686     unlock_page(page);
5687     /* restore reserve for newly allocated pages not in page cache */
5688     if (new_page && !new_pagecache_page)
5689         restore_reserve_on_error(h, vma, haddr, page);
5690     put_page(page);
5691     goto out;
5692 }
5693
5694 #ifdef CONFIG_SMP
5695 u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
5696 {
5697     unsigned long key[2];
5698     u32 hash;
5699
5700     key[0] = (unsigned long) mapping;
5701     key[1] = idx;
5702
5703     hash = jhash2((u32 *)&key, sizeof(key)/(sizeof(u32)), 0);
5704
5705     return hash & (num_fault_mutexes - 1);
5706 }
5707 #else
5708 /*
5709  * For uniprocessor systems we always use a single mutex, so just
5710  * return 0 and avoid the hashing overhead.
5711  */
5712 u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
5713 {
5714     return 0;
5715 }
5716 #endif
5717
5718 vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
5719             unsigned long address, unsigned int flags)
5720 {
5721     pte_t *ptep, entry;
5722     spinlock_t *ptl;
5723     vm_fault_t ret;
5724     u32 hash;
5725     pgoff_t idx;
5726     struct page *page = NULL;
5727     struct page *pagecache_page = NULL;
5728     struct hstate *h = hstate_vma(vma);
5729     struct address_space *mapping;
5730     int need_wait_lock = 0;
5731     unsigned long haddr = address & huge_page_mask(h);
5732
5733     ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
5734     if (ptep) {
5735         /*
5736          * Since we hold no locks, ptep could be stale.  That is
5737          * OK as we are only making decisions based on content and
5738          * not actually modifying content here.
5739          */
5740         entry = huge_ptep_get(ptep);
5741         if (unlikely(is_hugetlb_entry_migration(entry))) {
5742             migration_entry_wait_huge(vma, ptep);
5743             return 0;
5744         } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
5745             return VM_FAULT_HWPOISON_LARGE |
5746                 VM_FAULT_SET_HINDEX(hstate_index(h));
5747     }
5748
5749     /*
5750      * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
5751      * until finished with ptep.  This serves two purposes:
5752      * 1) It prevents huge_pmd_unshare from being called elsewhere
5753      *    and making the ptep no longer valid.
5754      * 2) It synchronizes us with i_size modifications during truncation.
5755      *
5756      * ptep could have already be assigned via huge_pte_offset.  That
5757      * is OK, as huge_pte_alloc will return the same value unless
5758      * something has changed.
5759      */
5760     mapping = vma->vm_file->f_mapping;
5761     i_mmap_lock_read(mapping);
5762     ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
5763     if (!ptep) {
5764         i_mmap_unlock_read(mapping);
5765         return VM_FAULT_OOM;
5766     }
5767
5768     /*
5769      * Serialize hugepage allocation and instantiation, so that we don't
5770      * get spurious allocation failures if two CPUs race to instantiate
5771      * the same page in the page cache.
5772      */
5773     idx = vma_hugecache_offset(h, vma, haddr);
5774     hash = hugetlb_fault_mutex_hash(mapping, idx);
5775     mutex_lock(&hugetlb_fault_mutex_table[hash]);
5776
5777     entry = huge_ptep_get(ptep);
5778     /* PTE markers should be handled the same way as none pte */
5779     if (huge_pte_none_mostly(entry)) {
5780         ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep,
5781                       entry, flags);
5782         goto out_mutex;
5783     }
5784
5785     ret = 0;
5786
5787     /*
5788      * entry could be a migration/hwpoison entry at this point, so this
5789      * check prevents the kernel from going below assuming that we have
5790      * an active hugepage in pagecache. This goto expects the 2nd page
5791      * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will
5792      * properly handle it.
5793      */
5794     if (!pte_present(entry))
5795         goto out_mutex;
5796
5797     /*
5798      * If we are going to COW/unshare the mapping later, we examine the
5799      * pending reservations for this page now. This will ensure that any
5800      * allocations necessary to record that reservation occur outside the
5801      * spinlock. Also lookup the pagecache page now as it is used to
5802      * determine if a reservation has been consumed.
5803      */
5804     if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
5805         !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(entry)) {
5806         if (vma_needs_reservation(h, vma, haddr) < 0) {
5807             ret = VM_FAULT_OOM;
5808             goto out_mutex;
5809         }
5810         /* Just decrements count, does not deallocate */
5811         vma_end_reservation(h, vma, haddr);
5812
5813         pagecache_page = hugetlbfs_pagecache_page(h, vma, haddr);
5814     }
5815
5816     ptl = huge_pte_lock(h, mm, ptep);
5817
5818     /* Check for a racing update before calling hugetlb_wp() */
5819     if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
5820         goto out_ptl;
5821
5822     /* Handle userfault-wp first, before trying to lock more pages */
5823     if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) &&
5824         (flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
5825         struct vm_fault vmf = {
5826             .vma = vma,
5827             .address = haddr,
5828             .real_address = address,
5829             .flags = flags,
5830         };
5831
5832         spin_unlock(ptl);
5833         if (pagecache_page) {
5834             unlock_page(pagecache_page);
5835             put_page(pagecache_page);
5836         }
5837         mutex_unlock(&hugetlb_fault_mutex_table[hash]);
5838         i_mmap_unlock_read(mapping);
5839         return handle_userfault(&vmf, VM_UFFD_WP);
5840     }
5841
5842     /*
5843      * hugetlb_wp() requires page locks of pte_page(entry) and
5844      * pagecache_page, so here we need take the former one
5845      * when page != pagecache_page or !pagecache_page.
5846      */
5847     page = pte_page(entry);
5848     if (page != pagecache_page)
5849         if (!trylock_page(page)) {
5850             need_wait_lock = 1;
5851             goto out_ptl;
5852         }
5853
5854     get_page(page);
5855
5856     if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
5857         if (!huge_pte_write(entry)) {
5858             ret = hugetlb_wp(mm, vma, address, ptep, flags,
5859                      pagecache_page, ptl);
5860             goto out_put_page;
5861         } else if (likely(flags & FAULT_FLAG_WRITE)) {
5862             entry = huge_pte_mkdirty(entry);
5863         }
5864     }
5865     entry = pte_mkyoung(entry);
5866     if (huge_ptep_set_access_flags(vma, haddr, ptep, entry,
5867                         flags & FAULT_FLAG_WRITE))
5868         update_mmu_cache(vma, haddr, ptep);
5869 out_put_page:
5870     if (page != pagecache_page)
5871         unlock_page(page);
5872     put_page(page);
5873 out_ptl:
5874     spin_unlock(ptl);
5875
5876     if (pagecache_page) {
5877         unlock_page(pagecache_page);
5878         put_page(pagecache_page);
5879     }
5880 out_mutex:
5881     mutex_unlock(&hugetlb_fault_mutex_table[hash]);
5882     i_mmap_unlock_read(mapping);
5883     /*
5884      * Generally it's safe to hold refcount during waiting page lock. But
5885      * here we just wait to defer the next page fault to avoid busy loop and
5886      * the page is not used after unlocked before returning from the current
5887      * page fault. So we are safe from accessing freed page, even if we wait
5888      * here without taking refcount.
5889      */
5890     if (need_wait_lock)
5891         wait_on_page_locked(page);
5892     return ret;
5893 }
5894
5895 #ifdef CONFIG_USERFAULTFD
5896 /*
5897  * Used by userfaultfd UFFDIO_COPY.  Based on mcopy_atomic_pte with
5898  * modifications for huge pages.
5899  */
5900 int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
5901                 pte_t *dst_pte,
5902                 struct vm_area_struct *dst_vma,
5903                 unsigned long dst_addr,
5904                 unsigned long src_addr,
5905                 enum mcopy_atomic_mode mode,
5906                 struct page **pagep,
5907                 bool wp_copy)
5908 {
5909     bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
5910     struct hstate *h = hstate_vma(dst_vma);
5911     struct address_space *mapping = dst_vma->vm_file->f_mapping;
5912     pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr);
5913     unsigned long size;
5914     int vm_shared = dst_vma->vm_flags & VM_SHARED;
5915     pte_t _dst_pte;
5916     spinlock_t *ptl;
5917     int ret = -ENOMEM;
5918     struct page *page;
5919     int writable;
5920     bool page_in_pagecache = false;
5921
5922     if (is_continue) {
5923         ret = -EFAULT;
5924         page = find_lock_page(mapping, idx);
5925         if (!page)
5926             goto out;
5927         page_in_pagecache = true;
5928     } else if (!*pagep) {
5929         /* If a page already exists, then it's UFFDIO_COPY for
5930          * a non-missing case. Return -EEXIST.
5931          */
5932         if (vm_shared &&
5933             hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
5934             ret = -EEXIST;
5935             goto out;
5936         }
5937
5938         page = alloc_huge_page(dst_vma, dst_addr, 0);
5939         if (IS_ERR(page)) {
5940             ret = -ENOMEM;
5941             goto out;
5942         }
5943
5944         ret = copy_huge_page_from_user(page,
5945                         (const void __user *) src_addr,
5946                         pages_per_huge_page(h), false);
5947
5948         /* fallback to copy_from_user outside mmap_lock */
5949         if (unlikely(ret)) {
5950             ret = -ENOENT;
5951             /* Free the allocated page which may have
5952              * consumed a reservation.
5953              */
5954             restore_reserve_on_error(h, dst_vma, dst_addr, page);
5955             put_page(page);
5956
5957             /* Allocate a temporary page to hold the copied
5958              * contents.
5959              */
5960             page = alloc_huge_page_vma(h, dst_vma, dst_addr);
5961             if (!page) {
5962                 ret = -ENOMEM;
5963                 goto out;
5964             }
5965             *pagep = page;
5966             /* Set the outparam pagep and return to the caller to
5967              * copy the contents outside the lock. Don't free the
5968              * page.
5969              */
5970             goto out;
5971         }
5972     } else {
5973         if (vm_shared &&
5974             hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
5975             put_page(*pagep);
5976             ret = -EEXIST;
5977             *pagep = NULL;
5978             goto out;
5979         }
5980
5981         page = alloc_huge_page(dst_vma, dst_addr, 0);
5982         if (IS_ERR(page)) {
5983             put_page(*pagep);
5984             ret = -ENOMEM;
5985             *pagep = NULL;
5986             goto out;
5987         }
5988         copy_user_huge_page(page, *pagep, dst_addr, dst_vma,
5989                     pages_per_huge_page(h));
5990         put_page(*pagep);
5991         *pagep = NULL;
5992     }
5993
5994     /*
5995      * The memory barrier inside __SetPageUptodate makes sure that
5996      * preceding stores to the page contents become visible before
5997      * the set_pte_at() write.
5998      */
5999     __SetPageUptodate(page);
6000
6001     /* Add shared, newly allocated pages to the page cache. */
6002     if (vm_shared && !is_continue) {
6003         size = i_size_read(mapping->host) >> huge_page_shift(h);
6004         ret = -EFAULT;
6005         if (idx >= size)
6006             goto out_release_nounlock;
6007
6008         /*
6009          * Serialization between remove_inode_hugepages() and
6010          * huge_add_to_page_cache() below happens through the
6011          * hugetlb_fault_mutex_table that here must be hold by
6012          * the caller.
6013          */
6014         ret = huge_add_to_page_cache(page, mapping, idx);
6015         if (ret)
6016             goto out_release_nounlock;
6017         page_in_pagecache = true;
6018     }
6019
6020     ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
6021     spin_lock(ptl);
6022
6023     /*
6024      * Recheck the i_size after holding PT lock to make sure not
6025      * to leave any page mapped (as page_mapped()) beyond the end
6026      * of the i_size (remove_inode_hugepages() is strict about
6027      * enforcing that). If we bail out here, we'll also leave a
6028      * page in the radix tree in the vm_shared case beyond the end
6029      * of the i_size, but remove_inode_hugepages() will take care
6030      * of it as soon as we drop the hugetlb_fault_mutex_table.
6031      */
6032     size = i_size_read(mapping->host) >> huge_page_shift(h);
6033     ret = -EFAULT;
6034     if (idx >= size)
6035         goto out_release_unlock;
6036
6037     ret = -EEXIST;
6038     /*
6039      * We allow to overwrite a pte marker: consider when both MISSING|WP
6040      * registered, we firstly wr-protect a none pte which has no page cache
6041      * page backing it, then access the page.
6042      */
6043     if (!huge_pte_none_mostly(huge_ptep_get(dst_pte)))
6044         goto out_release_unlock;
6045
6046     if (page_in_pagecache) {
6047         page_dup_file_rmap(page, true);
6048     } else {
6049         ClearHPageRestoreReserve(page);
6050         hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
6051     }
6052
6053     /*
6054      * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY
6055      * with wp flag set, don't set pte write bit.
6056      */
6057     if (wp_copy || (is_continue && !vm_shared))
6058         writable = 0;
6059     else
6060         writable = dst_vma->vm_flags & VM_WRITE;
6061
6062     _dst_pte = make_huge_pte(dst_vma, page, writable);
6063     /*
6064      * Always mark UFFDIO_COPY page dirty; note that this may not be
6065      * extremely important for hugetlbfs for now since swapping is not
6066      * supported, but we should still be clear in that this page cannot be
6067      * thrown away at will, even if write bit not set.
6068      */
6069     _dst_pte = huge_pte_mkdirty(_dst_pte);
6070     _dst_pte = pte_mkyoung(_dst_pte);
6071
6072     if (wp_copy)
6073         _dst_pte = huge_pte_mkuffd_wp(_dst_pte);
6074
6075     set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
6076
6077     hugetlb_count_add(pages_per_huge_page(h), dst_mm);
6078
6079     /* No need to invalidate - it was non-present before */
6080     update_mmu_cache(dst_vma, dst_addr, dst_pte);
6081
6082     spin_unlock(ptl);
6083     if (!is_continue)
6084         SetHPageMigratable(page);
6085     if (vm_shared || is_continue)
6086         unlock_page(page);
6087     ret = 0;
6088 out:
6089     return ret;
6090 out_release_unlock:
6091     spin_unlock(ptl);
6092     if (vm_shared || is_continue)
6093         unlock_page(page);
6094 out_release_nounlock:
6095     if (!page_in_pagecache)
6096         restore_reserve_on_error(h, dst_vma, dst_addr, page);
6097     put_page(page);
6098     goto out;
6099 }
6100 #endif /* CONFIG_USERFAULTFD */
6101
6102 static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
6103                  int refs, struct page **pages,
6104                  struct vm_area_struct **vmas)
6105 {
6106     int nr;
6107
6108     for (nr = 0; nr < refs; nr++) {
6109         if (likely(pages))
6110             pages[nr] = mem_map_offset(page, nr);
6111         if (vmas)
6112             vmas[nr] = vma;
6113     }
6114 }
6115
6116 static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte,
6117                            bool *unshare)
6118 {
6119     pte_t pteval = huge_ptep_get(pte);
6120
6121     *unshare = false;
6122     if (is_swap_pte(pteval))
6123         return true;
6124     if (huge_pte_write(pteval))
6125         return false;
6126     if (flags & FOLL_WRITE)
6127         return true;
6128     if (gup_must_unshare(flags, pte_page(pteval))) {
6129         *unshare = true;
6130         return true;
6131     }
6132     return false;
6133 }
6134
6135 long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
6136              struct page **pages, struct vm_area_struct **vmas,
6137              unsigned long *position, unsigned long *nr_pages,
6138              long i, unsigned int flags, int *locked)
6139 {
6140     unsigned long pfn_offset;
6141     unsigned long vaddr = *position;
6142     unsigned long remainder = *nr_pages;
6143     struct hstate *h = hstate_vma(vma);
6144     int err = -EFAULT, refs;
6145
6146     while (vaddr < vma->vm_end && remainder) {
6147         pte_t *pte;
6148         spinlock_t *ptl = NULL;
6149         bool unshare = false;
6150         int absent;
6151         struct page *page;
6152
6153         /*
6154          * If we have a pending SIGKILL, don't keep faulting pages and
6155          * potentially allocating memory.
6156          */
6157         if (fatal_signal_pending(current)) {
6158             remainder = 0;
6159             break;
6160         }
6161
6162         /*
6163          * Some archs (sparc64, sh*) have multiple pte_ts to
6164          * each hugepage.  We have to make sure we get the
6165          * first, for the page indexing below to work.
6166          *
6167          * Note that page table lock is not held when pte is null.
6168          */
6169         pte = huge_pte_offset(mm, vaddr & huge_page_mask(h),
6170                       huge_page_size(h));
6171         if (pte)
6172             ptl = huge_pte_lock(h, mm, pte);
6173         absent = !pte || huge_pte_none(huge_ptep_get(pte));
6174
6175         /*
6176          * When coredumping, it suits get_dump_page if we just return
6177          * an error where there's an empty slot with no huge pagecache
6178          * to back it.  This way, we avoid allocating a hugepage, and
6179          * the sparse dumpfile avoids allocating disk blocks, but its
6180          * huge holes still show up with zeroes where they need to be.
6181          */
6182         if (absent && (flags & FOLL_DUMP) &&
6183             !hugetlbfs_pagecache_present(h, vma, vaddr)) {
6184             if (pte)
6185                 spin_unlock(ptl);
6186             remainder = 0;
6187             break;
6188         }
6189
6190         /*
6191          * We need call hugetlb_fault for both hugepages under migration
6192          * (in which case hugetlb_fault waits for the migration,) and
6193          * hwpoisoned hugepages (in which case we need to prevent the
6194          * caller from accessing to them.) In order to do this, we use
6195          * here is_swap_pte instead of is_hugetlb_entry_migration and
6196          * is_hugetlb_entry_hwpoisoned. This is because it simply covers
6197          * both cases, and because we can't follow correct pages
6198          * directly from any kind of swap entries.
6199          */
6200         if (absent ||
6201             __follow_hugetlb_must_fault(flags, pte, &unshare)) {
6202             vm_fault_t ret;
6203             unsigned int fault_flags = 0;
6204
6205             if (pte)
6206                 spin_unlock(ptl);
6207             if (flags & FOLL_WRITE)
6208                 fault_flags |= FAULT_FLAG_WRITE;
6209             else if (unshare)
6210                 fault_flags |= FAULT_FLAG_UNSHARE;
6211             if (locked)
6212                 fault_flags |= FAULT_FLAG_ALLOW_RETRY |
6213                     FAULT_FLAG_KILLABLE;
6214             if (flags & FOLL_NOWAIT)
6215                 fault_flags |= FAULT_FLAG_ALLOW_RETRY |
6216                     FAULT_FLAG_RETRY_NOWAIT;
6217             if (flags & FOLL_TRIED) {
6218                 /*
6219                  * Note: FAULT_FLAG_ALLOW_RETRY and
6220                  * FAULT_FLAG_TRIED can co-exist
6221                  */
6222                 fault_flags |= FAULT_FLAG_TRIED;
6223             }
6224             ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
6225             if (ret & VM_FAULT_ERROR) {
6226                 err = vm_fault_to_errno(ret, flags);
6227                 remainder = 0;
6228                 break;
6229             }
6230             if (ret & VM_FAULT_RETRY) {
6231                 if (locked &&
6232                     !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
6233                     *locked = 0;
6234                 *nr_pages = 0;
6235                 /*
6236                  * VM_FAULT_RETRY must not return an
6237                  * error, it will return zero
6238                  * instead.
6239                  *
6240                  * No need to update "position" as the
6241                  * caller will not check it after
6242                  * *nr_pages is set to 0.
6243                  */
6244                 return i;
6245             }
6246             continue;
6247         }
6248
6249         pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
6250         page = pte_page(huge_ptep_get(pte));
6251
6252         VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
6253                    !PageAnonExclusive(page), page);
6254
6255         /*
6256          * If subpage information not requested, update counters
6257          * and skip the same_page loop below.
6258          */
6259         if (!pages && !vmas && !pfn_offset &&
6260             (vaddr + huge_page_size(h) < vma->vm_end) &&
6261             (remainder >= pages_per_huge_page(h))) {
6262             vaddr += huge_page_size(h);
6263             remainder -= pages_per_huge_page(h);
6264             i += pages_per_huge_page(h);
6265             spin_unlock(ptl);
6266             continue;
6267         }
6268
6269         /* vaddr may not be aligned to PAGE_SIZE */
6270         refs = min3(pages_per_huge_page(h) - pfn_offset, remainder,
6271             (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT);
6272
6273         if (pages || vmas)
6274             record_subpages_vmas(mem_map_offset(page, pfn_offset),
6275                          vma, refs,
6276                          likely(pages) ? pages + i : NULL,
6277                          vmas ? vmas + i : NULL);
6278
6279         if (pages) {
6280             /*
6281              * try_grab_folio() should always succeed here,
6282              * because: a) we hold the ptl lock, and b) we've just
6283              * checked that the huge page is present in the page
6284              * tables. If the huge page is present, then the tail
6285              * pages must also be present. The ptl prevents the
6286              * head page and tail pages from being rearranged in
6287              * any way. So this page must be available at this
6288              * point, unless the page refcount overflowed:
6289              */
6290             if (WARN_ON_ONCE(!try_grab_folio(pages[i], refs,
6291                              flags))) {
6292                 spin_unlock(ptl);
6293                 remainder = 0;
6294                 err = -ENOMEM;
6295                 break;
6296             }
6297         }
6298
6299         vaddr += (refs << PAGE_SHIFT);
6300         remainder -= refs;
6301         i += refs;
6302
6303         spin_unlock(ptl);
6304     }
6305     *nr_pages = remainder;
6306     /*
6307      * setting position is actually required only if remainder is
6308      * not zero but it's faster not to add a "if (remainder)"
6309      * branch.
6310      */
6311     *position = vaddr;
6312
6313     return i ? i : err;
6314 }
6315
6316 unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
6317         unsigned long address, unsigned long end,
6318         pgprot_t newprot, unsigned long cp_flags)
6319 {
6320     struct mm_struct *mm = vma->vm_mm;
6321     unsigned long start = address;
6322     pte_t *ptep;
6323     pte_t pte;
6324     struct hstate *h = hstate_vma(vma);
6325     unsigned long pages = 0, psize = huge_page_size(h);
6326     bool shared_pmd = false;
6327     struct mmu_notifier_range range;
6328     unsigned long last_addr_mask;
6329     bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
6330     bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
6331
6332     /*
6333      * In the case of shared PMDs, the area to flush could be beyond
6334      * start/end.  Set range.start/range.end to cover the maximum possible
6335      * range if PMD sharing is possible.
6336      */
6337     mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA,
6338                 0, vma, mm, start, end);
6339     adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
6340
6341     BUG_ON(address >= end);
6342     flush_cache_range(vma, range.start, range.end);
6343
6344     mmu_notifier_invalidate_range_start(&range);
6345     last_addr_mask = hugetlb_mask_last_page(h);
6346     i_mmap_lock_write(vma->vm_file->f_mapping);
6347     for (; address < end; address += psize) {
6348         spinlock_t *ptl;
6349         ptep = huge_pte_offset(mm, address, psize);
6350         if (!ptep) {
6351             address |= last_addr_mask;
6352             continue;
6353         }
6354         ptl = huge_pte_lock(h, mm, ptep);
6355         if (huge_pmd_unshare(mm, vma, address, ptep)) {
6356             /*
6357              * When uffd-wp is enabled on the vma, unshare
6358              * shouldn't happen at all.  Warn about it if it
6359              * happened due to some reason.
6360              */
6361             WARN_ON_ONCE(uffd_wp || uffd_wp_resolve);
6362             pages++;
6363             spin_unlock(ptl);
6364             shared_pmd = true;
6365             address |= last_addr_mask;
6366             continue;
6367         }
6368         pte = huge_ptep_get(ptep);
6369         if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
6370             spin_unlock(ptl);
6371             continue;
6372         }
6373         if (unlikely(is_hugetlb_entry_migration(pte))) {
6374             swp_entry_t entry = pte_to_swp_entry(pte);
6375             struct page *page = pfn_swap_entry_to_page(entry);
6376
6377             if (!is_readable_migration_entry(entry)) {
6378                 pte_t newpte;
6379
6380                 if (PageAnon(page))
6381                     entry = make_readable_exclusive_migration_entry(
6382                                 swp_offset(entry));
6383                 else
6384                     entry = make_readable_migration_entry(
6385                                 swp_offset(entry));
6386                 newpte = swp_entry_to_pte(entry);
6387                 if (uffd_wp)
6388                     newpte = pte_swp_mkuffd_wp(newpte);
6389                 else if (uffd_wp_resolve)
6390                     newpte = pte_swp_clear_uffd_wp(newpte);
6391                 set_huge_pte_at(mm, address, ptep, newpte);
6392                 pages++;
6393             }
6394             spin_unlock(ptl);
6395             continue;
6396         }
6397         if (unlikely(pte_marker_uffd_wp(pte))) {
6398             /*
6399              * This is changing a non-present pte into a none pte,
6400              * no need for huge_ptep_modify_prot_start/commit().
6401              */
6402             if (uffd_wp_resolve)
6403                 huge_pte_clear(mm, address, ptep, psize);
6404         }
6405         if (!huge_pte_none(pte)) {
6406             pte_t old_pte;
6407             unsigned int shift = huge_page_shift(hstate_vma(vma));
6408
6409             old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
6410             pte = huge_pte_modify(old_pte, newprot);
6411             pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
6412             if (uffd_wp)
6413                 pte = huge_pte_mkuffd_wp(huge_pte_wrprotect(pte));
6414             else if (uffd_wp_resolve)
6415                 pte = huge_pte_clear_uffd_wp(pte);
6416             huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
6417             pages++;
6418         } else {
6419             /* None pte */
6420             if (unlikely(uffd_wp))
6421                 /* Safe to modify directly (none->non-present). */
6422                 set_huge_pte_at(mm, address, ptep,
6423                         make_pte_marker(PTE_MARKER_UFFD_WP));
6424         }
6425         spin_unlock(ptl);
6426     }
6427     /*
6428      * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare
6429      * may have cleared our pud entry and done put_page on the page table:
6430      * once we release i_mmap_rwsem, another task can do the final put_page
6431      * and that page table be reused and filled with junk.  If we actually
6432      * did unshare a page of pmds, flush the range corresponding to the pud.
6433      */
6434     if (shared_pmd)
6435         flush_hugetlb_tlb_range(vma, range.start, range.end);
6436     else
6437         flush_hugetlb_tlb_range(vma, start, end);
6438     /*
6439      * No need to call mmu_notifier_invalidate_range() we are downgrading
6440      * page table protection not changing it to point to a new page.
6441      *
6442      * See Documentation/mm/mmu_notifier.rst
6443      */
6444     i_mmap_unlock_write(vma->vm_file->f_mapping);
6445     mmu_notifier_invalidate_range_end(&range);
6446
6447     return pages << h->order;
6448 }
6449
6450 /* Return true if reservation was successful, false otherwise.  */
6451 bool hugetlb_reserve_pages(struct inode *inode,
6452                     long from, long to,
6453                     struct vm_area_struct *vma,
6454                     vm_flags_t vm_flags)
6455 {
6456     long chg, add = -1;
6457     struct hstate *h = hstate_inode(inode);
6458     struct hugepage_subpool *spool = subpool_inode(inode);
6459     struct resv_map *resv_map;
6460     struct hugetlb_cgroup *h_cg = NULL;
6461     long gbl_reserve, regions_needed = 0;
6462
6463     /* This should never happen */
6464     if (from > to) {
6465         VM_WARN(1, "%s called with a negative range\n", __func__);
6466         return false;
6467     }
6468
6469     /*
6470      * Only apply hugepage reservation if asked. At fault time, an
6471      * attempt will be made for VM_NORESERVE to allocate a page
6472      * without using reserves
6473      */
6474     if (vm_flags & VM_NORESERVE)
6475         return true;
6476
6477     /*
6478      * Shared mappings base their reservation on the number of pages that
6479      * are already allocated on behalf of the file. Private mappings need
6480      * to reserve the full area even if read-only as mprotect() may be
6481      * called to make the mapping read-write. Assume !vma is a shm mapping
6482      */
6483     if (!vma || vma->vm_flags & VM_MAYSHARE) {
6484         /*
6485          * resv_map can not be NULL as hugetlb_reserve_pages is only
6486          * called for inodes for which resv_maps were created (see
6487          * hugetlbfs_get_inode).
6488          */
6489         resv_map = inode_resv_map(inode);
6490
6491         chg = region_chg(resv_map, from, to, &regions_needed);
6492
6493     } else {
6494         /* Private mapping. */
6495         resv_map = resv_map_alloc();
6496         if (!resv_map)
6497             return false;
6498
6499         chg = to - from;
6500
6501         set_vma_resv_map(vma, resv_map);
6502         set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
6503     }
6504
6505     if (chg < 0)
6506         goto out_err;
6507
6508     if (hugetlb_cgroup_charge_cgroup_rsvd(hstate_index(h),
6509                 chg * pages_per_huge_page(h), &h_cg) < 0)
6510         goto out_err;
6511
6512     if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) {
6513         /* For private mappings, the hugetlb_cgroup uncharge info hangs
6514          * of the resv_map.
6515          */
6516         resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, h_cg, h);
6517     }
6518
6519     /*
6520      * There must be enough pages in the subpool for the mapping. If
6521      * the subpool has a minimum size, there may be some global
6522      * reservations already in place (gbl_reserve).
6523      */
6524     gbl_reserve = hugepage_subpool_get_pages(spool, chg);
6525     if (gbl_reserve < 0)
6526         goto out_uncharge_cgroup;
6527
6528     /*
6529      * Check enough hugepages are available for the reservation.
6530      * Hand the pages back to the subpool if there are not
6531      */
6532     if (hugetlb_acct_memory(h, gbl_reserve) < 0)
6533         goto out_put_pages;
6534
6535     /*
6536      * Account for the reservations made. Shared mappings record regions
6537      * that have reservations as they are shared by multiple VMAs.
6538      * When the last VMA disappears, the region map says how much
6539      * the reservation was and the page cache tells how much of
6540      * the reservation was consumed. Private mappings are per-VMA and
6541      * only the consumed reservations are tracked. When the VMA
6542      * disappears, the original reservation is the VMA size and the
6543      * consumed reservations are stored in the map. Hence, nothing
6544      * else has to be done for private mappings here
6545      */
6546     if (!vma || vma->vm_flags & VM_MAYSHARE) {
6547         add = region_add(resv_map, from, to, regions_needed, h, h_cg);
6548
6549         if (unlikely(add < 0)) {
6550             hugetlb_acct_memory(h, -gbl_reserve);
6551             goto out_put_pages;
6552         } else if (unlikely(chg > add)) {
6553             /*
6554              * pages in this range were added to the reserve
6555              * map between region_chg and region_add.  This
6556              * indicates a race with alloc_huge_page.  Adjust
6557              * the subpool and reserve counts modified above
6558              * based on the difference.
6559              */
6560             long rsv_adjust;
6561
6562             /*
6563              * hugetlb_cgroup_uncharge_cgroup_rsvd() will put the
6564              * reference to h_cg->css. See comment below for detail.
6565              */
6566             hugetlb_cgroup_uncharge_cgroup_rsvd(
6567                 hstate_index(h),
6568                 (chg - add) * pages_per_huge_page(h), h_cg);
6569
6570             rsv_adjust = hugepage_subpool_put_pages(spool,
6571                                 chg - add);
6572             hugetlb_acct_memory(h, -rsv_adjust);
6573         } else if (h_cg) {
6574             /*
6575              * The file_regions will hold their own reference to
6576              * h_cg->css. So we should release the reference held
6577              * via hugetlb_cgroup_charge_cgroup_rsvd() when we are
6578              * done.
6579              */
6580             hugetlb_cgroup_put_rsvd_cgroup(h_cg);
6581         }
6582     }
6583     return true;
6584
6585 out_put_pages:
6586     /* put back original number of pages, chg */
6587     (void)hugepage_subpool_put_pages(spool, chg);
6588 out_uncharge_cgroup:
6589     hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
6590                         chg * pages_per_huge_page(h), h_cg);
6591 out_err:
6592     if (!vma || vma->vm_flags & VM_MAYSHARE)
6593         /* Only call region_abort if the region_chg succeeded but the
6594          * region_add failed or didn't run.
6595          */
6596         if (chg >= 0 && add < 0)
6597             region_abort(resv_map, from, to, regions_needed);
6598     if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
6599         kref_put(&resv_map->refs, resv_map_release);
6600     return false;
6601 }
6602
6603 long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
6604                                 long freed)
6605 {
6606     struct hstate *h = hstate_inode(inode);
6607     struct resv_map *resv_map = inode_resv_map(inode);
6608     long chg = 0;
6609     struct hugepage_subpool *spool = subpool_inode(inode);
6610     long gbl_reserve;
6611
6612     /*
6613      * Since this routine can be called in the evict inode path for all
6614      * hugetlbfs inodes, resv_map could be NULL.
6615      */
6616     if (resv_map) {
6617         chg = region_del(resv_map, start, end);
6618         /*
6619          * region_del() can fail in the rare case where a region
6620          * must be split and another region descriptor can not be
6621          * allocated.  If end == LONG_MAX, it will not fail.
6622          */
6623         if (chg < 0)
6624             return chg;
6625     }
6626
6627     spin_lock(&inode->i_lock);
6628     inode->i_blocks -= (blocks_per_huge_page(h) * freed);
6629     spin_unlock(&inode->i_lock);
6630
6631     /*
6632      * If the subpool has a minimum size, the number of global
6633      * reservations to be released may be adjusted.
6634      *
6635      * Note that !resv_map implies freed == 0. So (chg - freed)
6636      * won't go negative.
6637      */
6638     gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
6639     hugetlb_acct_memory(h, -gbl_reserve);
6640
6641     return 0;
6642 }
6643
6644 #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
6645 static unsigned long page_table_shareable(struct vm_area_struct *svma,
6646                 struct vm_area_struct *vma,
6647                 unsigned long addr, pgoff_t idx)
6648 {
6649     unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
6650                 svma->vm_start;
6651     unsigned long sbase = saddr & PUD_MASK;
6652     unsigned long s_end = sbase + PUD_SIZE;
6653
6654     /* Allow segments to share if only one is marked locked */
6655     unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
6656     unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK;
6657
6658     /*
6659      * match the virtual addresses, permission and the alignment of the
6660      * page table page.
6661      */
6662     if (pmd_index(addr) != pmd_index(saddr) ||
6663         vm_flags != svm_flags ||
6664         !range_in_vma(svma, sbase, s_end))
6665         return 0;
6666
6667     return saddr;
6668 }
6669
6670 static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
6671 {
6672     unsigned long base = addr & PUD_MASK;
6673     unsigned long end = base + PUD_SIZE;
6674
6675     /*
6676      * check on proper vm_flags and page table alignment
6677      */
6678     if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end))
6679         return true;
6680     return false;
6681 }
6682
6683 bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
6684 {
6685 #ifdef CONFIG_USERFAULTFD
6686     if (uffd_disable_huge_pmd_share(vma))
6687         return false;
6688 #endif
6689     return vma_shareable(vma, addr);
6690 }
6691
6692 /*
6693  * Determine if start,end range within vma could be mapped by shared pmd.
6694  * If yes, adjust start and end to cover range associated with possible
6695  * shared pmd mappings.
6696  */
6697 void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
6698                 unsigned long *start, unsigned long *end)
6699 {
6700     unsigned long v_start = ALIGN(vma->vm_start, PUD_SIZE),
6701         v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
6702
6703     /*
6704      * vma needs to span at least one aligned PUD size, and the range
6705      * must be at least partially within in.
6706      */
6707     if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) ||
6708         (*end <= v_start) || (*start >= v_end))
6709         return;
6710
6711     /* Extend the range to be PUD aligned for a worst case scenario */
6712     if (*start > v_start)
6713         *start = ALIGN_DOWN(*start, PUD_SIZE);
6714
6715     if (*end < v_end)
6716         *end = ALIGN(*end, PUD_SIZE);
6717 }
6718
6719 /*
6720  * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
6721  * and returns the corresponding pte. While this is not necessary for the
6722  * !shared pmd case because we can allocate the pmd later as well, it makes the
6723  * code much cleaner.
6724  *
6725  * This routine must be called with i_mmap_rwsem held in at least read mode if
6726  * sharing is possible.  For hugetlbfs, this prevents removal of any page
6727  * table entries associated with the address space.  This is important as we
6728  * are setting up sharing based on existing page table entries (mappings).
6729  */
6730 pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
6731               unsigned long addr, pud_t *pud)
6732 {
6733     struct address_space *mapping = vma->vm_file->f_mapping;
6734     pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
6735             vma->vm_pgoff;
6736     struct vm_area_struct *svma;
6737     unsigned long saddr;
6738     pte_t *spte = NULL;
6739     pte_t *pte;
6740     spinlock_t *ptl;
6741
6742     i_mmap_assert_locked(mapping);
6743     vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
6744         if (svma == vma)
6745             continue;
6746
6747         saddr = page_table_shareable(svma, vma, addr, idx);
6748         if (saddr) {
6749             spte = huge_pte_offset(svma->vm_mm, saddr,
6750                            vma_mmu_pagesize(svma));
6751             if (spte) {
6752                 get_page(virt_to_page(spte));
6753                 break;
6754             }
6755         }
6756     }
6757
6758     if (!spte)
6759         goto out;
6760
6761     ptl = huge_pte_lock(hstate_vma(vma), mm, spte);
6762     if (pud_none(*pud)) {
6763         pud_populate(mm, pud,
6764                 (pmd_t *)((unsigned long)spte & PAGE_MASK));
6765         mm_inc_nr_pmds(mm);
6766     } else {
6767         put_page(virt_to_page(spte));
6768     }
6769     spin_unlock(ptl);
6770 out:
6771     pte = (pte_t *)pmd_alloc(mm, pud, addr);
6772     return pte;
6773 }
6774
6775 /*
6776  * unmap huge page backed by shared pte.
6777  *
6778  * Hugetlb pte page is ref counted at the time of mapping.  If pte is shared
6779  * indicated by page_count > 1, unmap is achieved by clearing pud and
6780  * decrementing the ref count. If count == 1, the pte page is not shared.
6781  *
6782  * Called with page table lock held and i_mmap_rwsem held in write mode.
6783  *
6784  * returns: 1 successfully unmapped a shared pte page
6785  *      0 the underlying pte page is not shared, or it is the last user
6786  */
6787 int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
6788                     unsigned long addr, pte_t *ptep)
6789 {
6790     pgd_t *pgd = pgd_offset(mm, addr);
6791     p4d_t *p4d = p4d_offset(pgd, addr);
6792     pud_t *pud = pud_offset(p4d, addr);
6793
6794     i_mmap_assert_write_locked(vma->vm_file->f_mapping);
6795     BUG_ON(page_count(virt_to_page(ptep)) == 0);
6796     if (page_count(virt_to_page(ptep)) == 1)
6797         return 0;
6798
6799     pud_clear(pud);
6800     put_page(virt_to_page(ptep));
6801     mm_dec_nr_pmds(mm);
6802     return 1;
6803 }
6804
6805 #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
6806 pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
6807               unsigned long addr, pud_t *pud)
6808 {
6809     return NULL;
6810 }
6811
6812 int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
6813                 unsigned long addr, pte_t *ptep)
6814 {
6815     return 0;
6816 }
6817
6818 void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
6819                 unsigned long *start, unsigned long *end)
6820 {
6821 }
6822
6823 bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
6824 {
6825     return false;
6826 }
6827 #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
6828
6829 #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
6830 pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
6831             unsigned long addr, unsigned long sz)
6832 {
6833     pgd_t *pgd;
6834     p4d_t *p4d;
6835     pud_t *pud;
6836     pte_t *pte = NULL;
6837
6838     pgd = pgd_offset(mm, addr);
6839     p4d = p4d_alloc(mm, pgd, addr);
6840     if (!p4d)
6841         return NULL;
6842     pud = pud_alloc(mm, p4d, addr);
6843     if (pud) {
6844         if (sz == PUD_SIZE) {
6845             pte = (pte_t *)pud;
6846         } else {
6847             BUG_ON(sz != PMD_SIZE);
6848             if (want_pmd_share(vma, addr) && pud_none(*pud))
6849                 pte = huge_pmd_share(mm, vma, addr, pud);
6850             else
6851                 pte = (pte_t *)pmd_alloc(mm, pud, addr);
6852         }
6853     }
6854     BUG_ON(pte && pte_present(*pte) && !pte_huge(*pte));
6855
6856     return pte;
6857 }
6858
6859 /*
6860  * huge_pte_offset() - Walk the page table to resolve the hugepage
6861  * entry at address @addr
6862  *
6863  * Return: Pointer to page table entry (PUD or PMD) for
6864  * address @addr, or NULL if a !p*d_present() entry is encountered and the
6865  * size @sz doesn't match the hugepage size at this level of the page
6866  * table.
6867  */
6868 pte_t *huge_pte_offset(struct mm_struct *mm,
6869                unsigned long addr, unsigned long sz)
6870 {
6871     pgd_t *pgd;
6872     p4d_t *p4d;
6873     pud_t *pud;
6874     pmd_t *pmd;
6875
6876     pgd = pgd_offset(mm, addr);
6877     if (!pgd_present(*pgd))
6878         return NULL;
6879     p4d = p4d_offset(pgd, addr);
6880     if (!p4d_present(*p4d))
6881         return NULL;
6882
6883     pud = pud_offset(p4d, addr);
6884     if (sz == PUD_SIZE)
6885         /* must be pud huge, non-present or none */
6886         return (pte_t *)pud;
6887     if (!pud_present(*pud))
6888         return NULL;
6889     /* must have a valid entry and size to go further */
6890
6891     pmd = pmd_offset(pud, addr);
6892     /* must be pmd huge, non-present or none */
6893     return (pte_t *)pmd;
6894 }
6895
6896 /*
6897  * Return a mask that can be used to update an address to the last huge
6898  * page in a page table page mapping size.  Used to skip non-present
6899  * page table entries when linearly scanning address ranges.  Architectures
6900  * with unique huge page to page table relationships can define their own
6901  * version of this routine.
6902  */
6903 unsigned long hugetlb_mask_last_page(struct hstate *h)
6904 {
6905     unsigned long hp_size = huge_page_size(h);
6906
6907     if (hp_size == PUD_SIZE)
6908         return P4D_SIZE - PUD_SIZE;
6909     else if (hp_size == PMD_SIZE)
6910         return PUD_SIZE - PMD_SIZE;
6911     else
6912         return 0UL;
6913 }
6914
6915 #else
6916
6917 /* See description above.  Architectures can provide their own version. */
6918 __weak unsigned long hugetlb_mask_last_page(struct hstate *h)
6919 {
6920 #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
6921     if (huge_page_size(h) == PMD_SIZE)
6922         return PUD_SIZE - PMD_SIZE;
6923 #endif
6924     return 0UL;
6925 }
6926
6927 #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
6928
6929 /*
6930  * These functions are overwritable if your architecture needs its own
6931  * behavior.
6932  */
6933 struct page * __weak
6934 follow_huge_addr(struct mm_struct *mm, unsigned long address,
6935                   int write)
6936 {
6937     return ERR_PTR(-EINVAL);
6938 }
6939
6940 struct page * __weak
6941 follow_huge_pd(struct vm_area_struct *vma,
6942            unsigned long address, hugepd_t hpd, int flags, int pdshift)
6943 {
6944     WARN(1, "hugepd follow called with no support for hugepage directory format\n");
6945     return NULL;
6946 }
6947
6948 struct page * __weak
6949 follow_huge_pmd(struct mm_struct *mm, unsigned long address,
6950         pmd_t *pmd, int flags)
6951 {
6952     struct page *page = NULL;
6953     spinlock_t *ptl;
6954     pte_t pte;
6955
6956     /*
6957      * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
6958      * follow_hugetlb_page().
6959      */
6960     if (WARN_ON_ONCE(flags & FOLL_PIN))
6961         return NULL;
6962
6963 retry:
6964     ptl = pmd_lockptr(mm, pmd);
6965     spin_lock(ptl);
6966     /*
6967      * make sure that the address range covered by this pmd is not
6968      * unmapped from other threads.
6969      */
6970     if (!pmd_huge(*pmd))
6971         goto out;
6972     pte = huge_ptep_get((pte_t *)pmd);
6973     if (pte_present(pte)) {
6974         page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
6975         /*
6976          * try_grab_page() should always succeed here, because: a) we
6977          * hold the pmd (ptl) lock, and b) we've just checked that the
6978          * huge pmd (head) page is present in the page tables. The ptl
6979          * prevents the head page and tail pages from being rearranged
6980          * in any way. So this page must be available at this point,
6981          * unless the page refcount overflowed:
6982          */
6983         if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
6984             page = NULL;
6985             goto out;
6986         }
6987     } else {
6988         if (is_hugetlb_entry_migration(pte)) {
6989             spin_unlock(ptl);
6990             __migration_entry_wait_huge((pte_t *)pmd, ptl);
6991             goto retry;
6992         }
6993         /*
6994          * hwpoisoned entry is treated as no_page_table in
6995          * follow_page_mask().
6996          */
6997     }
6998 out:
6999     spin_unlock(ptl);
7000     return page;
7001 }
7002
7003 struct page * __weak
7004 follow_huge_pud(struct mm_struct *mm, unsigned long address,
7005         pud_t *pud, int flags)
7006 {
7007     struct page *page = NULL;
7008     spinlock_t *ptl;
7009     pte_t pte;
7010
7011     if (WARN_ON_ONCE(flags & FOLL_PIN))
7012         return NULL;
7013
7014 retry:
7015     ptl = huge_pte_lock(hstate_sizelog(PUD_SHIFT), mm, (pte_t *)pud);
7016     if (!pud_huge(*pud))
7017         goto out;
7018     pte = huge_ptep_get((pte_t *)pud);
7019     if (pte_present(pte)) {
7020         page = pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
7021         if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
7022             page = NULL;
7023             goto out;
7024         }
7025     } else {
7026         if (is_hugetlb_entry_migration(pte)) {
7027             spin_unlock(ptl);
7028             __migration_entry_wait(mm, (pte_t *)pud, ptl);
7029             goto retry;
7030         }
7031         /*
7032          * hwpoisoned entry is treated as no_page_table in
7033          * follow_page_mask().
7034          */
7035     }
7036 out:
7037     spin_unlock(ptl);
7038     return page;
7039 }
7040
7041 struct page * __weak
7042 follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags)
7043 {
7044     if (flags & (FOLL_GET | FOLL_PIN))
7045         return NULL;
7046
7047     return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
7048 }
7049
7050 int isolate_hugetlb(struct page *page, struct list_head *list)
7051 {
7052     int ret = 0;
7053
7054     spin_lock_irq(&hugetlb_lock);
7055     if (!PageHeadHuge(page) ||
7056         !HPageMigratable(page) ||
7057         !get_page_unless_zero(page)) {
7058         ret = -EBUSY;
7059         goto unlock;
7060     }
7061     ClearHPageMigratable(page);
7062     list_move_tail(&page->lru, list);
7063 unlock:
7064     spin_unlock_irq(&hugetlb_lock);
7065     return ret;
7066 }
7067
7068 int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
7069 {
7070     int ret = 0;
7071
7072     *hugetlb = false;
7073     spin_lock_irq(&hugetlb_lock);
7074     if (PageHeadHuge(page)) {
7075         *hugetlb = true;
7076         if (HPageFreed(page))
7077             ret = 0;
7078         else if (HPageMigratable(page))
7079             ret = get_page_unless_zero(page);
7080         else
7081             ret = -EBUSY;
7082     }
7083     spin_unlock_irq(&hugetlb_lock);
7084     return ret;
7085 }
7086
7087 int get_huge_page_for_hwpoison(unsigned long pfn, int flags)
7088 {
7089     int ret;
7090
7091     spin_lock_irq(&hugetlb_lock);
7092     ret = __get_huge_page_for_hwpoison(pfn, flags);
7093     spin_unlock_irq(&hugetlb_lock);
7094     return ret;
7095 }
7096
7097 void putback_active_hugepage(struct page *page)
7098 {
7099     spin_lock_irq(&hugetlb_lock);
7100     SetHPageMigratable(page);
7101     list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
7102     spin_unlock_irq(&hugetlb_lock);
7103     put_page(page);
7104 }
7105
7106 void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
7107 {
7108     struct hstate *h = page_hstate(oldpage);
7109
7110     hugetlb_cgroup_migrate(oldpage, newpage);
7111     set_page_owner_migrate_reason(newpage, reason);
7112
7113     /*
7114      * transfer temporary state of the new huge page. This is
7115      * reverse to other transitions because the newpage is going to
7116      * be final while the old one will be freed so it takes over
7117      * the temporary status.
7118      *
7119      * Also note that we have to transfer the per-node surplus state
7120      * here as well otherwise the global surplus count will not match
7121      * the per-node's.
7122      */
7123     if (HPageTemporary(newpage)) {
7124         int old_nid = page_to_nid(oldpage);
7125         int new_nid = page_to_nid(newpage);
7126
7127         SetHPageTemporary(oldpage);
7128         ClearHPageTemporary(newpage);
7129
7130         /*
7131          * There is no need to transfer the per-node surplus state
7132          * when we do not cross the node.
7133          */
7134         if (new_nid == old_nid)
7135             return;
7136         spin_lock_irq(&hugetlb_lock);
7137         if (h->surplus_huge_pages_node[old_nid]) {
7138             h->surplus_huge_pages_node[old_nid]--;
7139             h->surplus_huge_pages_node[new_nid]++;
7140         }
7141         spin_unlock_irq(&hugetlb_lock);
7142     }
7143 }
7144
7145 /*
7146  * This function will unconditionally remove all the shared pmd pgtable entries
7147  * within the specific vma for a hugetlbfs memory range.
7148  */
7149 void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
7150 {
7151     struct hstate *h = hstate_vma(vma);
7152     unsigned long sz = huge_page_size(h);
7153     struct mm_struct *mm = vma->vm_mm;
7154     struct mmu_notifier_range range;
7155     unsigned long address, start, end;
7156     spinlock_t *ptl;
7157     pte_t *ptep;
7158
7159     if (!(vma->vm_flags & VM_MAYSHARE))
7160         return;
7161
7162     start = ALIGN(vma->vm_start, PUD_SIZE);
7163     end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
7164
7165     if (start >= end)
7166         return;
7167
7168     flush_cache_range(vma, start, end);
7169     /*
7170      * No need to call adjust_range_if_pmd_sharing_possible(), because
7171      * we have already done the PUD_SIZE alignment.
7172      */
7173     mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
7174                 start, end);
7175     mmu_notifier_invalidate_range_start(&range);
7176     i_mmap_lock_write(vma->vm_file->f_mapping);
7177     for (address = start; address < end; address += PUD_SIZE) {
7178         ptep = huge_pte_offset(mm, address, sz);
7179         if (!ptep)
7180             continue;
7181         ptl = huge_pte_lock(h, mm, ptep);
7182         huge_pmd_unshare(mm, vma, address, ptep);
7183         spin_unlock(ptl);
7184     }
7185     flush_hugetlb_tlb_range(vma, start, end);
7186     i_mmap_unlock_write(vma->vm_file->f_mapping);
7187     /*
7188      * No need to call mmu_notifier_invalidate_range(), see
7189      * Documentation/mm/mmu_notifier.rst.
7190      */
7191     mmu_notifier_invalidate_range_end(&range);
7192 }
7193
7194 #ifdef CONFIG_CMA
7195 static bool cma_reserve_called __initdata;
7196
7197 static int __init cmdline_parse_hugetlb_cma(char *p)
7198 {
7199     int nid, count = 0;
7200     unsigned long tmp;
7201     char *s = p;
7202
7203     while (*s) {
7204         if (sscanf(s, "%lu%n", &tmp, &count) != 1)
7205             break;
7206
7207         if (s[count] == ':') {
7208             if (tmp >= MAX_NUMNODES)
7209                 break;
7210             nid = array_index_nospec(tmp, MAX_NUMNODES);
7211
7212             s += count + 1;
7213             tmp = memparse(s, &s);
7214             hugetlb_cma_size_in_node[nid] = tmp;
7215             hugetlb_cma_size += tmp;
7216
7217             /*
7218              * Skip the separator if have one, otherwise
7219              * break the parsing.
7220              */
7221             if (*s == ',')
7222                 s++;
7223             else
7224                 break;
7225         } else {
7226             hugetlb_cma_size = memparse(p, &p);
7227             break;
7228         }
7229     }
7230
7231     return 0;
7232 }
7233
7234 early_param("hugetlb_cma", cmdline_parse_hugetlb_cma);
7235
7236 void __init hugetlb_cma_reserve(int order)
7237 {
7238     unsigned long size, reserved, per_node;
7239     bool node_specific_cma_alloc = false;
7240     int nid;
7241
7242     cma_reserve_called = true;
7243
7244     if (!hugetlb_cma_size)
7245         return;
7246
7247     for (nid = 0; nid < MAX_NUMNODES; nid++) {
7248         if (hugetlb_cma_size_in_node[nid] == 0)
7249             continue;
7250
7251         if (!node_online(nid)) {
7252             pr_warn("hugetlb_cma: invalid node %d specified\n", nid);
7253             hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
7254             hugetlb_cma_size_in_node[nid] = 0;
7255             continue;
7256         }
7257
7258         if (hugetlb_cma_size_in_node[nid] < (PAGE_SIZE << order)) {
7259             pr_warn("hugetlb_cma: cma area of node %d should be at least %lu MiB\n",
7260                 nid, (PAGE_SIZE << order) / SZ_1M);
7261             hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
7262             hugetlb_cma_size_in_node[nid] = 0;
7263         } else {
7264             node_specific_cma_alloc = true;
7265         }
7266     }
7267
7268     /* Validate the CMA size again in case some invalid nodes specified. */
7269     if (!hugetlb_cma_size)
7270         return;
7271
7272     if (hugetlb_cma_size < (PAGE_SIZE << order)) {
7273         pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n",
7274             (PAGE_SIZE << order) / SZ_1M);
7275         hugetlb_cma_size = 0;
7276         return;
7277     }
7278
7279     if (!node_specific_cma_alloc) {
7280         /*
7281          * If 3 GB area is requested on a machine with 4 numa nodes,
7282          * let's allocate 1 GB on first three nodes and ignore the last one.
7283          */
7284         per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes);
7285         pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n",
7286             hugetlb_cma_size / SZ_1M, per_node / SZ_1M);
7287     }
7288
7289     reserved = 0;
7290     for_each_online_node(nid) {
7291         int res;
7292         char name[CMA_MAX_NAME];
7293
7294         if (node_specific_cma_alloc) {
7295             if (hugetlb_cma_size_in_node[nid] == 0)
7296                 continue;
7297
7298             size = hugetlb_cma_size_in_node[nid];
7299         } else {
7300             size = min(per_node, hugetlb_cma_size - reserved);
7301         }
7302
7303         size = round_up(size, PAGE_SIZE << order);
7304
7305         snprintf(name, sizeof(name), "hugetlb%d", nid);
7306         /*
7307          * Note that 'order per bit' is based on smallest size that
7308          * may be returned to CMA allocator in the case of
7309          * huge page demotion.
7310          */
7311         res = cma_declare_contiguous_nid(0, size, 0,
7312                         PAGE_SIZE << HUGETLB_PAGE_ORDER,
7313                          0, false, name,
7314                          &hugetlb_cma[nid], nid);
7315         if (res) {
7316             pr_warn("hugetlb_cma: reservation failed: err %d, node %d",
7317                 res, nid);
7318             continue;
7319         }
7320
7321         reserved += size;
7322         pr_info("hugetlb_cma: reserved %lu MiB on node %d\n",
7323             size / SZ_1M, nid);
7324
7325         if (reserved >= hugetlb_cma_size)
7326             break;
7327     }
7328
7329     if (!reserved)
7330         /*
7331          * hugetlb_cma_size is used to determine if allocations from
7332          * cma are possible.  Set to zero if no cma regions are set up.
7333          */
7334         hugetlb_cma_size = 0;
7335 }
7336
7337 void __init hugetlb_cma_check(void)
7338 {
7339     if (!hugetlb_cma_size || cma_reserve_called)
7340         return;
7341
7342     pr_warn("hugetlb_cma: the option isn't supported by current arch\n");
7343 }
7344
7345 #endif /* CONFIG_CMA */