Back to home page

LXR

 
 

    


0001 /*
0002  *  linux/mm/page_alloc.c
0003  *
0004  *  Manages the free list, the system allocates free pages here.
0005  *  Note that kmalloc() lives in slab.c
0006  *
0007  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
0008  *  Swap reorganised 29.12.95, Stephen Tweedie
0009  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
0010  *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
0011  *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
0012  *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
0013  *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
0014  *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
0015  */
0016 
0017 #include <linux/stddef.h>
0018 #include <linux/mm.h>
0019 #include <linux/swap.h>
0020 #include <linux/interrupt.h>
0021 #include <linux/pagemap.h>
0022 #include <linux/jiffies.h>
0023 #include <linux/bootmem.h>
0024 #include <linux/memblock.h>
0025 #include <linux/compiler.h>
0026 #include <linux/kernel.h>
0027 #include <linux/kmemcheck.h>
0028 #include <linux/kasan.h>
0029 #include <linux/module.h>
0030 #include <linux/suspend.h>
0031 #include <linux/pagevec.h>
0032 #include <linux/blkdev.h>
0033 #include <linux/slab.h>
0034 #include <linux/ratelimit.h>
0035 #include <linux/oom.h>
0036 #include <linux/notifier.h>
0037 #include <linux/topology.h>
0038 #include <linux/sysctl.h>
0039 #include <linux/cpu.h>
0040 #include <linux/cpuset.h>
0041 #include <linux/memory_hotplug.h>
0042 #include <linux/nodemask.h>
0043 #include <linux/vmalloc.h>
0044 #include <linux/vmstat.h>
0045 #include <linux/mempolicy.h>
0046 #include <linux/memremap.h>
0047 #include <linux/stop_machine.h>
0048 #include <linux/sort.h>
0049 #include <linux/pfn.h>
0050 #include <linux/backing-dev.h>
0051 #include <linux/fault-inject.h>
0052 #include <linux/page-isolation.h>
0053 #include <linux/page_ext.h>
0054 #include <linux/debugobjects.h>
0055 #include <linux/kmemleak.h>
0056 #include <linux/compaction.h>
0057 #include <trace/events/kmem.h>
0058 #include <linux/prefetch.h>
0059 #include <linux/mm_inline.h>
0060 #include <linux/migrate.h>
0061 #include <linux/page_ext.h>
0062 #include <linux/hugetlb.h>
0063 #include <linux/sched/rt.h>
0064 #include <linux/page_owner.h>
0065 #include <linux/kthread.h>
0066 #include <linux/memcontrol.h>
0067 
0068 #include <asm/sections.h>
0069 #include <asm/tlbflush.h>
0070 #include <asm/div64.h>
0071 #include "internal.h"
0072 
0073 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
0074 static DEFINE_MUTEX(pcp_batch_high_lock);
0075 #define MIN_PERCPU_PAGELIST_FRACTION    (8)
0076 
0077 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
0078 DEFINE_PER_CPU(int, numa_node);
0079 EXPORT_PER_CPU_SYMBOL(numa_node);
0080 #endif
0081 
0082 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
0083 /*
0084  * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
0085  * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
0086  * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
0087  * defined in <linux/topology.h>.
0088  */
0089 DEFINE_PER_CPU(int, _numa_mem_);        /* Kernel "local memory" node */
0090 EXPORT_PER_CPU_SYMBOL(_numa_mem_);
0091 int _node_numa_mem_[MAX_NUMNODES];
0092 #endif
0093 
0094 #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
0095 volatile unsigned long latent_entropy __latent_entropy;
0096 EXPORT_SYMBOL(latent_entropy);
0097 #endif
0098 
0099 /*
0100  * Array of node states.
0101  */
0102 nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
0103     [N_POSSIBLE] = NODE_MASK_ALL,
0104     [N_ONLINE] = { { [0] = 1UL } },
0105 #ifndef CONFIG_NUMA
0106     [N_NORMAL_MEMORY] = { { [0] = 1UL } },
0107 #ifdef CONFIG_HIGHMEM
0108     [N_HIGH_MEMORY] = { { [0] = 1UL } },
0109 #endif
0110 #ifdef CONFIG_MOVABLE_NODE
0111     [N_MEMORY] = { { [0] = 1UL } },
0112 #endif
0113     [N_CPU] = { { [0] = 1UL } },
0114 #endif  /* NUMA */
0115 };
0116 EXPORT_SYMBOL(node_states);
0117 
0118 /* Protect totalram_pages and zone->managed_pages */
0119 static DEFINE_SPINLOCK(managed_page_count_lock);
0120 
0121 unsigned long totalram_pages __read_mostly;
0122 unsigned long totalreserve_pages __read_mostly;
0123 unsigned long totalcma_pages __read_mostly;
0124 
0125 int percpu_pagelist_fraction;
0126 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
0127 
0128 /*
0129  * A cached value of the page's pageblock's migratetype, used when the page is
0130  * put on a pcplist. Used to avoid the pageblock migratetype lookup when
0131  * freeing from pcplists in most cases, at the cost of possibly becoming stale.
0132  * Also the migratetype set in the page does not necessarily match the pcplist
0133  * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
0134  * other index - this ensures that it will be put on the correct CMA freelist.
0135  */
0136 static inline int get_pcppage_migratetype(struct page *page)
0137 {
0138     return page->index;
0139 }
0140 
0141 static inline void set_pcppage_migratetype(struct page *page, int migratetype)
0142 {
0143     page->index = migratetype;
0144 }
0145 
0146 #ifdef CONFIG_PM_SLEEP
0147 /*
0148  * The following functions are used by the suspend/hibernate code to temporarily
0149  * change gfp_allowed_mask in order to avoid using I/O during memory allocations
0150  * while devices are suspended.  To avoid races with the suspend/hibernate code,
0151  * they should always be called with pm_mutex held (gfp_allowed_mask also should
0152  * only be modified with pm_mutex held, unless the suspend/hibernate code is
0153  * guaranteed not to run in parallel with that modification).
0154  */
0155 
0156 static gfp_t saved_gfp_mask;
0157 
0158 void pm_restore_gfp_mask(void)
0159 {
0160     WARN_ON(!mutex_is_locked(&pm_mutex));
0161     if (saved_gfp_mask) {
0162         gfp_allowed_mask = saved_gfp_mask;
0163         saved_gfp_mask = 0;
0164     }
0165 }
0166 
0167 void pm_restrict_gfp_mask(void)
0168 {
0169     WARN_ON(!mutex_is_locked(&pm_mutex));
0170     WARN_ON(saved_gfp_mask);
0171     saved_gfp_mask = gfp_allowed_mask;
0172     gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
0173 }
0174 
0175 bool pm_suspended_storage(void)
0176 {
0177     if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
0178         return false;
0179     return true;
0180 }
0181 #endif /* CONFIG_PM_SLEEP */
0182 
0183 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
0184 unsigned int pageblock_order __read_mostly;
0185 #endif
0186 
0187 static void __free_pages_ok(struct page *page, unsigned int order);
0188 
0189 /*
0190  * results with 256, 32 in the lowmem_reserve sysctl:
0191  *  1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
0192  *  1G machine -> (16M dma, 784M normal, 224M high)
0193  *  NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
0194  *  HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
0195  *  HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA
0196  *
0197  * TBD: should special case ZONE_DMA32 machines here - in those we normally
0198  * don't need any ZONE_NORMAL reservation
0199  */
0200 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
0201 #ifdef CONFIG_ZONE_DMA
0202      256,
0203 #endif
0204 #ifdef CONFIG_ZONE_DMA32
0205      256,
0206 #endif
0207 #ifdef CONFIG_HIGHMEM
0208      32,
0209 #endif
0210      32,
0211 };
0212 
0213 EXPORT_SYMBOL(totalram_pages);
0214 
0215 static char * const zone_names[MAX_NR_ZONES] = {
0216 #ifdef CONFIG_ZONE_DMA
0217      "DMA",
0218 #endif
0219 #ifdef CONFIG_ZONE_DMA32
0220      "DMA32",
0221 #endif
0222      "Normal",
0223 #ifdef CONFIG_HIGHMEM
0224      "HighMem",
0225 #endif
0226      "Movable",
0227 #ifdef CONFIG_ZONE_DEVICE
0228      "Device",
0229 #endif
0230 };
0231 
0232 char * const migratetype_names[MIGRATE_TYPES] = {
0233     "Unmovable",
0234     "Movable",
0235     "Reclaimable",
0236     "HighAtomic",
0237 #ifdef CONFIG_CMA
0238     "CMA",
0239 #endif
0240 #ifdef CONFIG_MEMORY_ISOLATION
0241     "Isolate",
0242 #endif
0243 };
0244 
0245 compound_page_dtor * const compound_page_dtors[] = {
0246     NULL,
0247     free_compound_page,
0248 #ifdef CONFIG_HUGETLB_PAGE
0249     free_huge_page,
0250 #endif
0251 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
0252     free_transhuge_page,
0253 #endif
0254 };
0255 
0256 int min_free_kbytes = 1024;
0257 int user_min_free_kbytes = -1;
0258 int watermark_scale_factor = 10;
0259 
0260 static unsigned long __meminitdata nr_kernel_pages;
0261 static unsigned long __meminitdata nr_all_pages;
0262 static unsigned long __meminitdata dma_reserve;
0263 
0264 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
0265 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
0266 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
0267 static unsigned long __initdata required_kernelcore;
0268 static unsigned long __initdata required_movablecore;
0269 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
0270 static bool mirrored_kernelcore;
0271 
0272 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
0273 int movable_zone;
0274 EXPORT_SYMBOL(movable_zone);
0275 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
0276 
0277 #if MAX_NUMNODES > 1
0278 int nr_node_ids __read_mostly = MAX_NUMNODES;
0279 int nr_online_nodes __read_mostly = 1;
0280 EXPORT_SYMBOL(nr_node_ids);
0281 EXPORT_SYMBOL(nr_online_nodes);
0282 #endif
0283 
0284 int page_group_by_mobility_disabled __read_mostly;
0285 
0286 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
0287 static inline void reset_deferred_meminit(pg_data_t *pgdat)
0288 {
0289     pgdat->first_deferred_pfn = ULONG_MAX;
0290 }
0291 
0292 /* Returns true if the struct page for the pfn is uninitialised */
0293 static inline bool __meminit early_page_uninitialised(unsigned long pfn)
0294 {
0295     int nid = early_pfn_to_nid(pfn);
0296 
0297     if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)
0298         return true;
0299 
0300     return false;
0301 }
0302 
0303 /*
0304  * Returns false when the remaining initialisation should be deferred until
0305  * later in the boot cycle when it can be parallelised.
0306  */
0307 static inline bool update_defer_init(pg_data_t *pgdat,
0308                 unsigned long pfn, unsigned long zone_end,
0309                 unsigned long *nr_initialised)
0310 {
0311     unsigned long max_initialise;
0312 
0313     /* Always populate low zones for address-contrained allocations */
0314     if (zone_end < pgdat_end_pfn(pgdat))
0315         return true;
0316     /*
0317      * Initialise at least 2G of a node but also take into account that
0318      * two large system hashes that can take up 1GB for 0.25TB/node.
0319      */
0320     max_initialise = max(2UL << (30 - PAGE_SHIFT),
0321         (pgdat->node_spanned_pages >> 8));
0322 
0323     (*nr_initialised)++;
0324     if ((*nr_initialised > max_initialise) &&
0325         (pfn & (PAGES_PER_SECTION - 1)) == 0) {
0326         pgdat->first_deferred_pfn = pfn;
0327         return false;
0328     }
0329 
0330     return true;
0331 }
0332 #else
0333 static inline void reset_deferred_meminit(pg_data_t *pgdat)
0334 {
0335 }
0336 
0337 static inline bool early_page_uninitialised(unsigned long pfn)
0338 {
0339     return false;
0340 }
0341 
0342 static inline bool update_defer_init(pg_data_t *pgdat,
0343                 unsigned long pfn, unsigned long zone_end,
0344                 unsigned long *nr_initialised)
0345 {
0346     return true;
0347 }
0348 #endif
0349 
0350 /* Return a pointer to the bitmap storing bits affecting a block of pages */
0351 static inline unsigned long *get_pageblock_bitmap(struct page *page,
0352                             unsigned long pfn)
0353 {
0354 #ifdef CONFIG_SPARSEMEM
0355     return __pfn_to_section(pfn)->pageblock_flags;
0356 #else
0357     return page_zone(page)->pageblock_flags;
0358 #endif /* CONFIG_SPARSEMEM */
0359 }
0360 
0361 static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)
0362 {
0363 #ifdef CONFIG_SPARSEMEM
0364     pfn &= (PAGES_PER_SECTION-1);
0365     return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
0366 #else
0367     pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);
0368     return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
0369 #endif /* CONFIG_SPARSEMEM */
0370 }
0371 
0372 /**
0373  * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
0374  * @page: The page within the block of interest
0375  * @pfn: The target page frame number
0376  * @end_bitidx: The last bit of interest to retrieve
0377  * @mask: mask of bits that the caller is interested in
0378  *
0379  * Return: pageblock_bits flags
0380  */
0381 static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page,
0382                     unsigned long pfn,
0383                     unsigned long end_bitidx,
0384                     unsigned long mask)
0385 {
0386     unsigned long *bitmap;
0387     unsigned long bitidx, word_bitidx;
0388     unsigned long word;
0389 
0390     bitmap = get_pageblock_bitmap(page, pfn);
0391     bitidx = pfn_to_bitidx(page, pfn);
0392     word_bitidx = bitidx / BITS_PER_LONG;
0393     bitidx &= (BITS_PER_LONG-1);
0394 
0395     word = bitmap[word_bitidx];
0396     bitidx += end_bitidx;
0397     return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
0398 }
0399 
0400 unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
0401                     unsigned long end_bitidx,
0402                     unsigned long mask)
0403 {
0404     return __get_pfnblock_flags_mask(page, pfn, end_bitidx, mask);
0405 }
0406 
0407 static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
0408 {
0409     return __get_pfnblock_flags_mask(page, pfn, PB_migrate_end, MIGRATETYPE_MASK);
0410 }
0411 
0412 /**
0413  * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
0414  * @page: The page within the block of interest
0415  * @flags: The flags to set
0416  * @pfn: The target page frame number
0417  * @end_bitidx: The last bit of interest
0418  * @mask: mask of bits that the caller is interested in
0419  */
0420 void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
0421                     unsigned long pfn,
0422                     unsigned long end_bitidx,
0423                     unsigned long mask)
0424 {
0425     unsigned long *bitmap;
0426     unsigned long bitidx, word_bitidx;
0427     unsigned long old_word, word;
0428 
0429     BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
0430 
0431     bitmap = get_pageblock_bitmap(page, pfn);
0432     bitidx = pfn_to_bitidx(page, pfn);
0433     word_bitidx = bitidx / BITS_PER_LONG;
0434     bitidx &= (BITS_PER_LONG-1);
0435 
0436     VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
0437 
0438     bitidx += end_bitidx;
0439     mask <<= (BITS_PER_LONG - bitidx - 1);
0440     flags <<= (BITS_PER_LONG - bitidx - 1);
0441 
0442     word = READ_ONCE(bitmap[word_bitidx]);
0443     for (;;) {
0444         old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
0445         if (word == old_word)
0446             break;
0447         word = old_word;
0448     }
0449 }
0450 
0451 void set_pageblock_migratetype(struct page *page, int migratetype)
0452 {
0453     if (unlikely(page_group_by_mobility_disabled &&
0454              migratetype < MIGRATE_PCPTYPES))
0455         migratetype = MIGRATE_UNMOVABLE;
0456 
0457     set_pageblock_flags_group(page, (unsigned long)migratetype,
0458                     PB_migrate, PB_migrate_end);
0459 }
0460 
0461 #ifdef CONFIG_DEBUG_VM
0462 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
0463 {
0464     int ret = 0;
0465     unsigned seq;
0466     unsigned long pfn = page_to_pfn(page);
0467     unsigned long sp, start_pfn;
0468 
0469     do {
0470         seq = zone_span_seqbegin(zone);
0471         start_pfn = zone->zone_start_pfn;
0472         sp = zone->spanned_pages;
0473         if (!zone_spans_pfn(zone, pfn))
0474             ret = 1;
0475     } while (zone_span_seqretry(zone, seq));
0476 
0477     if (ret)
0478         pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n",
0479             pfn, zone_to_nid(zone), zone->name,
0480             start_pfn, start_pfn + sp);
0481 
0482     return ret;
0483 }
0484 
0485 static int page_is_consistent(struct zone *zone, struct page *page)
0486 {
0487     if (!pfn_valid_within(page_to_pfn(page)))
0488         return 0;
0489     if (zone != page_zone(page))
0490         return 0;
0491 
0492     return 1;
0493 }
0494 /*
0495  * Temporary debugging check for pages not lying within a given zone.
0496  */
0497 static int bad_range(struct zone *zone, struct page *page)
0498 {
0499     if (page_outside_zone_boundaries(zone, page))
0500         return 1;
0501     if (!page_is_consistent(zone, page))
0502         return 1;
0503 
0504     return 0;
0505 }
0506 #else
0507 static inline int bad_range(struct zone *zone, struct page *page)
0508 {
0509     return 0;
0510 }
0511 #endif
0512 
0513 static void bad_page(struct page *page, const char *reason,
0514         unsigned long bad_flags)
0515 {
0516     static unsigned long resume;
0517     static unsigned long nr_shown;
0518     static unsigned long nr_unshown;
0519 
0520     /*
0521      * Allow a burst of 60 reports, then keep quiet for that minute;
0522      * or allow a steady drip of one report per second.
0523      */
0524     if (nr_shown == 60) {
0525         if (time_before(jiffies, resume)) {
0526             nr_unshown++;
0527             goto out;
0528         }
0529         if (nr_unshown) {
0530             pr_alert(
0531                   "BUG: Bad page state: %lu messages suppressed\n",
0532                 nr_unshown);
0533             nr_unshown = 0;
0534         }
0535         nr_shown = 0;
0536     }
0537     if (nr_shown++ == 0)
0538         resume = jiffies + 60 * HZ;
0539 
0540     pr_alert("BUG: Bad page state in process %s  pfn:%05lx\n",
0541         current->comm, page_to_pfn(page));
0542     __dump_page(page, reason);
0543     bad_flags &= page->flags;
0544     if (bad_flags)
0545         pr_alert("bad because of flags: %#lx(%pGp)\n",
0546                         bad_flags, &bad_flags);
0547     dump_page_owner(page);
0548 
0549     print_modules();
0550     dump_stack();
0551 out:
0552     /* Leave bad fields for debug, except PageBuddy could make trouble */
0553     page_mapcount_reset(page); /* remove PageBuddy */
0554     add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
0555 }
0556 
0557 /*
0558  * Higher-order pages are called "compound pages".  They are structured thusly:
0559  *
0560  * The first PAGE_SIZE page is called the "head page" and have PG_head set.
0561  *
0562  * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded
0563  * in bit 0 of page->compound_head. The rest of bits is pointer to head page.
0564  *
0565  * The first tail page's ->compound_dtor holds the offset in array of compound
0566  * page destructors. See compound_page_dtors.
0567  *
0568  * The first tail page's ->compound_order holds the order of allocation.
0569  * This usage means that zero-order pages may not be compound.
0570  */
0571 
0572 void free_compound_page(struct page *page)
0573 {
0574     __free_pages_ok(page, compound_order(page));
0575 }
0576 
0577 void prep_compound_page(struct page *page, unsigned int order)
0578 {
0579     int i;
0580     int nr_pages = 1 << order;
0581 
0582     set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
0583     set_compound_order(page, order);
0584     __SetPageHead(page);
0585     for (i = 1; i < nr_pages; i++) {
0586         struct page *p = page + i;
0587         set_page_count(p, 0);
0588         p->mapping = TAIL_MAPPING;
0589         set_compound_head(p, page);
0590     }
0591     atomic_set(compound_mapcount_ptr(page), -1);
0592 }
0593 
0594 #ifdef CONFIG_DEBUG_PAGEALLOC
0595 unsigned int _debug_guardpage_minorder;
0596 bool _debug_pagealloc_enabled __read_mostly
0597             = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
0598 EXPORT_SYMBOL(_debug_pagealloc_enabled);
0599 bool _debug_guardpage_enabled __read_mostly;
0600 
0601 static int __init early_debug_pagealloc(char *buf)
0602 {
0603     if (!buf)
0604         return -EINVAL;
0605     return kstrtobool(buf, &_debug_pagealloc_enabled);
0606 }
0607 early_param("debug_pagealloc", early_debug_pagealloc);
0608 
0609 static bool need_debug_guardpage(void)
0610 {
0611     /* If we don't use debug_pagealloc, we don't need guard page */
0612     if (!debug_pagealloc_enabled())
0613         return false;
0614 
0615     if (!debug_guardpage_minorder())
0616         return false;
0617 
0618     return true;
0619 }
0620 
0621 static void init_debug_guardpage(void)
0622 {
0623     if (!debug_pagealloc_enabled())
0624         return;
0625 
0626     if (!debug_guardpage_minorder())
0627         return;
0628 
0629     _debug_guardpage_enabled = true;
0630 }
0631 
0632 struct page_ext_operations debug_guardpage_ops = {
0633     .need = need_debug_guardpage,
0634     .init = init_debug_guardpage,
0635 };
0636 
0637 static int __init debug_guardpage_minorder_setup(char *buf)
0638 {
0639     unsigned long res;
0640 
0641     if (kstrtoul(buf, 10, &res) < 0 ||  res > MAX_ORDER / 2) {
0642         pr_err("Bad debug_guardpage_minorder value\n");
0643         return 0;
0644     }
0645     _debug_guardpage_minorder = res;
0646     pr_info("Setting debug_guardpage_minorder to %lu\n", res);
0647     return 0;
0648 }
0649 early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
0650 
0651 static inline bool set_page_guard(struct zone *zone, struct page *page,
0652                 unsigned int order, int migratetype)
0653 {
0654     struct page_ext *page_ext;
0655 
0656     if (!debug_guardpage_enabled())
0657         return false;
0658 
0659     if (order >= debug_guardpage_minorder())
0660         return false;
0661 
0662     page_ext = lookup_page_ext(page);
0663     if (unlikely(!page_ext))
0664         return false;
0665 
0666     __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
0667 
0668     INIT_LIST_HEAD(&page->lru);
0669     set_page_private(page, order);
0670     /* Guard pages are not available for any usage */
0671     __mod_zone_freepage_state(zone, -(1 << order), migratetype);
0672 
0673     return true;
0674 }
0675 
0676 static inline void clear_page_guard(struct zone *zone, struct page *page,
0677                 unsigned int order, int migratetype)
0678 {
0679     struct page_ext *page_ext;
0680 
0681     if (!debug_guardpage_enabled())
0682         return;
0683 
0684     page_ext = lookup_page_ext(page);
0685     if (unlikely(!page_ext))
0686         return;
0687 
0688     __clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
0689 
0690     set_page_private(page, 0);
0691     if (!is_migrate_isolate(migratetype))
0692         __mod_zone_freepage_state(zone, (1 << order), migratetype);
0693 }
0694 #else
0695 struct page_ext_operations debug_guardpage_ops;
0696 static inline bool set_page_guard(struct zone *zone, struct page *page,
0697             unsigned int order, int migratetype) { return false; }
0698 static inline void clear_page_guard(struct zone *zone, struct page *page,
0699                 unsigned int order, int migratetype) {}
0700 #endif
0701 
0702 static inline void set_page_order(struct page *page, unsigned int order)
0703 {
0704     set_page_private(page, order);
0705     __SetPageBuddy(page);
0706 }
0707 
0708 static inline void rmv_page_order(struct page *page)
0709 {
0710     __ClearPageBuddy(page);
0711     set_page_private(page, 0);
0712 }
0713 
0714 /*
0715  * This function checks whether a page is free && is the buddy
0716  * we can do coalesce a page and its buddy if
0717  * (a) the buddy is not in a hole &&
0718  * (b) the buddy is in the buddy system &&
0719  * (c) a page and its buddy have the same order &&
0720  * (d) a page and its buddy are in the same zone.
0721  *
0722  * For recording whether a page is in the buddy system, we set ->_mapcount
0723  * PAGE_BUDDY_MAPCOUNT_VALUE.
0724  * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is
0725  * serialized by zone->lock.
0726  *
0727  * For recording page's order, we use page_private(page).
0728  */
0729 static inline int page_is_buddy(struct page *page, struct page *buddy,
0730                             unsigned int order)
0731 {
0732     if (!pfn_valid_within(page_to_pfn(buddy)))
0733         return 0;
0734 
0735     if (page_is_guard(buddy) && page_order(buddy) == order) {
0736         if (page_zone_id(page) != page_zone_id(buddy))
0737             return 0;
0738 
0739         VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
0740 
0741         return 1;
0742     }
0743 
0744     if (PageBuddy(buddy) && page_order(buddy) == order) {
0745         /*
0746          * zone check is done late to avoid uselessly
0747          * calculating zone/node ids for pages that could
0748          * never merge.
0749          */
0750         if (page_zone_id(page) != page_zone_id(buddy))
0751             return 0;
0752 
0753         VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
0754 
0755         return 1;
0756     }
0757     return 0;
0758 }
0759 
0760 /*
0761  * Freeing function for a buddy system allocator.
0762  *
0763  * The concept of a buddy system is to maintain direct-mapped table
0764  * (containing bit values) for memory blocks of various "orders".
0765  * The bottom level table contains the map for the smallest allocatable
0766  * units of memory (here, pages), and each level above it describes
0767  * pairs of units from the levels below, hence, "buddies".
0768  * At a high level, all that happens here is marking the table entry
0769  * at the bottom level available, and propagating the changes upward
0770  * as necessary, plus some accounting needed to play nicely with other
0771  * parts of the VM system.
0772  * At each level, we keep a list of pages, which are heads of continuous
0773  * free pages of length of (1 << order) and marked with _mapcount
0774  * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page)
0775  * field.
0776  * So when we are allocating or freeing one, we can derive the state of the
0777  * other.  That is, if we allocate a small block, and both were
0778  * free, the remainder of the region must be split into blocks.
0779  * If a block is freed, and its buddy is also free, then this
0780  * triggers coalescing into a block of larger size.
0781  *
0782  * -- nyc
0783  */
0784 
0785 static inline void __free_one_page(struct page *page,
0786         unsigned long pfn,
0787         struct zone *zone, unsigned int order,
0788         int migratetype)
0789 {
0790     unsigned long page_idx;
0791     unsigned long combined_idx;
0792     unsigned long uninitialized_var(buddy_idx);
0793     struct page *buddy;
0794     unsigned int max_order;
0795 
0796     max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
0797 
0798     VM_BUG_ON(!zone_is_initialized(zone));
0799     VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
0800 
0801     VM_BUG_ON(migratetype == -1);
0802     if (likely(!is_migrate_isolate(migratetype)))
0803         __mod_zone_freepage_state(zone, 1 << order, migratetype);
0804 
0805     page_idx = pfn & ((1 << MAX_ORDER) - 1);
0806 
0807     VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page);
0808     VM_BUG_ON_PAGE(bad_range(zone, page), page);
0809 
0810 continue_merging:
0811     while (order < max_order - 1) {
0812         buddy_idx = __find_buddy_index(page_idx, order);
0813         buddy = page + (buddy_idx - page_idx);
0814         if (!page_is_buddy(page, buddy, order))
0815             goto done_merging;
0816         /*
0817          * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
0818          * merge with it and move up one order.
0819          */
0820         if (page_is_guard(buddy)) {
0821             clear_page_guard(zone, buddy, order, migratetype);
0822         } else {
0823             list_del(&buddy->lru);
0824             zone->free_area[order].nr_free--;
0825             rmv_page_order(buddy);
0826         }
0827         combined_idx = buddy_idx & page_idx;
0828         page = page + (combined_idx - page_idx);
0829         page_idx = combined_idx;
0830         order++;
0831     }
0832     if (max_order < MAX_ORDER) {
0833         /* If we are here, it means order is >= pageblock_order.
0834          * We want to prevent merge between freepages on isolate
0835          * pageblock and normal pageblock. Without this, pageblock
0836          * isolation could cause incorrect freepage or CMA accounting.
0837          *
0838          * We don't want to hit this code for the more frequent
0839          * low-order merging.
0840          */
0841         if (unlikely(has_isolate_pageblock(zone))) {
0842             int buddy_mt;
0843 
0844             buddy_idx = __find_buddy_index(page_idx, order);
0845             buddy = page + (buddy_idx - page_idx);
0846             buddy_mt = get_pageblock_migratetype(buddy);
0847 
0848             if (migratetype != buddy_mt
0849                     && (is_migrate_isolate(migratetype) ||
0850                         is_migrate_isolate(buddy_mt)))
0851                 goto done_merging;
0852         }
0853         max_order++;
0854         goto continue_merging;
0855     }
0856 
0857 done_merging:
0858     set_page_order(page, order);
0859 
0860     /*
0861      * If this is not the largest possible page, check if the buddy
0862      * of the next-highest order is free. If it is, it's possible
0863      * that pages are being freed that will coalesce soon. In case,
0864      * that is happening, add the free page to the tail of the list
0865      * so it's less likely to be used soon and more likely to be merged
0866      * as a higher order page
0867      */
0868     if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
0869         struct page *higher_page, *higher_buddy;
0870         combined_idx = buddy_idx & page_idx;
0871         higher_page = page + (combined_idx - page_idx);
0872         buddy_idx = __find_buddy_index(combined_idx, order + 1);
0873         higher_buddy = higher_page + (buddy_idx - combined_idx);
0874         if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
0875             list_add_tail(&page->lru,
0876                 &zone->free_area[order].free_list[migratetype]);
0877             goto out;
0878         }
0879     }
0880 
0881     list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
0882 out:
0883     zone->free_area[order].nr_free++;
0884 }
0885 
0886 /*
0887  * A bad page could be due to a number of fields. Instead of multiple branches,
0888  * try and check multiple fields with one check. The caller must do a detailed
0889  * check if necessary.
0890  */
0891 static inline bool page_expected_state(struct page *page,
0892                     unsigned long check_flags)
0893 {
0894     if (unlikely(atomic_read(&page->_mapcount) != -1))
0895         return false;
0896 
0897     if (unlikely((unsigned long)page->mapping |
0898             page_ref_count(page) |
0899 #ifdef CONFIG_MEMCG
0900             (unsigned long)page->mem_cgroup |
0901 #endif
0902             (page->flags & check_flags)))
0903         return false;
0904 
0905     return true;
0906 }
0907 
0908 static void free_pages_check_bad(struct page *page)
0909 {
0910     const char *bad_reason;
0911     unsigned long bad_flags;
0912 
0913     bad_reason = NULL;
0914     bad_flags = 0;
0915 
0916     if (unlikely(atomic_read(&page->_mapcount) != -1))
0917         bad_reason = "nonzero mapcount";
0918     if (unlikely(page->mapping != NULL))
0919         bad_reason = "non-NULL mapping";
0920     if (unlikely(page_ref_count(page) != 0))
0921         bad_reason = "nonzero _refcount";
0922     if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
0923         bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
0924         bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
0925     }
0926 #ifdef CONFIG_MEMCG
0927     if (unlikely(page->mem_cgroup))
0928         bad_reason = "page still charged to cgroup";
0929 #endif
0930     bad_page(page, bad_reason, bad_flags);
0931 }
0932 
0933 static inline int free_pages_check(struct page *page)
0934 {
0935     if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
0936         return 0;
0937 
0938     /* Something has gone sideways, find it */
0939     free_pages_check_bad(page);
0940     return 1;
0941 }
0942 
0943 static int free_tail_pages_check(struct page *head_page, struct page *page)
0944 {
0945     int ret = 1;
0946 
0947     /*
0948      * We rely page->lru.next never has bit 0 set, unless the page
0949      * is PageTail(). Let's make sure that's true even for poisoned ->lru.
0950      */
0951     BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);
0952 
0953     if (!IS_ENABLED(CONFIG_DEBUG_VM)) {
0954         ret = 0;
0955         goto out;
0956     }
0957     switch (page - head_page) {
0958     case 1:
0959         /* the first tail page: ->mapping is compound_mapcount() */
0960         if (unlikely(compound_mapcount(page))) {
0961             bad_page(page, "nonzero compound_mapcount", 0);
0962             goto out;
0963         }
0964         break;
0965     case 2:
0966         /*
0967          * the second tail page: ->mapping is
0968          * page_deferred_list().next -- ignore value.
0969          */
0970         break;
0971     default:
0972         if (page->mapping != TAIL_MAPPING) {
0973             bad_page(page, "corrupted mapping in tail page", 0);
0974             goto out;
0975         }
0976         break;
0977     }
0978     if (unlikely(!PageTail(page))) {
0979         bad_page(page, "PageTail not set", 0);
0980         goto out;
0981     }
0982     if (unlikely(compound_head(page) != head_page)) {
0983         bad_page(page, "compound_head not consistent", 0);
0984         goto out;
0985     }
0986     ret = 0;
0987 out:
0988     page->mapping = NULL;
0989     clear_compound_head(page);
0990     return ret;
0991 }
0992 
0993 static __always_inline bool free_pages_prepare(struct page *page,
0994                     unsigned int order, bool check_free)
0995 {
0996     int bad = 0;
0997 
0998     VM_BUG_ON_PAGE(PageTail(page), page);
0999 
1000     trace_mm_page_free(page, order);
1001     kmemcheck_free_shadow(page, order);
1002 
1003     /*
1004      * Check tail pages before head page information is cleared to
1005      * avoid checking PageCompound for order-0 pages.
1006      */
1007     if (unlikely(order)) {
1008         bool compound = PageCompound(page);
1009         int i;
1010 
1011         VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
1012 
1013         if (compound)
1014             ClearPageDoubleMap(page);
1015         for (i = 1; i < (1 << order); i++) {
1016             if (compound)
1017                 bad += free_tail_pages_check(page, page + i);
1018             if (unlikely(free_pages_check(page + i))) {
1019                 bad++;
1020                 continue;
1021             }
1022             (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
1023         }
1024     }
1025     if (PageMappingFlags(page))
1026         page->mapping = NULL;
1027     if (memcg_kmem_enabled() && PageKmemcg(page))
1028         memcg_kmem_uncharge(page, order);
1029     if (check_free)
1030         bad += free_pages_check(page);
1031     if (bad)
1032         return false;
1033 
1034     page_cpupid_reset_last(page);
1035     page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
1036     reset_page_owner(page, order);
1037 
1038     if (!PageHighMem(page)) {
1039         debug_check_no_locks_freed(page_address(page),
1040                        PAGE_SIZE << order);
1041         debug_check_no_obj_freed(page_address(page),
1042                        PAGE_SIZE << order);
1043     }
1044     arch_free_page(page, order);
1045     kernel_poison_pages(page, 1 << order, 0);
1046     kernel_map_pages(page, 1 << order, 0);
1047     kasan_free_pages(page, order);
1048 
1049     return true;
1050 }
1051 
1052 #ifdef CONFIG_DEBUG_VM
1053 static inline bool free_pcp_prepare(struct page *page)
1054 {
1055     return free_pages_prepare(page, 0, true);
1056 }
1057 
1058 static inline bool bulkfree_pcp_prepare(struct page *page)
1059 {
1060     return false;
1061 }
1062 #else
1063 static bool free_pcp_prepare(struct page *page)
1064 {
1065     return free_pages_prepare(page, 0, false);
1066 }
1067 
1068 static bool bulkfree_pcp_prepare(struct page *page)
1069 {
1070     return free_pages_check(page);
1071 }
1072 #endif /* CONFIG_DEBUG_VM */
1073 
1074 /*
1075  * Frees a number of pages from the PCP lists
1076  * Assumes all pages on list are in same zone, and of same order.
1077  * count is the number of pages to free.
1078  *
1079  * If the zone was previously in an "all pages pinned" state then look to
1080  * see if this freeing clears that state.
1081  *
1082  * And clear the zone's pages_scanned counter, to hold off the "all pages are
1083  * pinned" detection logic.
1084  */
1085 static void free_pcppages_bulk(struct zone *zone, int count,
1086                     struct per_cpu_pages *pcp)
1087 {
1088     int migratetype = 0;
1089     int batch_free = 0;
1090     unsigned long nr_scanned;
1091     bool isolated_pageblocks;
1092 
1093     spin_lock(&zone->lock);
1094     isolated_pageblocks = has_isolate_pageblock(zone);
1095     nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
1096     if (nr_scanned)
1097         __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
1098 
1099     while (count) {
1100         struct page *page;
1101         struct list_head *list;
1102 
1103         /*
1104          * Remove pages from lists in a round-robin fashion. A
1105          * batch_free count is maintained that is incremented when an
1106          * empty list is encountered.  This is so more pages are freed
1107          * off fuller lists instead of spinning excessively around empty
1108          * lists
1109          */
1110         do {
1111             batch_free++;
1112             if (++migratetype == MIGRATE_PCPTYPES)
1113                 migratetype = 0;
1114             list = &pcp->lists[migratetype];
1115         } while (list_empty(list));
1116 
1117         /* This is the only non-empty list. Free them all. */
1118         if (batch_free == MIGRATE_PCPTYPES)
1119             batch_free = count;
1120 
1121         do {
1122             int mt; /* migratetype of the to-be-freed page */
1123 
1124             page = list_last_entry(list, struct page, lru);
1125             /* must delete as __free_one_page list manipulates */
1126             list_del(&page->lru);
1127 
1128             mt = get_pcppage_migratetype(page);
1129             /* MIGRATE_ISOLATE page should not go to pcplists */
1130             VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
1131             /* Pageblock could have been isolated meanwhile */
1132             if (unlikely(isolated_pageblocks))
1133                 mt = get_pageblock_migratetype(page);
1134 
1135             if (bulkfree_pcp_prepare(page))
1136                 continue;
1137 
1138             __free_one_page(page, page_to_pfn(page), zone, 0, mt);
1139             trace_mm_page_pcpu_drain(page, 0, mt);
1140         } while (--count && --batch_free && !list_empty(list));
1141     }
1142     spin_unlock(&zone->lock);
1143 }
1144 
1145 static void free_one_page(struct zone *zone,
1146                 struct page *page, unsigned long pfn,
1147                 unsigned int order,
1148                 int migratetype)
1149 {
1150     unsigned long nr_scanned;
1151     spin_lock(&zone->lock);
1152     nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
1153     if (nr_scanned)
1154         __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
1155 
1156     if (unlikely(has_isolate_pageblock(zone) ||
1157         is_migrate_isolate(migratetype))) {
1158         migratetype = get_pfnblock_migratetype(page, pfn);
1159     }
1160     __free_one_page(page, pfn, zone, order, migratetype);
1161     spin_unlock(&zone->lock);
1162 }
1163 
1164 static void __meminit __init_single_page(struct page *page, unsigned long pfn,
1165                 unsigned long zone, int nid)
1166 {
1167     set_page_links(page, zone, nid, pfn);
1168     init_page_count(page);
1169     page_mapcount_reset(page);
1170     page_cpupid_reset_last(page);
1171 
1172     INIT_LIST_HEAD(&page->lru);
1173 #ifdef WANT_PAGE_VIRTUAL
1174     /* The shift won't overflow because ZONE_NORMAL is below 4G. */
1175     if (!is_highmem_idx(zone))
1176         set_page_address(page, __va(pfn << PAGE_SHIFT));
1177 #endif
1178 }
1179 
1180 static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone,
1181                     int nid)
1182 {
1183     return __init_single_page(pfn_to_page(pfn), pfn, zone, nid);
1184 }
1185 
1186 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
1187 static void init_reserved_page(unsigned long pfn)
1188 {
1189     pg_data_t *pgdat;
1190     int nid, zid;
1191 
1192     if (!early_page_uninitialised(pfn))
1193         return;
1194 
1195     nid = early_pfn_to_nid(pfn);
1196     pgdat = NODE_DATA(nid);
1197 
1198     for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1199         struct zone *zone = &pgdat->node_zones[zid];
1200 
1201         if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone))
1202             break;
1203     }
1204     __init_single_pfn(pfn, zid, nid);
1205 }
1206 #else
1207 static inline void init_reserved_page(unsigned long pfn)
1208 {
1209 }
1210 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
1211 
1212 /*
1213  * Initialised pages do not have PageReserved set. This function is
1214  * called for each range allocated by the bootmem allocator and
1215  * marks the pages PageReserved. The remaining valid pages are later
1216  * sent to the buddy page allocator.
1217  */
1218 void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
1219 {
1220     unsigned long start_pfn = PFN_DOWN(start);
1221     unsigned long end_pfn = PFN_UP(end);
1222 
1223     for (; start_pfn < end_pfn; start_pfn++) {
1224         if (pfn_valid(start_pfn)) {
1225             struct page *page = pfn_to_page(start_pfn);
1226 
1227             init_reserved_page(start_pfn);
1228 
1229             /* Avoid false-positive PageTail() */
1230             INIT_LIST_HEAD(&page->lru);
1231 
1232             SetPageReserved(page);
1233         }
1234     }
1235 }
1236 
1237 static void __free_pages_ok(struct page *page, unsigned int order)
1238 {
1239     unsigned long flags;
1240     int migratetype;
1241     unsigned long pfn = page_to_pfn(page);
1242 
1243     if (!free_pages_prepare(page, order, true))
1244         return;
1245 
1246     migratetype = get_pfnblock_migratetype(page, pfn);
1247     local_irq_save(flags);
1248     __count_vm_events(PGFREE, 1 << order);
1249     free_one_page(page_zone(page), page, pfn, order, migratetype);
1250     local_irq_restore(flags);
1251 }
1252 
1253 static void __init __free_pages_boot_core(struct page *page, unsigned int order)
1254 {
1255     unsigned int nr_pages = 1 << order;
1256     struct page *p = page;
1257     unsigned int loop;
1258 
1259     prefetchw(p);
1260     for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
1261         prefetchw(p + 1);
1262         __ClearPageReserved(p);
1263         set_page_count(p, 0);
1264     }
1265     __ClearPageReserved(p);
1266     set_page_count(p, 0);
1267 
1268     page_zone(page)->managed_pages += nr_pages;
1269     set_page_refcounted(page);
1270     __free_pages(page, order);
1271 }
1272 
1273 #if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \
1274     defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
1275 
1276 static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
1277 
1278 int __meminit early_pfn_to_nid(unsigned long pfn)
1279 {
1280     static DEFINE_SPINLOCK(early_pfn_lock);
1281     int nid;
1282 
1283     spin_lock(&early_pfn_lock);
1284     nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
1285     if (nid < 0)
1286         nid = first_online_node;
1287     spin_unlock(&early_pfn_lock);
1288 
1289     return nid;
1290 }
1291 #endif
1292 
1293 #ifdef CONFIG_NODES_SPAN_OTHER_NODES
1294 static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node,
1295                     struct mminit_pfnnid_cache *state)
1296 {
1297     int nid;
1298 
1299     nid = __early_pfn_to_nid(pfn, state);
1300     if (nid >= 0 && nid != node)
1301         return false;
1302     return true;
1303 }
1304 
1305 /* Only safe to use early in boot when initialisation is single-threaded */
1306 static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
1307 {
1308     return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache);
1309 }
1310 
1311 #else
1312 
1313 static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
1314 {
1315     return true;
1316 }
1317 static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node,
1318                     struct mminit_pfnnid_cache *state)
1319 {
1320     return true;
1321 }
1322 #endif
1323 
1324 
1325 void __init __free_pages_bootmem(struct page *page, unsigned long pfn,
1326                             unsigned int order)
1327 {
1328     if (early_page_uninitialised(pfn))
1329         return;
1330     return __free_pages_boot_core(page, order);
1331 }
1332 
1333 /*
1334  * Check that the whole (or subset of) a pageblock given by the interval of
1335  * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
1336  * with the migration of free compaction scanner. The scanners then need to
1337  * use only pfn_valid_within() check for arches that allow holes within
1338  * pageblocks.
1339  *
1340  * Return struct page pointer of start_pfn, or NULL if checks were not passed.
1341  *
1342  * It's possible on some configurations to have a setup like node0 node1 node0
1343  * i.e. it's possible that all pages within a zones range of pages do not
1344  * belong to a single zone. We assume that a border between node0 and node1
1345  * can occur within a single pageblock, but not a node0 node1 node0
1346  * interleaving within a single pageblock. It is therefore sufficient to check
1347  * the first and last page of a pageblock and avoid checking each individual
1348  * page in a pageblock.
1349  */
1350 struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
1351                      unsigned long end_pfn, struct zone *zone)
1352 {
1353     struct page *start_page;
1354     struct page *end_page;
1355 
1356     /* end_pfn is one past the range we are checking */
1357     end_pfn--;
1358 
1359     if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
1360         return NULL;
1361 
1362     start_page = pfn_to_page(start_pfn);
1363 
1364     if (page_zone(start_page) != zone)
1365         return NULL;
1366 
1367     end_page = pfn_to_page(end_pfn);
1368 
1369     /* This gives a shorter code than deriving page_zone(end_page) */
1370     if (page_zone_id(start_page) != page_zone_id(end_page))
1371         return NULL;
1372 
1373     return start_page;
1374 }
1375 
1376 void set_zone_contiguous(struct zone *zone)
1377 {
1378     unsigned long block_start_pfn = zone->zone_start_pfn;
1379     unsigned long block_end_pfn;
1380 
1381     block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages);
1382     for (; block_start_pfn < zone_end_pfn(zone);
1383             block_start_pfn = block_end_pfn,
1384              block_end_pfn += pageblock_nr_pages) {
1385 
1386         block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));
1387 
1388         if (!__pageblock_pfn_to_page(block_start_pfn,
1389                          block_end_pfn, zone))
1390             return;
1391     }
1392 
1393     /* We confirm that there is no hole */
1394     zone->contiguous = true;
1395 }
1396 
1397 void clear_zone_contiguous(struct zone *zone)
1398 {
1399     zone->contiguous = false;
1400 }
1401 
1402 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
1403 static void __init deferred_free_range(struct page *page,
1404                     unsigned long pfn, int nr_pages)
1405 {
1406     int i;
1407 
1408     if (!page)
1409         return;
1410 
1411     /* Free a large naturally-aligned chunk if possible */
1412     if (nr_pages == pageblock_nr_pages &&
1413         (pfn & (pageblock_nr_pages - 1)) == 0) {
1414         set_pageblock_migratetype(page, MIGRATE_MOVABLE);
1415         __free_pages_boot_core(page, pageblock_order);
1416         return;
1417     }
1418 
1419     for (i = 0; i < nr_pages; i++, page++, pfn++) {
1420         if ((pfn & (pageblock_nr_pages - 1)) == 0)
1421             set_pageblock_migratetype(page, MIGRATE_MOVABLE);
1422         __free_pages_boot_core(page, 0);
1423     }
1424 }
1425 
1426 /* Completion tracking for deferred_init_memmap() threads */
1427 static atomic_t pgdat_init_n_undone __initdata;
1428 static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);
1429 
1430 static inline void __init pgdat_init_report_one_done(void)
1431 {
1432     if (atomic_dec_and_test(&pgdat_init_n_undone))
1433         complete(&pgdat_init_all_done_comp);
1434 }
1435 
1436 /* Initialise remaining memory on a node */
1437 static int __init deferred_init_memmap(void *data)
1438 {
1439     pg_data_t *pgdat = data;
1440     int nid = pgdat->node_id;
1441     struct mminit_pfnnid_cache nid_init_state = { };
1442     unsigned long start = jiffies;
1443     unsigned long nr_pages = 0;
1444     unsigned long walk_start, walk_end;
1445     int i, zid;
1446     struct zone *zone;
1447     unsigned long first_init_pfn = pgdat->first_deferred_pfn;
1448     const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
1449 
1450     if (first_init_pfn == ULONG_MAX) {
1451         pgdat_init_report_one_done();
1452         return 0;
1453     }
1454 
1455     /* Bind memory initialisation thread to a local node if possible */
1456     if (!cpumask_empty(cpumask))
1457         set_cpus_allowed_ptr(current, cpumask);
1458 
1459     /* Sanity check boundaries */
1460     BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
1461     BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
1462     pgdat->first_deferred_pfn = ULONG_MAX;
1463 
1464     /* Only the highest zone is deferred so find it */
1465     for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1466         zone = pgdat->node_zones + zid;
1467         if (first_init_pfn < zone_end_pfn(zone))
1468             break;
1469     }
1470 
1471     for_each_mem_pfn_range(i, nid, &walk_start, &walk_end, NULL) {
1472         unsigned long pfn, end_pfn;
1473         struct page *page = NULL;
1474         struct page *free_base_page = NULL;
1475         unsigned long free_base_pfn = 0;
1476         int nr_to_free = 0;
1477 
1478         end_pfn = min(walk_end, zone_end_pfn(zone));
1479         pfn = first_init_pfn;
1480         if (pfn < walk_start)
1481             pfn = walk_start;
1482         if (pfn < zone->zone_start_pfn)
1483             pfn = zone->zone_start_pfn;
1484 
1485         for (; pfn < end_pfn; pfn++) {
1486             if (!pfn_valid_within(pfn))
1487                 goto free_range;
1488 
1489             /*
1490              * Ensure pfn_valid is checked every
1491              * pageblock_nr_pages for memory holes
1492              */
1493             if ((pfn & (pageblock_nr_pages - 1)) == 0) {
1494                 if (!pfn_valid(pfn)) {
1495                     page = NULL;
1496                     goto free_range;
1497                 }
1498             }
1499 
1500             if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {
1501                 page = NULL;
1502                 goto free_range;
1503             }
1504 
1505             /* Minimise pfn page lookups and scheduler checks */
1506             if (page && (pfn & (pageblock_nr_pages - 1)) != 0) {
1507                 page++;
1508             } else {
1509                 nr_pages += nr_to_free;
1510                 deferred_free_range(free_base_page,
1511                         free_base_pfn, nr_to_free);
1512                 free_base_page = NULL;
1513                 free_base_pfn = nr_to_free = 0;
1514 
1515                 page = pfn_to_page(pfn);
1516                 cond_resched();
1517             }
1518 
1519             if (page->flags) {
1520                 VM_BUG_ON(page_zone(page) != zone);
1521                 goto free_range;
1522             }
1523 
1524             __init_single_page(page, pfn, zid, nid);
1525             if (!free_base_page) {
1526                 free_base_page = page;
1527                 free_base_pfn = pfn;
1528                 nr_to_free = 0;
1529             }
1530             nr_to_free++;
1531 
1532             /* Where possible, batch up pages for a single free */
1533             continue;
1534 free_range:
1535             /* Free the current block of pages to allocator */
1536             nr_pages += nr_to_free;
1537             deferred_free_range(free_base_page, free_base_pfn,
1538                                 nr_to_free);
1539             free_base_page = NULL;
1540             free_base_pfn = nr_to_free = 0;
1541         }
1542         /* Free the last block of pages to allocator */
1543         nr_pages += nr_to_free;
1544         deferred_free_range(free_base_page, free_base_pfn, nr_to_free);
1545 
1546         first_init_pfn = max(end_pfn, first_init_pfn);
1547     }
1548 
1549     /* Sanity check that the next zone really is unpopulated */
1550     WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
1551 
1552     pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages,
1553                     jiffies_to_msecs(jiffies - start));
1554 
1555     pgdat_init_report_one_done();
1556     return 0;
1557 }
1558 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
1559 
1560 void __init page_alloc_init_late(void)
1561 {
1562     struct zone *zone;
1563 
1564 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
1565     int nid;
1566 
1567     /* There will be num_node_state(N_MEMORY) threads */
1568     atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
1569     for_each_node_state(nid, N_MEMORY) {
1570         kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);
1571     }
1572 
1573     /* Block until all are initialised */
1574     wait_for_completion(&pgdat_init_all_done_comp);
1575 
1576     /* Reinit limits that are based on free pages after the kernel is up */
1577     files_maxfiles_init();
1578 #endif
1579 
1580     for_each_populated_zone(zone)
1581         set_zone_contiguous(zone);
1582 }
1583 
1584 #ifdef CONFIG_CMA
1585 /* Free whole pageblock and set its migration type to MIGRATE_CMA. */
1586 void __init init_cma_reserved_pageblock(struct page *page)
1587 {
1588     unsigned i = pageblock_nr_pages;
1589     struct page *p = page;
1590 
1591     do {
1592         __ClearPageReserved(p);
1593         set_page_count(p, 0);
1594     } while (++p, --i);
1595 
1596     set_pageblock_migratetype(page, MIGRATE_CMA);
1597 
1598     if (pageblock_order >= MAX_ORDER) {
1599         i = pageblock_nr_pages;
1600         p = page;
1601         do {
1602             set_page_refcounted(p);
1603             __free_pages(p, MAX_ORDER - 1);
1604             p += MAX_ORDER_NR_PAGES;
1605         } while (i -= MAX_ORDER_NR_PAGES);
1606     } else {
1607         set_page_refcounted(page);
1608         __free_pages(page, pageblock_order);
1609     }
1610 
1611     adjust_managed_page_count(page, pageblock_nr_pages);
1612 }
1613 #endif
1614 
1615 /*
1616  * The order of subdivision here is critical for the IO subsystem.
1617  * Please do not alter this order without good reasons and regression
1618  * testing. Specifically, as large blocks of memory are subdivided,
1619  * the order in which smaller blocks are delivered depends on the order
1620  * they're subdivided in this function. This is the primary factor
1621  * influencing the order in which pages are delivered to the IO
1622  * subsystem according to empirical testing, and this is also justified
1623  * by considering the behavior of a buddy system containing a single
1624  * large block of memory acted on by a series of small allocations.
1625  * This behavior is a critical factor in sglist merging's success.
1626  *
1627  * -- nyc
1628  */
1629 static inline void expand(struct zone *zone, struct page *page,
1630     int low, int high, struct free_area *area,
1631     int migratetype)
1632 {
1633     unsigned long size = 1 << high;
1634 
1635     while (high > low) {
1636         area--;
1637         high--;
1638         size >>= 1;
1639         VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
1640 
1641         /*
1642          * Mark as guard pages (or page), that will allow to
1643          * merge back to allocator when buddy will be freed.
1644          * Corresponding page table entries will not be touched,
1645          * pages will stay not present in virtual address space
1646          */
1647         if (set_page_guard(zone, &page[size], high, migratetype))
1648             continue;
1649 
1650         list_add(&page[size].lru, &area->free_list[migratetype]);
1651         area->nr_free++;
1652         set_page_order(&page[size], high);
1653     }
1654 }
1655 
1656 static void check_new_page_bad(struct page *page)
1657 {
1658     const char *bad_reason = NULL;
1659     unsigned long bad_flags = 0;
1660 
1661     if (unlikely(atomic_read(&page->_mapcount) != -1))
1662         bad_reason = "nonzero mapcount";
1663     if (unlikely(page->mapping != NULL))
1664         bad_reason = "non-NULL mapping";
1665     if (unlikely(page_ref_count(page) != 0))
1666         bad_reason = "nonzero _count";
1667     if (unlikely(page->flags & __PG_HWPOISON)) {
1668         bad_reason = "HWPoisoned (hardware-corrupted)";
1669         bad_flags = __PG_HWPOISON;
1670         /* Don't complain about hwpoisoned pages */
1671         page_mapcount_reset(page); /* remove PageBuddy */
1672         return;
1673     }
1674     if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) {
1675         bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
1676         bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
1677     }
1678 #ifdef CONFIG_MEMCG
1679     if (unlikely(page->mem_cgroup))
1680         bad_reason = "page still charged to cgroup";
1681 #endif
1682     bad_page(page, bad_reason, bad_flags);
1683 }
1684 
1685 /*
1686  * This page is about to be returned from the page allocator
1687  */
1688 static inline int check_new_page(struct page *page)
1689 {
1690     if (likely(page_expected_state(page,
1691                 PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)))
1692         return 0;
1693 
1694     check_new_page_bad(page);
1695     return 1;
1696 }
1697 
1698 static inline bool free_pages_prezeroed(bool poisoned)
1699 {
1700     return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
1701         page_poisoning_enabled() && poisoned;
1702 }
1703 
1704 #ifdef CONFIG_DEBUG_VM
1705 static bool check_pcp_refill(struct page *page)
1706 {
1707     return false;
1708 }
1709 
1710 static bool check_new_pcp(struct page *page)
1711 {
1712     return check_new_page(page);
1713 }
1714 #else
1715 static bool check_pcp_refill(struct page *page)
1716 {
1717     return check_new_page(page);
1718 }
1719 static bool check_new_pcp(struct page *page)
1720 {
1721     return false;
1722 }
1723 #endif /* CONFIG_DEBUG_VM */
1724 
1725 static bool check_new_pages(struct page *page, unsigned int order)
1726 {
1727     int i;
1728     for (i = 0; i < (1 << order); i++) {
1729         struct page *p = page + i;
1730 
1731         if (unlikely(check_new_page(p)))
1732             return true;
1733     }
1734 
1735     return false;
1736 }
1737 
1738 inline void post_alloc_hook(struct page *page, unsigned int order,
1739                 gfp_t gfp_flags)
1740 {
1741     set_page_private(page, 0);
1742     set_page_refcounted(page);
1743 
1744     arch_alloc_page(page, order);
1745     kernel_map_pages(page, 1 << order, 1);
1746     kernel_poison_pages(page, 1 << order, 1);
1747     kasan_alloc_pages(page, order);
1748     set_page_owner(page, order, gfp_flags);
1749 }
1750 
1751 static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
1752                             unsigned int alloc_flags)
1753 {
1754     int i;
1755     bool poisoned = true;
1756 
1757     for (i = 0; i < (1 << order); i++) {
1758         struct page *p = page + i;
1759         if (poisoned)
1760             poisoned &= page_is_poisoned(p);
1761     }
1762 
1763     post_alloc_hook(page, order, gfp_flags);
1764 
1765     if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO))
1766         for (i = 0; i < (1 << order); i++)
1767             clear_highpage(page + i);
1768 
1769     if (order && (gfp_flags & __GFP_COMP))
1770         prep_compound_page(page, order);
1771 
1772     /*
1773      * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to
1774      * allocate the page. The expectation is that the caller is taking
1775      * steps that will free more memory. The caller should avoid the page
1776      * being used for !PFMEMALLOC purposes.
1777      */
1778     if (alloc_flags & ALLOC_NO_WATERMARKS)
1779         set_page_pfmemalloc(page);
1780     else
1781         clear_page_pfmemalloc(page);
1782 }
1783 
1784 /*
1785  * Go through the free lists for the given migratetype and remove
1786  * the smallest available page from the freelists
1787  */
1788 static inline
1789 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
1790                         int migratetype)
1791 {
1792     unsigned int current_order;
1793     struct free_area *area;
1794     struct page *page;
1795 
1796     /* Find a page of the appropriate size in the preferred list */
1797     for (current_order = order; current_order < MAX_ORDER; ++current_order) {
1798         area = &(zone->free_area[current_order]);
1799         page = list_first_entry_or_null(&area->free_list[migratetype],
1800                             struct page, lru);
1801         if (!page)
1802             continue;
1803         list_del(&page->lru);
1804         rmv_page_order(page);
1805         area->nr_free--;
1806         expand(zone, page, order, current_order, area, migratetype);
1807         set_pcppage_migratetype(page, migratetype);
1808         return page;
1809     }
1810 
1811     return NULL;
1812 }
1813 
1814 
1815 /*
1816  * This array describes the order lists are fallen back to when
1817  * the free lists for the desirable migrate type are depleted
1818  */
1819 static int fallbacks[MIGRATE_TYPES][4] = {
1820     [MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,   MIGRATE_TYPES },
1821     [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,   MIGRATE_TYPES },
1822     [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
1823 #ifdef CONFIG_CMA
1824     [MIGRATE_CMA]         = { MIGRATE_TYPES }, /* Never used */
1825 #endif
1826 #ifdef CONFIG_MEMORY_ISOLATION
1827     [MIGRATE_ISOLATE]     = { MIGRATE_TYPES }, /* Never used */
1828 #endif
1829 };
1830 
1831 #ifdef CONFIG_CMA
1832 static struct page *__rmqueue_cma_fallback(struct zone *zone,
1833                     unsigned int order)
1834 {
1835     return __rmqueue_smallest(zone, order, MIGRATE_CMA);
1836 }
1837 #else
1838 static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
1839                     unsigned int order) { return NULL; }
1840 #endif
1841 
1842 /*
1843  * Move the free pages in a range to the free lists of the requested type.
1844  * Note that start_page and end_pages are not aligned on a pageblock
1845  * boundary. If alignment is required, use move_freepages_block()
1846  */
1847 int move_freepages(struct zone *zone,
1848               struct page *start_page, struct page *end_page,
1849               int migratetype)
1850 {
1851     struct page *page;
1852     unsigned int order;
1853     int pages_moved = 0;
1854 
1855 #ifndef CONFIG_HOLES_IN_ZONE
1856     /*
1857      * page_zone is not safe to call in this context when
1858      * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
1859      * anyway as we check zone boundaries in move_freepages_block().
1860      * Remove at a later date when no bug reports exist related to
1861      * grouping pages by mobility
1862      */
1863     VM_BUG_ON(page_zone(start_page) != page_zone(end_page));
1864 #endif
1865 
1866     for (page = start_page; page <= end_page;) {
1867         if (!pfn_valid_within(page_to_pfn(page))) {
1868             page++;
1869             continue;
1870         }
1871 
1872         /* Make sure we are not inadvertently changing nodes */
1873         VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
1874 
1875         if (!PageBuddy(page)) {
1876             page++;
1877             continue;
1878         }
1879 
1880         order = page_order(page);
1881         list_move(&page->lru,
1882               &zone->free_area[order].free_list[migratetype]);
1883         page += 1 << order;
1884         pages_moved += 1 << order;
1885     }
1886 
1887     return pages_moved;
1888 }
1889 
1890 int move_freepages_block(struct zone *zone, struct page *page,
1891                 int migratetype)
1892 {
1893     unsigned long start_pfn, end_pfn;
1894     struct page *start_page, *end_page;
1895 
1896     start_pfn = page_to_pfn(page);
1897     start_pfn = start_pfn & ~(pageblock_nr_pages-1);
1898     start_page = pfn_to_page(start_pfn);
1899     end_page = start_page + pageblock_nr_pages - 1;
1900     end_pfn = start_pfn + pageblock_nr_pages - 1;
1901 
1902     /* Do not cross zone boundaries */
1903     if (!zone_spans_pfn(zone, start_pfn))
1904         start_page = page;
1905     if (!zone_spans_pfn(zone, end_pfn))
1906         return 0;
1907 
1908     return move_freepages(zone, start_page, end_page, migratetype);
1909 }
1910 
1911 static void change_pageblock_range(struct page *pageblock_page,
1912                     int start_order, int migratetype)
1913 {
1914     int nr_pageblocks = 1 << (start_order - pageblock_order);
1915 
1916     while (nr_pageblocks--) {
1917         set_pageblock_migratetype(pageblock_page, migratetype);
1918         pageblock_page += pageblock_nr_pages;
1919     }
1920 }
1921 
1922 /*
1923  * When we are falling back to another migratetype during allocation, try to
1924  * steal extra free pages from the same pageblocks to satisfy further
1925  * allocations, instead of polluting multiple pageblocks.
1926  *
1927  * If we are stealing a relatively large buddy page, it is likely there will
1928  * be more free pages in the pageblock, so try to steal them all. For
1929  * reclaimable and unmovable allocations, we steal regardless of page size,
1930  * as fragmentation caused by those allocations polluting movable pageblocks
1931  * is worse than movable allocations stealing from unmovable and reclaimable
1932  * pageblocks.
1933  */
1934 static bool can_steal_fallback(unsigned int order, int start_mt)
1935 {
1936     /*
1937      * Leaving this order check is intended, although there is
1938      * relaxed order check in next check. The reason is that
1939      * we can actually steal whole pageblock if this condition met,
1940      * but, below check doesn't guarantee it and that is just heuristic
1941      * so could be changed anytime.
1942      */
1943     if (order >= pageblock_order)
1944         return true;
1945 
1946     if (order >= pageblock_order / 2 ||
1947         start_mt == MIGRATE_RECLAIMABLE ||
1948         start_mt == MIGRATE_UNMOVABLE ||
1949         page_group_by_mobility_disabled)
1950         return true;
1951 
1952     return false;
1953 }
1954 
1955 /*
1956  * This function implements actual steal behaviour. If order is large enough,
1957  * we can steal whole pageblock. If not, we first move freepages in this
1958  * pageblock and check whether half of pages are moved or not. If half of
1959  * pages are moved, we can change migratetype of pageblock and permanently
1960  * use it's pages as requested migratetype in the future.
1961  */
1962 static void steal_suitable_fallback(struct zone *zone, struct page *page,
1963                               int start_type)
1964 {
1965     unsigned int current_order = page_order(page);
1966     int pages;
1967 
1968     /* Take ownership for orders >= pageblock_order */
1969     if (current_order >= pageblock_order) {
1970         change_pageblock_range(page, current_order, start_type);
1971         return;
1972     }
1973 
1974     pages = move_freepages_block(zone, page, start_type);
1975 
1976     /* Claim the whole block if over half of it is free */
1977     if (pages >= (1 << (pageblock_order-1)) ||
1978             page_group_by_mobility_disabled)
1979         set_pageblock_migratetype(page, start_type);
1980 }
1981 
1982 /*
1983  * Check whether there is a suitable fallback freepage with requested order.
1984  * If only_stealable is true, this function returns fallback_mt only if
1985  * we can steal other freepages all together. This would help to reduce
1986  * fragmentation due to mixed migratetype pages in one pageblock.
1987  */
1988 int find_suitable_fallback(struct free_area *area, unsigned int order,
1989             int migratetype, bool only_stealable, bool *can_steal)
1990 {
1991     int i;
1992     int fallback_mt;
1993 
1994     if (area->nr_free == 0)
1995         return -1;
1996 
1997     *can_steal = false;
1998     for (i = 0;; i++) {
1999         fallback_mt = fallbacks[migratetype][i];
2000         if (fallback_mt == MIGRATE_TYPES)
2001             break;
2002 
2003         if (list_empty(&area->free_list[fallback_mt]))
2004             continue;
2005 
2006         if (can_steal_fallback(order, migratetype))
2007             *can_steal = true;
2008 
2009         if (!only_stealable)
2010             return fallback_mt;
2011 
2012         if (*can_steal)
2013             return fallback_mt;
2014     }
2015 
2016     return -1;
2017 }
2018 
2019 /*
2020  * Reserve a pageblock for exclusive use of high-order atomic allocations if
2021  * there are no empty page blocks that contain a page with a suitable order
2022  */
2023 static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
2024                 unsigned int alloc_order)
2025 {
2026     int mt;
2027     unsigned long max_managed, flags;
2028 
2029     /*
2030      * Limit the number reserved to 1 pageblock or roughly 1% of a zone.
2031      * Check is race-prone but harmless.
2032      */
2033     max_managed = (zone->managed_pages / 100) + pageblock_nr_pages;
2034     if (zone->nr_reserved_highatomic >= max_managed)
2035         return;
2036 
2037     spin_lock_irqsave(&zone->lock, flags);
2038 
2039     /* Recheck the nr_reserved_highatomic limit under the lock */
2040     if (zone->nr_reserved_highatomic >= max_managed)
2041         goto out_unlock;
2042 
2043     /* Yoink! */
2044     mt = get_pageblock_migratetype(page);
2045     if (mt != MIGRATE_HIGHATOMIC &&
2046             !is_migrate_isolate(mt) && !is_migrate_cma(mt)) {
2047         zone->nr_reserved_highatomic += pageblock_nr_pages;
2048         set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
2049         move_freepages_block(zone, page, MIGRATE_HIGHATOMIC);
2050     }
2051 
2052 out_unlock:
2053     spin_unlock_irqrestore(&zone->lock, flags);
2054 }
2055 
2056 /*
2057  * Used when an allocation is about to fail under memory pressure. This
2058  * potentially hurts the reliability of high-order allocations when under
2059  * intense memory pressure but failed atomic allocations should be easier
2060  * to recover from than an OOM.
2061  *
2062  * If @force is true, try to unreserve a pageblock even though highatomic
2063  * pageblock is exhausted.
2064  */
2065 static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
2066                         bool force)
2067 {
2068     struct zonelist *zonelist = ac->zonelist;
2069     unsigned long flags;
2070     struct zoneref *z;
2071     struct zone *zone;
2072     struct page *page;
2073     int order;
2074     bool ret;
2075 
2076     for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
2077                                 ac->nodemask) {
2078         /*
2079          * Preserve at least one pageblock unless memory pressure
2080          * is really high.
2081          */
2082         if (!force && zone->nr_reserved_highatomic <=
2083                     pageblock_nr_pages)
2084             continue;
2085 
2086         spin_lock_irqsave(&zone->lock, flags);
2087         for (order = 0; order < MAX_ORDER; order++) {
2088             struct free_area *area = &(zone->free_area[order]);
2089 
2090             page = list_first_entry_or_null(
2091                     &area->free_list[MIGRATE_HIGHATOMIC],
2092                     struct page, lru);
2093             if (!page)
2094                 continue;
2095 
2096             /*
2097              * In page freeing path, migratetype change is racy so
2098              * we can counter several free pages in a pageblock
2099              * in this loop althoug we changed the pageblock type
2100              * from highatomic to ac->migratetype. So we should
2101              * adjust the count once.
2102              */
2103             if (get_pageblock_migratetype(page) ==
2104                             MIGRATE_HIGHATOMIC) {
2105                 /*
2106                  * It should never happen but changes to
2107                  * locking could inadvertently allow a per-cpu
2108                  * drain to add pages to MIGRATE_HIGHATOMIC
2109                  * while unreserving so be safe and watch for
2110                  * underflows.
2111                  */
2112                 zone->nr_reserved_highatomic -= min(
2113                         pageblock_nr_pages,
2114                         zone->nr_reserved_highatomic);
2115             }
2116 
2117             /*
2118              * Convert to ac->migratetype and avoid the normal
2119              * pageblock stealing heuristics. Minimally, the caller
2120              * is doing the work and needs the pages. More
2121              * importantly, if the block was always converted to
2122              * MIGRATE_UNMOVABLE or another type then the number
2123              * of pageblocks that cannot be completely freed
2124              * may increase.
2125              */
2126             set_pageblock_migratetype(page, ac->migratetype);
2127             ret = move_freepages_block(zone, page, ac->migratetype);
2128             if (ret) {
2129                 spin_unlock_irqrestore(&zone->lock, flags);
2130                 return ret;
2131             }
2132         }
2133         spin_unlock_irqrestore(&zone->lock, flags);
2134     }
2135 
2136     return false;
2137 }
2138 
2139 /* Remove an element from the buddy allocator from the fallback list */
2140 static inline struct page *
2141 __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
2142 {
2143     struct free_area *area;
2144     unsigned int current_order;
2145     struct page *page;
2146     int fallback_mt;
2147     bool can_steal;
2148 
2149     /* Find the largest possible block of pages in the other list */
2150     for (current_order = MAX_ORDER-1;
2151                 current_order >= order && current_order <= MAX_ORDER-1;
2152                 --current_order) {
2153         area = &(zone->free_area[current_order]);
2154         fallback_mt = find_suitable_fallback(area, current_order,
2155                 start_migratetype, false, &can_steal);
2156         if (fallback_mt == -1)
2157             continue;
2158 
2159         page = list_first_entry(&area->free_list[fallback_mt],
2160                         struct page, lru);
2161         if (can_steal &&
2162             get_pageblock_migratetype(page) != MIGRATE_HIGHATOMIC)
2163             steal_suitable_fallback(zone, page, start_migratetype);
2164 
2165         /* Remove the page from the freelists */
2166         area->nr_free--;
2167         list_del(&page->lru);
2168         rmv_page_order(page);
2169 
2170         expand(zone, page, order, current_order, area,
2171                     start_migratetype);
2172         /*
2173          * The pcppage_migratetype may differ from pageblock's
2174          * migratetype depending on the decisions in
2175          * find_suitable_fallback(). This is OK as long as it does not
2176          * differ for MIGRATE_CMA pageblocks. Those can be used as
2177          * fallback only via special __rmqueue_cma_fallback() function
2178          */
2179         set_pcppage_migratetype(page, start_migratetype);
2180 
2181         trace_mm_page_alloc_extfrag(page, order, current_order,
2182             start_migratetype, fallback_mt);
2183 
2184         return page;
2185     }
2186 
2187     return NULL;
2188 }
2189 
2190 /*
2191  * Do the hard work of removing an element from the buddy allocator.
2192  * Call me with the zone->lock already held.
2193  */
2194 static struct page *__rmqueue(struct zone *zone, unsigned int order,
2195                 int migratetype)
2196 {
2197     struct page *page;
2198 
2199     page = __rmqueue_smallest(zone, order, migratetype);
2200     if (unlikely(!page)) {
2201         if (migratetype == MIGRATE_MOVABLE)
2202             page = __rmqueue_cma_fallback(zone, order);
2203 
2204         if (!page)
2205             page = __rmqueue_fallback(zone, order, migratetype);
2206     }
2207 
2208     trace_mm_page_alloc_zone_locked(page, order, migratetype);
2209     return page;
2210 }
2211 
2212 /*
2213  * Obtain a specified number of elements from the buddy allocator, all under
2214  * a single hold of the lock, for efficiency.  Add them to the supplied list.
2215  * Returns the number of new pages which were placed at *list.
2216  */
2217 static int rmqueue_bulk(struct zone *zone, unsigned int order,
2218             unsigned long count, struct list_head *list,
2219             int migratetype, bool cold)
2220 {
2221     int i, alloced = 0;
2222 
2223     spin_lock(&zone->lock);
2224     for (i = 0; i < count; ++i) {
2225         struct page *page = __rmqueue(zone, order, migratetype);
2226         if (unlikely(page == NULL))
2227             break;
2228 
2229         if (unlikely(check_pcp_refill(page)))
2230             continue;
2231 
2232         /*
2233          * Split buddy pages returned by expand() are received here
2234          * in physical page order. The page is added to the callers and
2235          * list and the list head then moves forward. From the callers
2236          * perspective, the linked list is ordered by page number in
2237          * some conditions. This is useful for IO devices that can
2238          * merge IO requests if the physical pages are ordered
2239          * properly.
2240          */
2241         if (likely(!cold))
2242             list_add(&page->lru, list);
2243         else
2244             list_add_tail(&page->lru, list);
2245         list = &page->lru;
2246         alloced++;
2247         if (is_migrate_cma(get_pcppage_migratetype(page)))
2248             __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
2249                           -(1 << order));
2250     }
2251 
2252     /*
2253      * i pages were removed from the buddy list even if some leak due
2254      * to check_pcp_refill failing so adjust NR_FREE_PAGES based
2255      * on i. Do not confuse with 'alloced' which is the number of
2256      * pages added to the pcp list.
2257      */
2258     __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
2259     spin_unlock(&zone->lock);
2260     return alloced;
2261 }
2262 
2263 #ifdef CONFIG_NUMA
2264 /*
2265  * Called from the vmstat counter updater to drain pagesets of this
2266  * currently executing processor on remote nodes after they have
2267  * expired.
2268  *
2269  * Note that this function must be called with the thread pinned to
2270  * a single processor.
2271  */
2272 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
2273 {
2274     unsigned long flags;
2275     int to_drain, batch;
2276 
2277     local_irq_save(flags);
2278     batch = READ_ONCE(pcp->batch);
2279     to_drain = min(pcp->count, batch);
2280     if (to_drain > 0) {
2281         free_pcppages_bulk(zone, to_drain, pcp);
2282         pcp->count -= to_drain;
2283     }
2284     local_irq_restore(flags);
2285 }
2286 #endif
2287 
2288 /*
2289  * Drain pcplists of the indicated processor and zone.
2290  *
2291  * The processor must either be the current processor and the
2292  * thread pinned to the current processor or a processor that
2293  * is not online.
2294  */
2295 static void drain_pages_zone(unsigned int cpu, struct zone *zone)
2296 {
2297     unsigned long flags;
2298     struct per_cpu_pageset *pset;
2299     struct per_cpu_pages *pcp;
2300 
2301     local_irq_save(flags);
2302     pset = per_cpu_ptr(zone->pageset, cpu);
2303 
2304     pcp = &pset->pcp;
2305     if (pcp->count) {
2306         free_pcppages_bulk(zone, pcp->count, pcp);
2307         pcp->count = 0;
2308     }
2309     local_irq_restore(flags);
2310 }
2311 
2312 /*
2313  * Drain pcplists of all zones on the indicated processor.
2314  *
2315  * The processor must either be the current processor and the
2316  * thread pinned to the current processor or a processor that
2317  * is not online.
2318  */
2319 static void drain_pages(unsigned int cpu)
2320 {
2321     struct zone *zone;
2322 
2323     for_each_populated_zone(zone) {
2324         drain_pages_zone(cpu, zone);
2325     }
2326 }
2327 
2328 /*
2329  * Spill all of this CPU's per-cpu pages back into the buddy allocator.
2330  *
2331  * The CPU has to be pinned. When zone parameter is non-NULL, spill just
2332  * the single zone's pages.
2333  */
2334 void drain_local_pages(struct zone *zone)
2335 {
2336     int cpu = smp_processor_id();
2337 
2338     if (zone)
2339         drain_pages_zone(cpu, zone);
2340     else
2341         drain_pages(cpu);
2342 }
2343 
2344 /*
2345  * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
2346  *
2347  * When zone parameter is non-NULL, spill just the single zone's pages.
2348  *
2349  * Note that this code is protected against sending an IPI to an offline
2350  * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
2351  * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
2352  * nothing keeps CPUs from showing up after we populated the cpumask and
2353  * before the call to on_each_cpu_mask().
2354  */
2355 void drain_all_pages(struct zone *zone)
2356 {
2357     int cpu;
2358 
2359     /*
2360      * Allocate in the BSS so we wont require allocation in
2361      * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
2362      */
2363     static cpumask_t cpus_with_pcps;
2364 
2365     /*
2366      * We don't care about racing with CPU hotplug event
2367      * as offline notification will cause the notified
2368      * cpu to drain that CPU pcps and on_each_cpu_mask
2369      * disables preemption as part of its processing
2370      */
2371     for_each_online_cpu(cpu) {
2372         struct per_cpu_pageset *pcp;
2373         struct zone *z;
2374         bool has_pcps = false;
2375 
2376         if (zone) {
2377             pcp = per_cpu_ptr(zone->pageset, cpu);
2378             if (pcp->pcp.count)
2379                 has_pcps = true;
2380         } else {
2381             for_each_populated_zone(z) {
2382                 pcp = per_cpu_ptr(z->pageset, cpu);
2383                 if (pcp->pcp.count) {
2384                     has_pcps = true;
2385                     break;
2386                 }
2387             }
2388         }
2389 
2390         if (has_pcps)
2391             cpumask_set_cpu(cpu, &cpus_with_pcps);
2392         else
2393             cpumask_clear_cpu(cpu, &cpus_with_pcps);
2394     }
2395     on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages,
2396                                 zone, 1);
2397 }
2398 
2399 #ifdef CONFIG_HIBERNATION
2400 
2401 void mark_free_pages(struct zone *zone)
2402 {
2403     unsigned long pfn, max_zone_pfn;
2404     unsigned long flags;
2405     unsigned int order, t;
2406     struct page *page;
2407 
2408     if (zone_is_empty(zone))
2409         return;
2410 
2411     spin_lock_irqsave(&zone->lock, flags);
2412 
2413     max_zone_pfn = zone_end_pfn(zone);
2414     for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
2415         if (pfn_valid(pfn)) {
2416             page = pfn_to_page(pfn);
2417 
2418             if (page_zone(page) != zone)
2419                 continue;
2420 
2421             if (!swsusp_page_is_forbidden(page))
2422                 swsusp_unset_page_free(page);
2423         }
2424 
2425     for_each_migratetype_order(order, t) {
2426         list_for_each_entry(page,
2427                 &zone->free_area[order].free_list[t], lru) {
2428             unsigned long i;
2429 
2430             pfn = page_to_pfn(page);
2431             for (i = 0; i < (1UL << order); i++)
2432                 swsusp_set_page_free(pfn_to_page(pfn + i));
2433         }
2434     }
2435     spin_unlock_irqrestore(&zone->lock, flags);
2436 }
2437 #endif /* CONFIG_PM */
2438 
2439 /*
2440  * Free a 0-order page
2441  * cold == true ? free a cold page : free a hot page
2442  */
2443 void free_hot_cold_page(struct page *page, bool cold)
2444 {
2445     struct zone *zone = page_zone(page);
2446     struct per_cpu_pages *pcp;
2447     unsigned long flags;
2448     unsigned long pfn = page_to_pfn(page);
2449     int migratetype;
2450 
2451     if (!free_pcp_prepare(page))
2452         return;
2453 
2454     migratetype = get_pfnblock_migratetype(page, pfn);
2455     set_pcppage_migratetype(page, migratetype);
2456     local_irq_save(flags);
2457     __count_vm_event(PGFREE);
2458 
2459     /*
2460      * We only track unmovable, reclaimable and movable on pcp lists.
2461      * Free ISOLATE pages back to the allocator because they are being
2462      * offlined but treat RESERVE as movable pages so we can get those
2463      * areas back if necessary. Otherwise, we may have to free
2464      * excessively into the page allocator
2465      */
2466     if (migratetype >= MIGRATE_PCPTYPES) {
2467         if (unlikely(is_migrate_isolate(migratetype))) {
2468             free_one_page(zone, page, pfn, 0, migratetype);
2469             goto out;
2470         }
2471         migratetype = MIGRATE_MOVABLE;
2472     }
2473 
2474     pcp = &this_cpu_ptr(zone->pageset)->pcp;
2475     if (!cold)
2476         list_add(&page->lru, &pcp->lists[migratetype]);
2477     else
2478         list_add_tail(&page->lru, &pcp->lists[migratetype]);
2479     pcp->count++;
2480     if (pcp->count >= pcp->high) {
2481         unsigned long batch = READ_ONCE(pcp->batch);
2482         free_pcppages_bulk(zone, batch, pcp);
2483         pcp->count -= batch;
2484     }
2485 
2486 out:
2487     local_irq_restore(flags);
2488 }
2489 
2490 /*
2491  * Free a list of 0-order pages
2492  */
2493 void free_hot_cold_page_list(struct list_head *list, bool cold)
2494 {
2495     struct page *page, *next;
2496 
2497     list_for_each_entry_safe(page, next, list, lru) {
2498         trace_mm_page_free_batched(page, cold);
2499         free_hot_cold_page(page, cold);
2500     }
2501 }
2502 
2503 /*
2504  * split_page takes a non-compound higher-order page, and splits it into
2505  * n (1<<order) sub-pages: page[0..n]
2506  * Each sub-page must be freed individually.
2507  *
2508  * Note: this is probably too low level an operation for use in drivers.
2509  * Please consult with lkml before using this in your driver.
2510  */
2511 void split_page(struct page *page, unsigned int order)
2512 {
2513     int i;
2514 
2515     VM_BUG_ON_PAGE(PageCompound(page), page);
2516     VM_BUG_ON_PAGE(!page_count(page), page);
2517 
2518 #ifdef CONFIG_KMEMCHECK
2519     /*
2520      * Split shadow pages too, because free(page[0]) would
2521      * otherwise free the whole shadow.
2522      */
2523     if (kmemcheck_page_is_tracked(page))
2524         split_page(virt_to_page(page[0].shadow), order);
2525 #endif
2526 
2527     for (i = 1; i < (1 << order); i++)
2528         set_page_refcounted(page + i);
2529     split_page_owner(page, order);
2530 }
2531 EXPORT_SYMBOL_GPL(split_page);
2532 
2533 int __isolate_free_page(struct page *page, unsigned int order)
2534 {
2535     unsigned long watermark;
2536     struct zone *zone;
2537     int mt;
2538 
2539     BUG_ON(!PageBuddy(page));
2540 
2541     zone = page_zone(page);
2542     mt = get_pageblock_migratetype(page);
2543 
2544     if (!is_migrate_isolate(mt)) {
2545         /*
2546          * Obey watermarks as if the page was being allocated. We can
2547          * emulate a high-order watermark check with a raised order-0
2548          * watermark, because we already know our high-order page
2549          * exists.
2550          */
2551         watermark = min_wmark_pages(zone) + (1UL << order);
2552         if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
2553             return 0;
2554 
2555         __mod_zone_freepage_state(zone, -(1UL << order), mt);
2556     }
2557 
2558     /* Remove page from free list */
2559     list_del(&page->lru);
2560     zone->free_area[order].nr_free--;
2561     rmv_page_order(page);
2562 
2563     /*
2564      * Set the pageblock if the isolated page is at least half of a
2565      * pageblock
2566      */
2567     if (order >= pageblock_order - 1) {
2568         struct page *endpage = page + (1 << order) - 1;
2569         for (; page < endpage; page += pageblock_nr_pages) {
2570             int mt = get_pageblock_migratetype(page);
2571             if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)
2572                 && mt != MIGRATE_HIGHATOMIC)
2573                 set_pageblock_migratetype(page,
2574                               MIGRATE_MOVABLE);
2575         }
2576     }
2577 
2578 
2579     return 1UL << order;
2580 }
2581 
2582 /*
2583  * Update NUMA hit/miss statistics
2584  *
2585  * Must be called with interrupts disabled.
2586  */
2587 static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
2588 {
2589 #ifdef CONFIG_NUMA
2590     enum zone_stat_item local_stat = NUMA_LOCAL;
2591 
2592     if (z->node != numa_node_id())
2593         local_stat = NUMA_OTHER;
2594 
2595     if (z->node == preferred_zone->node)
2596         __inc_zone_state(z, NUMA_HIT);
2597     else {
2598         __inc_zone_state(z, NUMA_MISS);
2599         __inc_zone_state(preferred_zone, NUMA_FOREIGN);
2600     }
2601     __inc_zone_state(z, local_stat);
2602 #endif
2603 }
2604 
2605 /*
2606  * Allocate a page from the given zone. Use pcplists for order-0 allocations.
2607  */
2608 static inline
2609 struct page *buffered_rmqueue(struct zone *preferred_zone,
2610             struct zone *zone, unsigned int order,
2611             gfp_t gfp_flags, unsigned int alloc_flags,
2612             int migratetype)
2613 {
2614     unsigned long flags;
2615     struct page *page;
2616     bool cold = ((gfp_flags & __GFP_COLD) != 0);
2617 
2618     if (likely(order == 0)) {
2619         struct per_cpu_pages *pcp;
2620         struct list_head *list;
2621 
2622         local_irq_save(flags);
2623         do {
2624             pcp = &this_cpu_ptr(zone->pageset)->pcp;
2625             list = &pcp->lists[migratetype];
2626             if (list_empty(list)) {
2627                 pcp->count += rmqueue_bulk(zone, 0,
2628                         pcp->batch, list,
2629                         migratetype, cold);
2630                 if (unlikely(list_empty(list)))
2631                     goto failed;
2632             }
2633 
2634             if (cold)
2635                 page = list_last_entry(list, struct page, lru);
2636             else
2637                 page = list_first_entry(list, struct page, lru);
2638 
2639             list_del(&page->lru);
2640             pcp->count--;
2641 
2642         } while (check_new_pcp(page));
2643     } else {
2644         /*
2645          * We most definitely don't want callers attempting to
2646          * allocate greater than order-1 page units with __GFP_NOFAIL.
2647          */
2648         WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
2649         spin_lock_irqsave(&zone->lock, flags);
2650 
2651         do {
2652             page = NULL;
2653             if (alloc_flags & ALLOC_HARDER) {
2654                 page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
2655                 if (page)
2656                     trace_mm_page_alloc_zone_locked(page, order, migratetype);
2657             }
2658             if (!page)
2659                 page = __rmqueue(zone, order, migratetype);
2660         } while (page && check_new_pages(page, order));
2661         spin_unlock(&zone->lock);
2662         if (!page)
2663             goto failed;
2664         __mod_zone_freepage_state(zone, -(1 << order),
2665                       get_pcppage_migratetype(page));
2666     }
2667 
2668     __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
2669     zone_statistics(preferred_zone, zone);
2670     local_irq_restore(flags);
2671 
2672     VM_BUG_ON_PAGE(bad_range(zone, page), page);
2673     return page;
2674 
2675 failed:
2676     local_irq_restore(flags);
2677     return NULL;
2678 }
2679 
2680 #ifdef CONFIG_FAIL_PAGE_ALLOC
2681 
2682 static struct {
2683     struct fault_attr attr;
2684 
2685     bool ignore_gfp_highmem;
2686     bool ignore_gfp_reclaim;
2687     u32 min_order;
2688 } fail_page_alloc = {
2689     .attr = FAULT_ATTR_INITIALIZER,
2690     .ignore_gfp_reclaim = true,
2691     .ignore_gfp_highmem = true,
2692     .min_order = 1,
2693 };
2694 
2695 static int __init setup_fail_page_alloc(char *str)
2696 {
2697     return setup_fault_attr(&fail_page_alloc.attr, str);
2698 }
2699 __setup("fail_page_alloc=", setup_fail_page_alloc);
2700 
2701 static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
2702 {
2703     if (order < fail_page_alloc.min_order)
2704         return false;
2705     if (gfp_mask & __GFP_NOFAIL)
2706         return false;
2707     if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
2708         return false;
2709     if (fail_page_alloc.ignore_gfp_reclaim &&
2710             (gfp_mask & __GFP_DIRECT_RECLAIM))
2711         return false;
2712 
2713     return should_fail(&fail_page_alloc.attr, 1 << order);
2714 }
2715 
2716 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
2717 
2718 static int __init fail_page_alloc_debugfs(void)
2719 {
2720     umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
2721     struct dentry *dir;
2722 
2723     dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
2724                     &fail_page_alloc.attr);
2725     if (IS_ERR(dir))
2726         return PTR_ERR(dir);
2727 
2728     if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
2729                 &fail_page_alloc.ignore_gfp_reclaim))
2730         goto fail;
2731     if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
2732                 &fail_page_alloc.ignore_gfp_highmem))
2733         goto fail;
2734     if (!debugfs_create_u32("min-order", mode, dir,
2735                 &fail_page_alloc.min_order))
2736         goto fail;
2737 
2738     return 0;
2739 fail:
2740     debugfs_remove_recursive(dir);
2741 
2742     return -ENOMEM;
2743 }
2744 
2745 late_initcall(fail_page_alloc_debugfs);
2746 
2747 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
2748 
2749 #else /* CONFIG_FAIL_PAGE_ALLOC */
2750 
2751 static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
2752 {
2753     return false;
2754 }
2755 
2756 #endif /* CONFIG_FAIL_PAGE_ALLOC */
2757 
2758 /*
2759  * Return true if free base pages are above 'mark'. For high-order checks it
2760  * will return true of the order-0 watermark is reached and there is at least
2761  * one free page of a suitable size. Checking now avoids taking the zone lock
2762  * to check in the allocation paths if no pages are free.
2763  */
2764 bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
2765              int classzone_idx, unsigned int alloc_flags,
2766              long free_pages)
2767 {
2768     long min = mark;
2769     int o;
2770     const bool alloc_harder = (alloc_flags & ALLOC_HARDER);
2771 
2772     /* free_pages may go negative - that's OK */
2773     free_pages -= (1 << order) - 1;
2774 
2775     if (alloc_flags & ALLOC_HIGH)
2776         min -= min / 2;
2777 
2778     /*
2779      * If the caller does not have rights to ALLOC_HARDER then subtract
2780      * the high-atomic reserves. This will over-estimate the size of the
2781      * atomic reserve but it avoids a search.
2782      */
2783     if (likely(!alloc_harder))
2784         free_pages -= z->nr_reserved_highatomic;
2785     else
2786         min -= min / 4;
2787 
2788 #ifdef CONFIG_CMA
2789     /* If allocation can't use CMA areas don't use free CMA pages */
2790     if (!(alloc_flags & ALLOC_CMA))
2791         free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
2792 #endif
2793 
2794     /*
2795      * Check watermarks for an order-0 allocation request. If these
2796      * are not met, then a high-order request also cannot go ahead
2797      * even if a suitable page happened to be free.
2798      */
2799     if (free_pages <= min + z->lowmem_reserve[classzone_idx])
2800         return false;
2801 
2802     /* If this is an order-0 request then the watermark is fine */
2803     if (!order)
2804         return true;
2805 
2806     /* For a high-order request, check at least one suitable page is free */
2807     for (o = order; o < MAX_ORDER; o++) {
2808         struct free_area *area = &z->free_area[o];
2809         int mt;
2810 
2811         if (!area->nr_free)
2812             continue;
2813 
2814         if (alloc_harder)
2815             return true;
2816 
2817         for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
2818             if (!list_empty(&area->free_list[mt]))
2819                 return true;
2820         }
2821 
2822 #ifdef CONFIG_CMA
2823         if ((alloc_flags & ALLOC_CMA) &&
2824             !list_empty(&area->free_list[MIGRATE_CMA])) {
2825             return true;
2826         }
2827 #endif
2828     }
2829     return false;
2830 }
2831 
2832 bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
2833               int classzone_idx, unsigned int alloc_flags)
2834 {
2835     return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
2836                     zone_page_state(z, NR_FREE_PAGES));
2837 }
2838 
2839 static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
2840         unsigned long mark, int classzone_idx, unsigned int alloc_flags)
2841 {
2842     long free_pages = zone_page_state(z, NR_FREE_PAGES);
2843     long cma_pages = 0;
2844 
2845 #ifdef CONFIG_CMA
2846     /* If allocation can't use CMA areas don't use free CMA pages */
2847     if (!(alloc_flags & ALLOC_CMA))
2848         cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES);
2849 #endif
2850 
2851     /*
2852      * Fast check for order-0 only. If this fails then the reserves
2853      * need to be calculated. There is a corner case where the check
2854      * passes but only the high-order atomic reserve are free. If
2855      * the caller is !atomic then it'll uselessly search the free
2856      * list. That corner case is then slower but it is harmless.
2857      */
2858     if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx])
2859         return true;
2860 
2861     return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
2862                     free_pages);
2863 }
2864 
2865 bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
2866             unsigned long mark, int classzone_idx)
2867 {
2868     long free_pages = zone_page_state(z, NR_FREE_PAGES);
2869 
2870     if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
2871         free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
2872 
2873     return __zone_watermark_ok(z, order, mark, classzone_idx, 0,
2874                                 free_pages);
2875 }
2876 
2877 #ifdef CONFIG_NUMA
2878 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
2879 {
2880     return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <
2881                 RECLAIM_DISTANCE;
2882 }
2883 #else   /* CONFIG_NUMA */
2884 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
2885 {
2886     return true;
2887 }
2888 #endif  /* CONFIG_NUMA */
2889 
2890 /*
2891  * get_page_from_freelist goes through the zonelist trying to allocate
2892  * a page.
2893  */
2894 static struct page *
2895 get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
2896                         const struct alloc_context *ac)
2897 {
2898     struct zoneref *z = ac->preferred_zoneref;
2899     struct zone *zone;
2900     struct pglist_data *last_pgdat_dirty_limit = NULL;
2901 
2902     /*
2903      * Scan zonelist, looking for a zone with enough free.
2904      * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
2905      */
2906     for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
2907                                 ac->nodemask) {
2908         struct page *page;
2909         unsigned long mark;
2910 
2911         if (cpusets_enabled() &&
2912             (alloc_flags & ALLOC_CPUSET) &&
2913             !__cpuset_zone_allowed(zone, gfp_mask))
2914                 continue;
2915         /*
2916          * When allocating a page cache page for writing, we
2917          * want to get it from a node that is within its dirty
2918          * limit, such that no single node holds more than its
2919          * proportional share of globally allowed dirty pages.
2920          * The dirty limits take into account the node's
2921          * lowmem reserves and high watermark so that kswapd
2922          * should be able to balance it without having to
2923          * write pages from its LRU list.
2924          *
2925          * XXX: For now, allow allocations to potentially
2926          * exceed the per-node dirty limit in the slowpath
2927          * (spread_dirty_pages unset) before going into reclaim,
2928          * which is important when on a NUMA setup the allowed
2929          * nodes are together not big enough to reach the
2930          * global limit.  The proper fix for these situations
2931          * will require awareness of nodes in the
2932          * dirty-throttling and the flusher threads.
2933          */
2934         if (ac->spread_dirty_pages) {
2935             if (last_pgdat_dirty_limit == zone->zone_pgdat)
2936                 continue;
2937 
2938             if (!node_dirty_ok(zone->zone_pgdat)) {
2939                 last_pgdat_dirty_limit = zone->zone_pgdat;
2940                 continue;
2941             }
2942         }
2943 
2944         mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
2945         if (!zone_watermark_fast(zone, order, mark,
2946                        ac_classzone_idx(ac), alloc_flags)) {
2947             int ret;
2948 
2949             /* Checked here to keep the fast path fast */
2950             BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
2951             if (alloc_flags & ALLOC_NO_WATERMARKS)
2952                 goto try_this_zone;
2953 
2954             if (node_reclaim_mode == 0 ||
2955                 !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
2956                 continue;
2957 
2958             ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
2959             switch (ret) {
2960             case NODE_RECLAIM_NOSCAN:
2961                 /* did not scan */
2962                 continue;
2963             case NODE_RECLAIM_FULL:
2964                 /* scanned but unreclaimable */
2965                 continue;
2966             default:
2967                 /* did we reclaim enough */
2968                 if (zone_watermark_ok(zone, order, mark,
2969                         ac_classzone_idx(ac), alloc_flags))
2970                     goto try_this_zone;
2971 
2972                 continue;
2973             }
2974         }
2975 
2976 try_this_zone:
2977         page = buffered_rmqueue(ac->preferred_zoneref->zone, zone, order,
2978                 gfp_mask, alloc_flags, ac->migratetype);
2979         if (page) {
2980             prep_new_page(page, order, gfp_mask, alloc_flags);
2981 
2982             /*
2983              * If this is a high-order atomic allocation then check
2984              * if the pageblock should be reserved for the future
2985              */
2986             if (unlikely(order && (alloc_flags & ALLOC_HARDER)))
2987                 reserve_highatomic_pageblock(page, zone, order);
2988 
2989             return page;
2990         }
2991     }
2992 
2993     return NULL;
2994 }
2995 
2996 /*
2997  * Large machines with many possible nodes should not always dump per-node
2998  * meminfo in irq context.
2999  */
3000 static inline bool should_suppress_show_mem(void)
3001 {
3002     bool ret = false;
3003 
3004 #if NODES_SHIFT > 8
3005     ret = in_interrupt();
3006 #endif
3007     return ret;
3008 }
3009 
3010 static DEFINE_RATELIMIT_STATE(nopage_rs,
3011         DEFAULT_RATELIMIT_INTERVAL,
3012         DEFAULT_RATELIMIT_BURST);
3013 
3014 void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
3015 {
3016     unsigned int filter = SHOW_MEM_FILTER_NODES;
3017     struct va_format vaf;
3018     va_list args;
3019 
3020     if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
3021         debug_guardpage_minorder() > 0)
3022         return;
3023 
3024     /*
3025      * This documents exceptions given to allocations in certain
3026      * contexts that are allowed to allocate outside current's set
3027      * of allowed nodes.
3028      */
3029     if (!(gfp_mask & __GFP_NOMEMALLOC))
3030         if (test_thread_flag(TIF_MEMDIE) ||
3031             (current->flags & (PF_MEMALLOC | PF_EXITING)))
3032             filter &= ~SHOW_MEM_FILTER_NODES;
3033     if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
3034         filter &= ~SHOW_MEM_FILTER_NODES;
3035 
3036     pr_warn("%s: ", current->comm);
3037 
3038     va_start(args, fmt);
3039     vaf.fmt = fmt;
3040     vaf.va = &args;
3041     pr_cont("%pV", &vaf);
3042     va_end(args);
3043 
3044     pr_cont(", mode:%#x(%pGg)\n", gfp_mask, &gfp_mask);
3045 
3046     dump_stack();
3047     if (!should_suppress_show_mem())
3048         show_mem(filter);
3049 }
3050 
3051 static inline struct page *
3052 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
3053     const struct alloc_context *ac, unsigned long *did_some_progress)
3054 {
3055     struct oom_control oc = {
3056         .zonelist = ac->zonelist,
3057         .nodemask = ac->nodemask,
3058         .memcg = NULL,
3059         .gfp_mask = gfp_mask,
3060         .order = order,
3061     };
3062     struct page *page;
3063 
3064     *did_some_progress = 0;
3065 
3066     /*
3067      * Acquire the oom lock.  If that fails, somebody else is
3068      * making progress for us.
3069      */
3070     if (!mutex_trylock(&oom_lock)) {
3071         *did_some_progress = 1;
3072         schedule_timeout_uninterruptible(1);
3073         return NULL;
3074     }
3075 
3076     /*
3077      * Go through the zonelist yet one more time, keep very high watermark
3078      * here, this is only to catch a parallel oom killing, we must fail if
3079      * we're still under heavy pressure.
3080      */
3081     page = get_page_from_freelist(gfp_mask | __GFP_HARDWALL, order,
3082                     ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
3083     if (page)
3084         goto out;
3085 
3086     if (!(gfp_mask & __GFP_NOFAIL)) {
3087         /* Coredumps can quickly deplete all memory reserves */
3088         if (current->flags & PF_DUMPCORE)
3089             goto out;
3090         /* The OOM killer will not help higher order allocs */
3091         if (order > PAGE_ALLOC_COSTLY_ORDER)
3092             goto out;
3093         /* The OOM killer does not needlessly kill tasks for lowmem */
3094         if (ac->high_zoneidx < ZONE_NORMAL)
3095             goto out;
3096         if (pm_suspended_storage())
3097             goto out;
3098         /*
3099          * XXX: GFP_NOFS allocations should rather fail than rely on
3100          * other request to make a forward progress.
3101          * We are in an unfortunate situation where out_of_memory cannot
3102          * do much for this context but let's try it to at least get
3103          * access to memory reserved if the current task is killed (see
3104          * out_of_memory). Once filesystems are ready to handle allocation
3105          * failures more gracefully we should just bail out here.
3106          */
3107 
3108         /* The OOM killer may not free memory on a specific node */
3109         if (gfp_mask & __GFP_THISNODE)
3110             goto out;
3111     }
3112     /* Exhausted what can be done so it's blamo time */
3113     if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
3114         *did_some_progress = 1;
3115 
3116         if (gfp_mask & __GFP_NOFAIL) {
3117             page = get_page_from_freelist(gfp_mask, order,
3118                     ALLOC_NO_WATERMARKS|ALLOC_CPUSET, ac);
3119             /*
3120              * fallback to ignore cpuset restriction if our nodes
3121              * are depleted
3122              */
3123             if (!page)
3124                 page = get_page_from_freelist(gfp_mask, order,
3125                     ALLOC_NO_WATERMARKS, ac);
3126         }
3127     }
3128 out:
3129     mutex_unlock(&oom_lock);
3130     return page;
3131 }
3132 
3133 /*
3134  * Maximum number of compaction retries wit a progress before OOM
3135  * killer is consider as the only way to move forward.
3136  */
3137 #define MAX_COMPACT_RETRIES 16
3138 
3139 #ifdef CONFIG_COMPACTION
3140 /* Try memory compaction for high-order allocations before reclaim */
3141 static struct page *
3142 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
3143         unsigned int alloc_flags, const struct alloc_context *ac,
3144         enum compact_priority prio, enum compact_result *compact_result)
3145 {
3146     struct page *page;
3147 
3148     if (!order)
3149         return NULL;
3150 
3151     current->flags |= PF_MEMALLOC;
3152     *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
3153                                     prio);
3154     current->flags &= ~PF_MEMALLOC;
3155 
3156     if (*compact_result <= COMPACT_INACTIVE)
3157         return NULL;
3158 
3159     /*
3160      * At least in one zone compaction wasn't deferred or skipped, so let's
3161      * count a compaction stall
3162      */
3163     count_vm_event(COMPACTSTALL);
3164 
3165     page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
3166 
3167     if (page) {
3168         struct zone *zone = page_zone(page);
3169 
3170         zone->compact_blockskip_flush = false;
3171         compaction_defer_reset(zone, order, true);
3172         count_vm_event(COMPACTSUCCESS);
3173         return page;
3174     }
3175 
3176     /*
3177      * It's bad if compaction run occurs and fails. The most likely reason
3178      * is that pages exist, but not enough to satisfy watermarks.
3179      */
3180     count_vm_event(COMPACTFAIL);
3181 
3182     cond_resched();
3183 
3184     return NULL;
3185 }
3186 
3187 static inline bool
3188 should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
3189              enum compact_result compact_result,
3190              enum compact_priority *compact_priority,
3191              int *compaction_retries)
3192 {
3193     int max_retries = MAX_COMPACT_RETRIES;
3194     int min_priority;
3195 
3196     if (!order)
3197         return false;
3198 
3199     if (compaction_made_progress(compact_result))
3200         (*compaction_retries)++;
3201 
3202     /*
3203      * compaction considers all the zone as desperately out of memory
3204      * so it doesn't really make much sense to retry except when the
3205      * failure could be caused by insufficient priority
3206      */
3207     if (compaction_failed(compact_result))
3208         goto check_priority;
3209 
3210     /*
3211      * make sure the compaction wasn't deferred or didn't bail out early
3212      * due to locks contention before we declare that we should give up.
3213      * But do not retry if the given zonelist is not suitable for
3214      * compaction.
3215      */
3216     if (compaction_withdrawn(compact_result))
3217         return compaction_zonelist_suitable(ac, order, alloc_flags);
3218 
3219     /*
3220      * !costly requests are much more important than __GFP_REPEAT
3221      * costly ones because they are de facto nofail and invoke OOM
3222      * killer to move on while costly can fail and users are ready
3223      * to cope with that. 1/4 retries is rather arbitrary but we
3224      * would need much more detailed feedback from compaction to
3225      * make a better decision.
3226      */
3227     if (order > PAGE_ALLOC_COSTLY_ORDER)
3228         max_retries /= 4;
3229     if (*compaction_retries <= max_retries)
3230         return true;
3231 
3232     /*
3233      * Make sure there are attempts at the highest priority if we exhausted
3234      * all retries or failed at the lower priorities.
3235      */
3236 check_priority:
3237     min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?
3238             MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;
3239     if (*compact_priority > min_priority) {
3240         (*compact_priority)--;
3241         *compaction_retries = 0;
3242         return true;
3243     }
3244     return false;
3245 }
3246 #else
3247 static inline struct page *
3248 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
3249         unsigned int alloc_flags, const struct alloc_context *ac,
3250         enum compact_priority prio, enum compact_result *compact_result)
3251 {
3252     *compact_result = COMPACT_SKIPPED;
3253     return NULL;
3254 }
3255 
3256 static inline bool
3257 should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
3258              enum compact_result compact_result,
3259              enum compact_priority *compact_priority,
3260              int *compaction_retries)
3261 {
3262     struct zone *zone;
3263     struct zoneref *z;
3264 
3265     if (!order || order > PAGE_ALLOC_COSTLY_ORDER)
3266         return false;
3267 
3268     /*
3269      * There are setups with compaction disabled which would prefer to loop
3270      * inside the allocator rather than hit the oom killer prematurely.
3271      * Let's give them a good hope and keep retrying while the order-0
3272      * watermarks are OK.
3273      */
3274     for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
3275                     ac->nodemask) {
3276         if (zone_watermark_ok(zone, 0, min_wmark_pages(zone),
3277                     ac_classzone_idx(ac), alloc_flags))
3278             return true;
3279     }
3280     return false;
3281 }
3282 #endif /* CONFIG_COMPACTION */
3283 
3284 /* Perform direct synchronous page reclaim */
3285 static int
3286 __perform_reclaim(gfp_t gfp_mask, unsigned int order,
3287                     const struct alloc_context *ac)
3288 {
3289     struct reclaim_state reclaim_state;
3290     int progress;
3291 
3292     cond_resched();
3293 
3294     /* We now go into synchronous reclaim */
3295     cpuset_memory_pressure_bump();
3296     current->flags |= PF_MEMALLOC;
3297     lockdep_set_current_reclaim_state(gfp_mask);
3298     reclaim_state.reclaimed_slab = 0;
3299     current->reclaim_state = &reclaim_state;
3300 
3301     progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
3302                                 ac->nodemask);
3303 
3304     current->reclaim_state = NULL;
3305     lockdep_clear_current_reclaim_state();
3306     current->flags &= ~PF_MEMALLOC;
3307 
3308     cond_resched();
3309 
3310     return progress;
3311 }
3312 
3313 /* The really slow allocator path where we enter direct reclaim */
3314 static inline struct page *
3315 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
3316         unsigned int alloc_flags, const struct alloc_context *ac,
3317         unsigned long *did_some_progress)
3318 {
3319     struct page *page = NULL;
3320     bool drained = false;
3321 
3322     *did_some_progress = __perform_reclaim(gfp_mask, order, ac);
3323     if (unlikely(!(*did_some_progress)))
3324         return NULL;
3325 
3326 retry:
3327     page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
3328 
3329     /*
3330      * If an allocation failed after direct reclaim, it could be because
3331      * pages are pinned on the per-cpu lists or in high alloc reserves.
3332      * Shrink them them and try again
3333      */
3334     if (!page && !drained) {
3335         unreserve_highatomic_pageblock(ac, false);
3336         drain_all_pages(NULL);
3337         drained = true;
3338         goto retry;
3339     }
3340 
3341     return page;
3342 }
3343 
3344 static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
3345 {
3346     struct zoneref *z;
3347     struct zone *zone;
3348     pg_data_t *last_pgdat = NULL;
3349 
3350     for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
3351                     ac->high_zoneidx, ac->nodemask) {
3352         if (last_pgdat != zone->zone_pgdat)
3353             wakeup_kswapd(zone, order, ac->high_zoneidx);
3354         last_pgdat = zone->zone_pgdat;
3355     }
3356 }
3357 
3358 static inline unsigned int
3359 gfp_to_alloc_flags(gfp_t gfp_mask)
3360 {
3361     unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
3362 
3363     /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
3364     BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
3365 
3366     /*
3367      * The caller may dip into page reserves a bit more if the caller
3368      * cannot run direct reclaim, or if the caller has realtime scheduling
3369      * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
3370      * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH).
3371      */
3372     alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
3373 
3374     if (gfp_mask & __GFP_ATOMIC) {
3375         /*
3376          * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
3377          * if it can't schedule.
3378          */
3379         if (!(gfp_mask & __GFP_NOMEMALLOC))
3380             alloc_flags |= ALLOC_HARDER;
3381         /*
3382          * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
3383          * comment for __cpuset_node_allowed().
3384          */
3385         alloc_flags &= ~ALLOC_CPUSET;
3386     } else if (unlikely(rt_task(current)) && !in_interrupt())
3387         alloc_flags |= ALLOC_HARDER;
3388 
3389 #ifdef CONFIG_CMA
3390     if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
3391         alloc_flags |= ALLOC_CMA;
3392 #endif
3393     return alloc_flags;
3394 }
3395 
3396 bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
3397 {
3398     if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
3399         return false;
3400 
3401     if (gfp_mask & __GFP_MEMALLOC)
3402         return true;
3403     if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
3404         return true;
3405     if (!in_interrupt() &&
3406             ((current->flags & PF_MEMALLOC) ||
3407              unlikely(test_thread_flag(TIF_MEMDIE))))
3408         return true;
3409 
3410     return false;
3411 }
3412 
3413 /*
3414  * Maximum number of reclaim retries without any progress before OOM killer
3415  * is consider as the only way to move forward.
3416  */
3417 #define MAX_RECLAIM_RETRIES 16
3418 
3419 /*
3420  * Checks whether it makes sense to retry the reclaim to make a forward progress
3421  * for the given allocation request.
3422  * The reclaim feedback represented by did_some_progress (any progress during
3423  * the last reclaim round) and no_progress_loops (number of reclaim rounds without
3424  * any progress in a row) is considered as well as the reclaimable pages on the
3425  * applicable zone list (with a backoff mechanism which is a function of
3426  * no_progress_loops).
3427  *
3428  * Returns true if a retry is viable or false to enter the oom path.
3429  */
3430 static inline bool
3431 should_reclaim_retry(gfp_t gfp_mask, unsigned order,
3432              struct alloc_context *ac, int alloc_flags,
3433              bool did_some_progress, int *no_progress_loops)
3434 {
3435     struct zone *zone;
3436     struct zoneref *z;
3437 
3438     /*
3439      * Costly allocations might have made a progress but this doesn't mean
3440      * their order will become available due to high fragmentation so
3441      * always increment the no progress counter for them
3442      */
3443     if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
3444         *no_progress_loops = 0;
3445     else
3446         (*no_progress_loops)++;
3447 
3448     /*
3449      * Make sure we converge to OOM if we cannot make any progress
3450      * several times in the row.
3451      */
3452     if (*no_progress_loops > MAX_RECLAIM_RETRIES) {
3453         /* Before OOM, exhaust highatomic_reserve */
3454         return unreserve_highatomic_pageblock(ac, true);
3455     }
3456 
3457     /*
3458      * Keep reclaiming pages while there is a chance this will lead
3459      * somewhere.  If none of the target zones can satisfy our allocation
3460      * request even if all reclaimable pages are considered then we are
3461      * screwed and have to go OOM.
3462      */
3463     for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
3464                     ac->nodemask) {
3465         unsigned long available;
3466         unsigned long reclaimable;
3467 
3468         available = reclaimable = zone_reclaimable_pages(zone);
3469         available -= DIV_ROUND_UP((*no_progress_loops) * available,
3470                       MAX_RECLAIM_RETRIES);
3471         available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
3472 
3473         /*
3474          * Would the allocation succeed if we reclaimed the whole
3475          * available?
3476          */
3477         if (__zone_watermark_ok(zone, order, min_wmark_pages(zone),
3478                 ac_classzone_idx(ac), alloc_flags, available)) {
3479             /*
3480              * If we didn't make any progress and have a lot of
3481              * dirty + writeback pages then we should wait for
3482              * an IO to complete to slow down the reclaim and
3483              * prevent from pre mature OOM
3484              */
3485             if (!did_some_progress) {
3486                 unsigned long write_pending;
3487 
3488                 write_pending = zone_page_state_snapshot(zone,
3489                             NR_ZONE_WRITE_PENDING);
3490 
3491                 if (2 * write_pending > reclaimable) {
3492                     congestion_wait(BLK_RW_ASYNC, HZ/10);
3493                     return true;
3494                 }
3495             }
3496 
3497             /*
3498              * Memory allocation/reclaim might be called from a WQ
3499              * context and the current implementation of the WQ
3500              * concurrency control doesn't recognize that
3501              * a particular WQ is congested if the worker thread is
3502              * looping without ever sleeping. Therefore we have to
3503              * do a short sleep here rather than calling
3504              * cond_resched().
3505              */
3506             if (current->flags & PF_WQ_WORKER)
3507                 schedule_timeout_uninterruptible(1);
3508             else
3509                 cond_resched();
3510 
3511             return true;
3512         }
3513     }
3514 
3515     return false;
3516 }
3517 
3518 static inline struct page *
3519 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
3520                         struct alloc_context *ac)
3521 {
3522     bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
3523     struct page *page = NULL;
3524     unsigned int alloc_flags;
3525     unsigned long did_some_progress;
3526     enum compact_priority compact_priority;
3527     enum compact_result compact_result;
3528     int compaction_retries;
3529     int no_progress_loops;
3530     unsigned long alloc_start = jiffies;
3531     unsigned int stall_timeout = 10 * HZ;
3532     unsigned int cpuset_mems_cookie;
3533 
3534     /*
3535      * In the slowpath, we sanity check order to avoid ever trying to
3536      * reclaim >= MAX_ORDER areas which will never succeed. Callers may
3537      * be using allocators in order of preference for an area that is
3538      * too large.
3539      */
3540     if (order >= MAX_ORDER) {
3541         WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
3542         return NULL;
3543     }
3544 
3545     /*
3546      * We also sanity check to catch abuse of atomic reserves being used by
3547      * callers that are not in atomic context.
3548      */
3549     if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==
3550                 (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
3551         gfp_mask &= ~__GFP_ATOMIC;
3552 
3553 retry_cpuset:
3554     compaction_retries = 0;
3555     no_progress_loops = 0;
3556     compact_priority = DEF_COMPACT_PRIORITY;
3557     cpuset_mems_cookie = read_mems_allowed_begin();
3558     /*
3559      * We need to recalculate the starting point for the zonelist iterator
3560      * because we might have used different nodemask in the fast path, or
3561      * there was a cpuset modification and we are retrying - otherwise we
3562      * could end up iterating over non-eligible zones endlessly.
3563      */
3564     ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
3565                     ac->high_zoneidx, ac->nodemask);
3566     if (!ac->preferred_zoneref->zone)
3567         goto nopage;
3568 
3569 
3570     /*
3571      * The fast path uses conservative alloc_flags to succeed only until
3572      * kswapd needs to be woken up, and to avoid the cost of setting up
3573      * alloc_flags precisely. So we do that now.
3574      */
3575     alloc_flags = gfp_to_alloc_flags(gfp_mask);
3576 
3577     if (gfp_mask & __GFP_KSWAPD_RECLAIM)
3578         wake_all_kswapds(order, ac);
3579 
3580     /*
3581      * The adjusted alloc_flags might result in immediate success, so try
3582      * that first
3583      */
3584     page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
3585     if (page)
3586         goto got_pg;
3587 
3588     /*
3589      * For costly allocations, try direct compaction first, as it's likely
3590      * that we have enough base pages and don't need to reclaim. Don't try
3591      * that for allocations that are allowed to ignore watermarks, as the
3592      * ALLOC_NO_WATERMARKS attempt didn't yet happen.
3593      */
3594     if (can_direct_reclaim && order > PAGE_ALLOC_COSTLY_ORDER &&
3595         !gfp_pfmemalloc_allowed(gfp_mask)) {
3596         page = __alloc_pages_direct_compact(gfp_mask, order,
3597                         alloc_flags, ac,
3598                         INIT_COMPACT_PRIORITY,
3599                         &compact_result);
3600         if (page)
3601             goto got_pg;
3602 
3603         /*
3604          * Checks for costly allocations with __GFP_NORETRY, which
3605          * includes THP page fault allocations
3606          */
3607         if (gfp_mask & __GFP_NORETRY) {
3608             /*
3609              * If compaction is deferred for high-order allocations,
3610              * it is because sync compaction recently failed. If
3611              * this is the case and the caller requested a THP
3612              * allocation, we do not want to heavily disrupt the
3613              * system, so we fail the allocation instead of entering
3614              * direct reclaim.
3615              */
3616             if (compact_result == COMPACT_DEFERRED)
3617                 goto nopage;
3618 
3619             /*
3620              * Looks like reclaim/compaction is worth trying, but
3621              * sync compaction could be very expensive, so keep
3622              * using async compaction.
3623              */
3624             compact_priority = INIT_COMPACT_PRIORITY;
3625         }
3626     }
3627 
3628 retry:
3629     /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
3630     if (gfp_mask & __GFP_KSWAPD_RECLAIM)
3631         wake_all_kswapds(order, ac);
3632 
3633     if (gfp_pfmemalloc_allowed(gfp_mask))
3634         alloc_flags = ALLOC_NO_WATERMARKS;
3635 
3636     /*
3637      * Reset the zonelist iterators if memory policies can be ignored.
3638      * These allocations are high priority and system rather than user
3639      * orientated.
3640      */
3641     if (!(alloc_flags & ALLOC_CPUSET) || (alloc_flags & ALLOC_NO_WATERMARKS)) {
3642         ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
3643         ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
3644                     ac->high_zoneidx, ac->nodemask);
3645     }
3646 
3647     /* Attempt with potentially adjusted zonelist and alloc_flags */
3648     page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
3649     if (page)
3650         goto got_pg;
3651 
3652     /* Caller is not willing to reclaim, we can't balance anything */
3653     if (!can_direct_reclaim) {
3654         /*
3655          * All existing users of the __GFP_NOFAIL are blockable, so warn
3656          * of any new users that actually allow this type of allocation
3657          * to fail.
3658          */
3659         WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL);
3660         goto nopage;
3661     }
3662 
3663     /* Avoid recursion of direct reclaim */
3664     if (current->flags & PF_MEMALLOC) {
3665         /*
3666          * __GFP_NOFAIL request from this context is rather bizarre
3667          * because we cannot reclaim anything and only can loop waiting
3668          * for somebody to do a work for us.
3669          */
3670         if (WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
3671             cond_resched();
3672             goto retry;
3673         }
3674         goto nopage;
3675     }
3676 
3677     /* Avoid allocations with no watermarks from looping endlessly */
3678     if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
3679         goto nopage;
3680 
3681 
3682     /* Try direct reclaim and then allocating */
3683     page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
3684                             &did_some_progress);
3685     if (page)
3686         goto got_pg;
3687 
3688     /* Try direct compaction and then allocating */
3689     page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
3690                     compact_priority, &compact_result);
3691     if (page)
3692         goto got_pg;
3693 
3694     /* Do not loop if specifically requested */
3695     if (gfp_mask & __GFP_NORETRY)
3696         goto nopage;
3697 
3698     /*
3699      * Do not retry costly high order allocations unless they are
3700      * __GFP_REPEAT
3701      */
3702     if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
3703         goto nopage;
3704 
3705     /* Make sure we know about allocations which stall for too long */
3706     if (time_after(jiffies, alloc_start + stall_timeout)) {
3707         warn_alloc(gfp_mask,
3708             "page allocation stalls for %ums, order:%u",
3709             jiffies_to_msecs(jiffies-alloc_start), order);
3710         stall_timeout += 10 * HZ;
3711     }
3712 
3713     if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
3714                  did_some_progress > 0, &no_progress_loops))
3715         goto retry;
3716 
3717     /*
3718      * It doesn't make any sense to retry for the compaction if the order-0
3719      * reclaim is not able to make any progress because the current
3720      * implementation of the compaction depends on the sufficient amount
3721      * of free memory (see __compaction_suitable)
3722      */
3723     if (did_some_progress > 0 &&
3724             should_compact_retry(ac, order, alloc_flags,
3725                 compact_result, &compact_priority,
3726                 &compaction_retries))
3727         goto retry;
3728 
3729     /*
3730      * It's possible we raced with cpuset update so the OOM would be
3731      * premature (see below the nopage: label for full explanation).
3732      */
3733     if (read_mems_allowed_retry(cpuset_mems_cookie))
3734         goto retry_cpuset;
3735 
3736     /* Reclaim has failed us, start killing things */
3737     page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
3738     if (page)
3739         goto got_pg;
3740 
3741     /* Retry as long as the OOM killer is making progress */
3742     if (did_some_progress) {
3743         no_progress_loops = 0;
3744         goto retry;
3745     }
3746 
3747 nopage:
3748     /*
3749      * When updating a task's mems_allowed or mempolicy nodemask, it is
3750      * possible to race with parallel threads in such a way that our
3751      * allocation can fail while the mask is being updated. If we are about
3752      * to fail, check if the cpuset changed during allocation and if so,
3753      * retry.
3754      */
3755     if (read_mems_allowed_retry(cpuset_mems_cookie))
3756         goto retry_cpuset;
3757 
3758     warn_alloc(gfp_mask,
3759             "page allocation failure: order:%u", order);
3760 got_pg:
3761     return page;
3762 }
3763 
3764 /*
3765  * This is the 'heart' of the zoned buddy allocator.
3766  */
3767 struct page *
3768 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
3769             struct zonelist *zonelist, nodemask_t *nodemask)
3770 {
3771     struct page *page;
3772     unsigned int alloc_flags = ALLOC_WMARK_LOW;
3773     gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */
3774     struct alloc_context ac = {
3775         .high_zoneidx = gfp_zone(gfp_mask),
3776         .zonelist = zonelist,
3777         .nodemask = nodemask,
3778         .migratetype = gfpflags_to_migratetype(gfp_mask),
3779     };
3780 
3781     if (cpusets_enabled()) {
3782         alloc_mask |= __GFP_HARDWALL;
3783         alloc_flags |= ALLOC_CPUSET;
3784         if (!ac.nodemask)
3785             ac.nodemask = &cpuset_current_mems_allowed;
3786     }
3787 
3788     gfp_mask &= gfp_allowed_mask;
3789 
3790     lockdep_trace_alloc(gfp_mask);
3791 
3792     might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
3793 
3794     if (should_fail_alloc_page(gfp_mask, order))
3795         return NULL;
3796 
3797     /*
3798      * Check the zones suitable for the gfp_mask contain at least one
3799      * valid zone. It's possible to have an empty zonelist as a result
3800      * of __GFP_THISNODE and a memoryless node
3801      */
3802     if (unlikely(!zonelist->_zonerefs->zone))
3803         return NULL;
3804 
3805     if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE)
3806         alloc_flags |= ALLOC_CMA;
3807 
3808     /* Dirty zone balancing only done in the fast path */
3809     ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE);
3810 
3811     /*
3812      * The preferred zone is used for statistics but crucially it is
3813      * also used as the starting point for the zonelist iterator. It
3814      * may get reset for allocations that ignore memory policies.
3815      */
3816     ac.preferred_zoneref = first_zones_zonelist(ac.zonelist,
3817                     ac.high_zoneidx, ac.nodemask);
3818     if (!ac.preferred_zoneref->zone) {
3819         page = NULL;
3820         /*
3821          * This might be due to race with cpuset_current_mems_allowed
3822          * update, so make sure we retry with original nodemask in the
3823          * slow path.
3824          */
3825         goto no_zone;
3826     }
3827 
3828     /* First allocation attempt */
3829     page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
3830     if (likely(page))
3831         goto out;
3832 
3833 no_zone:
3834     /*
3835      * Runtime PM, block IO and its error handling path can deadlock
3836      * because I/O on the device might not complete.
3837      */
3838     alloc_mask = memalloc_noio_flags(gfp_mask);
3839     ac.spread_dirty_pages = false;
3840 
3841     /*
3842      * Restore the original nodemask if it was potentially replaced with
3843      * &cpuset_current_mems_allowed to optimize the fast-path attempt.
3844      */
3845     if (unlikely(ac.nodemask != nodemask))
3846         ac.nodemask = nodemask;
3847 
3848     page = __alloc_pages_slowpath(alloc_mask, order, &ac);
3849 
3850 out:
3851     if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
3852         unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) {
3853         __free_pages(page, order);
3854         page = NULL;
3855     }
3856 
3857     if (kmemcheck_enabled && page)
3858         kmemcheck_pagealloc_alloc(page, order, gfp_mask);
3859 
3860     trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
3861 
3862     return page;
3863 }
3864 EXPORT_SYMBOL(__alloc_pages_nodemask);
3865 
3866 /*
3867  * Common helper functions.
3868  */
3869 unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
3870 {
3871     struct page *page;
3872 
3873     /*
3874      * __get_free_pages() returns a 32-bit address, which cannot represent
3875      * a highmem page
3876      */
3877     VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
3878 
3879     page = alloc_pages(gfp_mask, order);
3880     if (!page)
3881         return 0;
3882     return (unsigned long) page_address(page);
3883 }
3884 EXPORT_SYMBOL(__get_free_pages);
3885 
3886 unsigned long get_zeroed_page(gfp_t gfp_mask)
3887 {
3888     return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
3889 }
3890 EXPORT_SYMBOL(get_zeroed_page);
3891 
3892 void __free_pages(struct page *page, unsigned int order)
3893 {
3894     if (put_page_testzero(page)) {
3895         if (order == 0)
3896             free_hot_cold_page(page, false);
3897         else
3898             __free_pages_ok(page, order);
3899     }
3900 }
3901 
3902 EXPORT_SYMBOL(__free_pages);
3903 
3904 void free_pages(unsigned long addr, unsigned int order)
3905 {
3906     if (addr != 0) {
3907         VM_BUG_ON(!virt_addr_valid((void *)addr));
3908         __free_pages(virt_to_page((void *)addr), order);
3909     }
3910 }
3911 
3912 EXPORT_SYMBOL(free_pages);
3913 
3914 /*
3915  * Page Fragment:
3916  *  An arbitrary-length arbitrary-offset area of memory which resides
3917  *  within a 0 or higher order page.  Multiple fragments within that page
3918  *  are individually refcounted, in the page's reference counter.
3919  *
3920  * The page_frag functions below provide a simple allocation framework for
3921  * page fragments.  This is used by the network stack and network device
3922  * drivers to provide a backing region of memory for use as either an
3923  * sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
3924  */
3925 static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
3926                          gfp_t gfp_mask)
3927 {
3928     struct page *page = NULL;
3929     gfp_t gfp = gfp_mask;
3930 
3931 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
3932     gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
3933             __GFP_NOMEMALLOC;
3934     page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
3935                 PAGE_FRAG_CACHE_MAX_ORDER);
3936     nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
3937 #endif
3938     if (unlikely(!page))
3939         page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
3940 
3941     nc->va = page ? page_address(page) : NULL;
3942 
3943     return page;
3944 }
3945 
3946 void __page_frag_cache_drain(struct page *page, unsigned int count)
3947 {
3948     VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
3949 
3950     if (page_ref_sub_and_test(page, count)) {
3951         unsigned int order = compound_order(page);
3952 
3953         if (order == 0)
3954             free_hot_cold_page(page, false);
3955         else
3956             __free_pages_ok(page, order);
3957     }
3958 }
3959 EXPORT_SYMBOL(__page_frag_cache_drain);
3960 
3961 void *page_frag_alloc(struct page_frag_cache *nc,
3962               unsigned int fragsz, gfp_t gfp_mask)
3963 {
3964     unsigned int size = PAGE_SIZE;
3965     struct page *page;
3966     int offset;
3967 
3968     if (unlikely(!nc->va)) {
3969 refill:
3970         page = __page_frag_cache_refill(nc, gfp_mask);
3971         if (!page)
3972             return NULL;
3973 
3974 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
3975         /* if size can vary use size else just use PAGE_SIZE */
3976         size = nc->size;
3977 #endif
3978         /* Even if we own the page, we do not use atomic_set().
3979          * This would break get_page_unless_zero() users.
3980          */
3981         page_ref_add(page, size - 1);
3982 
3983         /* reset page count bias and offset to start of new frag */
3984         nc->pfmemalloc = page_is_pfmemalloc(page);
3985         nc->pagecnt_bias = size;
3986         nc->offset = size;
3987     }
3988 
3989     offset = nc->offset - fragsz;
3990     if (unlikely(offset < 0)) {
3991         page = virt_to_page(nc->va);
3992 
3993         if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
3994             goto refill;
3995 
3996 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
3997         /* if size can vary use size else just use PAGE_SIZE */
3998         size = nc->size;
3999 #endif
4000         /* OK, page count is 0, we can safely set it */
4001         set_page_count(page, size);
4002 
4003         /* reset page count bias and offset to start of new frag */
4004         nc->pagecnt_bias = size;
4005         offset = size - fragsz;
4006     }
4007 
4008     nc->pagecnt_bias--;
4009     nc->offset = offset;
4010 
4011     return nc->va + offset;
4012 }
4013 EXPORT_SYMBOL(page_frag_alloc);
4014 
4015 /*
4016  * Frees a page fragment allocated out of either a compound or order 0 page.
4017  */
4018 void page_frag_free(void *addr)
4019 {
4020     struct page *page = virt_to_head_page(addr);
4021 
4022     if (unlikely(put_page_testzero(page)))
4023         __free_pages_ok(page, compound_order(page));
4024 }
4025 EXPORT_SYMBOL(page_frag_free);
4026 
4027 static void *make_alloc_exact(unsigned long addr, unsigned int order,
4028         size_t size)
4029 {
4030     if (addr) {
4031         unsigned long alloc_end = addr + (PAGE_SIZE << order);
4032         unsigned long used = addr + PAGE_ALIGN(size);
4033 
4034         split_page(virt_to_page((void *)addr), order);
4035         while (used < alloc_end) {
4036             free_page(used);
4037             used += PAGE_SIZE;
4038         }
4039     }
4040     return (void *)addr;
4041 }
4042 
4043 /**
4044  * alloc_pages_exact - allocate an exact number physically-contiguous pages.
4045  * @size: the number of bytes to allocate
4046  * @gfp_mask: GFP flags for the allocation
4047  *
4048  * This function is similar to alloc_pages(), except that it allocates the
4049  * minimum number of pages to satisfy the request.  alloc_pages() can only
4050  * allocate memory in power-of-two pages.
4051  *
4052  * This function is also limited by MAX_ORDER.
4053  *
4054  * Memory allocated by this function must be released by free_pages_exact().
4055  */
4056 void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
4057 {
4058     unsigned int order = get_order(size);
4059     unsigned long addr;
4060 
4061     addr = __get_free_pages(gfp_mask, order);
4062     return make_alloc_exact(addr, order, size);
4063 }
4064 EXPORT_SYMBOL(alloc_pages_exact);
4065 
4066 /**
4067  * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
4068  *             pages on a node.
4069  * @nid: the preferred node ID where memory should be allocated
4070  * @size: the number of bytes to allocate
4071  * @gfp_mask: GFP flags for the allocation
4072  *
4073  * Like alloc_pages_exact(), but try to allocate on node nid first before falling
4074  * back.
4075  */
4076 void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
4077 {
4078     unsigned int order = get_order(size);
4079     struct page *p = alloc_pages_node(nid, gfp_mask, order);
4080     if (!p)
4081         return NULL;
4082     return make_alloc_exact((unsigned long)page_address(p), order, size);
4083 }
4084 
4085 /**
4086  * free_pages_exact - release memory allocated via alloc_pages_exact()
4087  * @virt: the value returned by alloc_pages_exact.
4088  * @size: size of allocation, same value as passed to alloc_pages_exact().
4089  *
4090  * Release the memory allocated by a previous call to alloc_pages_exact.
4091  */
4092 void free_pages_exact(void *virt, size_t size)
4093 {
4094     unsigned long addr = (unsigned long)virt;
4095     unsigned long end = addr + PAGE_ALIGN(size);
4096 
4097     while (addr < end) {
4098         free_page(addr);
4099         addr += PAGE_SIZE;
4100     }
4101 }
4102 EXPORT_SYMBOL(free_pages_exact);
4103 
4104 /**
4105  * nr_free_zone_pages - count number of pages beyond high watermark
4106  * @offset: The zone index of the highest zone
4107  *
4108  * nr_free_zone_pages() counts the number of counts pages which are beyond the
4109  * high watermark within all zones at or below a given zone index.  For each
4110  * zone, the number of pages is calculated as:
4111  *     managed_pages - high_pages
4112  */
4113 static unsigned long nr_free_zone_pages(int offset)
4114 {
4115     struct zoneref *z;
4116     struct zone *zone;
4117 
4118     /* Just pick one node, since fallback list is circular */
4119     unsigned long sum = 0;
4120 
4121     struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
4122 
4123     for_each_zone_zonelist(zone, z, zonelist, offset) {
4124         unsigned long size = zone->managed_pages;
4125         unsigned long high = high_wmark_pages(zone);
4126         if (size > high)
4127             sum += size - high;
4128     }
4129 
4130     return sum;
4131 }
4132 
4133 /**
4134  * nr_free_buffer_pages - count number of pages beyond high watermark
4135  *
4136  * nr_free_buffer_pages() counts the number of pages which are beyond the high
4137  * watermark within ZONE_DMA and ZONE_NORMAL.
4138  */
4139 unsigned long nr_free_buffer_pages(void)
4140 {
4141     return nr_free_zone_pages(gfp_zone(GFP_USER));
4142 }
4143 EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
4144 
4145 /**
4146  * nr_free_pagecache_pages - count number of pages beyond high watermark
4147  *
4148  * nr_free_pagecache_pages() counts the number of pages which are beyond the
4149  * high watermark within all zones.
4150  */
4151 unsigned long nr_free_pagecache_pages(void)
4152 {
4153     return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
4154 }
4155 
4156 static inline void show_node(struct zone *zone)
4157 {
4158     if (IS_ENABLED(CONFIG_NUMA))
4159         printk("Node %d ", zone_to_nid(zone));
4160 }
4161 
4162 long si_mem_available(void)
4163 {
4164     long available;
4165     unsigned long pagecache;
4166     unsigned long wmark_low = 0;
4167     unsigned long pages[NR_LRU_LISTS];
4168     struct zone *zone;
4169     int lru;
4170 
4171     for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
4172         pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
4173 
4174     for_each_zone(zone)
4175         wmark_low += zone->watermark[WMARK_LOW];
4176 
4177     /*
4178      * Estimate the amount of memory available for userspace allocations,
4179      * without causing swapping.
4180      */
4181     available = global_page_state(NR_FREE_PAGES) - totalreserve_pages;
4182 
4183     /*
4184      * Not all the page cache can be freed, otherwise the system will
4185      * start swapping. Assume at least half of the page cache, or the
4186      * low watermark worth of cache, needs to stay.
4187      */
4188     pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
4189     pagecache -= min(pagecache / 2, wmark_low);
4190     available += pagecache;
4191 
4192     /*
4193      * Part of the reclaimable slab consists of items that are in use,
4194      * and cannot be freed. Cap this estimate at the low watermark.
4195      */
4196     available += global_page_state(NR_SLAB_RECLAIMABLE) -
4197              min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low);
4198 
4199     if (available < 0)
4200         available = 0;
4201     return available;
4202 }
4203 EXPORT_SYMBOL_GPL(si_mem_available);
4204 
4205 void si_meminfo(struct sysinfo *val)
4206 {
4207     val->totalram = totalram_pages;
4208     val->sharedram = global_node_page_state(NR_SHMEM);
4209     val->freeram = global_page_state(NR_FREE_PAGES);
4210     val->bufferram = nr_blockdev_pages();
4211     val->totalhigh = totalhigh_pages;
4212     val->freehigh = nr_free_highpages();
4213     val->mem_unit = PAGE_SIZE;
4214 }
4215 
4216 EXPORT_SYMBOL(si_meminfo);
4217 
4218 #ifdef CONFIG_NUMA
4219 void si_meminfo_node(struct sysinfo *val, int nid)
4220 {
4221     int zone_type;      /* needs to be signed */
4222     unsigned long managed_pages = 0;
4223     unsigned long managed_highpages = 0;
4224     unsigned long free_highpages = 0;
4225     pg_data_t *pgdat = NODE_DATA(nid);
4226 
4227     for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
4228         managed_pages += pgdat->node_zones[zone_type].managed_pages;
4229     val->totalram = managed_pages;
4230     val->sharedram = node_page_state(pgdat, NR_SHMEM);
4231     val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
4232 #ifdef CONFIG_HIGHMEM
4233     for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
4234         struct zone *zone = &pgdat->node_zones[zone_type];
4235 
4236         if (is_highmem(zone)) {
4237             managed_highpages += zone->managed_pages;
4238             free_highpages += zone_page_state(zone, NR_FREE_PAGES);
4239         }
4240     }
4241     val->totalhigh = managed_highpages;
4242     val->freehigh = free_highpages;
4243 #else
4244     val->totalhigh = managed_highpages;
4245     val->freehigh = free_highpages;
4246 #endif
4247     val->mem_unit = PAGE_SIZE;
4248 }
4249 #endif
4250 
4251 /*
4252  * Determine whether the node should be displayed or not, depending on whether
4253  * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
4254  */
4255 bool skip_free_areas_node(unsigned int flags, int nid)
4256 {
4257     bool ret = false;
4258     unsigned int cpuset_mems_cookie;
4259 
4260     if (!(flags & SHOW_MEM_FILTER_NODES))
4261         goto out;
4262 
4263     do {
4264         cpuset_mems_cookie = read_mems_allowed_begin();
4265         ret = !node_isset(nid, cpuset_current_mems_allowed);
4266     } while (read_mems_allowed_retry(cpuset_mems_cookie));
4267 out:
4268     return ret;
4269 }
4270 
4271 #define K(x) ((x) << (PAGE_SHIFT-10))
4272 
4273 static void show_migration_types(unsigned char type)
4274 {
4275     static const char types[MIGRATE_TYPES] = {
4276         [MIGRATE_UNMOVABLE] = 'U',
4277         [MIGRATE_MOVABLE]   = 'M',
4278         [MIGRATE_RECLAIMABLE]   = 'E',
4279         [MIGRATE_HIGHATOMIC]    = 'H',
4280 #ifdef CONFIG_CMA
4281         [MIGRATE_CMA]       = 'C',
4282 #endif
4283 #ifdef CONFIG_MEMORY_ISOLATION
4284         [MIGRATE_ISOLATE]   = 'I',
4285 #endif
4286     };
4287     char tmp[MIGRATE_TYPES + 1];
4288     char *p = tmp;
4289     int i;
4290 
4291     for (i = 0; i < MIGRATE_TYPES; i++) {
4292         if (type & (1 << i))
4293             *p++ = types[i];
4294     }
4295 
4296     *p = '\0';
4297     printk(KERN_CONT "(%s) ", tmp);
4298 }
4299 
4300 /*
4301  * Show free area list (used inside shift_scroll-lock stuff)
4302  * We also calculate the percentage fragmentation. We do this by counting the
4303  * memory on each free list with the exception of the first item on the list.
4304  *
4305  * Bits in @filter:
4306  * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
4307  *   cpuset.
4308  */
4309 void show_free_areas(unsigned int filter)
4310 {
4311     unsigned long free_pcp = 0;
4312     int cpu;
4313     struct zone *zone;
4314     pg_data_t *pgdat;
4315 
4316     for_each_populated_zone(zone) {
4317         if (skip_free_areas_node(filter, zone_to_nid(zone)))
4318             continue;
4319 
4320         for_each_online_cpu(cpu)
4321             free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
4322     }
4323 
4324     printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
4325         " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
4326         " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n"
4327         " slab_reclaimable:%lu slab_unreclaimable:%lu\n"
4328         " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
4329         " free:%lu free_pcp:%lu free_cma:%lu\n",
4330         global_node_page_state(NR_ACTIVE_ANON),
4331         global_node_page_state(NR_INACTIVE_ANON),
4332         global_node_page_state(NR_ISOLATED_ANON),
4333         global_node_page_state(NR_ACTIVE_FILE),
4334         global_node_page_state(NR_INACTIVE_FILE),
4335         global_node_page_state(NR_ISOLATED_FILE),
4336         global_node_page_state(NR_UNEVICTABLE),
4337         global_node_page_state(NR_FILE_DIRTY),
4338         global_node_page_state(NR_WRITEBACK),
4339         global_node_page_state(NR_UNSTABLE_NFS),
4340         global_page_state(NR_SLAB_RECLAIMABLE),
4341         global_page_state(NR_SLAB_UNRECLAIMABLE),
4342         global_node_page_state(NR_FILE_MAPPED),
4343         global_node_page_state(NR_SHMEM),
4344         global_page_state(NR_PAGETABLE),
4345         global_page_state(NR_BOUNCE),
4346         global_page_state(NR_FREE_PAGES),
4347         free_pcp,
4348         global_page_state(NR_FREE_CMA_PAGES));
4349 
4350     for_each_online_pgdat(pgdat) {
4351         printk("Node %d"
4352             " active_anon:%lukB"
4353             " inactive_anon:%lukB"
4354             " active_file:%lukB"
4355             " inactive_file:%lukB"
4356             " unevictable:%lukB"
4357             " isolated(anon):%lukB"
4358             " isolated(file):%lukB"
4359             " mapped:%lukB"
4360             " dirty:%lukB"
4361             " writeback:%lukB"
4362             " shmem:%lukB"
4363 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
4364             " shmem_thp: %lukB"
4365             " shmem_pmdmapped: %lukB"
4366             " anon_thp: %lukB"
4367 #endif
4368             " writeback_tmp:%lukB"
4369             " unstable:%lukB"
4370             " pages_scanned:%lu"
4371             " all_unreclaimable? %s"
4372             "\n",
4373             pgdat->node_id,
4374             K(node_page_state(pgdat, NR_ACTIVE_ANON)),
4375             K(node_page_state(pgdat, NR_INACTIVE_ANON)),
4376             K(node_page_state(pgdat, NR_ACTIVE_FILE)),
4377             K(node_page_state(pgdat, NR_INACTIVE_FILE)),
4378             K(node_page_state(pgdat, NR_UNEVICTABLE)),
4379             K(node_page_state(pgdat, NR_ISOLATED_ANON)),
4380             K(node_page_state(pgdat, NR_ISOLATED_FILE)),
4381             K(node_page_state(pgdat, NR_FILE_MAPPED)),
4382             K(node_page_state(pgdat, NR_FILE_DIRTY)),
4383             K(node_page_state(pgdat, NR_WRITEBACK)),
4384 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
4385             K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR),
4386             K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)
4387                     * HPAGE_PMD_NR),
4388             K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
4389 #endif
4390             K(node_page_state(pgdat, NR_SHMEM)),
4391             K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
4392             K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
4393             node_page_state(pgdat, NR_PAGES_SCANNED),
4394             !pgdat_reclaimable(pgdat) ? "yes" : "no");
4395     }
4396 
4397     for_each_populated_zone(zone) {
4398         int i;
4399 
4400         if (skip_free_areas_node(filter, zone_to_nid(zone)))
4401             continue;
4402 
4403         free_pcp = 0;
4404         for_each_online_cpu(cpu)
4405             free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
4406 
4407         show_node(zone);
4408         printk(KERN_CONT
4409             "%s"
4410             " free:%lukB"
4411             " min:%lukB"
4412             " low:%lukB"
4413             " high:%lukB"
4414             " active_anon:%lukB"
4415             " inactive_anon:%lukB"
4416             " active_file:%lukB"
4417             " inactive_file:%lukB"
4418             " unevictable:%lukB"
4419             " writepending:%lukB"
4420             " present:%lukB"
4421             " managed:%lukB"
4422             " mlocked:%lukB"
4423             " slab_reclaimable:%lukB"
4424             " slab_unreclaimable:%lukB"
4425             " kernel_stack:%lukB"
4426             " pagetables:%lukB"
4427             " bounce:%lukB"
4428             " free_pcp:%lukB"
4429             " local_pcp:%ukB"
4430             " free_cma:%lukB"
4431             "\n",
4432             zone->name,
4433             K(zone_page_state(zone, NR_FREE_PAGES)),
4434             K(min_wmark_pages(zone)),
4435             K(low_wmark_pages(zone)),
4436             K(high_wmark_pages(zone)),
4437             K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
4438             K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
4439             K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
4440             K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)),
4441             K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
4442             K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
4443             K(zone->present_pages),
4444             K(zone->managed_pages),
4445             K(zone_page_state(zone, NR_MLOCK)),
4446             K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
4447             K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
4448             zone_page_state(zone, NR_KERNEL_STACK_KB),
4449             K(zone_page_state(zone, NR_PAGETABLE)),
4450             K(zone_page_state(zone, NR_BOUNCE)),
4451             K(free_pcp),
4452             K(this_cpu_read(zone->pageset->pcp.count)),
4453             K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
4454         printk("lowmem_reserve[]:");
4455         for (i = 0; i < MAX_NR_ZONES; i++)
4456             printk(KERN_CONT " %ld", zone->lowmem_reserve[i]);
4457         printk(KERN_CONT "\n");
4458     }
4459 
4460     for_each_populated_zone(zone) {
4461         unsigned int order;
4462         unsigned long nr[MAX_ORDER], flags, total = 0;
4463         unsigned char types[MAX_ORDER];
4464 
4465         if (skip_free_areas_node(filter, zone_to_nid(zone)))
4466             continue;
4467         show_node(zone);
4468         printk(KERN_CONT "%s: ", zone->name);
4469 
4470         spin_lock_irqsave(&zone->lock, flags);
4471         for (order = 0; order < MAX_ORDER; order++) {
4472             struct free_area *area = &zone->free_area[order];
4473             int type;
4474 
4475             nr[order] = area->nr_free;
4476             total += nr[order] << order;
4477 
4478             types[order] = 0;
4479             for (type = 0; type < MIGRATE_TYPES; type++) {
4480                 if (!list_empty(&area->free_list[type]))
4481                     types[order] |= 1 << type;
4482             }
4483         }
4484         spin_unlock_irqrestore(&zone->lock, flags);
4485         for (order = 0; order < MAX_ORDER; order++) {
4486             printk(KERN_CONT "%lu*%lukB ",
4487                    nr[order], K(1UL) << order);
4488             if (nr[order])
4489                 show_migration_types(types[order]);
4490         }
4491         printk(KERN_CONT "= %lukB\n", K(total));
4492     }
4493 
4494     hugetlb_show_meminfo();
4495 
4496     printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES));
4497 
4498     show_swap_cache_info();
4499 }
4500 
4501 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
4502 {
4503     zoneref->zone = zone;
4504     zoneref->zone_idx = zone_idx(zone);
4505 }
4506 
4507 /*
4508  * Builds allocation fallback zone lists.
4509  *
4510  * Add all populated zones of a node to the zonelist.
4511  */
4512 static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
4513                 int nr_zones)
4514 {
4515     struct zone *zone;
4516     enum zone_type zone_type = MAX_NR_ZONES;
4517 
4518     do {
4519         zone_type--;
4520         zone = pgdat->node_zones + zone_type;
4521         if (managed_zone(zone)) {
4522             zoneref_set_zone(zone,
4523                 &zonelist->_zonerefs[nr_zones++]);
4524             check_highest_zone(zone_type);
4525         }
4526     } while (zone_type);
4527 
4528     return nr_zones;
4529 }
4530 
4531 
4532 /*
4533  *  zonelist_order:
4534  *  0 = automatic detection of better ordering.
4535  *  1 = order by ([node] distance, -zonetype)
4536  *  2 = order by (-zonetype, [node] distance)
4537  *
4538  *  If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
4539  *  the same zonelist. So only NUMA can configure this param.
4540  */
4541 #define ZONELIST_ORDER_DEFAULT  0
4542 #define ZONELIST_ORDER_NODE     1
4543 #define ZONELIST_ORDER_ZONE     2
4544 
4545 /* zonelist order in the kernel.
4546  * set_zonelist_order() will set this to NODE or ZONE.
4547  */
4548 static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
4549 static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
4550 
4551 
4552 #ifdef CONFIG_NUMA
4553 /* The value user specified ....changed by config */
4554 static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
4555 /* string for sysctl */
4556 #define NUMA_ZONELIST_ORDER_LEN 16
4557 char numa_zonelist_order[16] = "default";
4558 
4559 /*
4560  * interface for configure zonelist ordering.
4561  * command line option "numa_zonelist_order"
4562  *  = "[dD]efault   - default, automatic configuration.
4563  *  = "[nN]ode  - order by node locality, then by zone within node
4564  *  = "[zZ]one      - order by zone, then by locality within zone
4565  */
4566 
4567 static int __parse_numa_zonelist_order(char *s)
4568 {
4569     if (*s == 'd' || *s == 'D') {
4570         user_zonelist_order = ZONELIST_ORDER_DEFAULT;
4571     } else if (*s == 'n' || *s == 'N') {
4572         user_zonelist_order = ZONELIST_ORDER_NODE;
4573     } else if (*s == 'z' || *s == 'Z') {
4574         user_zonelist_order = ZONELIST_ORDER_ZONE;
4575     } else {
4576         pr_warn("Ignoring invalid numa_zonelist_order value:  %s\n", s);
4577         return -EINVAL;
4578     }
4579     return 0;
4580 }
4581 
4582 static __init int setup_numa_zonelist_order(char *s)
4583 {
4584     int ret;
4585 
4586     if (!s)
4587         return 0;
4588 
4589     ret = __parse_numa_zonelist_order(s);
4590     if (ret == 0)
4591         strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
4592 
4593     return ret;
4594 }
4595 early_param("numa_zonelist_order", setup_numa_zonelist_order);
4596 
4597 /*
4598  * sysctl handler for numa_zonelist_order
4599  */
4600 int numa_zonelist_order_handler(struct ctl_table *table, int write,
4601         void __user *buffer, size_t *length,
4602         loff_t *ppos)
4603 {
4604     char saved_string[NUMA_ZONELIST_ORDER_LEN];
4605     int ret;
4606     static DEFINE_MUTEX(zl_order_mutex);
4607 
4608     mutex_lock(&zl_order_mutex);
4609     if (write) {
4610         if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {
4611             ret = -EINVAL;
4612             goto out;
4613         }
4614         strcpy(saved_string, (char *)table->data);
4615     }
4616     ret = proc_dostring(table, write, buffer, length, ppos);
4617     if (ret)
4618         goto out;
4619     if (write) {
4620         int oldval = user_zonelist_order;
4621 
4622         ret = __parse_numa_zonelist_order((char *)table->data);
4623         if (ret) {
4624             /*
4625              * bogus value.  restore saved string
4626              */
4627             strncpy((char *)table->data, saved_string,
4628                 NUMA_ZONELIST_ORDER_LEN);
4629             user_zonelist_order = oldval;
4630         } else if (oldval != user_zonelist_order) {
4631             mutex_lock(&zonelists_mutex);
4632             build_all_zonelists(NULL, NULL);
4633             mutex_unlock(&zonelists_mutex);
4634         }
4635     }
4636 out:
4637     mutex_unlock(&zl_order_mutex);
4638     return ret;
4639 }
4640 
4641 
4642 #define MAX_NODE_LOAD (nr_online_nodes)
4643 static int node_load[MAX_NUMNODES];
4644 
4645 /**
4646  * find_next_best_node - find the next node that should appear in a given node's fallback list
4647  * @node: node whose fallback list we're appending
4648  * @used_node_mask: nodemask_t of already used nodes
4649  *
4650  * We use a number of factors to determine which is the next node that should
4651  * appear on a given node's fallback list.  The node should not have appeared
4652  * already in @node's fallback list, and it should be the next closest node
4653  * according to the distance array (which contains arbitrary distance values
4654  * from each node to each node in the system), and should also prefer nodes
4655  * with no CPUs, since presumably they'll have very little allocation pressure
4656  * on them otherwise.
4657  * It returns -1 if no node is found.
4658  */
4659 static int find_next_best_node(int node, nodemask_t *used_node_mask)
4660 {
4661     int n, val;
4662     int min_val = INT_MAX;
4663     int best_node = NUMA_NO_NODE;
4664     const struct cpumask *tmp = cpumask_of_node(0);
4665 
4666     /* Use the local node if we haven't already */
4667     if (!node_isset(node, *used_node_mask)) {
4668         node_set(node, *used_node_mask);
4669         return node;
4670     }
4671 
4672     for_each_node_state(n, N_MEMORY) {
4673 
4674         /* Don't want a node to appear more than once */
4675         if (node_isset(n, *used_node_mask))
4676             continue;
4677 
4678         /* Use the distance array to find the distance */
4679         val = node_distance(node, n);
4680 
4681         /* Penalize nodes under us ("prefer the next node") */
4682         val += (n < node);
4683 
4684         /* Give preference to headless and unused nodes */
4685         tmp = cpumask_of_node(n);
4686         if (!cpumask_empty(tmp))
4687             val += PENALTY_FOR_NODE_WITH_CPUS;
4688 
4689         /* Slight preference for less loaded node */
4690         val *= (MAX_NODE_LOAD*MAX_NUMNODES);
4691         val += node_load[n];
4692 
4693         if (val < min_val) {
4694             min_val = val;
4695             best_node = n;
4696         }
4697     }
4698 
4699     if (best_node >= 0)
4700         node_set(best_node, *used_node_mask);
4701 
4702     return best_node;
4703 }
4704 
4705 
4706 /*
4707  * Build zonelists ordered by node and zones within node.
4708  * This results in maximum locality--normal zone overflows into local
4709  * DMA zone, if any--but risks exhausting DMA zone.
4710  */
4711 static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
4712 {
4713     int j;
4714     struct zonelist *zonelist;
4715 
4716     zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
4717     for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
4718         ;
4719     j = build_zonelists_node(NODE_DATA(node), zonelist, j);
4720     zonelist->_zonerefs[j].zone = NULL;
4721     zonelist->_zonerefs[j].zone_idx = 0;
4722 }
4723 
4724 /*
4725  * Build gfp_thisnode zonelists
4726  */
4727 static void build_thisnode_zonelists(pg_data_t *pgdat)
4728 {
4729     int j;
4730     struct zonelist *zonelist;
4731 
4732     zonelist = &pgdat->node_zonelists[ZONELIST_NOFALLBACK];
4733     j = build_zonelists_node(pgdat, zonelist, 0);
4734     zonelist->_zonerefs[j].zone = NULL;
4735     zonelist->_zonerefs[j].zone_idx = 0;
4736 }
4737 
4738 /*
4739  * Build zonelists ordered by zone and nodes within zones.
4740  * This results in conserving DMA zone[s] until all Normal memory is
4741  * exhausted, but results in overflowing to remote node while memory
4742  * may still exist in local DMA zone.
4743  */
4744 static int node_order[MAX_NUMNODES];
4745 
4746 static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
4747 {
4748     int pos, j, node;
4749     int zone_type;      /* needs to be signed */
4750     struct zone *z;
4751     struct zonelist *zonelist;
4752 
4753     zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
4754     pos = 0;
4755     for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
4756         for (j = 0; j < nr_nodes; j++) {
4757             node = node_order[j];
4758             z = &NODE_DATA(node)->node_zones[zone_type];
4759             if (managed_zone(z)) {
4760                 zoneref_set_zone(z,
4761                     &zonelist->_zonerefs[pos++]);
4762                 check_highest_zone(zone_type);
4763             }
4764         }
4765     }
4766     zonelist->_zonerefs[pos].zone = NULL;
4767     zonelist->_zonerefs[pos].zone_idx = 0;
4768 }
4769 
4770 #if defined(CONFIG_64BIT)
4771 /*
4772  * Devices that require DMA32/DMA are relatively rare and do not justify a
4773  * penalty to every machine in case the specialised case applies. Default
4774  * to Node-ordering on 64-bit NUMA machines
4775  */
4776 static int default_zonelist_order(void)
4777 {
4778     return ZONELIST_ORDER_NODE;
4779 }
4780 #else
4781 /*
4782  * On 32-bit, the Normal zone needs to be preserved for allocations accessible
4783  * by the kernel. If processes running on node 0 deplete the low memory zone
4784  * then reclaim will occur more frequency increasing stalls and potentially
4785  * be easier to OOM if a large percentage of the zone is under writeback or
4786  * dirty. The problem is significantly worse if CONFIG_HIGHPTE is not set.
4787  * Hence, default to zone ordering on 32-bit.
4788  */
4789 static int default_zonelist_order(void)
4790 {
4791     return ZONELIST_ORDER_ZONE;
4792 }
4793 #endif /* CONFIG_64BIT */
4794 
4795 static void set_zonelist_order(void)
4796 {
4797     if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
4798         current_zonelist_order = default_zonelist_order();
4799     else
4800         current_zonelist_order = user_zonelist_order;
4801 }
4802 
4803 static void build_zonelists(pg_data_t *pgdat)
4804 {
4805     int i, node, load;
4806     nodemask_t used_mask;
4807     int local_node, prev_node;
4808     struct zonelist *zonelist;
4809     unsigned int order = current_zonelist_order;
4810 
4811     /* initialize zonelists */
4812     for (i = 0; i < MAX_ZONELISTS; i++) {
4813         zonelist = pgdat->node_zonelists + i;
4814         zonelist->_zonerefs[0].zone = NULL;
4815         zonelist->_zonerefs[0].zone_idx = 0;
4816     }
4817 
4818     /* NUMA-aware ordering of nodes */
4819     local_node = pgdat->node_id;
4820     load = nr_online_nodes;
4821     prev_node = local_node;
4822     nodes_clear(used_mask);
4823 
4824     memset(node_order, 0, sizeof(node_order));
4825     i = 0;
4826 
4827     while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
4828         /*
4829          * We don't want to pressure a particular node.
4830          * So adding penalty to the first node in same
4831          * distance group to make it round-robin.
4832          */
4833         if (node_distance(local_node, node) !=
4834             node_distance(local_node, prev_node))
4835             node_load[node] = load;
4836 
4837         prev_node = node;
4838         load--;
4839         if (order == ZONELIST_ORDER_NODE)
4840             build_zonelists_in_node_order(pgdat, node);
4841         else
4842             node_order[i++] = node; /* remember order */
4843     }
4844 
4845     if (order == ZONELIST_ORDER_ZONE) {
4846         /* calculate node order -- i.e., DMA last! */
4847         build_zonelists_in_zone_order(pgdat, i);
4848     }
4849 
4850     build_thisnode_zonelists(pgdat);
4851 }
4852 
4853 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
4854 /*
4855  * Return node id of node used for "local" allocations.
4856  * I.e., first node id of first zone in arg node's generic zonelist.
4857  * Used for initializing percpu 'numa_mem', which is used primarily
4858  * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
4859  */
4860 int local_memory_node(int node)
4861 {
4862     struct zoneref *z;
4863 
4864     z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
4865                    gfp_zone(GFP_KERNEL),
4866                    NULL);
4867     return z->zone->node;
4868 }
4869 #endif
4870 
4871 static void setup_min_unmapped_ratio(void);
4872 static void setup_min_slab_ratio(void);
4873 #else   /* CONFIG_NUMA */
4874 
4875 static void set_zonelist_order(void)
4876 {
4877     current_zonelist_order = ZONELIST_ORDER_ZONE;
4878 }
4879 
4880 static void build_zonelists(pg_data_t *pgdat)
4881 {
4882     int node, local_node;
4883     enum zone_type j;
4884     struct zonelist *zonelist;
4885 
4886     local_node = pgdat->node_id;
4887 
4888     zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
4889     j = build_zonelists_node(pgdat, zonelist, 0);
4890 
4891     /*
4892      * Now we build the zonelist so that it contains the zones
4893      * of all the other nodes.
4894      * We don't want to pressure a particular node, so when
4895      * building the zones for node N, we make sure that the
4896      * zones coming right after the local ones are those from
4897      * node N+1 (modulo N)
4898      */
4899     for (node = local_node + 1; node < MAX_NUMNODES; node++) {
4900         if (!node_online(node))
4901             continue;
4902         j = build_zonelists_node(NODE_DATA(node), zonelist, j);
4903     }
4904     for (node = 0; node < local_node; node++) {
4905         if (!node_online(node))
4906             continue;
4907         j = build_zonelists_node(NODE_DATA(node), zonelist, j);
4908     }
4909 
4910     zonelist->_zonerefs[j].zone = NULL;
4911     zonelist->_zonerefs[j].zone_idx = 0;
4912 }
4913 
4914 #endif  /* CONFIG_NUMA */
4915 
4916 /*
4917  * Boot pageset table. One per cpu which is going to be used for all
4918  * zones and all nodes. The parameters will be set in such a way
4919  * that an item put on a list will immediately be handed over to
4920  * the buddy list. This is safe since pageset manipulation is done
4921  * with interrupts disabled.
4922  *
4923  * The boot_pagesets must be kept even after bootup is complete for
4924  * unused processors and/or zones. They do play a role for bootstrapping
4925  * hotplugged processors.
4926  *
4927  * zoneinfo_show() and maybe other functions do
4928  * not check if the processor is online before following the pageset pointer.
4929  * Other parts of the kernel may not check if the zone is available.
4930  */
4931 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
4932 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
4933 static void setup_zone_pageset(struct zone *zone);
4934 
4935 /*
4936  * Global mutex to protect against size modification of zonelists
4937  * as well as to serialize pageset setup for the new populated zone.
4938  */
4939 DEFINE_MUTEX(zonelists_mutex);
4940 
4941 /* return values int ....just for stop_machine() */
4942 static int __build_all_zonelists(void *data)
4943 {
4944     int nid;
4945     int cpu;
4946     pg_data_t *self = data;
4947 
4948 #ifdef CONFIG_NUMA
4949     memset(node_load, 0, sizeof(node_load));
4950 #endif
4951 
4952     if (self && !node_online(self->node_id)) {
4953         build_zonelists(self);
4954     }
4955 
4956     for_each_online_node(nid) {
4957         pg_data_t *pgdat = NODE_DATA(nid);
4958 
4959         build_zonelists(pgdat);
4960     }
4961 
4962     /*
4963      * Initialize the boot_pagesets that are going to be used
4964      * for bootstrapping processors. The real pagesets for
4965      * each zone will be allocated later when the per cpu
4966      * allocator is available.
4967      *
4968      * boot_pagesets are used also for bootstrapping offline
4969      * cpus if the system is already booted because the pagesets
4970      * are needed to initialize allocators on a specific cpu too.
4971      * F.e. the percpu allocator needs the page allocator which
4972      * needs the percpu allocator in order to allocate its pagesets
4973      * (a chicken-egg dilemma).
4974      */
4975     for_each_possible_cpu(cpu) {
4976         setup_pageset(&per_cpu(boot_pageset, cpu), 0);
4977 
4978 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
4979         /*
4980          * We now know the "local memory node" for each node--
4981          * i.e., the node of the first zone in the generic zonelist.
4982          * Set up numa_mem percpu variable for on-line cpus.  During
4983          * boot, only the boot cpu should be on-line;  we'll init the
4984          * secondary cpus' numa_mem as they come on-line.  During
4985          * node/memory hotplug, we'll fixup all on-line cpus.
4986          */
4987         if (cpu_online(cpu))
4988             set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
4989 #endif
4990     }
4991 
4992     return 0;
4993 }
4994 
4995 static noinline void __init
4996 build_all_zonelists_init(void)
4997 {
4998     __build_all_zonelists(NULL);
4999     mminit_verify_zonelist();
5000     cpuset_init_current_mems_allowed();
5001 }
5002 
5003 /*
5004  * Called with zonelists_mutex held always
5005  * unless system_state == SYSTEM_BOOTING.
5006  *
5007  * __ref due to (1) call of __meminit annotated setup_zone_pageset
5008  * [we're only called with non-NULL zone through __meminit paths] and
5009  * (2) call of __init annotated helper build_all_zonelists_init
5010  * [protected by SYSTEM_BOOTING].
5011  */
5012 void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
5013 {
5014     set_zonelist_order();
5015 
5016     if (system_state == SYSTEM_BOOTING) {
5017         build_all_zonelists_init();
5018     } else {
5019 #ifdef CONFIG_MEMORY_HOTPLUG
5020         if (zone)
5021             setup_zone_pageset(zone);
5022 #endif
5023         /* we have to stop all cpus to guarantee there is no user
5024            of zonelist */
5025         stop_machine(__build_all_zonelists, pgdat, NULL);
5026         /* cpuset refresh routine should be here */
5027     }
5028     vm_total_pages = nr_free_pagecache_pages();
5029     /*
5030      * Disable grouping by mobility if the number of pages in the
5031      * system is too low to allow the mechanism to work. It would be
5032      * more accurate, but expensive to check per-zone. This check is
5033      * made on memory-hotadd so a system can start with mobility
5034      * disabled and enable it later
5035      */
5036     if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
5037         page_group_by_mobility_disabled = 1;
5038     else
5039         page_group_by_mobility_disabled = 0;
5040 
5041     pr_info("Built %i zonelists in %s order, mobility grouping %s.  Total pages: %ld\n",
5042         nr_online_nodes,
5043         zonelist_order_name[current_zonelist_order],
5044         page_group_by_mobility_disabled ? "off" : "on",
5045         vm_total_pages);
5046 #ifdef CONFIG_NUMA
5047     pr_info("Policy zone: %s\n", zone_names[policy_zone]);
5048 #endif
5049 }
5050 
5051 /*
5052  * Initially all pages are reserved - free ones are freed
5053  * up by free_all_bootmem() once the early boot process is
5054  * done. Non-atomic initialization, single-pass.
5055  */
5056 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
5057         unsigned long start_pfn, enum memmap_context context)
5058 {
5059     struct vmem_altmap *altmap = to_vmem_altmap(__pfn_to_phys(start_pfn));
5060     unsigned long end_pfn = start_pfn + size;
5061     pg_data_t *pgdat = NODE_DATA(nid);
5062     unsigned long pfn;
5063     unsigned long nr_initialised = 0;
5064 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
5065     struct memblock_region *r = NULL, *tmp;
5066 #endif
5067 
5068     if (highest_memmap_pfn < end_pfn - 1)
5069         highest_memmap_pfn = end_pfn - 1;
5070 
5071     /*
5072      * Honor reservation requested by the driver for this ZONE_DEVICE
5073      * memory
5074      */
5075     if (altmap && start_pfn == altmap->base_pfn)
5076         start_pfn += altmap->reserve;
5077 
5078     for (pfn = start_pfn; pfn < end_pfn; pfn++) {
5079         /*
5080          * There can be holes in boot-time mem_map[]s handed to this
5081          * function.  They do not exist on hotplugged memory.
5082          */
5083         if (context != MEMMAP_EARLY)
5084             goto not_early;
5085 
5086         if (!early_pfn_valid(pfn))
5087             continue;
5088         if (!early_pfn_in_nid(pfn, nid))
5089             continue;
5090         if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
5091             break;
5092 
5093 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
5094         /*
5095          * Check given memblock attribute by firmware which can affect
5096          * kernel memory layout.  If zone==ZONE_MOVABLE but memory is
5097          * mirrored, it's an overlapped memmap init. skip it.
5098          */
5099         if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
5100             if (!r || pfn >= memblock_region_memory_end_pfn(r)) {
5101                 for_each_memblock(memory, tmp)
5102                     if (pfn < memblock_region_memory_end_pfn(tmp))
5103                         break;
5104                 r = tmp;
5105             }
5106             if (pfn >= memblock_region_memory_base_pfn(r) &&
5107                 memblock_is_mirror(r)) {
5108                 /* already initialized as NORMAL */
5109                 pfn = memblock_region_memory_end_pfn(r);
5110                 continue;
5111             }
5112         }
5113 #endif
5114 
5115 not_early:
5116         /*
5117          * Mark the block movable so that blocks are reserved for
5118          * movable at startup. This will force kernel allocations
5119          * to reserve their blocks rather than leaking throughout
5120          * the address space during boot when many long-lived
5121          * kernel allocations are made.
5122          *
5123          * bitmap is created for zone's valid pfn range. but memmap
5124          * can be created for invalid pages (for alignment)
5125          * check here not to call set_pageblock_migratetype() against
5126          * pfn out of zone.
5127          */
5128         if (!(pfn & (pageblock_nr_pages - 1))) {
5129             struct page *page = pfn_to_page(pfn);
5130 
5131             __init_single_page(page, pfn, zone, nid);
5132             set_pageblock_migratetype(page, MIGRATE_MOVABLE);
5133         } else {
5134             __init_single_pfn(pfn, zone, nid);
5135         }
5136     }
5137 }
5138 
5139 static void __meminit zone_init_free_lists(struct zone *zone)
5140 {
5141     unsigned int order, t;
5142     for_each_migratetype_order(order, t) {
5143         INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
5144         zone->free_area[order].nr_free = 0;
5145     }
5146 }
5147 
5148 #ifndef __HAVE_ARCH_MEMMAP_INIT
5149 #define memmap_init(size, nid, zone, start_pfn) \
5150     memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
5151 #endif
5152 
5153 static int zone_batchsize(struct zone *zone)
5154 {
5155 #ifdef CONFIG_MMU
5156     int batch;
5157 
5158     /*
5159      * The per-cpu-pages pools are set to around 1000th of the
5160      * size of the zone.  But no more than 1/2 of a meg.
5161      *
5162      * OK, so we don't know how big the cache is.  So guess.
5163      */
5164     batch = zone->managed_pages / 1024;
5165     if (batch * PAGE_SIZE > 512 * 1024)
5166         batch = (512 * 1024) / PAGE_SIZE;
5167     batch /= 4;     /* We effectively *= 4 below */
5168     if (batch < 1)
5169         batch = 1;
5170 
5171     /*
5172      * Clamp the batch to a 2^n - 1 value. Having a power
5173      * of 2 value was found to be more likely to have
5174      * suboptimal cache aliasing properties in some cases.
5175      *
5176      * For example if 2 tasks are alternately allocating
5177      * batches of pages, one task can end up with a lot
5178      * of pages of one half of the possible page colors
5179      * and the other with pages of the other colors.
5180      */
5181     batch = rounddown_pow_of_two(batch + batch/2) - 1;
5182 
5183     return batch;
5184 
5185 #else
5186     /* The deferral and batching of frees should be suppressed under NOMMU
5187      * conditions.
5188      *
5189      * The problem is that NOMMU needs to be able to allocate large chunks
5190      * of contiguous memory as there's no hardware page translation to
5191      * assemble apparent contiguous memory from discontiguous pages.
5192      *
5193      * Queueing large contiguous runs of pages for batching, however,
5194      * causes the pages to actually be freed in smaller chunks.  As there
5195      * can be a significant delay between the individual batches being
5196      * recycled, this leads to the once large chunks of space being
5197      * fragmented and becoming unavailable for high-order allocations.
5198      */
5199     return 0;
5200 #endif
5201 }
5202 
5203 /*
5204  * pcp->high and pcp->batch values are related and dependent on one another:
5205  * ->batch must never be higher then ->high.
5206  * The following function updates them in a safe manner without read side
5207  * locking.
5208  *
5209  * Any new users of pcp->batch and pcp->high should ensure they can cope with
5210  * those fields changing asynchronously (acording the the above rule).
5211  *
5212  * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
5213  * outside of boot time (or some other assurance that no concurrent updaters
5214  * exist).
5215  */
5216 static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
5217         unsigned long batch)
5218 {
5219        /* start with a fail safe value for batch */
5220     pcp->batch = 1;
5221     smp_wmb();
5222 
5223        /* Update high, then batch, in order */
5224     pcp->high = high;
5225     smp_wmb();
5226 
5227     pcp->batch = batch;
5228 }
5229 
5230 /* a companion to pageset_set_high() */
5231 static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
5232 {
5233     pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));
5234 }
5235 
5236 static void pageset_init(struct per_cpu_pageset *p)
5237 {
5238     struct per_cpu_pages *pcp;
5239     int migratetype;
5240 
5241     memset(p, 0, sizeof(*p));
5242 
5243     pcp = &p->pcp;
5244     pcp->count = 0;
5245     for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
5246         INIT_LIST_HEAD(&pcp->lists[migratetype]);
5247 }
5248 
5249 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
5250 {
5251     pageset_init(p);
5252     pageset_set_batch(p, batch);
5253 }
5254 
5255 /*
5256  * pageset_set_high() sets the high water mark for hot per_cpu_pagelist
5257  * to the value high for the pageset p.
5258  */
5259 static void pageset_set_high(struct per_cpu_pageset *p,
5260                 unsigned long high)
5261 {
5262     unsigned long batch = max(1UL, high / 4);
5263     if ((high / 4) > (PAGE_SHIFT * 8))
5264         batch = PAGE_SHIFT * 8;
5265 
5266     pageset_update(&p->pcp, high, batch);
5267 }
5268 
5269 static void pageset_set_high_and_batch(struct zone *zone,
5270                        struct per_cpu_pageset *pcp)
5271 {
5272     if (percpu_pagelist_fraction)
5273         pageset_set_high(pcp,
5274             (zone->managed_pages /
5275                 percpu_pagelist_fraction));
5276     else
5277         pageset_set_batch(pcp, zone_batchsize(zone));
5278 }
5279 
5280 static void __meminit zone_pageset_init(struct zone *zone, int cpu)
5281 {
5282     struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
5283 
5284     pageset_init(pcp);
5285     pageset_set_high_and_batch(zone, pcp);
5286 }
5287 
5288 static void __meminit setup_zone_pageset(struct zone *zone)
5289 {
5290     int cpu;
5291     zone->pageset = alloc_percpu(struct per_cpu_pageset);
5292     for_each_possible_cpu(cpu)
5293         zone_pageset_init(zone, cpu);
5294 }
5295 
5296 /*
5297  * Allocate per cpu pagesets and initialize them.
5298  * Before this call only boot pagesets were available.
5299  */
5300 void __init setup_per_cpu_pageset(void)
5301 {
5302     struct pglist_data *pgdat;
5303     struct zone *zone;
5304 
5305     for_each_populated_zone(zone)
5306         setup_zone_pageset(zone);
5307 
5308     for_each_online_pgdat(pgdat)
5309         pgdat->per_cpu_nodestats =
5310             alloc_percpu(struct per_cpu_nodestat);
5311 }
5312 
5313 static __meminit void zone_pcp_init(struct zone *zone)
5314 {
5315     /*
5316      * per cpu subsystem is not up at this point. The following code
5317      * relies on the ability of the linker to provide the
5318      * offset of a (static) per cpu variable into the per cpu area.
5319      */
5320     zone->pageset = &boot_pageset;
5321 
5322     if (populated_zone(zone))
5323         printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%u\n",
5324             zone->name, zone->present_pages,
5325                      zone_batchsize(zone));
5326 }
5327 
5328 int __meminit init_currently_empty_zone(struct zone *zone,
5329                     unsigned long zone_start_pfn,
5330                     unsigned long size)
5331 {
5332     struct pglist_data *pgdat = zone->zone_pgdat;
5333 
5334     pgdat->nr_zones = zone_idx(zone) + 1;
5335 
5336     zone->zone_start_pfn = zone_start_pfn;
5337 
5338     mminit_dprintk(MMINIT_TRACE, "memmap_init",
5339             "Initialising map node %d zone %lu pfns %lu -> %lu\n",
5340             pgdat->node_id,
5341             (unsigned long)zone_idx(zone),
5342             zone_start_pfn, (zone_start_pfn + size));
5343 
5344     zone_init_free_lists(zone);
5345     zone->initialized = 1;
5346 
5347     return 0;
5348 }
5349 
5350 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
5351 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
5352 
5353 /*
5354  * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
5355  */
5356 int __meminit __early_pfn_to_nid(unsigned long pfn,
5357                     struct mminit_pfnnid_cache *state)
5358 {
5359     unsigned long start_pfn, end_pfn;
5360     int nid;
5361 
5362     if (state->last_start <= pfn && pfn < state->last_end)
5363         return state->last_nid;
5364 
5365     nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
5366     if (nid != -1) {
5367         state->last_start = start_pfn;
5368         state->last_end = end_pfn;
5369         state->last_nid = nid;
5370     }
5371 
5372     return nid;
5373 }
5374 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
5375 
5376 /**
5377  * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range
5378  * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
5379  * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid
5380  *
5381  * If an architecture guarantees that all ranges registered contain no holes
5382  * and may be freed, this this function may be used instead of calling
5383  * memblock_free_early_nid() manually.
5384  */
5385 void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
5386 {
5387     unsigned long start_pfn, end_pfn;
5388     int i, this_nid;
5389 
5390     for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {
5391         start_pfn = min(start_pfn, max_low_pfn);
5392         end_pfn = min(end_pfn, max_low_pfn);
5393 
5394         if (start_pfn < end_pfn)
5395             memblock_free_early_nid(PFN_PHYS(start_pfn),
5396                     (end_pfn - start_pfn) << PAGE_SHIFT,
5397                     this_nid);
5398     }
5399 }
5400 
5401 /**
5402  * sparse_memory_present_with_active_regions - Call memory_present for each active range
5403  * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
5404  *
5405  * If an architecture guarantees that all ranges registered contain no holes and may
5406  * be freed, this function may be used instead of calling memory_present() manually.
5407  */
5408 void __init sparse_memory_present_with_active_regions(int nid)
5409 {
5410     unsigned long start_pfn, end_pfn;
5411     int i, this_nid;
5412 
5413     for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
5414         memory_present(this_nid, start_pfn, end_pfn);
5415 }
5416 
5417 /**
5418  * get_pfn_range_for_nid - Return the start and end page frames for a node
5419  * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
5420  * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
5421  * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
5422  *
5423  * It returns the start and end page frame of a node based on information
5424  * provided by memblock_set_node(). If called for a node
5425  * with no available memory, a warning is printed and the start and end
5426  * PFNs will be 0.
5427  */
5428 void __meminit get_pfn_range_for_nid(unsigned int nid,
5429             unsigned long *start_pfn, unsigned long *end_pfn)
5430 {
5431     unsigned long this_start_pfn, this_end_pfn;
5432     int i;
5433 
5434     *start_pfn = -1UL;
5435     *end_pfn = 0;
5436 
5437     for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
5438         *start_pfn = min(*start_pfn, this_start_pfn);
5439         *end_pfn = max(*end_pfn, this_end_pfn);
5440     }
5441 
5442     if (*start_pfn == -1UL)
5443         *start_pfn = 0;
5444 }
5445 
5446 /*
5447  * This finds a zone that can be used for ZONE_MOVABLE pages. The
5448  * assumption is made that zones within a node are ordered in monotonic
5449  * increasing memory addresses so that the "highest" populated zone is used
5450  */
5451 static void __init find_usable_zone_for_movable(void)
5452 {
5453     int zone_index;
5454     for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
5455         if (zone_index == ZONE_MOVABLE)
5456             continue;
5457 
5458         if (arch_zone_highest_possible_pfn[zone_index] >
5459                 arch_zone_lowest_possible_pfn[zone_index])
5460             break;
5461     }
5462 
5463     VM_BUG_ON(zone_index == -1);
5464     movable_zone = zone_index;
5465 }
5466 
5467 /*
5468  * The zone ranges provided by the architecture do not include ZONE_MOVABLE
5469  * because it is sized independent of architecture. Unlike the other zones,
5470  * the starting point for ZONE_MOVABLE is not fixed. It may be different
5471  * in each node depending on the size of each node and how evenly kernelcore
5472  * is distributed. This helper function adjusts the zone ranges
5473  * provided by the architecture for a given node by using the end of the
5474  * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
5475  * zones within a node are in order of monotonic increases memory addresses
5476  */
5477 static void __meminit adjust_zone_range_for_zone_movable(int nid,
5478                     unsigned long zone_type,
5479                     unsigned long node_start_pfn,
5480                     unsigned long node_end_pfn,
5481                     unsigned long *zone_start_pfn,
5482                     unsigned long *zone_end_pfn)
5483 {
5484     /* Only adjust if ZONE_MOVABLE is on this node */
5485     if (zone_movable_pfn[nid]) {
5486         /* Size ZONE_MOVABLE */
5487         if (zone_type == ZONE_MOVABLE) {
5488             *zone_start_pfn = zone_movable_pfn[nid];
5489             *zone_end_pfn = min(node_end_pfn,
5490                 arch_zone_highest_possible_pfn[movable_zone]);
5491 
5492         /* Adjust for ZONE_MOVABLE starting within this range */
5493         } else if (!mirrored_kernelcore &&
5494             *zone_start_pfn < zone_movable_pfn[nid] &&
5495             *zone_end_pfn > zone_movable_pfn[nid]) {
5496             *zone_end_pfn = zone_movable_pfn[nid];
5497 
5498         /* Check if this whole range is within ZONE_MOVABLE */
5499         } else if (*zone_start_pfn >= zone_movable_pfn[nid])
5500             *zone_start_pfn = *zone_end_pfn;
5501     }
5502 }
5503 
5504 /*
5505  * Return the number of pages a zone spans in a node, including holes
5506  * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
5507  */
5508 static unsigned long __meminit zone_spanned_pages_in_node(int nid,
5509                     unsigned long zone_type,
5510                     unsigned long node_start_pfn,
5511                     unsigned long node_end_pfn,
5512                     unsigned long *zone_start_pfn,
5513                     unsigned long *zone_end_pfn,
5514                     unsigned long *ignored)
5515 {
5516     /* When hotadd a new node from cpu_up(), the node should be empty */
5517     if (!node_start_pfn && !node_end_pfn)
5518         return 0;
5519 
5520     /* Get the start and end of the zone */
5521     *zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
5522     *zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
5523     adjust_zone_range_for_zone_movable(nid, zone_type,
5524                 node_start_pfn, node_end_pfn,
5525                 zone_start_pfn, zone_end_pfn);
5526 
5527     /* Check that this node has pages within the zone's required range */
5528     if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)
5529         return 0;
5530 
5531     /* Move the zone boundaries inside the node if necessary */
5532     *zone_end_pfn = min(*zone_end_pfn, node_end_pfn);
5533     *zone_start_pfn = max(*zone_start_pfn, node_start_pfn);
5534 
5535     /* Return the spanned pages */
5536     return *zone_end_pfn - *zone_start_pfn;
5537 }
5538 
5539 /*
5540  * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
5541  * then all holes in the requested range will be accounted for.
5542  */
5543 unsigned long __meminit __absent_pages_in_range(int nid,
5544                 unsigned long range_start_pfn,
5545                 unsigned long range_end_pfn)
5546 {
5547     unsigned long nr_absent = range_end_pfn - range_start_pfn;
5548     unsigned long start_pfn, end_pfn;
5549     int i;
5550 
5551     for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
5552         start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
5553         end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
5554         nr_absent -= end_pfn - start_pfn;
5555     }
5556     return nr_absent;
5557 }
5558 
5559 /**
5560  * absent_pages_in_range - Return number of page frames in holes within a range
5561  * @start_pfn: The start PFN to start searching for holes
5562  * @end_pfn: The end PFN to stop searching for holes
5563  *
5564  * It returns the number of pages frames in memory holes within a range.
5565  */
5566 unsigned long __init absent_pages_in_range(unsigned long start_pfn,
5567                             unsigned long end_pfn)
5568 {
5569     return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
5570 }
5571 
5572 /* Return the number of page frames in holes in a zone on a node */
5573 static unsigned long __meminit zone_absent_pages_in_node(int nid,
5574                     unsigned long zone_type,
5575                     unsigned long node_start_pfn,
5576                     unsigned long node_end_pfn,
5577                     unsigned long *ignored)
5578 {
5579     unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
5580     unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
5581     unsigned long zone_start_pfn, zone_end_pfn;
5582     unsigned long nr_absent;
5583 
5584     /* When hotadd a new node from cpu_up(), the node should be empty */
5585     if (!node_start_pfn && !node_end_pfn)
5586         return 0;
5587 
5588     zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
5589     zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
5590 
5591     adjust_zone_range_for_zone_movable(nid, zone_type,
5592             node_start_pfn, node_end_pfn,
5593             &zone_start_pfn, &zone_end_pfn);
5594     nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
5595 
5596     /*
5597      * ZONE_MOVABLE handling.
5598      * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
5599      * and vice versa.
5600      */
5601     if (mirrored_kernelcore && zone_movable_pfn[nid]) {
5602         unsigned long start_pfn, end_pfn;
5603         struct memblock_region *r;
5604 
5605         for_each_memblock(memory, r) {
5606             start_pfn = clamp(memblock_region_memory_base_pfn(r),
5607                       zone_start_pfn, zone_end_pfn);
5608             end_pfn = clamp(memblock_region_memory_end_pfn(r),
5609                     zone_start_pfn, zone_end_pfn);
5610 
5611             if (zone_type == ZONE_MOVABLE &&
5612                 memblock_is_mirror(r))
5613                 nr_absent += end_pfn - start_pfn;
5614 
5615             if (zone_type == ZONE_NORMAL &&
5616                 !memblock_is_mirror(r))
5617                 nr_absent += end_pfn - start_pfn;
5618         }
5619     }
5620 
5621     return nr_absent;
5622 }
5623 
5624 #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
5625 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
5626                     unsigned long zone_type,
5627                     unsigned long node_start_pfn,
5628                     unsigned long node_end_pfn,
5629                     unsigned long *zone_start_pfn,
5630                     unsigned long *zone_end_pfn,
5631                     unsigned long *zones_size)
5632 {
5633     unsigned int zone;
5634 
5635     *zone_start_pfn = node_start_pfn;
5636     for (zone = 0; zone < zone_type; zone++)
5637         *zone_start_pfn += zones_size[zone];
5638 
5639     *zone_end_pfn = *zone_start_pfn + zones_size[zone_type];
5640 
5641     return zones_size[zone_type];
5642 }
5643 
5644 static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
5645                         unsigned long zone_type,
5646                         unsigned long node_start_pfn,
5647                         unsigned long node_end_pfn,
5648                         unsigned long *zholes_size)
5649 {
5650     if (!zholes_size)
5651         return 0;
5652 
5653     return zholes_size[zone_type];
5654 }
5655 
5656 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
5657 
5658 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
5659                         unsigned long node_start_pfn,
5660                         unsigned long node_end_pfn,
5661                         unsigned long *zones_size,
5662                         unsigned long *zholes_size)
5663 {
5664     unsigned long realtotalpages = 0, totalpages = 0;
5665     enum zone_type i;
5666 
5667     for (i = 0; i < MAX_NR_ZONES; i++) {
5668         struct zone *zone = pgdat->node_zones + i;
5669         unsigned long zone_start_pfn, zone_end_pfn;
5670         unsigned long size, real_size;
5671 
5672         size = zone_spanned_pages_in_node(pgdat->node_id, i,
5673                           node_start_pfn,
5674                           node_end_pfn,
5675                           &zone_start_pfn,
5676                           &zone_end_pfn,
5677                           zones_size);
5678         real_size = size - zone_absent_pages_in_node(pgdat->node_id, i,
5679                           node_start_pfn, node_end_pfn,
5680                           zholes_size);
5681         if (size)
5682             zone->zone_start_pfn = zone_start_pfn;
5683         else
5684             zone->zone_start_pfn = 0;
5685         zone->spanned_pages = size;
5686         zone->present_pages = real_size;
5687 
5688         totalpages += size;
5689         realtotalpages += real_size;
5690     }
5691 
5692     pgdat->node_spanned_pages = totalpages;
5693     pgdat->node_present_pages = realtotalpages;
5694     printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
5695                             realtotalpages);
5696 }
5697 
5698 #ifndef CONFIG_SPARSEMEM
5699 /*
5700  * Calculate the size of the zone->blockflags rounded to an unsigned long
5701  * Start by making sure zonesize is a multiple of pageblock_order by rounding
5702  * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
5703  * round what is now in bits to nearest long in bits, then return it in
5704  * bytes.
5705  */
5706 static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
5707 {
5708     unsigned long usemapsize;
5709 
5710     zonesize += zone_start_pfn & (pageblock_nr_pages-1);
5711     usemapsize = roundup(zonesize, pageblock_nr_pages);
5712     usemapsize = usemapsize >> pageblock_order;
5713     usemapsize *= NR_PAGEBLOCK_BITS;
5714     usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
5715 
5716     return usemapsize / 8;
5717 }
5718 
5719 static void __init setup_usemap(struct pglist_data *pgdat,
5720                 struct zone *zone,
5721                 unsigned long zone_start_pfn,
5722                 unsigned long zonesize)
5723 {
5724     unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
5725     zone->pageblock_flags = NULL;
5726     if (usemapsize)
5727         zone->pageblock_flags =
5728             memblock_virt_alloc_node_nopanic(usemapsize,
5729                              pgdat->node_id);
5730 }
5731 #else
5732 static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
5733                 unsigned long zone_start_pfn, unsigned long zonesize) {}
5734 #endif /* CONFIG_SPARSEMEM */
5735 
5736 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
5737 
5738 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
5739 void __paginginit set_pageblock_order(void)
5740 {
5741     unsigned int order;
5742 
5743     /* Check that pageblock_nr_pages has not already been setup */
5744     if (pageblock_order)
5745         return;
5746 
5747     if (HPAGE_SHIFT > PAGE_SHIFT)
5748         order = HUGETLB_PAGE_ORDER;
5749     else
5750         order = MAX_ORDER - 1;
5751 
5752     /*
5753      * Assume the largest contiguous order of interest is a huge page.
5754      * This value may be variable depending on boot parameters on IA64 and
5755      * powerpc.
5756      */
5757     pageblock_order = order;
5758 }
5759 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
5760 
5761 /*
5762  * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
5763  * is unused as pageblock_order is set at compile-time. See
5764  * include/linux/pageblock-flags.h for the values of pageblock_order based on
5765  * the kernel config
5766  */
5767 void __paginginit set_pageblock_order(void)
5768 {
5769 }
5770 
5771 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
5772 
5773 static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
5774                            unsigned long present_pages)
5775 {
5776     unsigned long pages = spanned_pages;
5777 
5778     /*
5779      * Provide a more accurate estimation if there are holes within
5780      * the zone and SPARSEMEM is in use. If there are holes within the
5781      * zone, each populated memory region may cost us one or two extra
5782      * memmap pages due to alignment because memmap pages for each
5783      * populated regions may not naturally algined on page boundary.
5784      * So the (present_pages >> 4) heuristic is a tradeoff for that.
5785      */
5786     if (spanned_pages > present_pages + (present_pages >> 4) &&
5787         IS_ENABLED(CONFIG_SPARSEMEM))
5788         pages = present_pages;
5789 
5790     return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
5791 }
5792 
5793 /*
5794  * Set up the zone data structures:
5795  *   - mark all pages reserved
5796  *   - mark all memory queues empty
5797  *   - clear the memory bitmaps
5798  *
5799  * NOTE: pgdat should get zeroed by caller.
5800  */
5801 static void __paginginit free_area_init_core(struct pglist_data *pgdat)
5802 {
5803     enum zone_type j;
5804     int nid = pgdat->node_id;
5805     int ret;
5806 
5807     pgdat_resize_init(pgdat);
5808 #ifdef CONFIG_NUMA_BALANCING
5809     spin_lock_init(&pgdat->numabalancing_migrate_lock);
5810     pgdat->numabalancing_migrate_nr_pages = 0;
5811     pgdat->numabalancing_migrate_next_window = jiffies;
5812 #endif
5813 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
5814     spin_lock_init(&pgdat->split_queue_lock);
5815     INIT_LIST_HEAD(&pgdat->split_queue);
5816     pgdat->split_queue_len = 0;
5817 #endif
5818     init_waitqueue_head(&pgdat->kswapd_wait);
5819     init_waitqueue_head(&pgdat->pfmemalloc_wait);
5820 #ifdef CONFIG_COMPACTION
5821     init_waitqueue_head(&pgdat->kcompactd_wait);
5822 #endif
5823     pgdat_page_ext_init(pgdat);
5824     spin_lock_init(&pgdat->lru_lock);
5825     lruvec_init(node_lruvec(pgdat));
5826 
5827     for (j = 0; j < MAX_NR_ZONES; j++) {
5828         struct zone *zone = pgdat->node_zones + j;
5829         unsigned long size, realsize, freesize, memmap_pages;
5830         unsigned long zone_start_pfn = zone->zone_start_pfn;
5831 
5832         size = zone->spanned_pages;
5833         realsize = freesize = zone->present_pages;
5834 
5835         /*
5836          * Adjust freesize so that it accounts for how much memory
5837          * is used by this zone for memmap. This affects the watermark
5838          * and per-cpu initialisations
5839          */
5840         memmap_pages = calc_memmap_size(size, realsize);
5841         if (!is_highmem_idx(j)) {
5842             if (freesize >= memmap_pages) {
5843                 freesize -= memmap_pages;
5844                 if (memmap_pages)
5845                     printk(KERN_DEBUG
5846                            "  %s zone: %lu pages used for memmap\n",
5847                            zone_names[j], memmap_pages);
5848             } else
5849                 pr_warn("  %s zone: %lu pages exceeds freesize %lu\n",
5850                     zone_names[j], memmap_pages, freesize);
5851         }
5852 
5853         /* Account for reserved pages */
5854         if (j == 0 && freesize > dma_reserve) {
5855             freesize -= dma_reserve;
5856             printk(KERN_DEBUG "  %s zone: %lu pages reserved\n",
5857                     zone_names[0], dma_reserve);
5858         }
5859 
5860         if (!is_highmem_idx(j))
5861             nr_kernel_pages += freesize;
5862         /* Charge for highmem memmap if there are enough kernel pages */
5863         else if (nr_kernel_pages > memmap_pages * 2)
5864             nr_kernel_pages -= memmap_pages;
5865         nr_all_pages += freesize;
5866 
5867         /*
5868          * Set an approximate value for lowmem here, it will be adjusted
5869          * when the bootmem allocator frees pages into the buddy system.
5870          * And all highmem pages will be managed by the buddy system.
5871          */
5872         zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
5873 #ifdef CONFIG_NUMA
5874         zone->node = nid;
5875 #endif
5876         zone->name = zone_names[j];
5877         zone->zone_pgdat = pgdat;
5878         spin_lock_init(&zone->lock);
5879         zone_seqlock_init(zone);
5880         zone_pcp_init(zone);
5881 
5882         if (!size)
5883             continue;
5884 
5885         set_pageblock_order();
5886         setup_usemap(pgdat, zone, zone_start_pfn, size);
5887         ret = init_currently_empty_zone(zone, zone_start_pfn, size);
5888         BUG_ON(ret);
5889         memmap_init(size, nid, j, zone_start_pfn);
5890     }
5891 }
5892 
5893 static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
5894 {
5895     unsigned long __maybe_unused start = 0;
5896     unsigned long __maybe_unused offset = 0;
5897 
5898     /* Skip empty nodes */
5899     if (!pgdat->node_spanned_pages)
5900         return;
5901 
5902 #ifdef CONFIG_FLAT_NODE_MEM_MAP
5903     start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
5904     offset = pgdat->node_start_pfn - start;
5905     /* ia64 gets its own node_mem_map, before this, without bootmem */
5906     if (!pgdat->node_mem_map) {
5907         unsigned long size, end;
5908         struct page *map;
5909 
5910         /*
5911          * The zone's endpoints aren't required to be MAX_ORDER
5912          * aligned but the node_mem_map endpoints must be in order
5913          * for the buddy allocator to function correctly.
5914          */
5915         end = pgdat_end_pfn(pgdat);
5916         end = ALIGN(end, MAX_ORDER_NR_PAGES);
5917         size =  (end - start) * sizeof(struct page);
5918         map = alloc_remap(pgdat->node_id, size);
5919         if (!map)
5920             map = memblock_virt_alloc_node_nopanic(size,
5921                                    pgdat->node_id);
5922         pgdat->node_mem_map = map + offset;
5923     }
5924 #ifndef CONFIG_NEED_MULTIPLE_NODES
5925     /*
5926      * With no DISCONTIG, the global mem_map is just set as node 0's
5927      */
5928     if (pgdat == NODE_DATA(0)) {
5929         mem_map = NODE_DATA(0)->node_mem_map;
5930 #if defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) || defined(CONFIG_FLATMEM)
5931         if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
5932             mem_map -= offset;
5933 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
5934     }
5935 #endif
5936 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
5937 }
5938 
5939 void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
5940         unsigned long node_start_pfn, unsigned long *zholes_size)
5941 {
5942     pg_data_t *pgdat = NODE_DATA(nid);
5943     unsigned long start_pfn = 0;
5944     unsigned long end_pfn = 0;
5945 
5946     /* pg_data_t should be reset to zero when it's allocated */
5947     WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx);
5948 
5949     reset_deferred_meminit(pgdat);
5950     pgdat->node_id = nid;
5951     pgdat->node_start_pfn = node_start_pfn;
5952     pgdat->per_cpu_nodestats = NULL;
5953 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
5954     get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
5955     pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
5956         (u64)start_pfn << PAGE_SHIFT,
5957         end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
5958 #else
5959     start_pfn = node_start_pfn;
5960 #endif
5961     calculate_node_totalpages(pgdat, start_pfn, end_pfn,
5962                   zones_size, zholes_size);
5963 
5964     alloc_node_mem_map(pgdat);
5965 #ifdef CONFIG_FLAT_NODE_MEM_MAP
5966     printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
5967         nid, (unsigned long)pgdat,
5968         (unsigned long)pgdat->node_mem_map);
5969 #endif
5970 
5971     free_area_init_core(pgdat);
5972 }
5973 
5974 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
5975 
5976 #if MAX_NUMNODES > 1
5977 /*
5978  * Figure out the number of possible node ids.
5979  */
5980 void __init setup_nr_node_ids(void)
5981 {
5982     unsigned int highest;
5983 
5984     highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES);
5985     nr_node_ids = highest + 1;
5986 }
5987 #endif
5988 
5989 /**
5990  * node_map_pfn_alignment - determine the maximum internode alignment
5991  *
5992  * This function should be called after node map is populated and sorted.
5993  * It calculates the maximum power of two alignment which can distinguish
5994  * all the nodes.
5995  *
5996  * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
5997  * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)).  If the
5998  * nodes are shifted by 256MiB, 256MiB.  Note that if only the last node is
5999  * shifted, 1GiB is enough and this function will indicate so.
6000  *
6001  * This is used to test whether pfn -> nid mapping of the chosen memory
6002  * model has fine enough granularity to avoid incorrect mapping for the
6003  * populated node map.
6004  *
6005  * Returns the determined alignment in pfn's.  0 if there is no alignment
6006  * requirement (single node).
6007  */
6008 unsigned long __init node_map_pfn_alignment(void)
6009 {
6010     unsigned long accl_mask = 0, last_end = 0;
6011     unsigned long start, end, mask;
6012     int last_nid = -1;
6013     int i, nid;
6014 
6015     for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
6016         if (!start || last_nid < 0 || last_nid == nid) {
6017             last_nid = nid;
6018             last_end = end;
6019             continue;
6020         }
6021 
6022         /*
6023          * Start with a mask granular enough to pin-point to the
6024          * start pfn and tick off bits one-by-one until it becomes
6025          * too coarse to separate the current node from the last.
6026          */
6027         mask = ~((1 << __ffs(start)) - 1);
6028         while (mask && last_end <= (start & (mask << 1)))
6029             mask <<= 1;
6030 
6031         /* accumulate all internode masks */
6032         accl_mask |= mask;
6033     }
6034 
6035     /* convert mask to number of pages */
6036     return ~accl_mask + 1;
6037 }
6038 
6039 /* Find the lowest pfn for a node */
6040 static unsigned long __init find_min_pfn_for_node(int nid)
6041 {
6042     unsigned long min_pfn = ULONG_MAX;
6043     unsigned long start_pfn;
6044     int i;
6045 
6046     for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
6047         min_pfn = min(min_pfn, start_pfn);
6048 
6049     if (min_pfn == ULONG_MAX) {
6050         pr_warn("Could not find start_pfn for node %d\n", nid);
6051         return 0;
6052     }
6053 
6054     return min_pfn;
6055 }
6056 
6057 /**
6058  * find_min_pfn_with_active_regions - Find the minimum PFN registered
6059  *
6060  * It returns the minimum PFN based on information provided via
6061  * memblock_set_node().
6062  */
6063 unsigned long __init find_min_pfn_with_active_regions(void)
6064 {
6065     return find_min_pfn_for_node(MAX_NUMNODES);
6066 }
6067 
6068 /*
6069  * early_calculate_totalpages()
6070  * Sum pages in active regions for movable zone.
6071  * Populate N_MEMORY for calculating usable_nodes.
6072  */
6073 static unsigned long __init early_calculate_totalpages(void)
6074 {
6075     unsigned long totalpages = 0;
6076     unsigned long start_pfn, end_pfn;
6077     int i, nid;
6078 
6079     for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
6080         unsigned long pages = end_pfn - start_pfn;
6081 
6082         totalpages += pages;
6083         if (pages)
6084             node_set_state(nid, N_MEMORY);
6085     }
6086     return totalpages;
6087 }
6088 
6089 /*
6090  * Find the PFN the Movable zone begins in each node. Kernel memory
6091  * is spread evenly between nodes as long as the nodes have enough
6092  * memory. When they don't, some nodes will have more kernelcore than
6093  * others
6094  */
6095 static void __init find_zone_movable_pfns_for_nodes(void)
6096 {
6097     int i, nid;
6098     unsigned long usable_startpfn;
6099     unsigned long kernelcore_node, kernelcore_remaining;
6100     /* save the state before borrow the nodemask */
6101     nodemask_t saved_node_state = node_states[N_MEMORY];
6102     unsigned long totalpages = early_calculate_totalpages();
6103     int usable_nodes = nodes_weight(node_states[N_MEMORY]);
6104     struct memblock_region *r;
6105 
6106     /* Need to find movable_zone earlier when movable_node is specified. */
6107     find_usable_zone_for_movable();
6108 
6109     /*
6110      * If movable_node is specified, ignore kernelcore and movablecore
6111      * options.
6112      */
6113     if (movable_node_is_enabled()) {
6114         for_each_memblock(memory, r) {
6115             if (!memblock_is_hotpluggable(r))
6116                 continue;
6117 
6118             nid = r->nid;
6119 
6120             usable_startpfn = PFN_DOWN(r->base);
6121             zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
6122                 min(usable_startpfn, zone_movable_pfn[nid]) :
6123                 usable_startpfn;
6124         }
6125 
6126         goto out2;
6127     }
6128 
6129     /*
6130      * If kernelcore=mirror is specified, ignore movablecore option
6131      */
6132     if (mirrored_kernelcore) {
6133         bool mem_below_4gb_not_mirrored = false;
6134 
6135         for_each_memblock(memory, r) {
6136             if (memblock_is_mirror(r))
6137                 continue;
6138 
6139             nid = r->nid;
6140 
6141             usable_startpfn = memblock_region_memory_base_pfn(r);
6142 
6143             if (usable_startpfn < 0x100000) {
6144                 mem_below_4gb_not_mirrored = true;
6145                 continue;
6146             }
6147 
6148             zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
6149                 min(usable_startpfn, zone_movable_pfn[nid]) :
6150                 usable_startpfn;
6151         }
6152 
6153         if (mem_below_4gb_not_mirrored)
6154             pr_warn("This configuration results in unmirrored kernel memory.");
6155 
6156         goto out2;
6157     }
6158 
6159     /*
6160      * If movablecore=nn[KMG] was specified, calculate what size of
6161      * kernelcore that corresponds so that memory usable for
6162      * any allocation type is evenly spread. If both kernelcore
6163      * and movablecore are specified, then the value of kernelcore
6164      * will be used for required_kernelcore if it's greater than
6165      * what movablecore would have allowed.
6166      */
6167     if (required_movablecore) {
6168         unsigned long corepages;
6169 
6170         /*
6171          * Round-up so that ZONE_MOVABLE is at least as large as what
6172          * was requested by the user
6173          */
6174         required_movablecore =
6175             roundup(required_movablecore, MAX_ORDER_NR_PAGES);
6176         required_movablecore = min(totalpages, required_movablecore);
6177         corepages = totalpages - required_movablecore;
6178 
6179         required_kernelcore = max(required_kernelcore, corepages);
6180     }
6181 
6182     /*
6183      * If kernelcore was not specified or kernelcore size is larger
6184      * than totalpages, there is no ZONE_MOVABLE.
6185      */
6186     if (!required_kernelcore || required_kernelcore >= totalpages)
6187         goto out;
6188 
6189     /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
6190     usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
6191 
6192 restart:
6193     /* Spread kernelcore memory as evenly as possible throughout nodes */
6194     kernelcore_node = required_kernelcore / usable_nodes;
6195     for_each_node_state(nid, N_MEMORY) {
6196         unsigned long start_pfn, end_pfn;
6197 
6198         /*
6199          * Recalculate kernelcore_node if the division per node
6200          * now exceeds what is necessary to satisfy the requested
6201          * amount of memory for the kernel
6202          */
6203         if (required_kernelcore < kernelcore_node)
6204             kernelcore_node = required_kernelcore / usable_nodes;
6205 
6206         /*
6207          * As the map is walked, we track how much memory is usable
6208          * by the kernel using kernelcore_remaining. When it is
6209          * 0, the rest of the node is usable by ZONE_MOVABLE
6210          */
6211         kernelcore_remaining = kernelcore_node;
6212 
6213         /* Go through each range of PFNs within this node */
6214         for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
6215             unsigned long size_pages;
6216 
6217             start_pfn = max(start_pfn, zone_movable_pfn[nid]);
6218             if (start_pfn >= end_pfn)
6219                 continue;
6220 
6221             /* Account for what is only usable for kernelcore */
6222             if (start_pfn < usable_startpfn) {
6223                 unsigned long kernel_pages;
6224                 kernel_pages = min(end_pfn, usable_startpfn)
6225                                 - start_pfn;
6226 
6227                 kernelcore_remaining -= min(kernel_pages,
6228                             kernelcore_remaining);
6229                 required_kernelcore -= min(kernel_pages,
6230                             required_kernelcore);
6231 
6232                 /* Continue if range is now fully accounted */
6233                 if (end_pfn <= usable_startpfn) {
6234 
6235                     /*
6236                      * Push zone_movable_pfn to the end so
6237                      * that if we have to rebalance
6238                      * kernelcore across nodes, we will
6239                      * not double account here
6240                      */
6241                     zone_movable_pfn[nid] = end_pfn;
6242                     continue;
6243                 }
6244                 start_pfn = usable_startpfn;
6245             }
6246 
6247             /*
6248              * The usable PFN range for ZONE_MOVABLE is from
6249              * start_pfn->end_pfn. Calculate size_pages as the
6250              * number of pages used as kernelcore
6251              */
6252             size_pages = end_pfn - start_pfn;
6253             if (size_pages > kernelcore_remaining)
6254                 size_pages = kernelcore_remaining;
6255             zone_movable_pfn[nid] = start_pfn + size_pages;
6256 
6257             /*
6258              * Some kernelcore has been met, update counts and
6259              * break if the kernelcore for this node has been
6260              * satisfied
6261              */
6262             required_kernelcore -= min(required_kernelcore,
6263                                 size_pages);
6264             kernelcore_remaining -= size_pages;
6265             if (!kernelcore_remaining)
6266                 break;
6267         }
6268     }
6269 
6270     /*
6271      * If there is still required_kernelcore, we do another pass with one
6272      * less node in the count. This will push zone_movable_pfn[nid] further
6273      * along on the nodes that still have memory until kernelcore is
6274      * satisfied
6275      */
6276     usable_nodes--;
6277     if (usable_nodes && required_kernelcore > usable_nodes)
6278         goto restart;
6279 
6280 out2:
6281     /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
6282     for (nid = 0; nid < MAX_NUMNODES; nid++)
6283         zone_movable_pfn[nid] =
6284             roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
6285 
6286 out:
6287     /* restore the node_state */
6288     node_states[N_MEMORY] = saved_node_state;
6289 }
6290 
6291 /* Any regular or high memory on that node ? */
6292 static void check_for_memory(pg_data_t *pgdat, int nid)
6293 {
6294     enum zone_type zone_type;
6295 
6296     if (N_MEMORY == N_NORMAL_MEMORY)
6297         return;
6298 
6299     for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
6300         struct zone *zone = &pgdat->node_zones[zone_type];
6301         if (populated_zone(zone)) {
6302             node_set_state(nid, N_HIGH_MEMORY);
6303             if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
6304                 zone_type <= ZONE_NORMAL)
6305                 node_set_state(nid, N_NORMAL_MEMORY);
6306             break;
6307         }
6308     }
6309 }
6310 
6311 /**
6312  * free_area_init_nodes - Initialise all pg_data_t and zone data
6313  * @max_zone_pfn: an array of max PFNs for each zone
6314  *
6315  * This will call free_area_init_node() for each active node in the system.
6316  * Using the page ranges provided by memblock_set_node(), the size of each
6317  * zone in each node and their holes is calculated. If the maximum PFN
6318  * between two adjacent zones match, it is assumed that the zone is empty.
6319  * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
6320  * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
6321  * starts where the previous one ended. For example, ZONE_DMA32 starts
6322  * at arch_max_dma_pfn.
6323  */
6324 void __init free_area_init_nodes(unsigned long *max_zone_pfn)
6325 {
6326     unsigned long start_pfn, end_pfn;
6327     int i, nid;
6328 
6329     /* Record where the zone boundaries are */
6330     memset(arch_zone_lowest_possible_pfn, 0,
6331                 sizeof(arch_zone_lowest_possible_pfn));
6332     memset(arch_zone_highest_possible_pfn, 0,
6333                 sizeof(arch_zone_highest_possible_pfn));
6334 
6335     start_pfn = find_min_pfn_with_active_regions();
6336 
6337     for (i = 0; i < MAX_NR_ZONES; i++) {
6338         if (i == ZONE_MOVABLE)
6339             continue;
6340 
6341         end_pfn = max(max_zone_pfn[i], start_pfn);
6342         arch_zone_lowest_possible_pfn[i] = start_pfn;
6343         arch_zone_highest_possible_pfn[i] = end_pfn;
6344 
6345         start_pfn = end_pfn;
6346     }
6347     arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
6348     arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
6349 
6350     /* Find the PFNs that ZONE_MOVABLE begins at in each node */
6351     memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
6352     find_zone_movable_pfns_for_nodes();
6353 
6354     /* Print out the zone ranges */
6355     pr_info("Zone ranges:\n");
6356     for (i = 0; i < MAX_NR_ZONES; i++) {
6357         if (i == ZONE_MOVABLE)
6358             continue;
6359         pr_info("  %-8s ", zone_names[i]);
6360         if (arch_zone_lowest_possible_pfn[i] ==
6361                 arch_zone_highest_possible_pfn[i])
6362             pr_cont("empty\n");
6363         else
6364             pr_cont("[mem %#018Lx-%#018Lx]\n",
6365                 (u64)arch_zone_lowest_possible_pfn[i]
6366                     << PAGE_SHIFT,
6367                 ((u64)arch_zone_highest_possible_pfn[i]
6368                     << PAGE_SHIFT) - 1);
6369     }
6370 
6371     /* Print out the PFNs ZONE_MOVABLE begins at in each node */
6372     pr_info("Movable zone start for each node\n");
6373     for (i = 0; i < MAX_NUMNODES; i++) {
6374         if (zone_movable_pfn[i])
6375             pr_info("  Node %d: %#018Lx\n", i,
6376                    (u64)zone_movable_pfn[i] << PAGE_SHIFT);
6377     }
6378 
6379     /* Print out the early node map */
6380     pr_info("Early memory node ranges\n");
6381     for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
6382         pr_info("  node %3d: [mem %#018Lx-%#018Lx]\n", nid,
6383             (u64)start_pfn << PAGE_SHIFT,
6384             ((u64)end_pfn << PAGE_SHIFT) - 1);
6385 
6386     /* Initialise every node */
6387     mminit_verify_pageflags_layout();
6388     setup_nr_node_ids();
6389     for_each_online_node(nid) {
6390         pg_data_t *pgdat = NODE_DATA(nid);
6391         free_area_init_node(nid, NULL,
6392                 find_min_pfn_for_node(nid), NULL);
6393 
6394         /* Any memory on that node */
6395         if (pgdat->node_present_pages)
6396             node_set_state(nid, N_MEMORY);
6397         check_for_memory(pgdat, nid);
6398     }
6399 }
6400 
6401 static int __init cmdline_parse_core(char *p, unsigned long *core)
6402 {
6403     unsigned long long coremem;
6404     if (!p)
6405         return -EINVAL;
6406 
6407     coremem = memparse(p, &p);
6408     *core = coremem >> PAGE_SHIFT;
6409 
6410     /* Paranoid check that UL is enough for the coremem value */
6411     WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
6412 
6413     return 0;
6414 }
6415 
6416 /*
6417  * kernelcore=size sets the amount of memory for use for allocations that
6418  * cannot be reclaimed or migrated.
6419  */
6420 static int __init cmdline_parse_kernelcore(char *p)
6421 {
6422     /* parse kernelcore=mirror */
6423     if (parse_option_str(p, "mirror")) {
6424         mirrored_kernelcore = true;
6425         return 0;
6426     }
6427 
6428     return cmdline_parse_core(p, &required_kernelcore);
6429 }
6430 
6431 /*
6432  * movablecore=size sets the amount of memory for use for allocations that
6433  * can be reclaimed or migrated.
6434  */
6435 static int __init cmdline_parse_movablecore(char *p)
6436 {
6437     return cmdline_parse_core(p, &required_movablecore);
6438 }
6439 
6440 early_param("kernelcore", cmdline_parse_kernelcore);
6441 early_param("movablecore", cmdline_parse_movablecore);
6442 
6443 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
6444 
6445 void adjust_managed_page_count(struct page *page, long count)
6446 {
6447     spin_lock(&managed_page_count_lock);
6448     page_zone(page)->managed_pages += count;
6449     totalram_pages += count;
6450 #ifdef CONFIG_HIGHMEM
6451     if (PageHighMem(page))
6452         totalhigh_pages += count;
6453 #endif
6454     spin_unlock(&managed_page_count_lock);
6455 }
6456 EXPORT_SYMBOL(adjust_managed_page_count);
6457 
6458 unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
6459 {
6460     void *pos;
6461     unsigned long pages = 0;
6462 
6463     start = (void *)PAGE_ALIGN((unsigned long)start);
6464     end = (void *)((unsigned long)end & PAGE_MASK);
6465     for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
6466         if ((unsigned int)poison <= 0xFF)
6467             memset(pos, poison, PAGE_SIZE);
6468         free_reserved_page(virt_to_page(pos));
6469     }
6470 
6471     if (pages && s)
6472         pr_info("Freeing %s memory: %ldK\n",
6473             s, pages << (PAGE_SHIFT - 10));
6474 
6475     return pages;
6476 }
6477 EXPORT_SYMBOL(free_reserved_area);
6478 
6479 #ifdef  CONFIG_HIGHMEM
6480 void free_highmem_page(struct page *page)
6481 {
6482     __free_reserved_page(page);
6483     totalram_pages++;
6484     page_zone(page)->managed_pages++;
6485     totalhigh_pages++;
6486 }
6487 #endif
6488 
6489 
6490 void __init mem_init_print_info(const char *str)
6491 {
6492     unsigned long physpages, codesize, datasize, rosize, bss_size;
6493     unsigned long init_code_size, init_data_size;
6494 
6495     physpages = get_num_physpages();
6496     codesize = _etext - _stext;
6497     datasize = _edata - _sdata;
6498     rosize = __end_rodata - __start_rodata;
6499     bss_size = __bss_stop - __bss_start;
6500     init_data_size = __init_end - __init_begin;
6501     init_code_size = _einittext - _sinittext;
6502 
6503     /*
6504      * Detect special cases and adjust section sizes accordingly:
6505      * 1) .init.* may be embedded into .data sections
6506      * 2) .init.text.* may be out of [__init_begin, __init_end],
6507      *    please refer to arch/tile/kernel/vmlinux.lds.S.
6508      * 3) .rodata.* may be embedded into .text or .data sections.
6509      */
6510 #define adj_init_size(start, end, size, pos, adj) \
6511     do { \
6512         if (start <= pos && pos < end && size > adj) \
6513             size -= adj; \
6514     } while (0)
6515 
6516     adj_init_size(__init_begin, __init_end, init_data_size,
6517              _sinittext, init_code_size);
6518     adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
6519     adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
6520     adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
6521     adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
6522 
6523 #undef  adj_init_size
6524 
6525     pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved"
6526 #ifdef  CONFIG_HIGHMEM
6527         ", %luK highmem"
6528 #endif
6529         "%s%s)\n",
6530         nr_free_pages() << (PAGE_SHIFT - 10),
6531         physpages << (PAGE_SHIFT - 10),
6532         codesize >> 10, datasize >> 10, rosize >> 10,
6533         (init_data_size + init_code_size) >> 10, bss_size >> 10,
6534         (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10),
6535         totalcma_pages << (PAGE_SHIFT - 10),
6536 #ifdef  CONFIG_HIGHMEM
6537         totalhigh_pages << (PAGE_SHIFT - 10),
6538 #endif
6539         str ? ", " : "", str ? str : "");
6540 }
6541 
6542 /**
6543  * set_dma_reserve - set the specified number of pages reserved in the first zone
6544  * @new_dma_reserve: The number of pages to mark reserved
6545  *
6546  * The per-cpu batchsize and zone watermarks are determined by managed_pages.
6547  * In the DMA zone, a significant percentage may be consumed by kernel image
6548  * and other unfreeable allocations which can skew the watermarks badly. This
6549  * function may optionally be used to account for unfreeable pages in the
6550  * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
6551  * smaller per-cpu batchsize.
6552  */
6553 void __init set_dma_reserve(unsigned long new_dma_reserve)
6554 {
6555     dma_reserve = new_dma_reserve;
6556 }
6557 
6558 void __init free_area_init(unsigned long *zones_size)
6559 {
6560     free_area_init_node(0, zones_size,
6561             __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
6562 }
6563 
6564 static int page_alloc_cpu_dead(unsigned int cpu)
6565 {
6566 
6567     lru_add_drain_cpu(cpu);
6568     drain_pages(cpu);
6569 
6570     /*
6571      * Spill the event counters of the dead processor
6572      * into the current processors event counters.
6573      * This artificially elevates the count of the current
6574      * processor.
6575      */
6576     vm_events_fold_cpu(cpu);
6577 
6578     /*
6579      * Zero the differential counters of the dead processor
6580      * so that the vm statistics are consistent.
6581      *
6582      * This is only okay since the processor is dead and cannot
6583      * race with what we are doing.
6584      */
6585     cpu_vm_stats_fold(cpu);
6586     return 0;
6587 }
6588 
6589 void __init page_alloc_init(void)
6590 {
6591     int ret;
6592 
6593     ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD,
6594                     "mm/page_alloc:dead", NULL,
6595                     page_alloc_cpu_dead);
6596     WARN_ON(ret < 0);
6597 }
6598 
6599 /*
6600  * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio
6601  *  or min_free_kbytes changes.
6602  */
6603 static void calculate_totalreserve_pages(void)
6604 {
6605     st