Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * sparse memory mappings.
0004  */
0005 #include <linux/mm.h>
0006 #include <linux/slab.h>
0007 #include <linux/mmzone.h>
0008 #include <linux/memblock.h>
0009 #include <linux/compiler.h>
0010 #include <linux/highmem.h>
0011 #include <linux/export.h>
0012 #include <linux/spinlock.h>
0013 #include <linux/vmalloc.h>
0014 #include <linux/swap.h>
0015 #include <linux/swapops.h>
0016 #include <linux/bootmem_info.h>
0017 
0018 #include "internal.h"
0019 #include <asm/dma.h>
0020 
0021 /*
0022  * Permanent SPARSEMEM data:
0023  *
0024  * 1) mem_section   - memory sections, mem_map's for valid memory
0025  */
0026 #ifdef CONFIG_SPARSEMEM_EXTREME
0027 struct mem_section **mem_section;
0028 #else
0029 struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
0030     ____cacheline_internodealigned_in_smp;
0031 #endif
0032 EXPORT_SYMBOL(mem_section);
0033 
0034 #ifdef NODE_NOT_IN_PAGE_FLAGS
0035 /*
0036  * If we did not store the node number in the page then we have to
0037  * do a lookup in the section_to_node_table in order to find which
0038  * node the page belongs to.
0039  */
0040 #if MAX_NUMNODES <= 256
0041 static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
0042 #else
0043 static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
0044 #endif
0045 
0046 int page_to_nid(const struct page *page)
0047 {
0048     return section_to_node_table[page_to_section(page)];
0049 }
0050 EXPORT_SYMBOL(page_to_nid);
0051 
0052 static void set_section_nid(unsigned long section_nr, int nid)
0053 {
0054     section_to_node_table[section_nr] = nid;
0055 }
0056 #else /* !NODE_NOT_IN_PAGE_FLAGS */
0057 static inline void set_section_nid(unsigned long section_nr, int nid)
0058 {
0059 }
0060 #endif
0061 
0062 #ifdef CONFIG_SPARSEMEM_EXTREME
0063 static noinline struct mem_section __ref *sparse_index_alloc(int nid)
0064 {
0065     struct mem_section *section = NULL;
0066     unsigned long array_size = SECTIONS_PER_ROOT *
0067                    sizeof(struct mem_section);
0068 
0069     if (slab_is_available()) {
0070         section = kzalloc_node(array_size, GFP_KERNEL, nid);
0071     } else {
0072         section = memblock_alloc_node(array_size, SMP_CACHE_BYTES,
0073                           nid);
0074         if (!section)
0075             panic("%s: Failed to allocate %lu bytes nid=%d\n",
0076                   __func__, array_size, nid);
0077     }
0078 
0079     return section;
0080 }
0081 
0082 static int __meminit sparse_index_init(unsigned long section_nr, int nid)
0083 {
0084     unsigned long root = SECTION_NR_TO_ROOT(section_nr);
0085     struct mem_section *section;
0086 
0087     /*
0088      * An existing section is possible in the sub-section hotplug
0089      * case. First hot-add instantiates, follow-on hot-add reuses
0090      * the existing section.
0091      *
0092      * The mem_hotplug_lock resolves the apparent race below.
0093      */
0094     if (mem_section[root])
0095         return 0;
0096 
0097     section = sparse_index_alloc(nid);
0098     if (!section)
0099         return -ENOMEM;
0100 
0101     mem_section[root] = section;
0102 
0103     return 0;
0104 }
0105 #else /* !SPARSEMEM_EXTREME */
0106 static inline int sparse_index_init(unsigned long section_nr, int nid)
0107 {
0108     return 0;
0109 }
0110 #endif
0111 
0112 /*
0113  * During early boot, before section_mem_map is used for an actual
0114  * mem_map, we use section_mem_map to store the section's NUMA
0115  * node.  This keeps us from having to use another data structure.  The
0116  * node information is cleared just before we store the real mem_map.
0117  */
0118 static inline unsigned long sparse_encode_early_nid(int nid)
0119 {
0120     return ((unsigned long)nid << SECTION_NID_SHIFT);
0121 }
0122 
0123 static inline int sparse_early_nid(struct mem_section *section)
0124 {
0125     return (section->section_mem_map >> SECTION_NID_SHIFT);
0126 }
0127 
0128 /* Validate the physical addressing limitations of the model */
0129 static void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
0130                         unsigned long *end_pfn)
0131 {
0132     unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);
0133 
0134     /*
0135      * Sanity checks - do not allow an architecture to pass
0136      * in larger pfns than the maximum scope of sparsemem:
0137      */
0138     if (*start_pfn > max_sparsemem_pfn) {
0139         mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
0140             "Start of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
0141             *start_pfn, *end_pfn, max_sparsemem_pfn);
0142         WARN_ON_ONCE(1);
0143         *start_pfn = max_sparsemem_pfn;
0144         *end_pfn = max_sparsemem_pfn;
0145     } else if (*end_pfn > max_sparsemem_pfn) {
0146         mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
0147             "End of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
0148             *start_pfn, *end_pfn, max_sparsemem_pfn);
0149         WARN_ON_ONCE(1);
0150         *end_pfn = max_sparsemem_pfn;
0151     }
0152 }
0153 
0154 /*
0155  * There are a number of times that we loop over NR_MEM_SECTIONS,
0156  * looking for section_present() on each.  But, when we have very
0157  * large physical address spaces, NR_MEM_SECTIONS can also be
0158  * very large which makes the loops quite long.
0159  *
0160  * Keeping track of this gives us an easy way to break out of
0161  * those loops early.
0162  */
0163 unsigned long __highest_present_section_nr;
0164 static void __section_mark_present(struct mem_section *ms,
0165         unsigned long section_nr)
0166 {
0167     if (section_nr > __highest_present_section_nr)
0168         __highest_present_section_nr = section_nr;
0169 
0170     ms->section_mem_map |= SECTION_MARKED_PRESENT;
0171 }
0172 
0173 #define for_each_present_section_nr(start, section_nr)      \
0174     for (section_nr = next_present_section_nr(start-1); \
0175          ((section_nr != -1) &&             \
0176           (section_nr <= __highest_present_section_nr));    \
0177          section_nr = next_present_section_nr(section_nr))
0178 
0179 static inline unsigned long first_present_section_nr(void)
0180 {
0181     return next_present_section_nr(-1);
0182 }
0183 
0184 #ifdef CONFIG_SPARSEMEM_VMEMMAP
0185 static void subsection_mask_set(unsigned long *map, unsigned long pfn,
0186         unsigned long nr_pages)
0187 {
0188     int idx = subsection_map_index(pfn);
0189     int end = subsection_map_index(pfn + nr_pages - 1);
0190 
0191     bitmap_set(map, idx, end - idx + 1);
0192 }
0193 
0194 void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages)
0195 {
0196     int end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
0197     unsigned long nr, start_sec = pfn_to_section_nr(pfn);
0198 
0199     if (!nr_pages)
0200         return;
0201 
0202     for (nr = start_sec; nr <= end_sec; nr++) {
0203         struct mem_section *ms;
0204         unsigned long pfns;
0205 
0206         pfns = min(nr_pages, PAGES_PER_SECTION
0207                 - (pfn & ~PAGE_SECTION_MASK));
0208         ms = __nr_to_section(nr);
0209         subsection_mask_set(ms->usage->subsection_map, pfn, pfns);
0210 
0211         pr_debug("%s: sec: %lu pfns: %lu set(%d, %d)\n", __func__, nr,
0212                 pfns, subsection_map_index(pfn),
0213                 subsection_map_index(pfn + pfns - 1));
0214 
0215         pfn += pfns;
0216         nr_pages -= pfns;
0217     }
0218 }
0219 #else
0220 void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages)
0221 {
0222 }
0223 #endif
0224 
0225 /* Record a memory area against a node. */
0226 static void __init memory_present(int nid, unsigned long start, unsigned long end)
0227 {
0228     unsigned long pfn;
0229 
0230 #ifdef CONFIG_SPARSEMEM_EXTREME
0231     if (unlikely(!mem_section)) {
0232         unsigned long size, align;
0233 
0234         size = sizeof(struct mem_section *) * NR_SECTION_ROOTS;
0235         align = 1 << (INTERNODE_CACHE_SHIFT);
0236         mem_section = memblock_alloc(size, align);
0237         if (!mem_section)
0238             panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
0239                   __func__, size, align);
0240     }
0241 #endif
0242 
0243     start &= PAGE_SECTION_MASK;
0244     mminit_validate_memmodel_limits(&start, &end);
0245     for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
0246         unsigned long section = pfn_to_section_nr(pfn);
0247         struct mem_section *ms;
0248 
0249         sparse_index_init(section, nid);
0250         set_section_nid(section, nid);
0251 
0252         ms = __nr_to_section(section);
0253         if (!ms->section_mem_map) {
0254             ms->section_mem_map = sparse_encode_early_nid(nid) |
0255                             SECTION_IS_ONLINE;
0256             __section_mark_present(ms, section);
0257         }
0258     }
0259 }
0260 
0261 /*
0262  * Mark all memblocks as present using memory_present().
0263  * This is a convenience function that is useful to mark all of the systems
0264  * memory as present during initialization.
0265  */
0266 static void __init memblocks_present(void)
0267 {
0268     unsigned long start, end;
0269     int i, nid;
0270 
0271     for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid)
0272         memory_present(nid, start, end);
0273 }
0274 
0275 /*
0276  * Subtle, we encode the real pfn into the mem_map such that
0277  * the identity pfn - section_mem_map will return the actual
0278  * physical page frame number.
0279  */
0280 static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum)
0281 {
0282     unsigned long coded_mem_map =
0283         (unsigned long)(mem_map - (section_nr_to_pfn(pnum)));
0284     BUILD_BUG_ON(SECTION_MAP_LAST_BIT > PFN_SECTION_SHIFT);
0285     BUG_ON(coded_mem_map & ~SECTION_MAP_MASK);
0286     return coded_mem_map;
0287 }
0288 
0289 #ifdef CONFIG_MEMORY_HOTPLUG
0290 /*
0291  * Decode mem_map from the coded memmap
0292  */
0293 struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum)
0294 {
0295     /* mask off the extra low bits of information */
0296     coded_mem_map &= SECTION_MAP_MASK;
0297     return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum);
0298 }
0299 #endif /* CONFIG_MEMORY_HOTPLUG */
0300 
0301 static void __meminit sparse_init_one_section(struct mem_section *ms,
0302         unsigned long pnum, struct page *mem_map,
0303         struct mem_section_usage *usage, unsigned long flags)
0304 {
0305     ms->section_mem_map &= ~SECTION_MAP_MASK;
0306     ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum)
0307         | SECTION_HAS_MEM_MAP | flags;
0308     ms->usage = usage;
0309 }
0310 
0311 static unsigned long usemap_size(void)
0312 {
0313     return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long);
0314 }
0315 
0316 size_t mem_section_usage_size(void)
0317 {
0318     return sizeof(struct mem_section_usage) + usemap_size();
0319 }
0320 
0321 static inline phys_addr_t pgdat_to_phys(struct pglist_data *pgdat)
0322 {
0323 #ifndef CONFIG_NUMA
0324     VM_BUG_ON(pgdat != &contig_page_data);
0325     return __pa_symbol(&contig_page_data);
0326 #else
0327     return __pa(pgdat);
0328 #endif
0329 }
0330 
0331 #ifdef CONFIG_MEMORY_HOTREMOVE
0332 static struct mem_section_usage * __init
0333 sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
0334                      unsigned long size)
0335 {
0336     struct mem_section_usage *usage;
0337     unsigned long goal, limit;
0338     int nid;
0339     /*
0340      * A page may contain usemaps for other sections preventing the
0341      * page being freed and making a section unremovable while
0342      * other sections referencing the usemap remain active. Similarly,
0343      * a pgdat can prevent a section being removed. If section A
0344      * contains a pgdat and section B contains the usemap, both
0345      * sections become inter-dependent. This allocates usemaps
0346      * from the same section as the pgdat where possible to avoid
0347      * this problem.
0348      */
0349     goal = pgdat_to_phys(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
0350     limit = goal + (1UL << PA_SECTION_SHIFT);
0351     nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
0352 again:
0353     usage = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, goal, limit, nid);
0354     if (!usage && limit) {
0355         limit = 0;
0356         goto again;
0357     }
0358     return usage;
0359 }
0360 
0361 static void __init check_usemap_section_nr(int nid,
0362         struct mem_section_usage *usage)
0363 {
0364     unsigned long usemap_snr, pgdat_snr;
0365     static unsigned long old_usemap_snr;
0366     static unsigned long old_pgdat_snr;
0367     struct pglist_data *pgdat = NODE_DATA(nid);
0368     int usemap_nid;
0369 
0370     /* First call */
0371     if (!old_usemap_snr) {
0372         old_usemap_snr = NR_MEM_SECTIONS;
0373         old_pgdat_snr = NR_MEM_SECTIONS;
0374     }
0375 
0376     usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT);
0377     pgdat_snr = pfn_to_section_nr(pgdat_to_phys(pgdat) >> PAGE_SHIFT);
0378     if (usemap_snr == pgdat_snr)
0379         return;
0380 
0381     if (old_usemap_snr == usemap_snr && old_pgdat_snr == pgdat_snr)
0382         /* skip redundant message */
0383         return;
0384 
0385     old_usemap_snr = usemap_snr;
0386     old_pgdat_snr = pgdat_snr;
0387 
0388     usemap_nid = sparse_early_nid(__nr_to_section(usemap_snr));
0389     if (usemap_nid != nid) {
0390         pr_info("node %d must be removed before remove section %ld\n",
0391             nid, usemap_snr);
0392         return;
0393     }
0394     /*
0395      * There is a circular dependency.
0396      * Some platforms allow un-removable section because they will just
0397      * gather other removable sections for dynamic partitioning.
0398      * Just notify un-removable section's number here.
0399      */
0400     pr_info("Section %ld and %ld (node %d) have a circular dependency on usemap and pgdat allocations\n",
0401         usemap_snr, pgdat_snr, nid);
0402 }
0403 #else
0404 static struct mem_section_usage * __init
0405 sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
0406                      unsigned long size)
0407 {
0408     return memblock_alloc_node(size, SMP_CACHE_BYTES, pgdat->node_id);
0409 }
0410 
0411 static void __init check_usemap_section_nr(int nid,
0412         struct mem_section_usage *usage)
0413 {
0414 }
0415 #endif /* CONFIG_MEMORY_HOTREMOVE */
0416 
0417 #ifdef CONFIG_SPARSEMEM_VMEMMAP
0418 static unsigned long __init section_map_size(void)
0419 {
0420     return ALIGN(sizeof(struct page) * PAGES_PER_SECTION, PMD_SIZE);
0421 }
0422 
0423 #else
0424 static unsigned long __init section_map_size(void)
0425 {
0426     return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
0427 }
0428 
0429 struct page __init *__populate_section_memmap(unsigned long pfn,
0430         unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
0431         struct dev_pagemap *pgmap)
0432 {
0433     unsigned long size = section_map_size();
0434     struct page *map = sparse_buffer_alloc(size);
0435     phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
0436 
0437     if (map)
0438         return map;
0439 
0440     map = memmap_alloc(size, size, addr, nid, false);
0441     if (!map)
0442         panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa\n",
0443               __func__, size, PAGE_SIZE, nid, &addr);
0444 
0445     return map;
0446 }
0447 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */
0448 
0449 static void *sparsemap_buf __meminitdata;
0450 static void *sparsemap_buf_end __meminitdata;
0451 
0452 static inline void __meminit sparse_buffer_free(unsigned long size)
0453 {
0454     WARN_ON(!sparsemap_buf || size == 0);
0455     memblock_free(sparsemap_buf, size);
0456 }
0457 
0458 static void __init sparse_buffer_init(unsigned long size, int nid)
0459 {
0460     phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
0461     WARN_ON(sparsemap_buf); /* forgot to call sparse_buffer_fini()? */
0462     /*
0463      * Pre-allocated buffer is mainly used by __populate_section_memmap
0464      * and we want it to be properly aligned to the section size - this is
0465      * especially the case for VMEMMAP which maps memmap to PMDs
0466      */
0467     sparsemap_buf = memmap_alloc(size, section_map_size(), addr, nid, true);
0468     sparsemap_buf_end = sparsemap_buf + size;
0469 }
0470 
0471 static void __init sparse_buffer_fini(void)
0472 {
0473     unsigned long size = sparsemap_buf_end - sparsemap_buf;
0474 
0475     if (sparsemap_buf && size > 0)
0476         sparse_buffer_free(size);
0477     sparsemap_buf = NULL;
0478 }
0479 
0480 void * __meminit sparse_buffer_alloc(unsigned long size)
0481 {
0482     void *ptr = NULL;
0483 
0484     if (sparsemap_buf) {
0485         ptr = (void *) roundup((unsigned long)sparsemap_buf, size);
0486         if (ptr + size > sparsemap_buf_end)
0487             ptr = NULL;
0488         else {
0489             /* Free redundant aligned space */
0490             if ((unsigned long)(ptr - sparsemap_buf) > 0)
0491                 sparse_buffer_free((unsigned long)(ptr - sparsemap_buf));
0492             sparsemap_buf = ptr + size;
0493         }
0494     }
0495     return ptr;
0496 }
0497 
0498 void __weak __meminit vmemmap_populate_print_last(void)
0499 {
0500 }
0501 
0502 /*
0503  * Initialize sparse on a specific node. The node spans [pnum_begin, pnum_end)
0504  * And number of present sections in this node is map_count.
0505  */
0506 static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
0507                    unsigned long pnum_end,
0508                    unsigned long map_count)
0509 {
0510     struct mem_section_usage *usage;
0511     unsigned long pnum;
0512     struct page *map;
0513 
0514     usage = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
0515             mem_section_usage_size() * map_count);
0516     if (!usage) {
0517         pr_err("%s: node[%d] usemap allocation failed", __func__, nid);
0518         goto failed;
0519     }
0520     sparse_buffer_init(map_count * section_map_size(), nid);
0521     for_each_present_section_nr(pnum_begin, pnum) {
0522         unsigned long pfn = section_nr_to_pfn(pnum);
0523 
0524         if (pnum >= pnum_end)
0525             break;
0526 
0527         map = __populate_section_memmap(pfn, PAGES_PER_SECTION,
0528                 nid, NULL, NULL);
0529         if (!map) {
0530             pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.",
0531                    __func__, nid);
0532             pnum_begin = pnum;
0533             sparse_buffer_fini();
0534             goto failed;
0535         }
0536         check_usemap_section_nr(nid, usage);
0537         sparse_init_one_section(__nr_to_section(pnum), pnum, map, usage,
0538                 SECTION_IS_EARLY);
0539         usage = (void *) usage + mem_section_usage_size();
0540     }
0541     sparse_buffer_fini();
0542     return;
0543 failed:
0544     /* We failed to allocate, mark all the following pnums as not present */
0545     for_each_present_section_nr(pnum_begin, pnum) {
0546         struct mem_section *ms;
0547 
0548         if (pnum >= pnum_end)
0549             break;
0550         ms = __nr_to_section(pnum);
0551         ms->section_mem_map = 0;
0552     }
0553 }
0554 
0555 /*
0556  * Allocate the accumulated non-linear sections, allocate a mem_map
0557  * for each and record the physical to section mapping.
0558  */
0559 void __init sparse_init(void)
0560 {
0561     unsigned long pnum_end, pnum_begin, map_count = 1;
0562     int nid_begin;
0563 
0564     memblocks_present();
0565 
0566     pnum_begin = first_present_section_nr();
0567     nid_begin = sparse_early_nid(__nr_to_section(pnum_begin));
0568 
0569     /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
0570     set_pageblock_order();
0571 
0572     for_each_present_section_nr(pnum_begin + 1, pnum_end) {
0573         int nid = sparse_early_nid(__nr_to_section(pnum_end));
0574 
0575         if (nid == nid_begin) {
0576             map_count++;
0577             continue;
0578         }
0579         /* Init node with sections in range [pnum_begin, pnum_end) */
0580         sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
0581         nid_begin = nid;
0582         pnum_begin = pnum_end;
0583         map_count = 1;
0584     }
0585     /* cover the last node */
0586     sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
0587     vmemmap_populate_print_last();
0588 }
0589 
0590 #ifdef CONFIG_MEMORY_HOTPLUG
0591 
0592 /* Mark all memory sections within the pfn range as online */
0593 void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
0594 {
0595     unsigned long pfn;
0596 
0597     for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
0598         unsigned long section_nr = pfn_to_section_nr(pfn);
0599         struct mem_section *ms;
0600 
0601         /* onlining code should never touch invalid ranges */
0602         if (WARN_ON(!valid_section_nr(section_nr)))
0603             continue;
0604 
0605         ms = __nr_to_section(section_nr);
0606         ms->section_mem_map |= SECTION_IS_ONLINE;
0607     }
0608 }
0609 
0610 /* Mark all memory sections within the pfn range as offline */
0611 void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
0612 {
0613     unsigned long pfn;
0614 
0615     for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
0616         unsigned long section_nr = pfn_to_section_nr(pfn);
0617         struct mem_section *ms;
0618 
0619         /*
0620          * TODO this needs some double checking. Offlining code makes
0621          * sure to check pfn_valid but those checks might be just bogus
0622          */
0623         if (WARN_ON(!valid_section_nr(section_nr)))
0624             continue;
0625 
0626         ms = __nr_to_section(section_nr);
0627         ms->section_mem_map &= ~SECTION_IS_ONLINE;
0628     }
0629 }
0630 
0631 #ifdef CONFIG_SPARSEMEM_VMEMMAP
0632 static struct page * __meminit populate_section_memmap(unsigned long pfn,
0633         unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
0634         struct dev_pagemap *pgmap)
0635 {
0636     return __populate_section_memmap(pfn, nr_pages, nid, altmap, pgmap);
0637 }
0638 
0639 static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
0640         struct vmem_altmap *altmap)
0641 {
0642     unsigned long start = (unsigned long) pfn_to_page(pfn);
0643     unsigned long end = start + nr_pages * sizeof(struct page);
0644 
0645     vmemmap_free(start, end, altmap);
0646 }
0647 static void free_map_bootmem(struct page *memmap)
0648 {
0649     unsigned long start = (unsigned long)memmap;
0650     unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
0651 
0652     vmemmap_free(start, end, NULL);
0653 }
0654 
0655 static int clear_subsection_map(unsigned long pfn, unsigned long nr_pages)
0656 {
0657     DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
0658     DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 };
0659     struct mem_section *ms = __pfn_to_section(pfn);
0660     unsigned long *subsection_map = ms->usage
0661         ? &ms->usage->subsection_map[0] : NULL;
0662 
0663     subsection_mask_set(map, pfn, nr_pages);
0664     if (subsection_map)
0665         bitmap_and(tmp, map, subsection_map, SUBSECTIONS_PER_SECTION);
0666 
0667     if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION),
0668                 "section already deactivated (%#lx + %ld)\n",
0669                 pfn, nr_pages))
0670         return -EINVAL;
0671 
0672     bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION);
0673     return 0;
0674 }
0675 
0676 static bool is_subsection_map_empty(struct mem_section *ms)
0677 {
0678     return bitmap_empty(&ms->usage->subsection_map[0],
0679                 SUBSECTIONS_PER_SECTION);
0680 }
0681 
0682 static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages)
0683 {
0684     struct mem_section *ms = __pfn_to_section(pfn);
0685     DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
0686     unsigned long *subsection_map;
0687     int rc = 0;
0688 
0689     subsection_mask_set(map, pfn, nr_pages);
0690 
0691     subsection_map = &ms->usage->subsection_map[0];
0692 
0693     if (bitmap_empty(map, SUBSECTIONS_PER_SECTION))
0694         rc = -EINVAL;
0695     else if (bitmap_intersects(map, subsection_map, SUBSECTIONS_PER_SECTION))
0696         rc = -EEXIST;
0697     else
0698         bitmap_or(subsection_map, map, subsection_map,
0699                 SUBSECTIONS_PER_SECTION);
0700 
0701     return rc;
0702 }
0703 #else
0704 struct page * __meminit populate_section_memmap(unsigned long pfn,
0705         unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
0706         struct dev_pagemap *pgmap)
0707 {
0708     return kvmalloc_node(array_size(sizeof(struct page),
0709                     PAGES_PER_SECTION), GFP_KERNEL, nid);
0710 }
0711 
0712 static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
0713         struct vmem_altmap *altmap)
0714 {
0715     kvfree(pfn_to_page(pfn));
0716 }
0717 
0718 static void free_map_bootmem(struct page *memmap)
0719 {
0720     unsigned long maps_section_nr, removing_section_nr, i;
0721     unsigned long magic, nr_pages;
0722     struct page *page = virt_to_page(memmap);
0723 
0724     nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
0725         >> PAGE_SHIFT;
0726 
0727     for (i = 0; i < nr_pages; i++, page++) {
0728         magic = page->index;
0729 
0730         BUG_ON(magic == NODE_INFO);
0731 
0732         maps_section_nr = pfn_to_section_nr(page_to_pfn(page));
0733         removing_section_nr = page_private(page);
0734 
0735         /*
0736          * When this function is called, the removing section is
0737          * logical offlined state. This means all pages are isolated
0738          * from page allocator. If removing section's memmap is placed
0739          * on the same section, it must not be freed.
0740          * If it is freed, page allocator may allocate it which will
0741          * be removed physically soon.
0742          */
0743         if (maps_section_nr != removing_section_nr)
0744             put_page_bootmem(page);
0745     }
0746 }
0747 
0748 static int clear_subsection_map(unsigned long pfn, unsigned long nr_pages)
0749 {
0750     return 0;
0751 }
0752 
0753 static bool is_subsection_map_empty(struct mem_section *ms)
0754 {
0755     return true;
0756 }
0757 
0758 static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages)
0759 {
0760     return 0;
0761 }
0762 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
0763 
0764 /*
0765  * To deactivate a memory region, there are 3 cases to handle across
0766  * two configurations (SPARSEMEM_VMEMMAP={y,n}):
0767  *
0768  * 1. deactivation of a partial hot-added section (only possible in
0769  *    the SPARSEMEM_VMEMMAP=y case).
0770  *      a) section was present at memory init.
0771  *      b) section was hot-added post memory init.
0772  * 2. deactivation of a complete hot-added section.
0773  * 3. deactivation of a complete section from memory init.
0774  *
0775  * For 1, when subsection_map does not empty we will not be freeing the
0776  * usage map, but still need to free the vmemmap range.
0777  *
0778  * For 2 and 3, the SPARSEMEM_VMEMMAP={y,n} cases are unified
0779  */
0780 static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
0781         struct vmem_altmap *altmap)
0782 {
0783     struct mem_section *ms = __pfn_to_section(pfn);
0784     bool section_is_early = early_section(ms);
0785     struct page *memmap = NULL;
0786     bool empty;
0787 
0788     if (clear_subsection_map(pfn, nr_pages))
0789         return;
0790 
0791     empty = is_subsection_map_empty(ms);
0792     if (empty) {
0793         unsigned long section_nr = pfn_to_section_nr(pfn);
0794 
0795         /*
0796          * When removing an early section, the usage map is kept (as the
0797          * usage maps of other sections fall into the same page). It
0798          * will be re-used when re-adding the section - which is then no
0799          * longer an early section. If the usage map is PageReserved, it
0800          * was allocated during boot.
0801          */
0802         if (!PageReserved(virt_to_page(ms->usage))) {
0803             kfree(ms->usage);
0804             ms->usage = NULL;
0805         }
0806         memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
0807         /*
0808          * Mark the section invalid so that valid_section()
0809          * return false. This prevents code from dereferencing
0810          * ms->usage array.
0811          */
0812         ms->section_mem_map &= ~SECTION_HAS_MEM_MAP;
0813     }
0814 
0815     /*
0816      * The memmap of early sections is always fully populated. See
0817      * section_activate() and pfn_valid() .
0818      */
0819     if (!section_is_early)
0820         depopulate_section_memmap(pfn, nr_pages, altmap);
0821     else if (memmap)
0822         free_map_bootmem(memmap);
0823 
0824     if (empty)
0825         ms->section_mem_map = (unsigned long)NULL;
0826 }
0827 
0828 static struct page * __meminit section_activate(int nid, unsigned long pfn,
0829         unsigned long nr_pages, struct vmem_altmap *altmap,
0830         struct dev_pagemap *pgmap)
0831 {
0832     struct mem_section *ms = __pfn_to_section(pfn);
0833     struct mem_section_usage *usage = NULL;
0834     struct page *memmap;
0835     int rc = 0;
0836 
0837     if (!ms->usage) {
0838         usage = kzalloc(mem_section_usage_size(), GFP_KERNEL);
0839         if (!usage)
0840             return ERR_PTR(-ENOMEM);
0841         ms->usage = usage;
0842     }
0843 
0844     rc = fill_subsection_map(pfn, nr_pages);
0845     if (rc) {
0846         if (usage)
0847             ms->usage = NULL;
0848         kfree(usage);
0849         return ERR_PTR(rc);
0850     }
0851 
0852     /*
0853      * The early init code does not consider partially populated
0854      * initial sections, it simply assumes that memory will never be
0855      * referenced.  If we hot-add memory into such a section then we
0856      * do not need to populate the memmap and can simply reuse what
0857      * is already there.
0858      */
0859     if (nr_pages < PAGES_PER_SECTION && early_section(ms))
0860         return pfn_to_page(pfn);
0861 
0862     memmap = populate_section_memmap(pfn, nr_pages, nid, altmap, pgmap);
0863     if (!memmap) {
0864         section_deactivate(pfn, nr_pages, altmap);
0865         return ERR_PTR(-ENOMEM);
0866     }
0867 
0868     return memmap;
0869 }
0870 
0871 /**
0872  * sparse_add_section - add a memory section, or populate an existing one
0873  * @nid: The node to add section on
0874  * @start_pfn: start pfn of the memory range
0875  * @nr_pages: number of pfns to add in the section
0876  * @altmap: alternate pfns to allocate the memmap backing store
0877  * @pgmap: alternate compound page geometry for devmap mappings
0878  *
0879  * This is only intended for hotplug.
0880  *
0881  * Note that only VMEMMAP supports sub-section aligned hotplug,
0882  * the proper alignment and size are gated by check_pfn_span().
0883  *
0884  *
0885  * Return:
0886  * * 0      - On success.
0887  * * -EEXIST    - Section has been present.
0888  * * -ENOMEM    - Out of memory.
0889  */
0890 int __meminit sparse_add_section(int nid, unsigned long start_pfn,
0891         unsigned long nr_pages, struct vmem_altmap *altmap,
0892         struct dev_pagemap *pgmap)
0893 {
0894     unsigned long section_nr = pfn_to_section_nr(start_pfn);
0895     struct mem_section *ms;
0896     struct page *memmap;
0897     int ret;
0898 
0899     ret = sparse_index_init(section_nr, nid);
0900     if (ret < 0)
0901         return ret;
0902 
0903     memmap = section_activate(nid, start_pfn, nr_pages, altmap, pgmap);
0904     if (IS_ERR(memmap))
0905         return PTR_ERR(memmap);
0906 
0907     /*
0908      * Poison uninitialized struct pages in order to catch invalid flags
0909      * combinations.
0910      */
0911     page_init_poison(memmap, sizeof(struct page) * nr_pages);
0912 
0913     ms = __nr_to_section(section_nr);
0914     set_section_nid(section_nr, nid);
0915     __section_mark_present(ms, section_nr);
0916 
0917     /* Align memmap to section boundary in the subsection case */
0918     if (section_nr_to_pfn(section_nr) != start_pfn)
0919         memmap = pfn_to_page(section_nr_to_pfn(section_nr));
0920     sparse_init_one_section(ms, section_nr, memmap, ms->usage, 0);
0921 
0922     return 0;
0923 }
0924 
0925 void sparse_remove_section(struct mem_section *ms, unsigned long pfn,
0926         unsigned long nr_pages, unsigned long map_offset,
0927         struct vmem_altmap *altmap)
0928 {
0929     clear_hwpoisoned_pages(pfn_to_page(pfn) + map_offset,
0930             nr_pages - map_offset);
0931     section_deactivate(pfn, nr_pages, altmap);
0932 }
0933 #endif /* CONFIG_MEMORY_HOTPLUG */