x86/mm/numa.c

0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /* Common code for 32 and 64-bit NUMA */
0003 #include <linux/acpi.h>
0004 #include <linux/kernel.h>
0005 #include <linux/mm.h>
0006 #include <linux/string.h>
0007 #include <linux/init.h>
0008 #include <linux/memblock.h>
0009 #include <linux/mmzone.h>
0010 #include <linux/ctype.h>
0011 #include <linux/nodemask.h>
0012 #include <linux/sched.h>
0013 #include <linux/topology.h>
0014
0015 #include <asm/e820/api.h>
0016 #include <asm/proto.h>
0017 #include <asm/dma.h>
0018 #include <asm/amd_nb.h>
0019
0020 #include "numa_internal.h"
0021
0022 int numa_off;
0023 nodemask_t numa_nodes_parsed __initdata;
0024
0025 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
0026 EXPORT_SYMBOL(node_data);
0027
0028 static struct numa_meminfo numa_meminfo __initdata_or_meminfo;
0029 static struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo;
0030
0031 static int numa_distance_cnt;
0032 static u8 *numa_distance;
0033
0034 static __init int numa_setup(char *opt)
0035 {
0036     if (!opt)
0037         return -EINVAL;
0038     if (!strncmp(opt, "off", 3))
0039         numa_off = 1;
0040     if (!strncmp(opt, "fake=", 5))
0041         return numa_emu_cmdline(opt + 5);
0042     if (!strncmp(opt, "noacpi", 6))
0043         disable_srat();
0044     if (!strncmp(opt, "nohmat", 6))
0045         disable_hmat();
0046     return 0;
0047 }
0048 early_param("numa", numa_setup);
0049
0050 /*
0051  * apicid, cpu, node mappings
0052  */
0053 s16 __apicid_to_node[MAX_LOCAL_APIC] = {
0054     [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
0055 };
0056
0057 int numa_cpu_node(int cpu)
0058 {
0059     int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
0060
0061     if (apicid != BAD_APICID)
0062         return __apicid_to_node[apicid];
0063     return NUMA_NO_NODE;
0064 }
0065
0066 cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
0067 EXPORT_SYMBOL(node_to_cpumask_map);
0068
0069 /*
0070  * Map cpu index to node index
0071  */
0072 DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
0073 EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
0074
0075 void numa_set_node(int cpu, int node)
0076 {
0077     int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
0078
0079     /* early setting, no percpu area yet */
0080     if (cpu_to_node_map) {
0081         cpu_to_node_map[cpu] = node;
0082         return;
0083     }
0084
0085 #ifdef CONFIG_DEBUG_PER_CPU_MAPS
0086     if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
0087         printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
0088         dump_stack();
0089         return;
0090     }
0091 #endif
0092     per_cpu(x86_cpu_to_node_map, cpu) = node;
0093
0094     set_cpu_numa_node(cpu, node);
0095 }
0096
0097 void numa_clear_node(int cpu)
0098 {
0099     numa_set_node(cpu, NUMA_NO_NODE);
0100 }
0101
0102 /*
0103  * Allocate node_to_cpumask_map based on number of available nodes
0104  * Requires node_possible_map to be valid.
0105  *
0106  * Note: cpumask_of_node() is not valid until after this is done.
0107  * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
0108  */
0109 void __init setup_node_to_cpumask_map(void)
0110 {
0111     unsigned int node;
0112
0113     /* setup nr_node_ids if not done yet */
0114     if (nr_node_ids == MAX_NUMNODES)
0115         setup_nr_node_ids();
0116
0117     /* allocate the map */
0118     for (node = 0; node < nr_node_ids; node++)
0119         alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
0120
0121     /* cpumask_of_node() will now work */
0122     pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids);
0123 }
0124
0125 static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
0126                      struct numa_meminfo *mi)
0127 {
0128     /* ignore zero length blks */
0129     if (start == end)
0130         return 0;
0131
0132     /* whine about and ignore invalid blks */
0133     if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
0134         pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
0135             nid, start, end - 1);
0136         return 0;
0137     }
0138
0139     if (mi->nr_blks >= NR_NODE_MEMBLKS) {
0140         pr_err("too many memblk ranges\n");
0141         return -EINVAL;
0142     }
0143
0144     mi->blk[mi->nr_blks].start = start;
0145     mi->blk[mi->nr_blks].end = end;
0146     mi->blk[mi->nr_blks].nid = nid;
0147     mi->nr_blks++;
0148     return 0;
0149 }
0150
0151 /**
0152  * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
0153  * @idx: Index of memblk to remove
0154  * @mi: numa_meminfo to remove memblk from
0155  *
0156  * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
0157  * decrementing @mi->nr_blks.
0158  */
0159 void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
0160 {
0161     mi->nr_blks--;
0162     memmove(&mi->blk[idx], &mi->blk[idx + 1],
0163         (mi->nr_blks - idx) * sizeof(mi->blk[0]));
0164 }
0165
0166 /**
0167  * numa_move_tail_memblk - Move a numa_memblk from one numa_meminfo to another
0168  * @dst: numa_meminfo to append block to
0169  * @idx: Index of memblk to remove
0170  * @src: numa_meminfo to remove memblk from
0171  */
0172 static void __init numa_move_tail_memblk(struct numa_meminfo *dst, int idx,
0173                      struct numa_meminfo *src)
0174 {
0175     dst->blk[dst->nr_blks++] = src->blk[idx];
0176     numa_remove_memblk_from(idx, src);
0177 }
0178
0179 /**
0180  * numa_add_memblk - Add one numa_memblk to numa_meminfo
0181  * @nid: NUMA node ID of the new memblk
0182  * @start: Start address of the new memblk
0183  * @end: End address of the new memblk
0184  *
0185  * Add a new memblk to the default numa_meminfo.
0186  *
0187  * RETURNS:
0188  * 0 on success, -errno on failure.
0189  */
0190 int __init numa_add_memblk(int nid, u64 start, u64 end)
0191 {
0192     return numa_add_memblk_to(nid, start, end, &numa_meminfo);
0193 }
0194
0195 /* Allocate NODE_DATA for a node on the local memory */
0196 static void __init alloc_node_data(int nid)
0197 {
0198     const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
0199     u64 nd_pa;
0200     void *nd;
0201     int tnid;
0202
0203     /*
0204      * Allocate node data.  Try node-local memory and then any node.
0205      * Never allocate in DMA zone.
0206      */
0207     nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
0208     if (!nd_pa) {
0209         pr_err("Cannot find %zu bytes in any node (initial node: %d)\n",
0210                nd_size, nid);
0211         return;
0212     }
0213     nd = __va(nd_pa);
0214
0215     /* report and initialize */
0216     printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid,
0217            nd_pa, nd_pa + nd_size - 1);
0218     tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
0219     if (tnid != nid)
0220         printk(KERN_INFO "    NODE_DATA(%d) on node %d\n", nid, tnid);
0221
0222     node_data[nid] = nd;
0223     memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
0224
0225     node_set_online(nid);
0226 }
0227
0228 /**
0229  * numa_cleanup_meminfo - Cleanup a numa_meminfo
0230  * @mi: numa_meminfo to clean up
0231  *
0232  * Sanitize @mi by merging and removing unnecessary memblks.  Also check for
0233  * conflicts and clear unused memblks.
0234  *
0235  * RETURNS:
0236  * 0 on success, -errno on failure.
0237  */
0238 int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
0239 {
0240     const u64 low = 0;
0241     const u64 high = PFN_PHYS(max_pfn);
0242     int i, j, k;
0243
0244     /* first, trim all entries */
0245     for (i = 0; i < mi->nr_blks; i++) {
0246         struct numa_memblk *bi = &mi->blk[i];
0247
0248         /* move / save reserved memory ranges */
0249         if (!memblock_overlaps_region(&memblock.memory,
0250                     bi->start, bi->end - bi->start)) {
0251             numa_move_tail_memblk(&numa_reserved_meminfo, i--, mi);
0252             continue;
0253         }
0254
0255         /* make sure all non-reserved blocks are inside the limits */
0256         bi->start = max(bi->start, low);
0257
0258         /* preserve info for non-RAM areas above 'max_pfn': */
0259         if (bi->end > high) {
0260             numa_add_memblk_to(bi->nid, high, bi->end,
0261                        &numa_reserved_meminfo);
0262             bi->end = high;
0263         }
0264
0265         /* and there's no empty block */
0266         if (bi->start >= bi->end)
0267             numa_remove_memblk_from(i--, mi);
0268     }
0269
0270     /* merge neighboring / overlapping entries */
0271     for (i = 0; i < mi->nr_blks; i++) {
0272         struct numa_memblk *bi = &mi->blk[i];
0273
0274         for (j = i + 1; j < mi->nr_blks; j++) {
0275             struct numa_memblk *bj = &mi->blk[j];
0276             u64 start, end;
0277
0278             /*
0279              * See whether there are overlapping blocks.  Whine
0280              * about but allow overlaps of the same nid.  They
0281              * will be merged below.
0282              */
0283             if (bi->end > bj->start && bi->start < bj->end) {
0284                 if (bi->nid != bj->nid) {
0285                     pr_err("node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n",
0286                            bi->nid, bi->start, bi->end - 1,
0287                            bj->nid, bj->start, bj->end - 1);
0288                     return -EINVAL;
0289                 }
0290                 pr_warn("Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n",
0291                     bi->nid, bi->start, bi->end - 1,
0292                     bj->start, bj->end - 1);
0293             }
0294
0295             /*
0296              * Join together blocks on the same node, holes
0297              * between which don't overlap with memory on other
0298              * nodes.
0299              */
0300             if (bi->nid != bj->nid)
0301                 continue;
0302             start = min(bi->start, bj->start);
0303             end = max(bi->end, bj->end);
0304             for (k = 0; k < mi->nr_blks; k++) {
0305                 struct numa_memblk *bk = &mi->blk[k];
0306
0307                 if (bi->nid == bk->nid)
0308                     continue;
0309                 if (start < bk->end && end > bk->start)
0310                     break;
0311             }
0312             if (k < mi->nr_blks)
0313                 continue;
0314             printk(KERN_INFO "NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n",
0315                    bi->nid, bi->start, bi->end - 1, bj->start,
0316                    bj->end - 1, start, end - 1);
0317             bi->start = start;
0318             bi->end = end;
0319             numa_remove_memblk_from(j--, mi);
0320         }
0321     }
0322
0323     /* clear unused ones */
0324     for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
0325         mi->blk[i].start = mi->blk[i].end = 0;
0326         mi->blk[i].nid = NUMA_NO_NODE;
0327     }
0328
0329     return 0;
0330 }
0331
0332 /*
0333  * Set nodes, which have memory in @mi, in *@nodemask.
0334  */
0335 static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
0336                           const struct numa_meminfo *mi)
0337 {
0338     int i;
0339
0340     for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
0341         if (mi->blk[i].start != mi->blk[i].end &&
0342             mi->blk[i].nid != NUMA_NO_NODE)
0343             node_set(mi->blk[i].nid, *nodemask);
0344 }
0345
0346 /**
0347  * numa_reset_distance - Reset NUMA distance table
0348  *
0349  * The current table is freed.  The next numa_set_distance() call will
0350  * create a new one.
0351  */
0352 void __init numa_reset_distance(void)
0353 {
0354     size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]);
0355
0356     /* numa_distance could be 1LU marking allocation failure, test cnt */
0357     if (numa_distance_cnt)
0358         memblock_free(numa_distance, size);
0359     numa_distance_cnt = 0;
0360     numa_distance = NULL;   /* enable table creation */
0361 }
0362
0363 static int __init numa_alloc_distance(void)
0364 {
0365     nodemask_t nodes_parsed;
0366     size_t size;
0367     int i, j, cnt = 0;
0368     u64 phys;
0369
0370     /* size the new table and allocate it */
0371     nodes_parsed = numa_nodes_parsed;
0372     numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);
0373
0374     for_each_node_mask(i, nodes_parsed)
0375         cnt = i;
0376     cnt++;
0377     size = cnt * cnt * sizeof(numa_distance[0]);
0378
0379     phys = memblock_phys_alloc_range(size, PAGE_SIZE, 0,
0380                      PFN_PHYS(max_pfn_mapped));
0381     if (!phys) {
0382         pr_warn("Warning: can't allocate distance table!\n");
0383         /* don't retry until explicitly reset */
0384         numa_distance = (void *)1LU;
0385         return -ENOMEM;
0386     }
0387
0388     numa_distance = __va(phys);
0389     numa_distance_cnt = cnt;
0390
0391     /* fill with the default distances */
0392     for (i = 0; i < cnt; i++)
0393         for (j = 0; j < cnt; j++)
0394             numa_distance[i * cnt + j] = i == j ?
0395                 LOCAL_DISTANCE : REMOTE_DISTANCE;
0396     printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
0397
0398     return 0;
0399 }
0400
0401 /**
0402  * numa_set_distance - Set NUMA distance from one NUMA to another
0403  * @from: the 'from' node to set distance
0404  * @to: the 'to'  node to set distance
0405  * @distance: NUMA distance
0406  *
0407  * Set the distance from node @from to @to to @distance.  If distance table
0408  * doesn't exist, one which is large enough to accommodate all the currently
0409  * known nodes will be created.
0410  *
0411  * If such table cannot be allocated, a warning is printed and further
0412  * calls are ignored until the distance table is reset with
0413  * numa_reset_distance().
0414  *
0415  * If @from or @to is higher than the highest known node or lower than zero
0416  * at the time of table creation or @distance doesn't make sense, the call
0417  * is ignored.
0418  * This is to allow simplification of specific NUMA config implementations.
0419  */
0420 void __init numa_set_distance(int from, int to, int distance)
0421 {
0422     if (!numa_distance && numa_alloc_distance() < 0)
0423         return;
0424
0425     if (from >= numa_distance_cnt || to >= numa_distance_cnt ||
0426             from < 0 || to < 0) {
0427         pr_warn_once("Warning: node ids are out of bound, from=%d to=%d distance=%d\n",
0428                  from, to, distance);
0429         return;
0430     }
0431
0432     if ((u8)distance != distance ||
0433         (from == to && distance != LOCAL_DISTANCE)) {
0434         pr_warn_once("Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
0435                  from, to, distance);
0436         return;
0437     }
0438
0439     numa_distance[from * numa_distance_cnt + to] = distance;
0440 }
0441
0442 int __node_distance(int from, int to)
0443 {
0444     if (from >= numa_distance_cnt || to >= numa_distance_cnt)
0445         return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
0446     return numa_distance[from * numa_distance_cnt + to];
0447 }
0448 EXPORT_SYMBOL(__node_distance);
0449
0450 /*
0451  * Sanity check to catch more bad NUMA configurations (they are amazingly
0452  * common).  Make sure the nodes cover all memory.
0453  */
0454 static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
0455 {
0456     u64 numaram, e820ram;
0457     int i;
0458
0459     numaram = 0;
0460     for (i = 0; i < mi->nr_blks; i++) {
0461         u64 s = mi->blk[i].start >> PAGE_SHIFT;
0462         u64 e = mi->blk[i].end >> PAGE_SHIFT;
0463         numaram += e - s;
0464         numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e);
0465         if ((s64)numaram < 0)
0466             numaram = 0;
0467     }
0468
0469     e820ram = max_pfn - absent_pages_in_range(0, max_pfn);
0470
0471     /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
0472     if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
0473         printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n",
0474                (numaram << PAGE_SHIFT) >> 20,
0475                (e820ram << PAGE_SHIFT) >> 20);
0476         return false;
0477     }
0478     return true;
0479 }
0480
0481 /*
0482  * Mark all currently memblock-reserved physical memory (which covers the
0483  * kernel's own memory ranges) as hot-unswappable.
0484  */
0485 static void __init numa_clear_kernel_node_hotplug(void)
0486 {
0487     nodemask_t reserved_nodemask = NODE_MASK_NONE;
0488     struct memblock_region *mb_region;
0489     int i;
0490
0491     /*
0492      * We have to do some preprocessing of memblock regions, to
0493      * make them suitable for reservation.
0494      *
0495      * At this time, all memory regions reserved by memblock are
0496      * used by the kernel, but those regions are not split up
0497      * along node boundaries yet, and don't necessarily have their
0498      * node ID set yet either.
0499      *
0500      * So iterate over all memory known to the x86 architecture,
0501      * and use those ranges to set the nid in memblock.reserved.
0502      * This will split up the memblock regions along node
0503      * boundaries and will set the node IDs as well.
0504      */
0505     for (i = 0; i < numa_meminfo.nr_blks; i++) {
0506         struct numa_memblk *mb = numa_meminfo.blk + i;
0507         int ret;
0508
0509         ret = memblock_set_node(mb->start, mb->end - mb->start, &memblock.reserved, mb->nid);
0510         WARN_ON_ONCE(ret);
0511     }
0512
0513     /*
0514      * Now go over all reserved memblock regions, to construct a
0515      * node mask of all kernel reserved memory areas.
0516      *
0517      * [ Note, when booting with mem=nn[kMG] or in a kdump kernel,
0518      *   numa_meminfo might not include all memblock.reserved
0519      *   memory ranges, because quirks such as trim_snb_memory()
0520      *   reserve specific pages for Sandy Bridge graphics. ]
0521      */
0522     for_each_reserved_mem_region(mb_region) {
0523         int nid = memblock_get_region_node(mb_region);
0524
0525         if (nid != MAX_NUMNODES)
0526             node_set(nid, reserved_nodemask);
0527     }
0528
0529     /*
0530      * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory
0531      * belonging to the reserved node mask.
0532      *
0533      * Note that this will include memory regions that reside
0534      * on nodes that contain kernel memory - entire nodes
0535      * become hot-unpluggable:
0536      */
0537     for (i = 0; i < numa_meminfo.nr_blks; i++) {
0538         struct numa_memblk *mb = numa_meminfo.blk + i;
0539
0540         if (!node_isset(mb->nid, reserved_nodemask))
0541             continue;
0542
0543         memblock_clear_hotplug(mb->start, mb->end - mb->start);
0544     }
0545 }
0546
0547 static int __init numa_register_memblks(struct numa_meminfo *mi)
0548 {
0549     int i, nid;
0550
0551     /* Account for nodes with cpus and no memory */
0552     node_possible_map = numa_nodes_parsed;
0553     numa_nodemask_from_meminfo(&node_possible_map, mi);
0554     if (WARN_ON(nodes_empty(node_possible_map)))
0555         return -EINVAL;
0556
0557     for (i = 0; i < mi->nr_blks; i++) {
0558         struct numa_memblk *mb = &mi->blk[i];
0559         memblock_set_node(mb->start, mb->end - mb->start,
0560                   &memblock.memory, mb->nid);
0561     }
0562
0563     /*
0564      * At very early time, the kernel have to use some memory such as
0565      * loading the kernel image. We cannot prevent this anyway. So any
0566      * node the kernel resides in should be un-hotpluggable.
0567      *
0568      * And when we come here, alloc node data won't fail.
0569      */
0570     numa_clear_kernel_node_hotplug();
0571
0572     /*
0573      * If sections array is gonna be used for pfn -> nid mapping, check
0574      * whether its granularity is fine enough.
0575      */
0576     if (IS_ENABLED(NODE_NOT_IN_PAGE_FLAGS)) {
0577         unsigned long pfn_align = node_map_pfn_alignment();
0578
0579         if (pfn_align && pfn_align < PAGES_PER_SECTION) {
0580             pr_warn("Node alignment %LuMB < min %LuMB, rejecting NUMA config\n",
0581                 PFN_PHYS(pfn_align) >> 20,
0582                 PFN_PHYS(PAGES_PER_SECTION) >> 20);
0583             return -EINVAL;
0584         }
0585     }
0586     if (!numa_meminfo_cover_memory(mi))
0587         return -EINVAL;
0588
0589     /* Finally register nodes. */
0590     for_each_node_mask(nid, node_possible_map) {
0591         u64 start = PFN_PHYS(max_pfn);
0592         u64 end = 0;
0593
0594         for (i = 0; i < mi->nr_blks; i++) {
0595             if (nid != mi->blk[i].nid)
0596                 continue;
0597             start = min(mi->blk[i].start, start);
0598             end = max(mi->blk[i].end, end);
0599         }
0600
0601         if (start >= end)
0602             continue;
0603
0604         /*
0605          * Don't confuse VM with a node that doesn't have the
0606          * minimum amount of memory:
0607          */
0608         if (end && (end - start) < NODE_MIN_SIZE)
0609             continue;
0610
0611         alloc_node_data(nid);
0612     }
0613
0614     /* Dump memblock with node info and return. */
0615     memblock_dump_all();
0616     return 0;
0617 }
0618
0619 /*
0620  * There are unfortunately some poorly designed mainboards around that
0621  * only connect memory to a single CPU. This breaks the 1:1 cpu->node
0622  * mapping. To avoid this fill in the mapping for all possible CPUs,
0623  * as the number of CPUs is not known yet. We round robin the existing
0624  * nodes.
0625  */
0626 static void __init numa_init_array(void)
0627 {
0628     int rr, i;
0629
0630     rr = first_node(node_online_map);
0631     for (i = 0; i < nr_cpu_ids; i++) {
0632         if (early_cpu_to_node(i) != NUMA_NO_NODE)
0633             continue;
0634         numa_set_node(i, rr);
0635         rr = next_node_in(rr, node_online_map);
0636     }
0637 }
0638
0639 static int __init numa_init(int (*init_func)(void))
0640 {
0641     int i;
0642     int ret;
0643
0644     for (i = 0; i < MAX_LOCAL_APIC; i++)
0645         set_apicid_to_node(i, NUMA_NO_NODE);
0646
0647     nodes_clear(numa_nodes_parsed);
0648     nodes_clear(node_possible_map);
0649     nodes_clear(node_online_map);
0650     memset(&numa_meminfo, 0, sizeof(numa_meminfo));
0651     WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.memory,
0652                   MAX_NUMNODES));
0653     WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.reserved,
0654                   MAX_NUMNODES));
0655     /* In case that parsing SRAT failed. */
0656     WARN_ON(memblock_clear_hotplug(0, ULLONG_MAX));
0657     numa_reset_distance();
0658
0659     ret = init_func();
0660     if (ret < 0)
0661         return ret;
0662
0663     /*
0664      * We reset memblock back to the top-down direction
0665      * here because if we configured ACPI_NUMA, we have
0666      * parsed SRAT in init_func(). It is ok to have the
0667      * reset here even if we did't configure ACPI_NUMA
0668      * or acpi numa init fails and fallbacks to dummy
0669      * numa init.
0670      */
0671     memblock_set_bottom_up(false);
0672
0673     ret = numa_cleanup_meminfo(&numa_meminfo);
0674     if (ret < 0)
0675         return ret;
0676
0677     numa_emulation(&numa_meminfo, numa_distance_cnt);
0678
0679     ret = numa_register_memblks(&numa_meminfo);
0680     if (ret < 0)
0681         return ret;
0682
0683     for (i = 0; i < nr_cpu_ids; i++) {
0684         int nid = early_cpu_to_node(i);
0685
0686         if (nid == NUMA_NO_NODE)
0687             continue;
0688         if (!node_online(nid))
0689             numa_clear_node(i);
0690     }
0691     numa_init_array();
0692
0693     return 0;
0694 }
0695
0696 /**
0697  * dummy_numa_init - Fallback dummy NUMA init
0698  *
0699  * Used if there's no underlying NUMA architecture, NUMA initialization
0700  * fails, or NUMA is disabled on the command line.
0701  *
0702  * Must online at least one node and add memory blocks that cover all
0703  * allowed memory.  This function must not fail.
0704  */
0705 static int __init dummy_numa_init(void)
0706 {
0707     printk(KERN_INFO "%s\n",
0708            numa_off ? "NUMA turned off" : "No NUMA configuration found");
0709     printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n",
0710            0LLU, PFN_PHYS(max_pfn) - 1);
0711
0712     node_set(0, numa_nodes_parsed);
0713     numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
0714
0715     return 0;
0716 }
0717
0718 /**
0719  * x86_numa_init - Initialize NUMA
0720  *
0721  * Try each configured NUMA initialization method until one succeeds.  The
0722  * last fallback is dummy single node config encompassing whole memory and
0723  * never fails.
0724  */
0725 void __init x86_numa_init(void)
0726 {
0727     if (!numa_off) {
0728 #ifdef CONFIG_ACPI_NUMA
0729         if (!numa_init(x86_acpi_numa_init))
0730             return;
0731 #endif
0732 #ifdef CONFIG_AMD_NUMA
0733         if (!numa_init(amd_numa_init))
0734             return;
0735 #endif
0736     }
0737
0738     numa_init(dummy_numa_init);
0739 }
0740
0741
0742 /*
0743  * A node may exist which has one or more Generic Initiators but no CPUs and no
0744  * memory.
0745  *
0746  * This function must be called after init_cpu_to_node(), to ensure that any
0747  * memoryless CPU nodes have already been brought online, and before the
0748  * node_data[nid] is needed for zone list setup in build_all_zonelists().
0749  *
0750  * When this function is called, any nodes containing either memory and/or CPUs
0751  * will already be online and there is no need to do anything extra, even if
0752  * they also contain one or more Generic Initiators.
0753  */
0754 void __init init_gi_nodes(void)
0755 {
0756     int nid;
0757
0758     /*
0759      * Exclude this node from
0760      * bringup_nonboot_cpus
0761      *  cpu_up
0762      *   __try_online_node
0763      *    register_one_node
0764      * because node_subsys is not initialized yet.
0765      * TODO remove dependency on node_online
0766      */
0767     for_each_node_state(nid, N_GENERIC_INITIATOR)
0768         if (!node_online(nid))
0769             node_set_online(nid);
0770 }
0771
0772 /*
0773  * Setup early cpu_to_node.
0774  *
0775  * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
0776  * and apicid_to_node[] tables have valid entries for a CPU.
0777  * This means we skip cpu_to_node[] initialisation for NUMA
0778  * emulation and faking node case (when running a kernel compiled
0779  * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
0780  * is already initialized in a round robin manner at numa_init_array,
0781  * prior to this call, and this initialization is good enough
0782  * for the fake NUMA cases.
0783  *
0784  * Called before the per_cpu areas are setup.
0785  */
0786 void __init init_cpu_to_node(void)
0787 {
0788     int cpu;
0789     u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
0790
0791     BUG_ON(cpu_to_apicid == NULL);
0792
0793     for_each_possible_cpu(cpu) {
0794         int node = numa_cpu_node(cpu);
0795
0796         if (node == NUMA_NO_NODE)
0797             continue;
0798
0799         /*
0800          * Exclude this node from
0801          * bringup_nonboot_cpus
0802          *  cpu_up
0803          *   __try_online_node
0804          *    register_one_node
0805          * because node_subsys is not initialized yet.
0806          * TODO remove dependency on node_online
0807          */
0808         if (!node_online(node))
0809             node_set_online(node);
0810
0811         numa_set_node(cpu, node);
0812     }
0813 }
0814
0815 #ifndef CONFIG_DEBUG_PER_CPU_MAPS
0816
0817 # ifndef CONFIG_NUMA_EMU
0818 void numa_add_cpu(int cpu)
0819 {
0820     cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
0821 }
0822
0823 void numa_remove_cpu(int cpu)
0824 {
0825     cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
0826 }
0827 # endif /* !CONFIG_NUMA_EMU */
0828
0829 #else   /* !CONFIG_DEBUG_PER_CPU_MAPS */
0830
0831 int __cpu_to_node(int cpu)
0832 {
0833     if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
0834         printk(KERN_WARNING
0835             "cpu_to_node(%d): usage too early!\n", cpu);
0836         dump_stack();
0837         return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
0838     }
0839     return per_cpu(x86_cpu_to_node_map, cpu);
0840 }
0841 EXPORT_SYMBOL(__cpu_to_node);
0842
0843 /*
0844  * Same function as cpu_to_node() but used if called before the
0845  * per_cpu areas are setup.
0846  */
0847 int early_cpu_to_node(int cpu)
0848 {
0849     if (early_per_cpu_ptr(x86_cpu_to_node_map))
0850         return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
0851
0852     if (!cpu_possible(cpu)) {
0853         printk(KERN_WARNING
0854             "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
0855         dump_stack();
0856         return NUMA_NO_NODE;
0857     }
0858     return per_cpu(x86_cpu_to_node_map, cpu);
0859 }
0860
0861 void debug_cpumask_set_cpu(int cpu, int node, bool enable)
0862 {
0863     struct cpumask *mask;
0864
0865     if (node == NUMA_NO_NODE) {
0866         /* early_cpu_to_node() already emits a warning and trace */
0867         return;
0868     }
0869     mask = node_to_cpumask_map[node];
0870     if (!cpumask_available(mask)) {
0871         pr_err("node_to_cpumask_map[%i] NULL\n", node);
0872         dump_stack();
0873         return;
0874     }
0875
0876     if (enable)
0877         cpumask_set_cpu(cpu, mask);
0878     else
0879         cpumask_clear_cpu(cpu, mask);
0880
0881     printk(KERN_DEBUG "%s cpu %d node %d: mask now %*pbl\n",
0882         enable ? "numa_add_cpu" : "numa_remove_cpu",
0883         cpu, node, cpumask_pr_args(mask));
0884     return;
0885 }
0886
0887 # ifndef CONFIG_NUMA_EMU
0888 static void numa_set_cpumask(int cpu, bool enable)
0889 {
0890     debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable);
0891 }
0892
0893 void numa_add_cpu(int cpu)
0894 {
0895     numa_set_cpumask(cpu, true);
0896 }
0897
0898 void numa_remove_cpu(int cpu)
0899 {
0900     numa_set_cpumask(cpu, false);
0901 }
0902 # endif /* !CONFIG_NUMA_EMU */
0903
0904 /*
0905  * Returns a pointer to the bitmask of CPUs on Node 'node'.
0906  */
0907 const struct cpumask *cpumask_of_node(int node)
0908 {
0909     if ((unsigned)node >= nr_node_ids) {
0910         printk(KERN_WARNING
0911             "cpumask_of_node(%d): (unsigned)node >= nr_node_ids(%u)\n",
0912             node, nr_node_ids);
0913         dump_stack();
0914         return cpu_none_mask;
0915     }
0916     if (!cpumask_available(node_to_cpumask_map[node])) {
0917         printk(KERN_WARNING
0918             "cpumask_of_node(%d): no node_to_cpumask_map!\n",
0919             node);
0920         dump_stack();
0921         return cpu_online_mask;
0922     }
0923     return node_to_cpumask_map[node];
0924 }
0925 EXPORT_SYMBOL(cpumask_of_node);
0926
0927 #endif  /* !CONFIG_DEBUG_PER_CPU_MAPS */
0928
0929 #ifdef CONFIG_NUMA_KEEP_MEMINFO
0930 static int meminfo_to_nid(struct numa_meminfo *mi, u64 start)
0931 {
0932     int i;
0933
0934     for (i = 0; i < mi->nr_blks; i++)
0935         if (mi->blk[i].start <= start && mi->blk[i].end > start)
0936             return mi->blk[i].nid;
0937     return NUMA_NO_NODE;
0938 }
0939
0940 int phys_to_target_node(phys_addr_t start)
0941 {
0942     int nid = meminfo_to_nid(&numa_meminfo, start);
0943
0944     /*
0945      * Prefer online nodes, but if reserved memory might be
0946      * hot-added continue the search with reserved ranges.
0947      */
0948     if (nid != NUMA_NO_NODE)
0949         return nid;
0950
0951     return meminfo_to_nid(&numa_reserved_meminfo, start);
0952 }
0953 EXPORT_SYMBOL_GPL(phys_to_target_node);
0954
0955 int memory_add_physaddr_to_nid(u64 start)
0956 {
0957     int nid = meminfo_to_nid(&numa_meminfo, start);
0958
0959     if (nid == NUMA_NO_NODE)
0960         nid = numa_meminfo.blk[0].nid;
0961     return nid;
0962 }
0963 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
0964 #endif