Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * NUMA emulation
0004  */
0005 #include <linux/kernel.h>
0006 #include <linux/errno.h>
0007 #include <linux/topology.h>
0008 #include <linux/memblock.h>
0009 #include <asm/dma.h>
0010 
0011 #include "numa_internal.h"
0012 
0013 static int emu_nid_to_phys[MAX_NUMNODES];
0014 static char *emu_cmdline __initdata;
0015 
0016 int __init numa_emu_cmdline(char *str)
0017 {
0018     emu_cmdline = str;
0019     return 0;
0020 }
0021 
0022 static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi)
0023 {
0024     int i;
0025 
0026     for (i = 0; i < mi->nr_blks; i++)
0027         if (mi->blk[i].nid == nid)
0028             return i;
0029     return -ENOENT;
0030 }
0031 
0032 static u64 __init mem_hole_size(u64 start, u64 end)
0033 {
0034     unsigned long start_pfn = PFN_UP(start);
0035     unsigned long end_pfn = PFN_DOWN(end);
0036 
0037     if (start_pfn < end_pfn)
0038         return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn));
0039     return 0;
0040 }
0041 
0042 /*
0043  * Sets up nid to range from @start to @end.  The return value is -errno if
0044  * something went wrong, 0 otherwise.
0045  */
0046 static int __init emu_setup_memblk(struct numa_meminfo *ei,
0047                    struct numa_meminfo *pi,
0048                    int nid, int phys_blk, u64 size)
0049 {
0050     struct numa_memblk *eb = &ei->blk[ei->nr_blks];
0051     struct numa_memblk *pb = &pi->blk[phys_blk];
0052 
0053     if (ei->nr_blks >= NR_NODE_MEMBLKS) {
0054         pr_err("NUMA: Too many emulated memblks, failing emulation\n");
0055         return -EINVAL;
0056     }
0057 
0058     ei->nr_blks++;
0059     eb->start = pb->start;
0060     eb->end = pb->start + size;
0061     eb->nid = nid;
0062 
0063     if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
0064         emu_nid_to_phys[nid] = pb->nid;
0065 
0066     pb->start += size;
0067     if (pb->start >= pb->end) {
0068         WARN_ON_ONCE(pb->start > pb->end);
0069         numa_remove_memblk_from(phys_blk, pi);
0070     }
0071 
0072     printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n",
0073            nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20);
0074     return 0;
0075 }
0076 
0077 /*
0078  * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
0079  * to max_addr.
0080  *
0081  * Returns zero on success or negative on error.
0082  */
0083 static int __init split_nodes_interleave(struct numa_meminfo *ei,
0084                      struct numa_meminfo *pi,
0085                      u64 addr, u64 max_addr, int nr_nodes)
0086 {
0087     nodemask_t physnode_mask = numa_nodes_parsed;
0088     u64 size;
0089     int big;
0090     int nid = 0;
0091     int i, ret;
0092 
0093     if (nr_nodes <= 0)
0094         return -1;
0095     if (nr_nodes > MAX_NUMNODES) {
0096         pr_info("numa=fake=%d too large, reducing to %d\n",
0097             nr_nodes, MAX_NUMNODES);
0098         nr_nodes = MAX_NUMNODES;
0099     }
0100 
0101     /*
0102      * Calculate target node size.  x86_32 freaks on __udivdi3() so do
0103      * the division in ulong number of pages and convert back.
0104      */
0105     size = max_addr - addr - mem_hole_size(addr, max_addr);
0106     size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes);
0107 
0108     /*
0109      * Calculate the number of big nodes that can be allocated as a result
0110      * of consolidating the remainder.
0111      */
0112     big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
0113         FAKE_NODE_MIN_SIZE;
0114 
0115     size &= FAKE_NODE_MIN_HASH_MASK;
0116     if (!size) {
0117         pr_err("Not enough memory for each node.  "
0118             "NUMA emulation disabled.\n");
0119         return -1;
0120     }
0121 
0122     /*
0123      * Continue to fill physical nodes with fake nodes until there is no
0124      * memory left on any of them.
0125      */
0126     while (!nodes_empty(physnode_mask)) {
0127         for_each_node_mask(i, physnode_mask) {
0128             u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
0129             u64 start, limit, end;
0130             int phys_blk;
0131 
0132             phys_blk = emu_find_memblk_by_nid(i, pi);
0133             if (phys_blk < 0) {
0134                 node_clear(i, physnode_mask);
0135                 continue;
0136             }
0137             start = pi->blk[phys_blk].start;
0138             limit = pi->blk[phys_blk].end;
0139             end = start + size;
0140 
0141             if (nid < big)
0142                 end += FAKE_NODE_MIN_SIZE;
0143 
0144             /*
0145              * Continue to add memory to this fake node if its
0146              * non-reserved memory is less than the per-node size.
0147              */
0148             while (end - start - mem_hole_size(start, end) < size) {
0149                 end += FAKE_NODE_MIN_SIZE;
0150                 if (end > limit) {
0151                     end = limit;
0152                     break;
0153                 }
0154             }
0155 
0156             /*
0157              * If there won't be at least FAKE_NODE_MIN_SIZE of
0158              * non-reserved memory in ZONE_DMA32 for the next node,
0159              * this one must extend to the boundary.
0160              */
0161             if (end < dma32_end && dma32_end - end -
0162                 mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
0163                 end = dma32_end;
0164 
0165             /*
0166              * If there won't be enough non-reserved memory for the
0167              * next node, this one must extend to the end of the
0168              * physical node.
0169              */
0170             if (limit - end - mem_hole_size(end, limit) < size)
0171                 end = limit;
0172 
0173             ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes,
0174                            phys_blk,
0175                            min(end, limit) - start);
0176             if (ret < 0)
0177                 return ret;
0178         }
0179     }
0180     return 0;
0181 }
0182 
0183 /*
0184  * Returns the end address of a node so that there is at least `size' amount of
0185  * non-reserved memory or `max_addr' is reached.
0186  */
0187 static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
0188 {
0189     u64 end = start + size;
0190 
0191     while (end - start - mem_hole_size(start, end) < size) {
0192         end += FAKE_NODE_MIN_SIZE;
0193         if (end > max_addr) {
0194             end = max_addr;
0195             break;
0196         }
0197     }
0198     return end;
0199 }
0200 
0201 static u64 uniform_size(u64 max_addr, u64 base, u64 hole, int nr_nodes)
0202 {
0203     unsigned long max_pfn = PHYS_PFN(max_addr);
0204     unsigned long base_pfn = PHYS_PFN(base);
0205     unsigned long hole_pfns = PHYS_PFN(hole);
0206 
0207     return PFN_PHYS((max_pfn - base_pfn - hole_pfns) / nr_nodes);
0208 }
0209 
0210 /*
0211  * Sets up fake nodes of `size' interleaved over physical nodes ranging from
0212  * `addr' to `max_addr'.
0213  *
0214  * Returns zero on success or negative on error.
0215  */
0216 static int __init split_nodes_size_interleave_uniform(struct numa_meminfo *ei,
0217                           struct numa_meminfo *pi,
0218                           u64 addr, u64 max_addr, u64 size,
0219                           int nr_nodes, struct numa_memblk *pblk,
0220                           int nid)
0221 {
0222     nodemask_t physnode_mask = numa_nodes_parsed;
0223     int i, ret, uniform = 0;
0224     u64 min_size;
0225 
0226     if ((!size && !nr_nodes) || (nr_nodes && !pblk))
0227         return -1;
0228 
0229     /*
0230      * In the 'uniform' case split the passed in physical node by
0231      * nr_nodes, in the non-uniform case, ignore the passed in
0232      * physical block and try to create nodes of at least size
0233      * @size.
0234      *
0235      * In the uniform case, split the nodes strictly by physical
0236      * capacity, i.e. ignore holes. In the non-uniform case account
0237      * for holes and treat @size as a minimum floor.
0238      */
0239     if (!nr_nodes)
0240         nr_nodes = MAX_NUMNODES;
0241     else {
0242         nodes_clear(physnode_mask);
0243         node_set(pblk->nid, physnode_mask);
0244         uniform = 1;
0245     }
0246 
0247     if (uniform) {
0248         min_size = uniform_size(max_addr, addr, 0, nr_nodes);
0249         size = min_size;
0250     } else {
0251         /*
0252          * The limit on emulated nodes is MAX_NUMNODES, so the
0253          * size per node is increased accordingly if the
0254          * requested size is too small.  This creates a uniform
0255          * distribution of node sizes across the entire machine
0256          * (but not necessarily over physical nodes).
0257          */
0258         min_size = uniform_size(max_addr, addr,
0259                 mem_hole_size(addr, max_addr), nr_nodes);
0260     }
0261     min_size = ALIGN(max(min_size, FAKE_NODE_MIN_SIZE), FAKE_NODE_MIN_SIZE);
0262     if (size < min_size) {
0263         pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
0264             size >> 20, min_size >> 20);
0265         size = min_size;
0266     }
0267     size = ALIGN_DOWN(size, FAKE_NODE_MIN_SIZE);
0268 
0269     /*
0270      * Fill physical nodes with fake nodes of size until there is no memory
0271      * left on any of them.
0272      */
0273     while (!nodes_empty(physnode_mask)) {
0274         for_each_node_mask(i, physnode_mask) {
0275             u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
0276             u64 start, limit, end;
0277             int phys_blk;
0278 
0279             phys_blk = emu_find_memblk_by_nid(i, pi);
0280             if (phys_blk < 0) {
0281                 node_clear(i, physnode_mask);
0282                 continue;
0283             }
0284 
0285             start = pi->blk[phys_blk].start;
0286             limit = pi->blk[phys_blk].end;
0287 
0288             if (uniform)
0289                 end = start + size;
0290             else
0291                 end = find_end_of_node(start, limit, size);
0292             /*
0293              * If there won't be at least FAKE_NODE_MIN_SIZE of
0294              * non-reserved memory in ZONE_DMA32 for the next node,
0295              * this one must extend to the boundary.
0296              */
0297             if (end < dma32_end && dma32_end - end -
0298                 mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
0299                 end = dma32_end;
0300 
0301             /*
0302              * If there won't be enough non-reserved memory for the
0303              * next node, this one must extend to the end of the
0304              * physical node.
0305              */
0306             if ((limit - end - mem_hole_size(end, limit) < size)
0307                     && !uniform)
0308                 end = limit;
0309 
0310             ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES,
0311                            phys_blk,
0312                            min(end, limit) - start);
0313             if (ret < 0)
0314                 return ret;
0315         }
0316     }
0317     return nid;
0318 }
0319 
0320 static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
0321                           struct numa_meminfo *pi,
0322                           u64 addr, u64 max_addr, u64 size)
0323 {
0324     return split_nodes_size_interleave_uniform(ei, pi, addr, max_addr, size,
0325             0, NULL, 0);
0326 }
0327 
0328 static int __init setup_emu2phys_nid(int *dfl_phys_nid)
0329 {
0330     int i, max_emu_nid = 0;
0331 
0332     *dfl_phys_nid = NUMA_NO_NODE;
0333     for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) {
0334         if (emu_nid_to_phys[i] != NUMA_NO_NODE) {
0335             max_emu_nid = i;
0336             if (*dfl_phys_nid == NUMA_NO_NODE)
0337                 *dfl_phys_nid = emu_nid_to_phys[i];
0338         }
0339     }
0340 
0341     return max_emu_nid;
0342 }
0343 
0344 /**
0345  * numa_emulation - Emulate NUMA nodes
0346  * @numa_meminfo: NUMA configuration to massage
0347  * @numa_dist_cnt: The size of the physical NUMA distance table
0348  *
0349  * Emulate NUMA nodes according to the numa=fake kernel parameter.
0350  * @numa_meminfo contains the physical memory configuration and is modified
0351  * to reflect the emulated configuration on success.  @numa_dist_cnt is
0352  * used to determine the size of the physical distance table.
0353  *
0354  * On success, the following modifications are made.
0355  *
0356  * - @numa_meminfo is updated to reflect the emulated nodes.
0357  *
0358  * - __apicid_to_node[] is updated such that APIC IDs are mapped to the
0359  *   emulated nodes.
0360  *
0361  * - NUMA distance table is rebuilt to represent distances between emulated
0362  *   nodes.  The distances are determined considering how emulated nodes
0363  *   are mapped to physical nodes and match the actual distances.
0364  *
0365  * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical
0366  *   nodes.  This is used by numa_add_cpu() and numa_remove_cpu().
0367  *
0368  * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with
0369  * identity mapping and no other modification is made.
0370  */
0371 void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
0372 {
0373     static struct numa_meminfo ei __initdata;
0374     static struct numa_meminfo pi __initdata;
0375     const u64 max_addr = PFN_PHYS(max_pfn);
0376     u8 *phys_dist = NULL;
0377     size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
0378     int max_emu_nid, dfl_phys_nid;
0379     int i, j, ret;
0380 
0381     if (!emu_cmdline)
0382         goto no_emu;
0383 
0384     memset(&ei, 0, sizeof(ei));
0385     pi = *numa_meminfo;
0386 
0387     for (i = 0; i < MAX_NUMNODES; i++)
0388         emu_nid_to_phys[i] = NUMA_NO_NODE;
0389 
0390     /*
0391      * If the numa=fake command-line contains a 'M' or 'G', it represents
0392      * the fixed node size.  Otherwise, if it is just a single number N,
0393      * split the system RAM into N fake nodes.
0394      */
0395     if (strchr(emu_cmdline, 'U')) {
0396         nodemask_t physnode_mask = numa_nodes_parsed;
0397         unsigned long n;
0398         int nid = 0;
0399 
0400         n = simple_strtoul(emu_cmdline, &emu_cmdline, 0);
0401         ret = -1;
0402         for_each_node_mask(i, physnode_mask) {
0403             /*
0404              * The reason we pass in blk[0] is due to
0405              * numa_remove_memblk_from() called by
0406              * emu_setup_memblk() will delete entry 0
0407              * and then move everything else up in the pi.blk
0408              * array. Therefore we should always be looking
0409              * at blk[0].
0410              */
0411             ret = split_nodes_size_interleave_uniform(&ei, &pi,
0412                     pi.blk[0].start, pi.blk[0].end, 0,
0413                     n, &pi.blk[0], nid);
0414             if (ret < 0)
0415                 break;
0416             if (ret < n) {
0417                 pr_info("%s: phys: %d only got %d of %ld nodes, failing\n",
0418                         __func__, i, ret, n);
0419                 ret = -1;
0420                 break;
0421             }
0422             nid = ret;
0423         }
0424     } else if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) {
0425         u64 size;
0426 
0427         size = memparse(emu_cmdline, &emu_cmdline);
0428         ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size);
0429     } else {
0430         unsigned long n;
0431 
0432         n = simple_strtoul(emu_cmdline, &emu_cmdline, 0);
0433         ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n);
0434     }
0435     if (*emu_cmdline == ':')
0436         emu_cmdline++;
0437 
0438     if (ret < 0)
0439         goto no_emu;
0440 
0441     if (numa_cleanup_meminfo(&ei) < 0) {
0442         pr_warn("NUMA: Warning: constructed meminfo invalid, disabling emulation\n");
0443         goto no_emu;
0444     }
0445 
0446     /* copy the physical distance table */
0447     if (numa_dist_cnt) {
0448         u64 phys;
0449 
0450         phys = memblock_phys_alloc_range(phys_size, PAGE_SIZE, 0,
0451                          PFN_PHYS(max_pfn_mapped));
0452         if (!phys) {
0453             pr_warn("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
0454             goto no_emu;
0455         }
0456         phys_dist = __va(phys);
0457 
0458         for (i = 0; i < numa_dist_cnt; i++)
0459             for (j = 0; j < numa_dist_cnt; j++)
0460                 phys_dist[i * numa_dist_cnt + j] =
0461                     node_distance(i, j);
0462     }
0463 
0464     /*
0465      * Determine the max emulated nid and the default phys nid to use
0466      * for unmapped nodes.
0467      */
0468     max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid);
0469 
0470     /* commit */
0471     *numa_meminfo = ei;
0472 
0473     /* Make sure numa_nodes_parsed only contains emulated nodes */
0474     nodes_clear(numa_nodes_parsed);
0475     for (i = 0; i < ARRAY_SIZE(ei.blk); i++)
0476         if (ei.blk[i].start != ei.blk[i].end &&
0477             ei.blk[i].nid != NUMA_NO_NODE)
0478             node_set(ei.blk[i].nid, numa_nodes_parsed);
0479 
0480     /*
0481      * Transform __apicid_to_node table to use emulated nids by
0482      * reverse-mapping phys_nid.  The maps should always exist but fall
0483      * back to zero just in case.
0484      */
0485     for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
0486         if (__apicid_to_node[i] == NUMA_NO_NODE)
0487             continue;
0488         for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++)
0489             if (__apicid_to_node[i] == emu_nid_to_phys[j])
0490                 break;
0491         __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0;
0492     }
0493 
0494     /* make sure all emulated nodes are mapped to a physical node */
0495     for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
0496         if (emu_nid_to_phys[i] == NUMA_NO_NODE)
0497             emu_nid_to_phys[i] = dfl_phys_nid;
0498 
0499     /* transform distance table */
0500     numa_reset_distance();
0501     for (i = 0; i < max_emu_nid + 1; i++) {
0502         for (j = 0; j < max_emu_nid + 1; j++) {
0503             int physi = emu_nid_to_phys[i];
0504             int physj = emu_nid_to_phys[j];
0505             int dist;
0506 
0507             if (get_option(&emu_cmdline, &dist) == 2)
0508                 ;
0509             else if (physi >= numa_dist_cnt || physj >= numa_dist_cnt)
0510                 dist = physi == physj ?
0511                     LOCAL_DISTANCE : REMOTE_DISTANCE;
0512             else
0513                 dist = phys_dist[physi * numa_dist_cnt + physj];
0514 
0515             numa_set_distance(i, j, dist);
0516         }
0517     }
0518 
0519     /* free the copied physical distance table */
0520     memblock_free(phys_dist, phys_size);
0521     return;
0522 
0523 no_emu:
0524     /* No emulation.  Build identity emu_nid_to_phys[] for numa_add_cpu() */
0525     for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
0526         emu_nid_to_phys[i] = i;
0527 }
0528 
0529 #ifndef CONFIG_DEBUG_PER_CPU_MAPS
0530 void numa_add_cpu(int cpu)
0531 {
0532     int physnid, nid;
0533 
0534     nid = early_cpu_to_node(cpu);
0535     BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
0536 
0537     physnid = emu_nid_to_phys[nid];
0538 
0539     /*
0540      * Map the cpu to each emulated node that is allocated on the physical
0541      * node of the cpu's apic id.
0542      */
0543     for_each_online_node(nid)
0544         if (emu_nid_to_phys[nid] == physnid)
0545             cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
0546 }
0547 
0548 void numa_remove_cpu(int cpu)
0549 {
0550     int i;
0551 
0552     for_each_online_node(i)
0553         cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
0554 }
0555 #else   /* !CONFIG_DEBUG_PER_CPU_MAPS */
0556 static void numa_set_cpumask(int cpu, bool enable)
0557 {
0558     int nid, physnid;
0559 
0560     nid = early_cpu_to_node(cpu);
0561     if (nid == NUMA_NO_NODE) {
0562         /* early_cpu_to_node() already emits a warning and trace */
0563         return;
0564     }
0565 
0566     physnid = emu_nid_to_phys[nid];
0567 
0568     for_each_online_node(nid) {
0569         if (emu_nid_to_phys[nid] != physnid)
0570             continue;
0571 
0572         debug_cpumask_set_cpu(cpu, nid, enable);
0573     }
0574 }
0575 
0576 void numa_add_cpu(int cpu)
0577 {
0578     numa_set_cpumask(cpu, true);
0579 }
0580 
0581 void numa_remove_cpu(int cpu)
0582 {
0583     numa_set_cpumask(cpu, false);
0584 }
0585 #endif  /* !CONFIG_DEBUG_PER_CPU_MAPS */