0001
0002
0003
0004
0005 #include <linux/kernel.h>
0006 #include <linux/errno.h>
0007 #include <linux/topology.h>
0008 #include <linux/memblock.h>
0009 #include <asm/dma.h>
0010
0011 #include "numa_internal.h"
0012
0013 static int emu_nid_to_phys[MAX_NUMNODES];
0014 static char *emu_cmdline __initdata;
0015
0016 int __init numa_emu_cmdline(char *str)
0017 {
0018 emu_cmdline = str;
0019 return 0;
0020 }
0021
0022 static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi)
0023 {
0024 int i;
0025
0026 for (i = 0; i < mi->nr_blks; i++)
0027 if (mi->blk[i].nid == nid)
0028 return i;
0029 return -ENOENT;
0030 }
0031
0032 static u64 __init mem_hole_size(u64 start, u64 end)
0033 {
0034 unsigned long start_pfn = PFN_UP(start);
0035 unsigned long end_pfn = PFN_DOWN(end);
0036
0037 if (start_pfn < end_pfn)
0038 return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn));
0039 return 0;
0040 }
0041
0042
0043
0044
0045
0046 static int __init emu_setup_memblk(struct numa_meminfo *ei,
0047 struct numa_meminfo *pi,
0048 int nid, int phys_blk, u64 size)
0049 {
0050 struct numa_memblk *eb = &ei->blk[ei->nr_blks];
0051 struct numa_memblk *pb = &pi->blk[phys_blk];
0052
0053 if (ei->nr_blks >= NR_NODE_MEMBLKS) {
0054 pr_err("NUMA: Too many emulated memblks, failing emulation\n");
0055 return -EINVAL;
0056 }
0057
0058 ei->nr_blks++;
0059 eb->start = pb->start;
0060 eb->end = pb->start + size;
0061 eb->nid = nid;
0062
0063 if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
0064 emu_nid_to_phys[nid] = pb->nid;
0065
0066 pb->start += size;
0067 if (pb->start >= pb->end) {
0068 WARN_ON_ONCE(pb->start > pb->end);
0069 numa_remove_memblk_from(phys_blk, pi);
0070 }
0071
0072 printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n",
0073 nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20);
0074 return 0;
0075 }
0076
0077
0078
0079
0080
0081
0082
0083 static int __init split_nodes_interleave(struct numa_meminfo *ei,
0084 struct numa_meminfo *pi,
0085 u64 addr, u64 max_addr, int nr_nodes)
0086 {
0087 nodemask_t physnode_mask = numa_nodes_parsed;
0088 u64 size;
0089 int big;
0090 int nid = 0;
0091 int i, ret;
0092
0093 if (nr_nodes <= 0)
0094 return -1;
0095 if (nr_nodes > MAX_NUMNODES) {
0096 pr_info("numa=fake=%d too large, reducing to %d\n",
0097 nr_nodes, MAX_NUMNODES);
0098 nr_nodes = MAX_NUMNODES;
0099 }
0100
0101
0102
0103
0104
0105 size = max_addr - addr - mem_hole_size(addr, max_addr);
0106 size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes);
0107
0108
0109
0110
0111
0112 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
0113 FAKE_NODE_MIN_SIZE;
0114
0115 size &= FAKE_NODE_MIN_HASH_MASK;
0116 if (!size) {
0117 pr_err("Not enough memory for each node. "
0118 "NUMA emulation disabled.\n");
0119 return -1;
0120 }
0121
0122
0123
0124
0125
0126 while (!nodes_empty(physnode_mask)) {
0127 for_each_node_mask(i, physnode_mask) {
0128 u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
0129 u64 start, limit, end;
0130 int phys_blk;
0131
0132 phys_blk = emu_find_memblk_by_nid(i, pi);
0133 if (phys_blk < 0) {
0134 node_clear(i, physnode_mask);
0135 continue;
0136 }
0137 start = pi->blk[phys_blk].start;
0138 limit = pi->blk[phys_blk].end;
0139 end = start + size;
0140
0141 if (nid < big)
0142 end += FAKE_NODE_MIN_SIZE;
0143
0144
0145
0146
0147
0148 while (end - start - mem_hole_size(start, end) < size) {
0149 end += FAKE_NODE_MIN_SIZE;
0150 if (end > limit) {
0151 end = limit;
0152 break;
0153 }
0154 }
0155
0156
0157
0158
0159
0160
0161 if (end < dma32_end && dma32_end - end -
0162 mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
0163 end = dma32_end;
0164
0165
0166
0167
0168
0169
0170 if (limit - end - mem_hole_size(end, limit) < size)
0171 end = limit;
0172
0173 ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes,
0174 phys_blk,
0175 min(end, limit) - start);
0176 if (ret < 0)
0177 return ret;
0178 }
0179 }
0180 return 0;
0181 }
0182
0183
0184
0185
0186
0187 static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
0188 {
0189 u64 end = start + size;
0190
0191 while (end - start - mem_hole_size(start, end) < size) {
0192 end += FAKE_NODE_MIN_SIZE;
0193 if (end > max_addr) {
0194 end = max_addr;
0195 break;
0196 }
0197 }
0198 return end;
0199 }
0200
0201 static u64 uniform_size(u64 max_addr, u64 base, u64 hole, int nr_nodes)
0202 {
0203 unsigned long max_pfn = PHYS_PFN(max_addr);
0204 unsigned long base_pfn = PHYS_PFN(base);
0205 unsigned long hole_pfns = PHYS_PFN(hole);
0206
0207 return PFN_PHYS((max_pfn - base_pfn - hole_pfns) / nr_nodes);
0208 }
0209
0210
0211
0212
0213
0214
0215
0216 static int __init split_nodes_size_interleave_uniform(struct numa_meminfo *ei,
0217 struct numa_meminfo *pi,
0218 u64 addr, u64 max_addr, u64 size,
0219 int nr_nodes, struct numa_memblk *pblk,
0220 int nid)
0221 {
0222 nodemask_t physnode_mask = numa_nodes_parsed;
0223 int i, ret, uniform = 0;
0224 u64 min_size;
0225
0226 if ((!size && !nr_nodes) || (nr_nodes && !pblk))
0227 return -1;
0228
0229
0230
0231
0232
0233
0234
0235
0236
0237
0238
0239 if (!nr_nodes)
0240 nr_nodes = MAX_NUMNODES;
0241 else {
0242 nodes_clear(physnode_mask);
0243 node_set(pblk->nid, physnode_mask);
0244 uniform = 1;
0245 }
0246
0247 if (uniform) {
0248 min_size = uniform_size(max_addr, addr, 0, nr_nodes);
0249 size = min_size;
0250 } else {
0251
0252
0253
0254
0255
0256
0257
0258 min_size = uniform_size(max_addr, addr,
0259 mem_hole_size(addr, max_addr), nr_nodes);
0260 }
0261 min_size = ALIGN(max(min_size, FAKE_NODE_MIN_SIZE), FAKE_NODE_MIN_SIZE);
0262 if (size < min_size) {
0263 pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
0264 size >> 20, min_size >> 20);
0265 size = min_size;
0266 }
0267 size = ALIGN_DOWN(size, FAKE_NODE_MIN_SIZE);
0268
0269
0270
0271
0272
0273 while (!nodes_empty(physnode_mask)) {
0274 for_each_node_mask(i, physnode_mask) {
0275 u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
0276 u64 start, limit, end;
0277 int phys_blk;
0278
0279 phys_blk = emu_find_memblk_by_nid(i, pi);
0280 if (phys_blk < 0) {
0281 node_clear(i, physnode_mask);
0282 continue;
0283 }
0284
0285 start = pi->blk[phys_blk].start;
0286 limit = pi->blk[phys_blk].end;
0287
0288 if (uniform)
0289 end = start + size;
0290 else
0291 end = find_end_of_node(start, limit, size);
0292
0293
0294
0295
0296
0297 if (end < dma32_end && dma32_end - end -
0298 mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
0299 end = dma32_end;
0300
0301
0302
0303
0304
0305
0306 if ((limit - end - mem_hole_size(end, limit) < size)
0307 && !uniform)
0308 end = limit;
0309
0310 ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES,
0311 phys_blk,
0312 min(end, limit) - start);
0313 if (ret < 0)
0314 return ret;
0315 }
0316 }
0317 return nid;
0318 }
0319
0320 static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
0321 struct numa_meminfo *pi,
0322 u64 addr, u64 max_addr, u64 size)
0323 {
0324 return split_nodes_size_interleave_uniform(ei, pi, addr, max_addr, size,
0325 0, NULL, 0);
0326 }
0327
0328 static int __init setup_emu2phys_nid(int *dfl_phys_nid)
0329 {
0330 int i, max_emu_nid = 0;
0331
0332 *dfl_phys_nid = NUMA_NO_NODE;
0333 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) {
0334 if (emu_nid_to_phys[i] != NUMA_NO_NODE) {
0335 max_emu_nid = i;
0336 if (*dfl_phys_nid == NUMA_NO_NODE)
0337 *dfl_phys_nid = emu_nid_to_phys[i];
0338 }
0339 }
0340
0341 return max_emu_nid;
0342 }
0343
0344
0345
0346
0347
0348
0349
0350
0351
0352
0353
0354
0355
0356
0357
0358
0359
0360
0361
0362
0363
0364
0365
0366
0367
0368
0369
0370
0371 void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
0372 {
0373 static struct numa_meminfo ei __initdata;
0374 static struct numa_meminfo pi __initdata;
0375 const u64 max_addr = PFN_PHYS(max_pfn);
0376 u8 *phys_dist = NULL;
0377 size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
0378 int max_emu_nid, dfl_phys_nid;
0379 int i, j, ret;
0380
0381 if (!emu_cmdline)
0382 goto no_emu;
0383
0384 memset(&ei, 0, sizeof(ei));
0385 pi = *numa_meminfo;
0386
0387 for (i = 0; i < MAX_NUMNODES; i++)
0388 emu_nid_to_phys[i] = NUMA_NO_NODE;
0389
0390
0391
0392
0393
0394
0395 if (strchr(emu_cmdline, 'U')) {
0396 nodemask_t physnode_mask = numa_nodes_parsed;
0397 unsigned long n;
0398 int nid = 0;
0399
0400 n = simple_strtoul(emu_cmdline, &emu_cmdline, 0);
0401 ret = -1;
0402 for_each_node_mask(i, physnode_mask) {
0403
0404
0405
0406
0407
0408
0409
0410
0411 ret = split_nodes_size_interleave_uniform(&ei, &pi,
0412 pi.blk[0].start, pi.blk[0].end, 0,
0413 n, &pi.blk[0], nid);
0414 if (ret < 0)
0415 break;
0416 if (ret < n) {
0417 pr_info("%s: phys: %d only got %d of %ld nodes, failing\n",
0418 __func__, i, ret, n);
0419 ret = -1;
0420 break;
0421 }
0422 nid = ret;
0423 }
0424 } else if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) {
0425 u64 size;
0426
0427 size = memparse(emu_cmdline, &emu_cmdline);
0428 ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size);
0429 } else {
0430 unsigned long n;
0431
0432 n = simple_strtoul(emu_cmdline, &emu_cmdline, 0);
0433 ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n);
0434 }
0435 if (*emu_cmdline == ':')
0436 emu_cmdline++;
0437
0438 if (ret < 0)
0439 goto no_emu;
0440
0441 if (numa_cleanup_meminfo(&ei) < 0) {
0442 pr_warn("NUMA: Warning: constructed meminfo invalid, disabling emulation\n");
0443 goto no_emu;
0444 }
0445
0446
0447 if (numa_dist_cnt) {
0448 u64 phys;
0449
0450 phys = memblock_phys_alloc_range(phys_size, PAGE_SIZE, 0,
0451 PFN_PHYS(max_pfn_mapped));
0452 if (!phys) {
0453 pr_warn("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
0454 goto no_emu;
0455 }
0456 phys_dist = __va(phys);
0457
0458 for (i = 0; i < numa_dist_cnt; i++)
0459 for (j = 0; j < numa_dist_cnt; j++)
0460 phys_dist[i * numa_dist_cnt + j] =
0461 node_distance(i, j);
0462 }
0463
0464
0465
0466
0467
0468 max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid);
0469
0470
0471 *numa_meminfo = ei;
0472
0473
0474 nodes_clear(numa_nodes_parsed);
0475 for (i = 0; i < ARRAY_SIZE(ei.blk); i++)
0476 if (ei.blk[i].start != ei.blk[i].end &&
0477 ei.blk[i].nid != NUMA_NO_NODE)
0478 node_set(ei.blk[i].nid, numa_nodes_parsed);
0479
0480
0481
0482
0483
0484
0485 for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
0486 if (__apicid_to_node[i] == NUMA_NO_NODE)
0487 continue;
0488 for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++)
0489 if (__apicid_to_node[i] == emu_nid_to_phys[j])
0490 break;
0491 __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0;
0492 }
0493
0494
0495 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
0496 if (emu_nid_to_phys[i] == NUMA_NO_NODE)
0497 emu_nid_to_phys[i] = dfl_phys_nid;
0498
0499
0500 numa_reset_distance();
0501 for (i = 0; i < max_emu_nid + 1; i++) {
0502 for (j = 0; j < max_emu_nid + 1; j++) {
0503 int physi = emu_nid_to_phys[i];
0504 int physj = emu_nid_to_phys[j];
0505 int dist;
0506
0507 if (get_option(&emu_cmdline, &dist) == 2)
0508 ;
0509 else if (physi >= numa_dist_cnt || physj >= numa_dist_cnt)
0510 dist = physi == physj ?
0511 LOCAL_DISTANCE : REMOTE_DISTANCE;
0512 else
0513 dist = phys_dist[physi * numa_dist_cnt + physj];
0514
0515 numa_set_distance(i, j, dist);
0516 }
0517 }
0518
0519
0520 memblock_free(phys_dist, phys_size);
0521 return;
0522
0523 no_emu:
0524
0525 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
0526 emu_nid_to_phys[i] = i;
0527 }
0528
0529 #ifndef CONFIG_DEBUG_PER_CPU_MAPS
0530 void numa_add_cpu(int cpu)
0531 {
0532 int physnid, nid;
0533
0534 nid = early_cpu_to_node(cpu);
0535 BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
0536
0537 physnid = emu_nid_to_phys[nid];
0538
0539
0540
0541
0542
0543 for_each_online_node(nid)
0544 if (emu_nid_to_phys[nid] == physnid)
0545 cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
0546 }
0547
0548 void numa_remove_cpu(int cpu)
0549 {
0550 int i;
0551
0552 for_each_online_node(i)
0553 cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
0554 }
0555 #else
0556 static void numa_set_cpumask(int cpu, bool enable)
0557 {
0558 int nid, physnid;
0559
0560 nid = early_cpu_to_node(cpu);
0561 if (nid == NUMA_NO_NODE) {
0562
0563 return;
0564 }
0565
0566 physnid = emu_nid_to_phys[nid];
0567
0568 for_each_online_node(nid) {
0569 if (emu_nid_to_phys[nid] != physnid)
0570 continue;
0571
0572 debug_cpumask_set_cpu(cpu, nid, enable);
0573 }
0574 }
0575
0576 void numa_add_cpu(int cpu)
0577 {
0578 numa_set_cpumask(cpu, true);
0579 }
0580
0581 void numa_remove_cpu(int cpu)
0582 {
0583 numa_set_cpumask(cpu, false);
0584 }
0585 #endif