Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-or-later
0002 /*
0003  * pSeries NUMA support
0004  *
0005  * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
0006  */
0007 #define pr_fmt(fmt) "numa: " fmt
0008 
0009 #include <linux/threads.h>
0010 #include <linux/memblock.h>
0011 #include <linux/init.h>
0012 #include <linux/mm.h>
0013 #include <linux/mmzone.h>
0014 #include <linux/export.h>
0015 #include <linux/nodemask.h>
0016 #include <linux/cpu.h>
0017 #include <linux/notifier.h>
0018 #include <linux/of.h>
0019 #include <linux/pfn.h>
0020 #include <linux/cpuset.h>
0021 #include <linux/node.h>
0022 #include <linux/stop_machine.h>
0023 #include <linux/proc_fs.h>
0024 #include <linux/seq_file.h>
0025 #include <linux/uaccess.h>
0026 #include <linux/slab.h>
0027 #include <asm/cputhreads.h>
0028 #include <asm/sparsemem.h>
0029 #include <asm/smp.h>
0030 #include <asm/topology.h>
0031 #include <asm/firmware.h>
0032 #include <asm/paca.h>
0033 #include <asm/hvcall.h>
0034 #include <asm/setup.h>
0035 #include <asm/vdso.h>
0036 #include <asm/drmem.h>
0037 
0038 static int numa_enabled = 1;
0039 
0040 static char *cmdline __initdata;
0041 
0042 int numa_cpu_lookup_table[NR_CPUS];
0043 cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
0044 struct pglist_data *node_data[MAX_NUMNODES];
0045 
0046 EXPORT_SYMBOL(numa_cpu_lookup_table);
0047 EXPORT_SYMBOL(node_to_cpumask_map);
0048 EXPORT_SYMBOL(node_data);
0049 
0050 static int primary_domain_index;
0051 static int n_mem_addr_cells, n_mem_size_cells;
0052 
0053 #define FORM0_AFFINITY 0
0054 #define FORM1_AFFINITY 1
0055 #define FORM2_AFFINITY 2
0056 static int affinity_form;
0057 
0058 #define MAX_DISTANCE_REF_POINTS 4
0059 static int distance_ref_points_depth;
0060 static const __be32 *distance_ref_points;
0061 static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
0062 static int numa_distance_table[MAX_NUMNODES][MAX_NUMNODES] = {
0063     [0 ... MAX_NUMNODES - 1] = { [0 ... MAX_NUMNODES - 1] = -1 }
0064 };
0065 static int numa_id_index_table[MAX_NUMNODES] = { [0 ... MAX_NUMNODES - 1] = NUMA_NO_NODE };
0066 
0067 /*
0068  * Allocate node_to_cpumask_map based on number of available nodes
0069  * Requires node_possible_map to be valid.
0070  *
0071  * Note: cpumask_of_node() is not valid until after this is done.
0072  */
0073 static void __init setup_node_to_cpumask_map(void)
0074 {
0075     unsigned int node;
0076 
0077     /* setup nr_node_ids if not done yet */
0078     if (nr_node_ids == MAX_NUMNODES)
0079         setup_nr_node_ids();
0080 
0081     /* allocate the map */
0082     for_each_node(node)
0083         alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
0084 
0085     /* cpumask_of_node() will now work */
0086     pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids);
0087 }
0088 
0089 static int __init fake_numa_create_new_node(unsigned long end_pfn,
0090                         unsigned int *nid)
0091 {
0092     unsigned long long mem;
0093     char *p = cmdline;
0094     static unsigned int fake_nid;
0095     static unsigned long long curr_boundary;
0096 
0097     /*
0098      * Modify node id, iff we started creating NUMA nodes
0099      * We want to continue from where we left of the last time
0100      */
0101     if (fake_nid)
0102         *nid = fake_nid;
0103     /*
0104      * In case there are no more arguments to parse, the
0105      * node_id should be the same as the last fake node id
0106      * (we've handled this above).
0107      */
0108     if (!p)
0109         return 0;
0110 
0111     mem = memparse(p, &p);
0112     if (!mem)
0113         return 0;
0114 
0115     if (mem < curr_boundary)
0116         return 0;
0117 
0118     curr_boundary = mem;
0119 
0120     if ((end_pfn << PAGE_SHIFT) > mem) {
0121         /*
0122          * Skip commas and spaces
0123          */
0124         while (*p == ',' || *p == ' ' || *p == '\t')
0125             p++;
0126 
0127         cmdline = p;
0128         fake_nid++;
0129         *nid = fake_nid;
0130         pr_debug("created new fake_node with id %d\n", fake_nid);
0131         return 1;
0132     }
0133     return 0;
0134 }
0135 
0136 static void __init reset_numa_cpu_lookup_table(void)
0137 {
0138     unsigned int cpu;
0139 
0140     for_each_possible_cpu(cpu)
0141         numa_cpu_lookup_table[cpu] = -1;
0142 }
0143 
0144 void map_cpu_to_node(int cpu, int node)
0145 {
0146     update_numa_cpu_lookup_table(cpu, node);
0147 
0148     if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node]))) {
0149         pr_debug("adding cpu %d to node %d\n", cpu, node);
0150         cpumask_set_cpu(cpu, node_to_cpumask_map[node]);
0151     }
0152 }
0153 
0154 #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR)
0155 void unmap_cpu_from_node(unsigned long cpu)
0156 {
0157     int node = numa_cpu_lookup_table[cpu];
0158 
0159     if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) {
0160         cpumask_clear_cpu(cpu, node_to_cpumask_map[node]);
0161         pr_debug("removing cpu %lu from node %d\n", cpu, node);
0162     } else {
0163         pr_warn("Warning: cpu %lu not found in node %d\n", cpu, node);
0164     }
0165 }
0166 #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
0167 
0168 static int __associativity_to_nid(const __be32 *associativity,
0169                   int max_array_sz)
0170 {
0171     int nid;
0172     /*
0173      * primary_domain_index is 1 based array index.
0174      */
0175     int index = primary_domain_index  - 1;
0176 
0177     if (!numa_enabled || index >= max_array_sz)
0178         return NUMA_NO_NODE;
0179 
0180     nid = of_read_number(&associativity[index], 1);
0181 
0182     /* POWER4 LPAR uses 0xffff as invalid node */
0183     if (nid == 0xffff || nid >= nr_node_ids)
0184         nid = NUMA_NO_NODE;
0185     return nid;
0186 }
0187 /*
0188  * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
0189  * info is found.
0190  */
0191 static int associativity_to_nid(const __be32 *associativity)
0192 {
0193     int array_sz = of_read_number(associativity, 1);
0194 
0195     /* Skip the first element in the associativity array */
0196     return __associativity_to_nid((associativity + 1), array_sz);
0197 }
0198 
0199 static int __cpu_form2_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
0200 {
0201     int dist;
0202     int node1, node2;
0203 
0204     node1 = associativity_to_nid(cpu1_assoc);
0205     node2 = associativity_to_nid(cpu2_assoc);
0206 
0207     dist = numa_distance_table[node1][node2];
0208     if (dist <= LOCAL_DISTANCE)
0209         return 0;
0210     else if (dist <= REMOTE_DISTANCE)
0211         return 1;
0212     else
0213         return 2;
0214 }
0215 
0216 static int __cpu_form1_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
0217 {
0218     int dist = 0;
0219 
0220     int i, index;
0221 
0222     for (i = 0; i < distance_ref_points_depth; i++) {
0223         index = be32_to_cpu(distance_ref_points[i]);
0224         if (cpu1_assoc[index] == cpu2_assoc[index])
0225             break;
0226         dist++;
0227     }
0228 
0229     return dist;
0230 }
0231 
0232 int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
0233 {
0234     /* We should not get called with FORM0 */
0235     VM_WARN_ON(affinity_form == FORM0_AFFINITY);
0236     if (affinity_form == FORM1_AFFINITY)
0237         return __cpu_form1_relative_distance(cpu1_assoc, cpu2_assoc);
0238     return __cpu_form2_relative_distance(cpu1_assoc, cpu2_assoc);
0239 }
0240 
0241 /* must hold reference to node during call */
0242 static const __be32 *of_get_associativity(struct device_node *dev)
0243 {
0244     return of_get_property(dev, "ibm,associativity", NULL);
0245 }
0246 
0247 int __node_distance(int a, int b)
0248 {
0249     int i;
0250     int distance = LOCAL_DISTANCE;
0251 
0252     if (affinity_form == FORM2_AFFINITY)
0253         return numa_distance_table[a][b];
0254     else if (affinity_form == FORM0_AFFINITY)
0255         return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
0256 
0257     for (i = 0; i < distance_ref_points_depth; i++) {
0258         if (distance_lookup_table[a][i] == distance_lookup_table[b][i])
0259             break;
0260 
0261         /* Double the distance for each NUMA level */
0262         distance *= 2;
0263     }
0264 
0265     return distance;
0266 }
0267 EXPORT_SYMBOL(__node_distance);
0268 
0269 /* Returns the nid associated with the given device tree node,
0270  * or -1 if not found.
0271  */
0272 static int of_node_to_nid_single(struct device_node *device)
0273 {
0274     int nid = NUMA_NO_NODE;
0275     const __be32 *tmp;
0276 
0277     tmp = of_get_associativity(device);
0278     if (tmp)
0279         nid = associativity_to_nid(tmp);
0280     return nid;
0281 }
0282 
0283 /* Walk the device tree upwards, looking for an associativity id */
0284 int of_node_to_nid(struct device_node *device)
0285 {
0286     int nid = NUMA_NO_NODE;
0287 
0288     of_node_get(device);
0289     while (device) {
0290         nid = of_node_to_nid_single(device);
0291         if (nid != -1)
0292             break;
0293 
0294         device = of_get_next_parent(device);
0295     }
0296     of_node_put(device);
0297 
0298     return nid;
0299 }
0300 EXPORT_SYMBOL(of_node_to_nid);
0301 
0302 static void __initialize_form1_numa_distance(const __be32 *associativity,
0303                          int max_array_sz)
0304 {
0305     int i, nid;
0306 
0307     if (affinity_form != FORM1_AFFINITY)
0308         return;
0309 
0310     nid = __associativity_to_nid(associativity, max_array_sz);
0311     if (nid != NUMA_NO_NODE) {
0312         for (i = 0; i < distance_ref_points_depth; i++) {
0313             const __be32 *entry;
0314             int index = be32_to_cpu(distance_ref_points[i]) - 1;
0315 
0316             /*
0317              * broken hierarchy, return with broken distance table
0318              */
0319             if (WARN(index >= max_array_sz, "Broken ibm,associativity property"))
0320                 return;
0321 
0322             entry = &associativity[index];
0323             distance_lookup_table[nid][i] = of_read_number(entry, 1);
0324         }
0325     }
0326 }
0327 
0328 static void initialize_form1_numa_distance(const __be32 *associativity)
0329 {
0330     int array_sz;
0331 
0332     array_sz = of_read_number(associativity, 1);
0333     /* Skip the first element in the associativity array */
0334     __initialize_form1_numa_distance(associativity + 1, array_sz);
0335 }
0336 
0337 /*
0338  * Used to update distance information w.r.t newly added node.
0339  */
0340 void update_numa_distance(struct device_node *node)
0341 {
0342     int nid;
0343 
0344     if (affinity_form == FORM0_AFFINITY)
0345         return;
0346     else if (affinity_form == FORM1_AFFINITY) {
0347         const __be32 *associativity;
0348 
0349         associativity = of_get_associativity(node);
0350         if (!associativity)
0351             return;
0352 
0353         initialize_form1_numa_distance(associativity);
0354         return;
0355     }
0356 
0357     /* FORM2 affinity  */
0358     nid = of_node_to_nid_single(node);
0359     if (nid == NUMA_NO_NODE)
0360         return;
0361 
0362     /*
0363      * With FORM2 we expect NUMA distance of all possible NUMA
0364      * nodes to be provided during boot.
0365      */
0366     WARN(numa_distance_table[nid][nid] == -1,
0367          "NUMA distance details for node %d not provided\n", nid);
0368 }
0369 
0370 /*
0371  * ibm,numa-lookup-index-table= {N, domainid1, domainid2, ..... domainidN}
0372  * ibm,numa-distance-table = { N, 1, 2, 4, 5, 1, 6, .... N elements}
0373  */
0374 static void __init initialize_form2_numa_distance_lookup_table(void)
0375 {
0376     int i, j;
0377     struct device_node *root;
0378     const __u8 *form2_distances;
0379     const __be32 *numa_lookup_index;
0380     int form2_distances_length;
0381     int max_numa_index, distance_index;
0382 
0383     if (firmware_has_feature(FW_FEATURE_OPAL))
0384         root = of_find_node_by_path("/ibm,opal");
0385     else
0386         root = of_find_node_by_path("/rtas");
0387     if (!root)
0388         root = of_find_node_by_path("/");
0389 
0390     numa_lookup_index = of_get_property(root, "ibm,numa-lookup-index-table", NULL);
0391     max_numa_index = of_read_number(&numa_lookup_index[0], 1);
0392 
0393     /* first element of the array is the size and is encode-int */
0394     form2_distances = of_get_property(root, "ibm,numa-distance-table", NULL);
0395     form2_distances_length = of_read_number((const __be32 *)&form2_distances[0], 1);
0396     /* Skip the size which is encoded int */
0397     form2_distances += sizeof(__be32);
0398 
0399     pr_debug("form2_distances_len = %d, numa_dist_indexes_len = %d\n",
0400          form2_distances_length, max_numa_index);
0401 
0402     for (i = 0; i < max_numa_index; i++)
0403         /* +1 skip the max_numa_index in the property */
0404         numa_id_index_table[i] = of_read_number(&numa_lookup_index[i + 1], 1);
0405 
0406 
0407     if (form2_distances_length != max_numa_index * max_numa_index) {
0408         WARN(1, "Wrong NUMA distance information\n");
0409         form2_distances = NULL; // don't use it
0410     }
0411     distance_index = 0;
0412     for (i = 0;  i < max_numa_index; i++) {
0413         for (j = 0; j < max_numa_index; j++) {
0414             int nodeA = numa_id_index_table[i];
0415             int nodeB = numa_id_index_table[j];
0416             int dist;
0417 
0418             if (form2_distances)
0419                 dist = form2_distances[distance_index++];
0420             else if (nodeA == nodeB)
0421                 dist = LOCAL_DISTANCE;
0422             else
0423                 dist = REMOTE_DISTANCE;
0424             numa_distance_table[nodeA][nodeB] = dist;
0425             pr_debug("dist[%d][%d]=%d ", nodeA, nodeB, dist);
0426         }
0427     }
0428 
0429     of_node_put(root);
0430 }
0431 
0432 static int __init find_primary_domain_index(void)
0433 {
0434     int index;
0435     struct device_node *root;
0436 
0437     /*
0438      * Check for which form of affinity.
0439      */
0440     if (firmware_has_feature(FW_FEATURE_OPAL)) {
0441         affinity_form = FORM1_AFFINITY;
0442     } else if (firmware_has_feature(FW_FEATURE_FORM2_AFFINITY)) {
0443         pr_debug("Using form 2 affinity\n");
0444         affinity_form = FORM2_AFFINITY;
0445     } else if (firmware_has_feature(FW_FEATURE_FORM1_AFFINITY)) {
0446         pr_debug("Using form 1 affinity\n");
0447         affinity_form = FORM1_AFFINITY;
0448     } else
0449         affinity_form = FORM0_AFFINITY;
0450 
0451     if (firmware_has_feature(FW_FEATURE_OPAL))
0452         root = of_find_node_by_path("/ibm,opal");
0453     else
0454         root = of_find_node_by_path("/rtas");
0455     if (!root)
0456         root = of_find_node_by_path("/");
0457 
0458     /*
0459      * This property is a set of 32-bit integers, each representing
0460      * an index into the ibm,associativity nodes.
0461      *
0462      * With form 0 affinity the first integer is for an SMP configuration
0463      * (should be all 0's) and the second is for a normal NUMA
0464      * configuration. We have only one level of NUMA.
0465      *
0466      * With form 1 affinity the first integer is the most significant
0467      * NUMA boundary and the following are progressively less significant
0468      * boundaries. There can be more than one level of NUMA.
0469      */
0470     distance_ref_points = of_get_property(root,
0471                     "ibm,associativity-reference-points",
0472                     &distance_ref_points_depth);
0473 
0474     if (!distance_ref_points) {
0475         pr_debug("ibm,associativity-reference-points not found.\n");
0476         goto err;
0477     }
0478 
0479     distance_ref_points_depth /= sizeof(int);
0480     if (affinity_form == FORM0_AFFINITY) {
0481         if (distance_ref_points_depth < 2) {
0482             pr_warn("short ibm,associativity-reference-points\n");
0483             goto err;
0484         }
0485 
0486         index = of_read_number(&distance_ref_points[1], 1);
0487     } else {
0488         /*
0489          * Both FORM1 and FORM2 affinity find the primary domain details
0490          * at the same offset.
0491          */
0492         index = of_read_number(distance_ref_points, 1);
0493     }
0494     /*
0495      * Warn and cap if the hardware supports more than
0496      * MAX_DISTANCE_REF_POINTS domains.
0497      */
0498     if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) {
0499         pr_warn("distance array capped at %d entries\n",
0500             MAX_DISTANCE_REF_POINTS);
0501         distance_ref_points_depth = MAX_DISTANCE_REF_POINTS;
0502     }
0503 
0504     of_node_put(root);
0505     return index;
0506 
0507 err:
0508     of_node_put(root);
0509     return -1;
0510 }
0511 
0512 static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells)
0513 {
0514     struct device_node *memory = NULL;
0515 
0516     memory = of_find_node_by_type(memory, "memory");
0517     if (!memory)
0518         panic("numa.c: No memory nodes found!");
0519 
0520     *n_addr_cells = of_n_addr_cells(memory);
0521     *n_size_cells = of_n_size_cells(memory);
0522     of_node_put(memory);
0523 }
0524 
0525 static unsigned long read_n_cells(int n, const __be32 **buf)
0526 {
0527     unsigned long result = 0;
0528 
0529     while (n--) {
0530         result = (result << 32) | of_read_number(*buf, 1);
0531         (*buf)++;
0532     }
0533     return result;
0534 }
0535 
0536 struct assoc_arrays {
0537     u32 n_arrays;
0538     u32 array_sz;
0539     const __be32 *arrays;
0540 };
0541 
0542 /*
0543  * Retrieve and validate the list of associativity arrays for drconf
0544  * memory from the ibm,associativity-lookup-arrays property of the
0545  * device tree..
0546  *
0547  * The layout of the ibm,associativity-lookup-arrays property is a number N
0548  * indicating the number of associativity arrays, followed by a number M
0549  * indicating the size of each associativity array, followed by a list
0550  * of N associativity arrays.
0551  */
0552 static int of_get_assoc_arrays(struct assoc_arrays *aa)
0553 {
0554     struct device_node *memory;
0555     const __be32 *prop;
0556     u32 len;
0557 
0558     memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
0559     if (!memory)
0560         return -1;
0561 
0562     prop = of_get_property(memory, "ibm,associativity-lookup-arrays", &len);
0563     if (!prop || len < 2 * sizeof(unsigned int)) {
0564         of_node_put(memory);
0565         return -1;
0566     }
0567 
0568     aa->n_arrays = of_read_number(prop++, 1);
0569     aa->array_sz = of_read_number(prop++, 1);
0570 
0571     of_node_put(memory);
0572 
0573     /* Now that we know the number of arrays and size of each array,
0574      * revalidate the size of the property read in.
0575      */
0576     if (len < (aa->n_arrays * aa->array_sz + 2) * sizeof(unsigned int))
0577         return -1;
0578 
0579     aa->arrays = prop;
0580     return 0;
0581 }
0582 
0583 static int __init get_nid_and_numa_distance(struct drmem_lmb *lmb)
0584 {
0585     struct assoc_arrays aa = { .arrays = NULL };
0586     int default_nid = NUMA_NO_NODE;
0587     int nid = default_nid;
0588     int rc, index;
0589 
0590     if ((primary_domain_index < 0) || !numa_enabled)
0591         return default_nid;
0592 
0593     rc = of_get_assoc_arrays(&aa);
0594     if (rc)
0595         return default_nid;
0596 
0597     if (primary_domain_index <= aa.array_sz &&
0598         !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) {
0599         const __be32 *associativity;
0600 
0601         index = lmb->aa_index * aa.array_sz;
0602         associativity = &aa.arrays[index];
0603         nid = __associativity_to_nid(associativity, aa.array_sz);
0604         if (nid > 0 && affinity_form == FORM1_AFFINITY) {
0605             /*
0606              * lookup array associativity entries have
0607              * no length of the array as the first element.
0608              */
0609             __initialize_form1_numa_distance(associativity, aa.array_sz);
0610         }
0611     }
0612     return nid;
0613 }
0614 
0615 /*
0616  * This is like of_node_to_nid_single() for memory represented in the
0617  * ibm,dynamic-reconfiguration-memory node.
0618  */
0619 int of_drconf_to_nid_single(struct drmem_lmb *lmb)
0620 {
0621     struct assoc_arrays aa = { .arrays = NULL };
0622     int default_nid = NUMA_NO_NODE;
0623     int nid = default_nid;
0624     int rc, index;
0625 
0626     if ((primary_domain_index < 0) || !numa_enabled)
0627         return default_nid;
0628 
0629     rc = of_get_assoc_arrays(&aa);
0630     if (rc)
0631         return default_nid;
0632 
0633     if (primary_domain_index <= aa.array_sz &&
0634         !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) {
0635         const __be32 *associativity;
0636 
0637         index = lmb->aa_index * aa.array_sz;
0638         associativity = &aa.arrays[index];
0639         nid = __associativity_to_nid(associativity, aa.array_sz);
0640     }
0641     return nid;
0642 }
0643 
0644 #ifdef CONFIG_PPC_SPLPAR
0645 
0646 static int __vphn_get_associativity(long lcpu, __be32 *associativity)
0647 {
0648     long rc, hwid;
0649 
0650     /*
0651      * On a shared lpar, device tree will not have node associativity.
0652      * At this time lppaca, or its __old_status field may not be
0653      * updated. Hence kernel cannot detect if its on a shared lpar. So
0654      * request an explicit associativity irrespective of whether the
0655      * lpar is shared or dedicated. Use the device tree property as a
0656      * fallback. cpu_to_phys_id is only valid between
0657      * smp_setup_cpu_maps() and smp_setup_pacas().
0658      */
0659     if (firmware_has_feature(FW_FEATURE_VPHN)) {
0660         if (cpu_to_phys_id)
0661             hwid = cpu_to_phys_id[lcpu];
0662         else
0663             hwid = get_hard_smp_processor_id(lcpu);
0664 
0665         rc = hcall_vphn(hwid, VPHN_FLAG_VCPU, associativity);
0666         if (rc == H_SUCCESS)
0667             return 0;
0668     }
0669 
0670     return -1;
0671 }
0672 
0673 static int vphn_get_nid(long lcpu)
0674 {
0675     __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
0676 
0677 
0678     if (!__vphn_get_associativity(lcpu, associativity))
0679         return associativity_to_nid(associativity);
0680 
0681     return NUMA_NO_NODE;
0682 
0683 }
0684 #else
0685 
0686 static int __vphn_get_associativity(long lcpu, __be32 *associativity)
0687 {
0688     return -1;
0689 }
0690 
0691 static int vphn_get_nid(long unused)
0692 {
0693     return NUMA_NO_NODE;
0694 }
0695 #endif  /* CONFIG_PPC_SPLPAR */
0696 
0697 /*
0698  * Figure out to which domain a cpu belongs and stick it there.
0699  * Return the id of the domain used.
0700  */
0701 static int numa_setup_cpu(unsigned long lcpu)
0702 {
0703     struct device_node *cpu;
0704     int fcpu = cpu_first_thread_sibling(lcpu);
0705     int nid = NUMA_NO_NODE;
0706 
0707     if (!cpu_present(lcpu)) {
0708         set_cpu_numa_node(lcpu, first_online_node);
0709         return first_online_node;
0710     }
0711 
0712     /*
0713      * If a valid cpu-to-node mapping is already available, use it
0714      * directly instead of querying the firmware, since it represents
0715      * the most recent mapping notified to us by the platform (eg: VPHN).
0716      * Since cpu_to_node binding remains the same for all threads in the
0717      * core. If a valid cpu-to-node mapping is already available, for
0718      * the first thread in the core, use it.
0719      */
0720     nid = numa_cpu_lookup_table[fcpu];
0721     if (nid >= 0) {
0722         map_cpu_to_node(lcpu, nid);
0723         return nid;
0724     }
0725 
0726     nid = vphn_get_nid(lcpu);
0727     if (nid != NUMA_NO_NODE)
0728         goto out_present;
0729 
0730     cpu = of_get_cpu_node(lcpu, NULL);
0731 
0732     if (!cpu) {
0733         WARN_ON(1);
0734         if (cpu_present(lcpu))
0735             goto out_present;
0736         else
0737             goto out;
0738     }
0739 
0740     nid = of_node_to_nid_single(cpu);
0741     of_node_put(cpu);
0742 
0743 out_present:
0744     if (nid < 0 || !node_possible(nid))
0745         nid = first_online_node;
0746 
0747     /*
0748      * Update for the first thread of the core. All threads of a core
0749      * have to be part of the same node. This not only avoids querying
0750      * for every other thread in the core, but always avoids a case
0751      * where virtual node associativity change causes subsequent threads
0752      * of a core to be associated with different nid. However if first
0753      * thread is already online, expect it to have a valid mapping.
0754      */
0755     if (fcpu != lcpu) {
0756         WARN_ON(cpu_online(fcpu));
0757         map_cpu_to_node(fcpu, nid);
0758     }
0759 
0760     map_cpu_to_node(lcpu, nid);
0761 out:
0762     return nid;
0763 }
0764 
0765 static void verify_cpu_node_mapping(int cpu, int node)
0766 {
0767     int base, sibling, i;
0768 
0769     /* Verify that all the threads in the core belong to the same node */
0770     base = cpu_first_thread_sibling(cpu);
0771 
0772     for (i = 0; i < threads_per_core; i++) {
0773         sibling = base + i;
0774 
0775         if (sibling == cpu || cpu_is_offline(sibling))
0776             continue;
0777 
0778         if (cpu_to_node(sibling) != node) {
0779             WARN(1, "CPU thread siblings %d and %d don't belong"
0780                 " to the same node!\n", cpu, sibling);
0781             break;
0782         }
0783     }
0784 }
0785 
0786 /* Must run before sched domains notifier. */
0787 static int ppc_numa_cpu_prepare(unsigned int cpu)
0788 {
0789     int nid;
0790 
0791     nid = numa_setup_cpu(cpu);
0792     verify_cpu_node_mapping(cpu, nid);
0793     return 0;
0794 }
0795 
0796 static int ppc_numa_cpu_dead(unsigned int cpu)
0797 {
0798     return 0;
0799 }
0800 
0801 /*
0802  * Check and possibly modify a memory region to enforce the memory limit.
0803  *
0804  * Returns the size the region should have to enforce the memory limit.
0805  * This will either be the original value of size, a truncated value,
0806  * or zero. If the returned value of size is 0 the region should be
0807  * discarded as it lies wholly above the memory limit.
0808  */
0809 static unsigned long __init numa_enforce_memory_limit(unsigned long start,
0810                               unsigned long size)
0811 {
0812     /*
0813      * We use memblock_end_of_DRAM() in here instead of memory_limit because
0814      * we've already adjusted it for the limit and it takes care of
0815      * having memory holes below the limit.  Also, in the case of
0816      * iommu_is_off, memory_limit is not set but is implicitly enforced.
0817      */
0818 
0819     if (start + size <= memblock_end_of_DRAM())
0820         return size;
0821 
0822     if (start >= memblock_end_of_DRAM())
0823         return 0;
0824 
0825     return memblock_end_of_DRAM() - start;
0826 }
0827 
0828 /*
0829  * Reads the counter for a given entry in
0830  * linux,drconf-usable-memory property
0831  */
0832 static inline int __init read_usm_ranges(const __be32 **usm)
0833 {
0834     /*
0835      * For each lmb in ibm,dynamic-memory a corresponding
0836      * entry in linux,drconf-usable-memory property contains
0837      * a counter followed by that many (base, size) duple.
0838      * read the counter from linux,drconf-usable-memory
0839      */
0840     return read_n_cells(n_mem_size_cells, usm);
0841 }
0842 
0843 /*
0844  * Extract NUMA information from the ibm,dynamic-reconfiguration-memory
0845  * node.  This assumes n_mem_{addr,size}_cells have been set.
0846  */
0847 static int __init numa_setup_drmem_lmb(struct drmem_lmb *lmb,
0848                     const __be32 **usm,
0849                     void *data)
0850 {
0851     unsigned int ranges, is_kexec_kdump = 0;
0852     unsigned long base, size, sz;
0853     int nid;
0854 
0855     /*
0856      * Skip this block if the reserved bit is set in flags (0x80)
0857      * or if the block is not assigned to this partition (0x8)
0858      */
0859     if ((lmb->flags & DRCONF_MEM_RESERVED)
0860         || !(lmb->flags & DRCONF_MEM_ASSIGNED))
0861         return 0;
0862 
0863     if (*usm)
0864         is_kexec_kdump = 1;
0865 
0866     base = lmb->base_addr;
0867     size = drmem_lmb_size();
0868     ranges = 1;
0869 
0870     if (is_kexec_kdump) {
0871         ranges = read_usm_ranges(usm);
0872         if (!ranges) /* there are no (base, size) duple */
0873             return 0;
0874     }
0875 
0876     do {
0877         if (is_kexec_kdump) {
0878             base = read_n_cells(n_mem_addr_cells, usm);
0879             size = read_n_cells(n_mem_size_cells, usm);
0880         }
0881 
0882         nid = get_nid_and_numa_distance(lmb);
0883         fake_numa_create_new_node(((base + size) >> PAGE_SHIFT),
0884                       &nid);
0885         node_set_online(nid);
0886         sz = numa_enforce_memory_limit(base, size);
0887         if (sz)
0888             memblock_set_node(base, sz, &memblock.memory, nid);
0889     } while (--ranges);
0890 
0891     return 0;
0892 }
0893 
0894 static int __init parse_numa_properties(void)
0895 {
0896     struct device_node *memory;
0897     int default_nid = 0;
0898     unsigned long i;
0899     const __be32 *associativity;
0900 
0901     if (numa_enabled == 0) {
0902         pr_warn("disabled by user\n");
0903         return -1;
0904     }
0905 
0906     primary_domain_index = find_primary_domain_index();
0907 
0908     if (primary_domain_index < 0) {
0909         /*
0910          * if we fail to parse primary_domain_index from device tree
0911          * mark the numa disabled, boot with numa disabled.
0912          */
0913         numa_enabled = false;
0914         return primary_domain_index;
0915     }
0916 
0917     pr_debug("associativity depth for CPU/Memory: %d\n", primary_domain_index);
0918 
0919     /*
0920      * If it is FORM2 initialize the distance table here.
0921      */
0922     if (affinity_form == FORM2_AFFINITY)
0923         initialize_form2_numa_distance_lookup_table();
0924 
0925     /*
0926      * Even though we connect cpus to numa domains later in SMP
0927      * init, we need to know the node ids now. This is because
0928      * each node to be onlined must have NODE_DATA etc backing it.
0929      */
0930     for_each_present_cpu(i) {
0931         __be32 vphn_assoc[VPHN_ASSOC_BUFSIZE];
0932         struct device_node *cpu;
0933         int nid = NUMA_NO_NODE;
0934 
0935         memset(vphn_assoc, 0, VPHN_ASSOC_BUFSIZE * sizeof(__be32));
0936 
0937         if (__vphn_get_associativity(i, vphn_assoc) == 0) {
0938             nid = associativity_to_nid(vphn_assoc);
0939             initialize_form1_numa_distance(vphn_assoc);
0940         } else {
0941 
0942             /*
0943              * Don't fall back to default_nid yet -- we will plug
0944              * cpus into nodes once the memory scan has discovered
0945              * the topology.
0946              */
0947             cpu = of_get_cpu_node(i, NULL);
0948             BUG_ON(!cpu);
0949 
0950             associativity = of_get_associativity(cpu);
0951             if (associativity) {
0952                 nid = associativity_to_nid(associativity);
0953                 initialize_form1_numa_distance(associativity);
0954             }
0955             of_node_put(cpu);
0956         }
0957 
0958         /* node_set_online() is an UB if 'nid' is negative */
0959         if (likely(nid >= 0))
0960             node_set_online(nid);
0961     }
0962 
0963     get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells);
0964 
0965     for_each_node_by_type(memory, "memory") {
0966         unsigned long start;
0967         unsigned long size;
0968         int nid;
0969         int ranges;
0970         const __be32 *memcell_buf;
0971         unsigned int len;
0972 
0973         memcell_buf = of_get_property(memory,
0974             "linux,usable-memory", &len);
0975         if (!memcell_buf || len <= 0)
0976             memcell_buf = of_get_property(memory, "reg", &len);
0977         if (!memcell_buf || len <= 0)
0978             continue;
0979 
0980         /* ranges in cell */
0981         ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
0982 new_range:
0983         /* these are order-sensitive, and modify the buffer pointer */
0984         start = read_n_cells(n_mem_addr_cells, &memcell_buf);
0985         size = read_n_cells(n_mem_size_cells, &memcell_buf);
0986 
0987         /*
0988          * Assumption: either all memory nodes or none will
0989          * have associativity properties.  If none, then
0990          * everything goes to default_nid.
0991          */
0992         associativity = of_get_associativity(memory);
0993         if (associativity) {
0994             nid = associativity_to_nid(associativity);
0995             initialize_form1_numa_distance(associativity);
0996         } else
0997             nid = default_nid;
0998 
0999         fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid);
1000         node_set_online(nid);
1001 
1002         size = numa_enforce_memory_limit(start, size);
1003         if (size)
1004             memblock_set_node(start, size, &memblock.memory, nid);
1005 
1006         if (--ranges)
1007             goto new_range;
1008     }
1009 
1010     /*
1011      * Now do the same thing for each MEMBLOCK listed in the
1012      * ibm,dynamic-memory property in the
1013      * ibm,dynamic-reconfiguration-memory node.
1014      */
1015     memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
1016     if (memory) {
1017         walk_drmem_lmbs(memory, NULL, numa_setup_drmem_lmb);
1018         of_node_put(memory);
1019     }
1020 
1021     return 0;
1022 }
1023 
1024 static void __init setup_nonnuma(void)
1025 {
1026     unsigned long top_of_ram = memblock_end_of_DRAM();
1027     unsigned long total_ram = memblock_phys_mem_size();
1028     unsigned long start_pfn, end_pfn;
1029     unsigned int nid = 0;
1030     int i;
1031 
1032     pr_debug("Top of RAM: 0x%lx, Total RAM: 0x%lx\n", top_of_ram, total_ram);
1033     pr_debug("Memory hole size: %ldMB\n", (top_of_ram - total_ram) >> 20);
1034 
1035     for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
1036         fake_numa_create_new_node(end_pfn, &nid);
1037         memblock_set_node(PFN_PHYS(start_pfn),
1038                   PFN_PHYS(end_pfn - start_pfn),
1039                   &memblock.memory, nid);
1040         node_set_online(nid);
1041     }
1042 }
1043 
1044 void __init dump_numa_cpu_topology(void)
1045 {
1046     unsigned int node;
1047     unsigned int cpu, count;
1048 
1049     if (!numa_enabled)
1050         return;
1051 
1052     for_each_online_node(node) {
1053         pr_info("Node %d CPUs:", node);
1054 
1055         count = 0;
1056         /*
1057          * If we used a CPU iterator here we would miss printing
1058          * the holes in the cpumap.
1059          */
1060         for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
1061             if (cpumask_test_cpu(cpu,
1062                     node_to_cpumask_map[node])) {
1063                 if (count == 0)
1064                     pr_cont(" %u", cpu);
1065                 ++count;
1066             } else {
1067                 if (count > 1)
1068                     pr_cont("-%u", cpu - 1);
1069                 count = 0;
1070             }
1071         }
1072 
1073         if (count > 1)
1074             pr_cont("-%u", nr_cpu_ids - 1);
1075         pr_cont("\n");
1076     }
1077 }
1078 
1079 /* Initialize NODE_DATA for a node on the local memory */
1080 static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn)
1081 {
1082     u64 spanned_pages = end_pfn - start_pfn;
1083     const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES);
1084     u64 nd_pa;
1085     void *nd;
1086     int tnid;
1087 
1088     nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
1089     if (!nd_pa)
1090         panic("Cannot allocate %zu bytes for node %d data\n",
1091               nd_size, nid);
1092 
1093     nd = __va(nd_pa);
1094 
1095     /* report and initialize */
1096     pr_info("  NODE_DATA [mem %#010Lx-%#010Lx]\n",
1097         nd_pa, nd_pa + nd_size - 1);
1098     tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
1099     if (tnid != nid)
1100         pr_info("    NODE_DATA(%d) on node %d\n", nid, tnid);
1101 
1102     node_data[nid] = nd;
1103     memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
1104     NODE_DATA(nid)->node_id = nid;
1105     NODE_DATA(nid)->node_start_pfn = start_pfn;
1106     NODE_DATA(nid)->node_spanned_pages = spanned_pages;
1107 }
1108 
1109 static void __init find_possible_nodes(void)
1110 {
1111     struct device_node *rtas;
1112     const __be32 *domains = NULL;
1113     int prop_length, max_nodes;
1114     u32 i;
1115 
1116     if (!numa_enabled)
1117         return;
1118 
1119     rtas = of_find_node_by_path("/rtas");
1120     if (!rtas)
1121         return;
1122 
1123     /*
1124      * ibm,current-associativity-domains is a fairly recent property. If
1125      * it doesn't exist, then fallback on ibm,max-associativity-domains.
1126      * Current denotes what the platform can support compared to max
1127      * which denotes what the Hypervisor can support.
1128      *
1129      * If the LPAR is migratable, new nodes might be activated after a LPM,
1130      * so we should consider the max number in that case.
1131      */
1132     if (!of_get_property(of_root, "ibm,migratable-partition", NULL))
1133         domains = of_get_property(rtas,
1134                       "ibm,current-associativity-domains",
1135                       &prop_length);
1136     if (!domains) {
1137         domains = of_get_property(rtas, "ibm,max-associativity-domains",
1138                     &prop_length);
1139         if (!domains)
1140             goto out;
1141     }
1142 
1143     max_nodes = of_read_number(&domains[primary_domain_index], 1);
1144     pr_info("Partition configured for %d NUMA nodes.\n", max_nodes);
1145 
1146     for (i = 0; i < max_nodes; i++) {
1147         if (!node_possible(i))
1148             node_set(i, node_possible_map);
1149     }
1150 
1151     prop_length /= sizeof(int);
1152     if (prop_length > primary_domain_index + 2)
1153         coregroup_enabled = 1;
1154 
1155 out:
1156     of_node_put(rtas);
1157 }
1158 
1159 void __init mem_topology_setup(void)
1160 {
1161     int cpu;
1162 
1163     /*
1164      * Linux/mm assumes node 0 to be online at boot. However this is not
1165      * true on PowerPC, where node 0 is similar to any other node, it
1166      * could be cpuless, memoryless node. So force node 0 to be offline
1167      * for now. This will prevent cpuless, memoryless node 0 showing up
1168      * unnecessarily as online. If a node has cpus or memory that need
1169      * to be online, then node will anyway be marked online.
1170      */
1171     node_set_offline(0);
1172 
1173     if (parse_numa_properties())
1174         setup_nonnuma();
1175 
1176     /*
1177      * Modify the set of possible NUMA nodes to reflect information
1178      * available about the set of online nodes, and the set of nodes
1179      * that we expect to make use of for this platform's affinity
1180      * calculations.
1181      */
1182     nodes_and(node_possible_map, node_possible_map, node_online_map);
1183 
1184     find_possible_nodes();
1185 
1186     setup_node_to_cpumask_map();
1187 
1188     reset_numa_cpu_lookup_table();
1189 
1190     for_each_possible_cpu(cpu) {
1191         /*
1192          * Powerpc with CONFIG_NUMA always used to have a node 0,
1193          * even if it was memoryless or cpuless. For all cpus that
1194          * are possible but not present, cpu_to_node() would point
1195          * to node 0. To remove a cpuless, memoryless dummy node,
1196          * powerpc need to make sure all possible but not present
1197          * cpu_to_node are set to a proper node.
1198          */
1199         numa_setup_cpu(cpu);
1200     }
1201 }
1202 
1203 void __init initmem_init(void)
1204 {
1205     int nid;
1206 
1207     max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
1208     max_pfn = max_low_pfn;
1209 
1210     memblock_dump_all();
1211 
1212     for_each_online_node(nid) {
1213         unsigned long start_pfn, end_pfn;
1214 
1215         get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
1216         setup_node_data(nid, start_pfn, end_pfn);
1217     }
1218 
1219     sparse_init();
1220 
1221     /*
1222      * We need the numa_cpu_lookup_table to be accurate for all CPUs,
1223      * even before we online them, so that we can use cpu_to_{node,mem}
1224      * early in boot, cf. smp_prepare_cpus().
1225      * _nocalls() + manual invocation is used because cpuhp is not yet
1226      * initialized for the boot CPU.
1227      */
1228     cpuhp_setup_state_nocalls(CPUHP_POWER_NUMA_PREPARE, "powerpc/numa:prepare",
1229                   ppc_numa_cpu_prepare, ppc_numa_cpu_dead);
1230 }
1231 
1232 static int __init early_numa(char *p)
1233 {
1234     if (!p)
1235         return 0;
1236 
1237     if (strstr(p, "off"))
1238         numa_enabled = 0;
1239 
1240     p = strstr(p, "fake=");
1241     if (p)
1242         cmdline = p + strlen("fake=");
1243 
1244     return 0;
1245 }
1246 early_param("numa", early_numa);
1247 
1248 #ifdef CONFIG_MEMORY_HOTPLUG
1249 /*
1250  * Find the node associated with a hot added memory section for
1251  * memory represented in the device tree by the property
1252  * ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory.
1253  */
1254 static int hot_add_drconf_scn_to_nid(unsigned long scn_addr)
1255 {
1256     struct drmem_lmb *lmb;
1257     unsigned long lmb_size;
1258     int nid = NUMA_NO_NODE;
1259 
1260     lmb_size = drmem_lmb_size();
1261 
1262     for_each_drmem_lmb(lmb) {
1263         /* skip this block if it is reserved or not assigned to
1264          * this partition */
1265         if ((lmb->flags & DRCONF_MEM_RESERVED)
1266             || !(lmb->flags & DRCONF_MEM_ASSIGNED))
1267             continue;
1268 
1269         if ((scn_addr < lmb->base_addr)
1270             || (scn_addr >= (lmb->base_addr + lmb_size)))
1271             continue;
1272 
1273         nid = of_drconf_to_nid_single(lmb);
1274         break;
1275     }
1276 
1277     return nid;
1278 }
1279 
1280 /*
1281  * Find the node associated with a hot added memory section for memory
1282  * represented in the device tree as a node (i.e. memory@XXXX) for
1283  * each memblock.
1284  */
1285 static int hot_add_node_scn_to_nid(unsigned long scn_addr)
1286 {
1287     struct device_node *memory;
1288     int nid = NUMA_NO_NODE;
1289 
1290     for_each_node_by_type(memory, "memory") {
1291         unsigned long start, size;
1292         int ranges;
1293         const __be32 *memcell_buf;
1294         unsigned int len;
1295 
1296         memcell_buf = of_get_property(memory, "reg", &len);
1297         if (!memcell_buf || len <= 0)
1298             continue;
1299 
1300         /* ranges in cell */
1301         ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
1302 
1303         while (ranges--) {
1304             start = read_n_cells(n_mem_addr_cells, &memcell_buf);
1305             size = read_n_cells(n_mem_size_cells, &memcell_buf);
1306 
1307             if ((scn_addr < start) || (scn_addr >= (start + size)))
1308                 continue;
1309 
1310             nid = of_node_to_nid_single(memory);
1311             break;
1312         }
1313 
1314         if (nid >= 0)
1315             break;
1316     }
1317 
1318     of_node_put(memory);
1319 
1320     return nid;
1321 }
1322 
1323 /*
1324  * Find the node associated with a hot added memory section.  Section
1325  * corresponds to a SPARSEMEM section, not an MEMBLOCK.  It is assumed that
1326  * sections are fully contained within a single MEMBLOCK.
1327  */
1328 int hot_add_scn_to_nid(unsigned long scn_addr)
1329 {
1330     struct device_node *memory = NULL;
1331     int nid;
1332 
1333     if (!numa_enabled)
1334         return first_online_node;
1335 
1336     memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
1337     if (memory) {
1338         nid = hot_add_drconf_scn_to_nid(scn_addr);
1339         of_node_put(memory);
1340     } else {
1341         nid = hot_add_node_scn_to_nid(scn_addr);
1342     }
1343 
1344     if (nid < 0 || !node_possible(nid))
1345         nid = first_online_node;
1346 
1347     return nid;
1348 }
1349 
1350 static u64 hot_add_drconf_memory_max(void)
1351 {
1352     struct device_node *memory = NULL;
1353     struct device_node *dn = NULL;
1354     const __be64 *lrdr = NULL;
1355 
1356     dn = of_find_node_by_path("/rtas");
1357     if (dn) {
1358         lrdr = of_get_property(dn, "ibm,lrdr-capacity", NULL);
1359         of_node_put(dn);
1360         if (lrdr)
1361             return be64_to_cpup(lrdr);
1362     }
1363 
1364     memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
1365     if (memory) {
1366         of_node_put(memory);
1367         return drmem_lmb_memory_max();
1368     }
1369     return 0;
1370 }
1371 
1372 /*
1373  * memory_hotplug_max - return max address of memory that may be added
1374  *
1375  * This is currently only used on systems that support drconfig memory
1376  * hotplug.
1377  */
1378 u64 memory_hotplug_max(void)
1379 {
1380         return max(hot_add_drconf_memory_max(), memblock_end_of_DRAM());
1381 }
1382 #endif /* CONFIG_MEMORY_HOTPLUG */
1383 
1384 /* Virtual Processor Home Node (VPHN) support */
1385 #ifdef CONFIG_PPC_SPLPAR
1386 static int topology_inited;
1387 
1388 /*
1389  * Retrieve the new associativity information for a virtual processor's
1390  * home node.
1391  */
1392 static long vphn_get_associativity(unsigned long cpu,
1393                     __be32 *associativity)
1394 {
1395     long rc;
1396 
1397     rc = hcall_vphn(get_hard_smp_processor_id(cpu),
1398                 VPHN_FLAG_VCPU, associativity);
1399 
1400     switch (rc) {
1401     case H_SUCCESS:
1402         pr_debug("VPHN hcall succeeded. Reset polling...\n");
1403         goto out;
1404 
1405     case H_FUNCTION:
1406         pr_err_ratelimited("VPHN unsupported. Disabling polling...\n");
1407         break;
1408     case H_HARDWARE:
1409         pr_err_ratelimited("hcall_vphn() experienced a hardware fault "
1410             "preventing VPHN. Disabling polling...\n");
1411         break;
1412     case H_PARAMETER:
1413         pr_err_ratelimited("hcall_vphn() was passed an invalid parameter. "
1414             "Disabling polling...\n");
1415         break;
1416     default:
1417         pr_err_ratelimited("hcall_vphn() returned %ld. Disabling polling...\n"
1418             , rc);
1419         break;
1420     }
1421 out:
1422     return rc;
1423 }
1424 
1425 void find_and_update_cpu_nid(int cpu)
1426 {
1427     __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
1428     int new_nid;
1429 
1430     /* Use associativity from first thread for all siblings */
1431     if (vphn_get_associativity(cpu, associativity))
1432         return;
1433 
1434     /* Do not have previous associativity, so find it now. */
1435     new_nid = associativity_to_nid(associativity);
1436 
1437     if (new_nid < 0 || !node_possible(new_nid))
1438         new_nid = first_online_node;
1439     else
1440         // Associate node <-> cpu, so cpu_up() calls
1441         // try_online_node() on the right node.
1442         set_cpu_numa_node(cpu, new_nid);
1443 
1444     pr_debug("%s:%d cpu %d nid %d\n", __func__, __LINE__, cpu, new_nid);
1445 }
1446 
1447 int cpu_to_coregroup_id(int cpu)
1448 {
1449     __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
1450     int index;
1451 
1452     if (cpu < 0 || cpu > nr_cpu_ids)
1453         return -1;
1454 
1455     if (!coregroup_enabled)
1456         goto out;
1457 
1458     if (!firmware_has_feature(FW_FEATURE_VPHN))
1459         goto out;
1460 
1461     if (vphn_get_associativity(cpu, associativity))
1462         goto out;
1463 
1464     index = of_read_number(associativity, 1);
1465     if (index > primary_domain_index + 1)
1466         return of_read_number(&associativity[index - 1], 1);
1467 
1468 out:
1469     return cpu_to_core_id(cpu);
1470 }
1471 
1472 static int topology_update_init(void)
1473 {
1474     topology_inited = 1;
1475     return 0;
1476 }
1477 device_initcall(topology_update_init);
1478 #endif /* CONFIG_PPC_SPLPAR */