kernel/irq/affinity.c

0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * Copyright (C) 2016 Thomas Gleixner.
0004  * Copyright (C) 2016-2017 Christoph Hellwig.
0005  */
0006 #include <linux/interrupt.h>
0007 #include <linux/kernel.h>
0008 #include <linux/slab.h>
0009 #include <linux/cpu.h>
0010 #include <linux/sort.h>
0011
0012 static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
0013                 unsigned int cpus_per_vec)
0014 {
0015     const struct cpumask *siblmsk;
0016     int cpu, sibl;
0017
0018     for ( ; cpus_per_vec > 0; ) {
0019         cpu = cpumask_first(nmsk);
0020
0021         /* Should not happen, but I'm too lazy to think about it */
0022         if (cpu >= nr_cpu_ids)
0023             return;
0024
0025         cpumask_clear_cpu(cpu, nmsk);
0026         cpumask_set_cpu(cpu, irqmsk);
0027         cpus_per_vec--;
0028
0029         /* If the cpu has siblings, use them first */
0030         siblmsk = topology_sibling_cpumask(cpu);
0031         for (sibl = -1; cpus_per_vec > 0; ) {
0032             sibl = cpumask_next(sibl, siblmsk);
0033             if (sibl >= nr_cpu_ids)
0034                 break;
0035             if (!cpumask_test_and_clear_cpu(sibl, nmsk))
0036                 continue;
0037             cpumask_set_cpu(sibl, irqmsk);
0038             cpus_per_vec--;
0039         }
0040     }
0041 }
0042
0043 static cpumask_var_t *alloc_node_to_cpumask(void)
0044 {
0045     cpumask_var_t *masks;
0046     int node;
0047
0048     masks = kcalloc(nr_node_ids, sizeof(cpumask_var_t), GFP_KERNEL);
0049     if (!masks)
0050         return NULL;
0051
0052     for (node = 0; node < nr_node_ids; node++) {
0053         if (!zalloc_cpumask_var(&masks[node], GFP_KERNEL))
0054             goto out_unwind;
0055     }
0056
0057     return masks;
0058
0059 out_unwind:
0060     while (--node >= 0)
0061         free_cpumask_var(masks[node]);
0062     kfree(masks);
0063     return NULL;
0064 }
0065
0066 static void free_node_to_cpumask(cpumask_var_t *masks)
0067 {
0068     int node;
0069
0070     for (node = 0; node < nr_node_ids; node++)
0071         free_cpumask_var(masks[node]);
0072     kfree(masks);
0073 }
0074
0075 static void build_node_to_cpumask(cpumask_var_t *masks)
0076 {
0077     int cpu;
0078
0079     for_each_possible_cpu(cpu)
0080         cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]);
0081 }
0082
0083 static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
0084                 const struct cpumask *mask, nodemask_t *nodemsk)
0085 {
0086     int n, nodes = 0;
0087
0088     /* Calculate the number of nodes in the supplied affinity mask */
0089     for_each_node(n) {
0090         if (cpumask_intersects(mask, node_to_cpumask[n])) {
0091             node_set(n, *nodemsk);
0092             nodes++;
0093         }
0094     }
0095     return nodes;
0096 }
0097
0098 struct node_vectors {
0099     unsigned id;
0100
0101     union {
0102         unsigned nvectors;
0103         unsigned ncpus;
0104     };
0105 };
0106
0107 static int ncpus_cmp_func(const void *l, const void *r)
0108 {
0109     const struct node_vectors *ln = l;
0110     const struct node_vectors *rn = r;
0111
0112     return ln->ncpus - rn->ncpus;
0113 }
0114
0115 /*
0116  * Allocate vector number for each node, so that for each node:
0117  *
0118  * 1) the allocated number is >= 1
0119  *
0120  * 2) the allocated numbver is <= active CPU number of this node
0121  *
0122  * The actual allocated total vectors may be less than @numvecs when
0123  * active total CPU number is less than @numvecs.
0124  *
0125  * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]'
0126  * for each node.
0127  */
0128 static void alloc_nodes_vectors(unsigned int numvecs,
0129                 cpumask_var_t *node_to_cpumask,
0130                 const struct cpumask *cpu_mask,
0131                 const nodemask_t nodemsk,
0132                 struct cpumask *nmsk,
0133                 struct node_vectors *node_vectors)
0134 {
0135     unsigned n, remaining_ncpus = 0;
0136
0137     for (n = 0; n < nr_node_ids; n++) {
0138         node_vectors[n].id = n;
0139         node_vectors[n].ncpus = UINT_MAX;
0140     }
0141
0142     for_each_node_mask(n, nodemsk) {
0143         unsigned ncpus;
0144
0145         cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
0146         ncpus = cpumask_weight(nmsk);
0147
0148         if (!ncpus)
0149             continue;
0150         remaining_ncpus += ncpus;
0151         node_vectors[n].ncpus = ncpus;
0152     }
0153
0154     numvecs = min_t(unsigned, remaining_ncpus, numvecs);
0155
0156     sort(node_vectors, nr_node_ids, sizeof(node_vectors[0]),
0157          ncpus_cmp_func, NULL);
0158
0159     /*
0160      * Allocate vectors for each node according to the ratio of this
0161      * node's nr_cpus to remaining un-assigned ncpus. 'numvecs' is
0162      * bigger than number of active numa nodes. Always start the
0163      * allocation from the node with minimized nr_cpus.
0164      *
0165      * This way guarantees that each active node gets allocated at
0166      * least one vector, and the theory is simple: over-allocation
0167      * is only done when this node is assigned by one vector, so
0168      * other nodes will be allocated >= 1 vector, since 'numvecs' is
0169      * bigger than number of numa nodes.
0170      *
0171      * One perfect invariant is that number of allocated vectors for
0172      * each node is <= CPU count of this node:
0173      *
0174      * 1) suppose there are two nodes: A and B
0175      *  ncpu(X) is CPU count of node X
0176      *  vecs(X) is the vector count allocated to node X via this
0177      *  algorithm
0178      *
0179      *  ncpu(A) <= ncpu(B)
0180      *  ncpu(A) + ncpu(B) = N
0181      *  vecs(A) + vecs(B) = V
0182      *
0183      *  vecs(A) = max(1, round_down(V * ncpu(A) / N))
0184      *  vecs(B) = V - vecs(A)
0185      *
0186      *  both N and V are integer, and 2 <= V <= N, suppose
0187      *  V = N - delta, and 0 <= delta <= N - 2
0188      *
0189      * 2) obviously vecs(A) <= ncpu(A) because:
0190      *
0191      *  if vecs(A) is 1, then vecs(A) <= ncpu(A) given
0192      *  ncpu(A) >= 1
0193      *
0194      *  otherwise,
0195      *      vecs(A) <= V * ncpu(A) / N <= ncpu(A), given V <= N
0196      *
0197      * 3) prove how vecs(B) <= ncpu(B):
0198      *
0199      *  if round_down(V * ncpu(A) / N) == 0, vecs(B) won't be
0200      *  over-allocated, so vecs(B) <= ncpu(B),
0201      *
0202      *  otherwise:
0203      *
0204      *  vecs(A) =
0205      *      round_down(V * ncpu(A) / N) =
0206      *      round_down((N - delta) * ncpu(A) / N) =
0207      *      round_down((N * ncpu(A) - delta * ncpu(A)) / N)  >=
0208      *      round_down((N * ncpu(A) - delta * N) / N)    =
0209      *      cpu(A) - delta
0210      *
0211      *  then:
0212      *
0213      *  vecs(A) - V >= ncpu(A) - delta - V
0214      *  =>
0215      *  V - vecs(A) <= V + delta - ncpu(A)
0216      *  =>
0217      *  vecs(B) <= N - ncpu(A)
0218      *  =>
0219      *  vecs(B) <= cpu(B)
0220      *
0221      * For nodes >= 3, it can be thought as one node and another big
0222      * node given that is exactly what this algorithm is implemented,
0223      * and we always re-calculate 'remaining_ncpus' & 'numvecs', and
0224      * finally for each node X: vecs(X) <= ncpu(X).
0225      *
0226      */
0227     for (n = 0; n < nr_node_ids; n++) {
0228         unsigned nvectors, ncpus;
0229
0230         if (node_vectors[n].ncpus == UINT_MAX)
0231             continue;
0232
0233         WARN_ON_ONCE(numvecs == 0);
0234
0235         ncpus = node_vectors[n].ncpus;
0236         nvectors = max_t(unsigned, 1,
0237                  numvecs * ncpus / remaining_ncpus);
0238         WARN_ON_ONCE(nvectors > ncpus);
0239
0240         node_vectors[n].nvectors = nvectors;
0241
0242         remaining_ncpus -= ncpus;
0243         numvecs -= nvectors;
0244     }
0245 }
0246
0247 static int __irq_build_affinity_masks(unsigned int startvec,
0248                       unsigned int numvecs,
0249                       unsigned int firstvec,
0250                       cpumask_var_t *node_to_cpumask,
0251                       const struct cpumask *cpu_mask,
0252                       struct cpumask *nmsk,
0253                       struct irq_affinity_desc *masks)
0254 {
0255     unsigned int i, n, nodes, cpus_per_vec, extra_vecs, done = 0;
0256     unsigned int last_affv = firstvec + numvecs;
0257     unsigned int curvec = startvec;
0258     nodemask_t nodemsk = NODE_MASK_NONE;
0259     struct node_vectors *node_vectors;
0260
0261     if (cpumask_empty(cpu_mask))
0262         return 0;
0263
0264     nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk);
0265
0266     /*
0267      * If the number of nodes in the mask is greater than or equal the
0268      * number of vectors we just spread the vectors across the nodes.
0269      */
0270     if (numvecs <= nodes) {
0271         for_each_node_mask(n, nodemsk) {
0272             /* Ensure that only CPUs which are in both masks are set */
0273             cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
0274             cpumask_or(&masks[curvec].mask, &masks[curvec].mask, nmsk);
0275             if (++curvec == last_affv)
0276                 curvec = firstvec;
0277         }
0278         return numvecs;
0279     }
0280
0281     node_vectors = kcalloc(nr_node_ids,
0282                    sizeof(struct node_vectors),
0283                    GFP_KERNEL);
0284     if (!node_vectors)
0285         return -ENOMEM;
0286
0287     /* allocate vector number for each node */
0288     alloc_nodes_vectors(numvecs, node_to_cpumask, cpu_mask,
0289                 nodemsk, nmsk, node_vectors);
0290
0291     for (i = 0; i < nr_node_ids; i++) {
0292         unsigned int ncpus, v;
0293         struct node_vectors *nv = &node_vectors[i];
0294
0295         if (nv->nvectors == UINT_MAX)
0296             continue;
0297
0298         /* Get the cpus on this node which are in the mask */
0299         cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]);
0300         ncpus = cpumask_weight(nmsk);
0301         if (!ncpus)
0302             continue;
0303
0304         WARN_ON_ONCE(nv->nvectors > ncpus);
0305
0306         /* Account for rounding errors */
0307         extra_vecs = ncpus - nv->nvectors * (ncpus / nv->nvectors);
0308
0309         /* Spread allocated vectors on CPUs of the current node */
0310         for (v = 0; v < nv->nvectors; v++, curvec++) {
0311             cpus_per_vec = ncpus / nv->nvectors;
0312
0313             /* Account for extra vectors to compensate rounding errors */
0314             if (extra_vecs) {
0315                 cpus_per_vec++;
0316                 --extra_vecs;
0317             }
0318
0319             /*
0320              * wrapping has to be considered given 'startvec'
0321              * may start anywhere
0322              */
0323             if (curvec >= last_affv)
0324                 curvec = firstvec;
0325             irq_spread_init_one(&masks[curvec].mask, nmsk,
0326                         cpus_per_vec);
0327         }
0328         done += nv->nvectors;
0329     }
0330     kfree(node_vectors);
0331     return done;
0332 }
0333
0334 /*
0335  * build affinity in two stages:
0336  *  1) spread present CPU on these vectors
0337  *  2) spread other possible CPUs on these vectors
0338  */
0339 static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
0340                     unsigned int firstvec,
0341                     struct irq_affinity_desc *masks)
0342 {
0343     unsigned int curvec = startvec, nr_present = 0, nr_others = 0;
0344     cpumask_var_t *node_to_cpumask;
0345     cpumask_var_t nmsk, npresmsk;
0346     int ret = -ENOMEM;
0347
0348     if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
0349         return ret;
0350
0351     if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL))
0352         goto fail_nmsk;
0353
0354     node_to_cpumask = alloc_node_to_cpumask();
0355     if (!node_to_cpumask)
0356         goto fail_npresmsk;
0357
0358     /* Stabilize the cpumasks */
0359     cpus_read_lock();
0360     build_node_to_cpumask(node_to_cpumask);
0361
0362     /* Spread on present CPUs starting from affd->pre_vectors */
0363     ret = __irq_build_affinity_masks(curvec, numvecs, firstvec,
0364                      node_to_cpumask, cpu_present_mask,
0365                      nmsk, masks);
0366     if (ret < 0)
0367         goto fail_build_affinity;
0368     nr_present = ret;
0369
0370     /*
0371      * Spread on non present CPUs starting from the next vector to be
0372      * handled. If the spreading of present CPUs already exhausted the
0373      * vector space, assign the non present CPUs to the already spread
0374      * out vectors.
0375      */
0376     if (nr_present >= numvecs)
0377         curvec = firstvec;
0378     else
0379         curvec = firstvec + nr_present;
0380     cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
0381     ret = __irq_build_affinity_masks(curvec, numvecs, firstvec,
0382                      node_to_cpumask, npresmsk, nmsk,
0383                      masks);
0384     if (ret >= 0)
0385         nr_others = ret;
0386
0387  fail_build_affinity:
0388     cpus_read_unlock();
0389
0390     if (ret >= 0)
0391         WARN_ON(nr_present + nr_others < numvecs);
0392
0393     free_node_to_cpumask(node_to_cpumask);
0394
0395  fail_npresmsk:
0396     free_cpumask_var(npresmsk);
0397
0398  fail_nmsk:
0399     free_cpumask_var(nmsk);
0400     return ret < 0 ? ret : 0;
0401 }
0402
0403 static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs)
0404 {
0405     affd->nr_sets = 1;
0406     affd->set_size[0] = affvecs;
0407 }
0408
0409 /**
0410  * irq_create_affinity_masks - Create affinity masks for multiqueue spreading
0411  * @nvecs:  The total number of vectors
0412  * @affd:   Description of the affinity requirements
0413  *
0414  * Returns the irq_affinity_desc pointer or NULL if allocation failed.
0415  */
0416 struct irq_affinity_desc *
0417 irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
0418 {
0419     unsigned int affvecs, curvec, usedvecs, i;
0420     struct irq_affinity_desc *masks = NULL;
0421
0422     /*
0423      * Determine the number of vectors which need interrupt affinities
0424      * assigned. If the pre/post request exhausts the available vectors
0425      * then nothing to do here except for invoking the calc_sets()
0426      * callback so the device driver can adjust to the situation.
0427      */
0428     if (nvecs > affd->pre_vectors + affd->post_vectors)
0429         affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
0430     else
0431         affvecs = 0;
0432
0433     /*
0434      * Simple invocations do not provide a calc_sets() callback. Install
0435      * the generic one.
0436      */
0437     if (!affd->calc_sets)
0438         affd->calc_sets = default_calc_sets;
0439
0440     /* Recalculate the sets */
0441     affd->calc_sets(affd, affvecs);
0442
0443     if (WARN_ON_ONCE(affd->nr_sets > IRQ_AFFINITY_MAX_SETS))
0444         return NULL;
0445
0446     /* Nothing to assign? */
0447     if (!affvecs)
0448         return NULL;
0449
0450     masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL);
0451     if (!masks)
0452         return NULL;
0453
0454     /* Fill out vectors at the beginning that don't need affinity */
0455     for (curvec = 0; curvec < affd->pre_vectors; curvec++)
0456         cpumask_copy(&masks[curvec].mask, irq_default_affinity);
0457
0458     /*
0459      * Spread on present CPUs starting from affd->pre_vectors. If we
0460      * have multiple sets, build each sets affinity mask separately.
0461      */
0462     for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) {
0463         unsigned int this_vecs = affd->set_size[i];
0464         int ret;
0465
0466         ret = irq_build_affinity_masks(curvec, this_vecs,
0467                            curvec, masks);
0468         if (ret) {
0469             kfree(masks);
0470             return NULL;
0471         }
0472         curvec += this_vecs;
0473         usedvecs += this_vecs;
0474     }
0475
0476     /* Fill out vectors at the end that don't need affinity */
0477     if (usedvecs >= affvecs)
0478         curvec = affd->pre_vectors + affvecs;
0479     else
0480         curvec = affd->pre_vectors + usedvecs;
0481     for (; curvec < nvecs; curvec++)
0482         cpumask_copy(&masks[curvec].mask, irq_default_affinity);
0483
0484     /* Mark the managed interrupts */
0485     for (i = affd->pre_vectors; i < nvecs - affd->post_vectors; i++)
0486         masks[i].is_managed = 1;
0487
0488     return masks;
0489 }
0490
0491 /**
0492  * irq_calc_affinity_vectors - Calculate the optimal number of vectors
0493  * @minvec: The minimum number of vectors available
0494  * @maxvec: The maximum number of vectors available
0495  * @affd:   Description of the affinity requirements
0496  */
0497 unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec,
0498                        const struct irq_affinity *affd)
0499 {
0500     unsigned int resv = affd->pre_vectors + affd->post_vectors;
0501     unsigned int set_vecs;
0502
0503     if (resv > minvec)
0504         return 0;
0505
0506     if (affd->calc_sets) {
0507         set_vecs = maxvec - resv;
0508     } else {
0509         cpus_read_lock();
0510         set_vecs = cpumask_weight(cpu_possible_mask);
0511         cpus_read_unlock();
0512     }
0513
0514     return resv + min(set_vecs, maxvec - resv);
0515 }