kernel/sched/cpupri.c

0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  *  kernel/sched/cpupri.c
0004  *
0005  *  CPU priority management
0006  *
0007  *  Copyright (C) 2007-2008 Novell
0008  *
0009  *  Author: Gregory Haskins <ghaskins@novell.com>
0010  *
0011  *  This code tracks the priority of each CPU so that global migration
0012  *  decisions are easy to calculate.  Each CPU can be in a state as follows:
0013  *
0014  *                 (INVALID), NORMAL, RT1, ... RT99, HIGHER
0015  *
0016  *  going from the lowest priority to the highest.  CPUs in the INVALID state
0017  *  are not eligible for routing.  The system maintains this state with
0018  *  a 2 dimensional bitmap (the first for priority class, the second for CPUs
0019  *  in that class).  Therefore a typical application without affinity
0020  *  restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
0021  *  searches).  For tasks with affinity restrictions, the algorithm has a
0022  *  worst case complexity of O(min(101, nr_domcpus)), though the scenario that
0023  *  yields the worst case search is fairly contrived.
0024  */
0025
0026 /*
0027  * p->rt_priority   p->prio   newpri   cpupri
0028  *
0029  *                -1       -1 (CPUPRI_INVALID)
0030  *
0031  *                99        0 (CPUPRI_NORMAL)
0032  *
0033  *      1        98       98        1
0034  *        ...
0035  *         49        50       50       49
0036  *         50        49       49       50
0037  *        ...
0038  *         99         0        0       99
0039  *
0040  *               100      100 (CPUPRI_HIGHER)
0041  */
0042 static int convert_prio(int prio)
0043 {
0044     int cpupri;
0045
0046     switch (prio) {
0047     case CPUPRI_INVALID:
0048         cpupri = CPUPRI_INVALID;    /* -1 */
0049         break;
0050
0051     case 0 ... 98:
0052         cpupri = MAX_RT_PRIO-1 - prio;  /* 1 ... 99 */
0053         break;
0054
0055     case MAX_RT_PRIO-1:
0056         cpupri = CPUPRI_NORMAL;     /*  0 */
0057         break;
0058
0059     case MAX_RT_PRIO:
0060         cpupri = CPUPRI_HIGHER;     /* 100 */
0061         break;
0062     }
0063
0064     return cpupri;
0065 }
0066
0067 static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p,
0068                 struct cpumask *lowest_mask, int idx)
0069 {
0070     struct cpupri_vec *vec  = &cp->pri_to_cpu[idx];
0071     int skip = 0;
0072
0073     if (!atomic_read(&(vec)->count))
0074         skip = 1;
0075     /*
0076      * When looking at the vector, we need to read the counter,
0077      * do a memory barrier, then read the mask.
0078      *
0079      * Note: This is still all racy, but we can deal with it.
0080      *  Ideally, we only want to look at masks that are set.
0081      *
0082      *  If a mask is not set, then the only thing wrong is that we
0083      *  did a little more work than necessary.
0084      *
0085      *  If we read a zero count but the mask is set, because of the
0086      *  memory barriers, that can only happen when the highest prio
0087      *  task for a run queue has left the run queue, in which case,
0088      *  it will be followed by a pull. If the task we are processing
0089      *  fails to find a proper place to go, that pull request will
0090      *  pull this task if the run queue is running at a lower
0091      *  priority.
0092      */
0093     smp_rmb();
0094
0095     /* Need to do the rmb for every iteration */
0096     if (skip)
0097         return 0;
0098
0099     if (cpumask_any_and(&p->cpus_mask, vec->mask) >= nr_cpu_ids)
0100         return 0;
0101
0102     if (lowest_mask) {
0103         cpumask_and(lowest_mask, &p->cpus_mask, vec->mask);
0104
0105         /*
0106          * We have to ensure that we have at least one bit
0107          * still set in the array, since the map could have
0108          * been concurrently emptied between the first and
0109          * second reads of vec->mask.  If we hit this
0110          * condition, simply act as though we never hit this
0111          * priority level and continue on.
0112          */
0113         if (cpumask_empty(lowest_mask))
0114             return 0;
0115     }
0116
0117     return 1;
0118 }
0119
0120 int cpupri_find(struct cpupri *cp, struct task_struct *p,
0121         struct cpumask *lowest_mask)
0122 {
0123     return cpupri_find_fitness(cp, p, lowest_mask, NULL);
0124 }
0125
0126 /**
0127  * cpupri_find_fitness - find the best (lowest-pri) CPU in the system
0128  * @cp: The cpupri context
0129  * @p: The task
0130  * @lowest_mask: A mask to fill in with selected CPUs (or NULL)
0131  * @fitness_fn: A pointer to a function to do custom checks whether the CPU
0132  *              fits a specific criteria so that we only return those CPUs.
0133  *
0134  * Note: This function returns the recommended CPUs as calculated during the
0135  * current invocation.  By the time the call returns, the CPUs may have in
0136  * fact changed priorities any number of times.  While not ideal, it is not
0137  * an issue of correctness since the normal rebalancer logic will correct
0138  * any discrepancies created by racing against the uncertainty of the current
0139  * priority configuration.
0140  *
0141  * Return: (int)bool - CPUs were found
0142  */
0143 int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p,
0144         struct cpumask *lowest_mask,
0145         bool (*fitness_fn)(struct task_struct *p, int cpu))
0146 {
0147     int task_pri = convert_prio(p->prio);
0148     int idx, cpu;
0149
0150     BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES);
0151
0152     for (idx = 0; idx < task_pri; idx++) {
0153
0154         if (!__cpupri_find(cp, p, lowest_mask, idx))
0155             continue;
0156
0157         if (!lowest_mask || !fitness_fn)
0158             return 1;
0159
0160         /* Ensure the capacity of the CPUs fit the task */
0161         for_each_cpu(cpu, lowest_mask) {
0162             if (!fitness_fn(p, cpu))
0163                 cpumask_clear_cpu(cpu, lowest_mask);
0164         }
0165
0166         /*
0167          * If no CPU at the current priority can fit the task
0168          * continue looking
0169          */
0170         if (cpumask_empty(lowest_mask))
0171             continue;
0172
0173         return 1;
0174     }
0175
0176     /*
0177      * If we failed to find a fitting lowest_mask, kick off a new search
0178      * but without taking into account any fitness criteria this time.
0179      *
0180      * This rule favours honouring priority over fitting the task in the
0181      * correct CPU (Capacity Awareness being the only user now).
0182      * The idea is that if a higher priority task can run, then it should
0183      * run even if this ends up being on unfitting CPU.
0184      *
0185      * The cost of this trade-off is not entirely clear and will probably
0186      * be good for some workloads and bad for others.
0187      *
0188      * The main idea here is that if some CPUs were over-committed, we try
0189      * to spread which is what the scheduler traditionally did. Sys admins
0190      * must do proper RT planning to avoid overloading the system if they
0191      * really care.
0192      */
0193     if (fitness_fn)
0194         return cpupri_find(cp, p, lowest_mask);
0195
0196     return 0;
0197 }
0198
0199 /**
0200  * cpupri_set - update the CPU priority setting
0201  * @cp: The cpupri context
0202  * @cpu: The target CPU
0203  * @newpri: The priority (INVALID,NORMAL,RT1-RT99,HIGHER) to assign to this CPU
0204  *
0205  * Note: Assumes cpu_rq(cpu)->lock is locked
0206  *
0207  * Returns: (void)
0208  */
0209 void cpupri_set(struct cpupri *cp, int cpu, int newpri)
0210 {
0211     int *currpri = &cp->cpu_to_pri[cpu];
0212     int oldpri = *currpri;
0213     int do_mb = 0;
0214
0215     newpri = convert_prio(newpri);
0216
0217     BUG_ON(newpri >= CPUPRI_NR_PRIORITIES);
0218
0219     if (newpri == oldpri)
0220         return;
0221
0222     /*
0223      * If the CPU was currently mapped to a different value, we
0224      * need to map it to the new value then remove the old value.
0225      * Note, we must add the new value first, otherwise we risk the
0226      * cpu being missed by the priority loop in cpupri_find.
0227      */
0228     if (likely(newpri != CPUPRI_INVALID)) {
0229         struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
0230
0231         cpumask_set_cpu(cpu, vec->mask);
0232         /*
0233          * When adding a new vector, we update the mask first,
0234          * do a write memory barrier, and then update the count, to
0235          * make sure the vector is visible when count is set.
0236          */
0237         smp_mb__before_atomic();
0238         atomic_inc(&(vec)->count);
0239         do_mb = 1;
0240     }
0241     if (likely(oldpri != CPUPRI_INVALID)) {
0242         struct cpupri_vec *vec  = &cp->pri_to_cpu[oldpri];
0243
0244         /*
0245          * Because the order of modification of the vec->count
0246          * is important, we must make sure that the update
0247          * of the new prio is seen before we decrement the
0248          * old prio. This makes sure that the loop sees
0249          * one or the other when we raise the priority of
0250          * the run queue. We don't care about when we lower the
0251          * priority, as that will trigger an rt pull anyway.
0252          *
0253          * We only need to do a memory barrier if we updated
0254          * the new priority vec.
0255          */
0256         if (do_mb)
0257             smp_mb__after_atomic();
0258
0259         /*
0260          * When removing from the vector, we decrement the counter first
0261          * do a memory barrier and then clear the mask.
0262          */
0263         atomic_dec(&(vec)->count);
0264         smp_mb__after_atomic();
0265         cpumask_clear_cpu(cpu, vec->mask);
0266     }
0267
0268     *currpri = newpri;
0269 }
0270
0271 /**
0272  * cpupri_init - initialize the cpupri structure
0273  * @cp: The cpupri context
0274  *
0275  * Return: -ENOMEM on memory allocation failure.
0276  */
0277 int cpupri_init(struct cpupri *cp)
0278 {
0279     int i;
0280
0281     for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
0282         struct cpupri_vec *vec = &cp->pri_to_cpu[i];
0283
0284         atomic_set(&vec->count, 0);
0285         if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
0286             goto cleanup;
0287     }
0288
0289     cp->cpu_to_pri = kcalloc(nr_cpu_ids, sizeof(int), GFP_KERNEL);
0290     if (!cp->cpu_to_pri)
0291         goto cleanup;
0292
0293     for_each_possible_cpu(i)
0294         cp->cpu_to_pri[i] = CPUPRI_INVALID;
0295
0296     return 0;
0297
0298 cleanup:
0299     for (i--; i >= 0; i--)
0300         free_cpumask_var(cp->pri_to_cpu[i].mask);
0301     return -ENOMEM;
0302 }
0303
0304 /**
0305  * cpupri_cleanup - clean up the cpupri structure
0306  * @cp: The cpupri context
0307  */
0308 void cpupri_cleanup(struct cpupri *cp)
0309 {
0310     int i;
0311
0312     kfree(cp->cpu_to_pri);
0313     for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
0314         free_cpumask_var(cp->pri_to_cpu[i].mask);
0315 }