cpuidle/governors/menu.c

0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * menu.c - the menu idle governor
0004  *
0005  * Copyright (C) 2006-2007 Adam Belay <abelay@novell.com>
0006  * Copyright (C) 2009 Intel Corporation
0007  * Author:
0008  *        Arjan van de Ven <arjan@linux.intel.com>
0009  */
0010
0011 #include <linux/kernel.h>
0012 #include <linux/cpuidle.h>
0013 #include <linux/time.h>
0014 #include <linux/ktime.h>
0015 #include <linux/hrtimer.h>
0016 #include <linux/tick.h>
0017 #include <linux/sched.h>
0018 #include <linux/sched/loadavg.h>
0019 #include <linux/sched/stat.h>
0020 #include <linux/math64.h>
0021
0022 #define BUCKETS 12
0023 #define INTERVAL_SHIFT 3
0024 #define INTERVALS (1UL << INTERVAL_SHIFT)
0025 #define RESOLUTION 1024
0026 #define DECAY 8
0027 #define MAX_INTERESTING (50000 * NSEC_PER_USEC)
0028
0029 /*
0030  * Concepts and ideas behind the menu governor
0031  *
0032  * For the menu governor, there are 3 decision factors for picking a C
0033  * state:
0034  * 1) Energy break even point
0035  * 2) Performance impact
0036  * 3) Latency tolerance (from pmqos infrastructure)
0037  * These three factors are treated independently.
0038  *
0039  * Energy break even point
0040  * -----------------------
0041  * C state entry and exit have an energy cost, and a certain amount of time in
0042  * the  C state is required to actually break even on this cost. CPUIDLE
0043  * provides us this duration in the "target_residency" field. So all that we
0044  * need is a good prediction of how long we'll be idle. Like the traditional
0045  * menu governor, we start with the actual known "next timer event" time.
0046  *
0047  * Since there are other source of wakeups (interrupts for example) than
0048  * the next timer event, this estimation is rather optimistic. To get a
0049  * more realistic estimate, a correction factor is applied to the estimate,
0050  * that is based on historic behavior. For example, if in the past the actual
0051  * duration always was 50% of the next timer tick, the correction factor will
0052  * be 0.5.
0053  *
0054  * menu uses a running average for this correction factor, however it uses a
0055  * set of factors, not just a single factor. This stems from the realization
0056  * that the ratio is dependent on the order of magnitude of the expected
0057  * duration; if we expect 500 milliseconds of idle time the likelihood of
0058  * getting an interrupt very early is much higher than if we expect 50 micro
0059  * seconds of idle time. A second independent factor that has big impact on
0060  * the actual factor is if there is (disk) IO outstanding or not.
0061  * (as a special twist, we consider every sleep longer than 50 milliseconds
0062  * as perfect; there are no power gains for sleeping longer than this)
0063  *
0064  * For these two reasons we keep an array of 12 independent factors, that gets
0065  * indexed based on the magnitude of the expected duration as well as the
0066  * "is IO outstanding" property.
0067  *
0068  * Repeatable-interval-detector
0069  * ----------------------------
0070  * There are some cases where "next timer" is a completely unusable predictor:
0071  * Those cases where the interval is fixed, for example due to hardware
0072  * interrupt mitigation, but also due to fixed transfer rate devices such as
0073  * mice.
0074  * For this, we use a different predictor: We track the duration of the last 8
0075  * intervals and if the stand deviation of these 8 intervals is below a
0076  * threshold value, we use the average of these intervals as prediction.
0077  *
0078  * Limiting Performance Impact
0079  * ---------------------------
0080  * C states, especially those with large exit latencies, can have a real
0081  * noticeable impact on workloads, which is not acceptable for most sysadmins,
0082  * and in addition, less performance has a power price of its own.
0083  *
0084  * As a general rule of thumb, menu assumes that the following heuristic
0085  * holds:
0086  *     The busier the system, the less impact of C states is acceptable
0087  *
0088  * This rule-of-thumb is implemented using a performance-multiplier:
0089  * If the exit latency times the performance multiplier is longer than
0090  * the predicted duration, the C state is not considered a candidate
0091  * for selection due to a too high performance impact. So the higher
0092  * this multiplier is, the longer we need to be idle to pick a deep C
0093  * state, and thus the less likely a busy CPU will hit such a deep
0094  * C state.
0095  *
0096  * Two factors are used in determing this multiplier:
0097  * a value of 10 is added for each point of "per cpu load average" we have.
0098  * a value of 5 points is added for each process that is waiting for
0099  * IO on this CPU.
0100  * (these values are experimentally determined)
0101  *
0102  * The load average factor gives a longer term (few seconds) input to the
0103  * decision, while the iowait value gives a cpu local instantanious input.
0104  * The iowait factor may look low, but realize that this is also already
0105  * represented in the system load average.
0106  *
0107  */
0108
0109 struct menu_device {
0110     int             needs_update;
0111     int             tick_wakeup;
0112
0113     u64     next_timer_ns;
0114     unsigned int    bucket;
0115     unsigned int    correction_factor[BUCKETS];
0116     unsigned int    intervals[INTERVALS];
0117     int     interval_ptr;
0118 };
0119
0120 static inline int which_bucket(u64 duration_ns, unsigned int nr_iowaiters)
0121 {
0122     int bucket = 0;
0123
0124     /*
0125      * We keep two groups of stats; one with no
0126      * IO pending, one without.
0127      * This allows us to calculate
0128      * E(duration)|iowait
0129      */
0130     if (nr_iowaiters)
0131         bucket = BUCKETS/2;
0132
0133     if (duration_ns < 10ULL * NSEC_PER_USEC)
0134         return bucket;
0135     if (duration_ns < 100ULL * NSEC_PER_USEC)
0136         return bucket + 1;
0137     if (duration_ns < 1000ULL * NSEC_PER_USEC)
0138         return bucket + 2;
0139     if (duration_ns < 10000ULL * NSEC_PER_USEC)
0140         return bucket + 3;
0141     if (duration_ns < 100000ULL * NSEC_PER_USEC)
0142         return bucket + 4;
0143     return bucket + 5;
0144 }
0145
0146 /*
0147  * Return a multiplier for the exit latency that is intended
0148  * to take performance requirements into account.
0149  * The more performance critical we estimate the system
0150  * to be, the higher this multiplier, and thus the higher
0151  * the barrier to go to an expensive C state.
0152  */
0153 static inline int performance_multiplier(unsigned int nr_iowaiters)
0154 {
0155     /* for IO wait tasks (per cpu!) we add 10x each */
0156     return 1 + 10 * nr_iowaiters;
0157 }
0158
0159 static DEFINE_PER_CPU(struct menu_device, menu_devices);
0160
0161 static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev);
0162
0163 /*
0164  * Try detecting repeating patterns by keeping track of the last 8
0165  * intervals, and checking if the standard deviation of that set
0166  * of points is below a threshold. If it is... then use the
0167  * average of these 8 points as the estimated value.
0168  */
0169 static unsigned int get_typical_interval(struct menu_device *data,
0170                      unsigned int predicted_us)
0171 {
0172     int i, divisor;
0173     unsigned int min, max, thresh, avg;
0174     uint64_t sum, variance;
0175
0176     thresh = INT_MAX; /* Discard outliers above this value */
0177
0178 again:
0179
0180     /* First calculate the average of past intervals */
0181     min = UINT_MAX;
0182     max = 0;
0183     sum = 0;
0184     divisor = 0;
0185     for (i = 0; i < INTERVALS; i++) {
0186         unsigned int value = data->intervals[i];
0187         if (value <= thresh) {
0188             sum += value;
0189             divisor++;
0190             if (value > max)
0191                 max = value;
0192
0193             if (value < min)
0194                 min = value;
0195         }
0196     }
0197
0198     /*
0199      * If the result of the computation is going to be discarded anyway,
0200      * avoid the computation altogether.
0201      */
0202     if (min >= predicted_us)
0203         return UINT_MAX;
0204
0205     if (divisor == INTERVALS)
0206         avg = sum >> INTERVAL_SHIFT;
0207     else
0208         avg = div_u64(sum, divisor);
0209
0210     /* Then try to determine variance */
0211     variance = 0;
0212     for (i = 0; i < INTERVALS; i++) {
0213         unsigned int value = data->intervals[i];
0214         if (value <= thresh) {
0215             int64_t diff = (int64_t)value - avg;
0216             variance += diff * diff;
0217         }
0218     }
0219     if (divisor == INTERVALS)
0220         variance >>= INTERVAL_SHIFT;
0221     else
0222         do_div(variance, divisor);
0223
0224     /*
0225      * The typical interval is obtained when standard deviation is
0226      * small (stddev <= 20 us, variance <= 400 us^2) or standard
0227      * deviation is small compared to the average interval (avg >
0228      * 6*stddev, avg^2 > 36*variance). The average is smaller than
0229      * UINT_MAX aka U32_MAX, so computing its square does not
0230      * overflow a u64. We simply reject this candidate average if
0231      * the standard deviation is greater than 715 s (which is
0232      * rather unlikely).
0233      *
0234      * Use this result only if there is no timer to wake us up sooner.
0235      */
0236     if (likely(variance <= U64_MAX/36)) {
0237         if ((((u64)avg*avg > variance*36) && (divisor * 4 >= INTERVALS * 3))
0238                             || variance <= 400) {
0239             return avg;
0240         }
0241     }
0242
0243     /*
0244      * If we have outliers to the upside in our distribution, discard
0245      * those by setting the threshold to exclude these outliers, then
0246      * calculate the average and standard deviation again. Once we get
0247      * down to the bottom 3/4 of our samples, stop excluding samples.
0248      *
0249      * This can deal with workloads that have long pauses interspersed
0250      * with sporadic activity with a bunch of short pauses.
0251      */
0252     if ((divisor * 4) <= INTERVALS * 3)
0253         return UINT_MAX;
0254
0255     thresh = max - 1;
0256     goto again;
0257 }
0258
0259 /**
0260  * menu_select - selects the next idle state to enter
0261  * @drv: cpuidle driver containing state data
0262  * @dev: the CPU
0263  * @stop_tick: indication on whether or not to stop the tick
0264  */
0265 static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
0266                bool *stop_tick)
0267 {
0268     struct menu_device *data = this_cpu_ptr(&menu_devices);
0269     s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
0270     unsigned int predicted_us;
0271     u64 predicted_ns;
0272     u64 interactivity_req;
0273     unsigned int nr_iowaiters;
0274     ktime_t delta, delta_tick;
0275     int i, idx;
0276
0277     if (data->needs_update) {
0278         menu_update(drv, dev);
0279         data->needs_update = 0;
0280     }
0281
0282     /* determine the expected residency time, round up */
0283     delta = tick_nohz_get_sleep_length(&delta_tick);
0284     if (unlikely(delta < 0)) {
0285         delta = 0;
0286         delta_tick = 0;
0287     }
0288     data->next_timer_ns = delta;
0289
0290     nr_iowaiters = nr_iowait_cpu(dev->cpu);
0291     data->bucket = which_bucket(data->next_timer_ns, nr_iowaiters);
0292
0293     if (unlikely(drv->state_count <= 1 || latency_req == 0) ||
0294         ((data->next_timer_ns < drv->states[1].target_residency_ns ||
0295           latency_req < drv->states[1].exit_latency_ns) &&
0296          !dev->states_usage[0].disable)) {
0297         /*
0298          * In this case state[0] will be used no matter what, so return
0299          * it right away and keep the tick running if state[0] is a
0300          * polling one.
0301          */
0302         *stop_tick = !(drv->states[0].flags & CPUIDLE_FLAG_POLLING);
0303         return 0;
0304     }
0305
0306     /* Round up the result for half microseconds. */
0307     predicted_us = div_u64(data->next_timer_ns *
0308                    data->correction_factor[data->bucket] +
0309                    (RESOLUTION * DECAY * NSEC_PER_USEC) / 2,
0310                    RESOLUTION * DECAY * NSEC_PER_USEC);
0311     /* Use the lowest expected idle interval to pick the idle state. */
0312     predicted_ns = (u64)min(predicted_us,
0313                 get_typical_interval(data, predicted_us)) *
0314                 NSEC_PER_USEC;
0315
0316     if (tick_nohz_tick_stopped()) {
0317         /*
0318          * If the tick is already stopped, the cost of possible short
0319          * idle duration misprediction is much higher, because the CPU
0320          * may be stuck in a shallow idle state for a long time as a
0321          * result of it.  In that case say we might mispredict and use
0322          * the known time till the closest timer event for the idle
0323          * state selection.
0324          */
0325         if (predicted_ns < TICK_NSEC)
0326             predicted_ns = data->next_timer_ns;
0327     } else {
0328         /*
0329          * Use the performance multiplier and the user-configurable
0330          * latency_req to determine the maximum exit latency.
0331          */
0332         interactivity_req = div64_u64(predicted_ns,
0333                           performance_multiplier(nr_iowaiters));
0334         if (latency_req > interactivity_req)
0335             latency_req = interactivity_req;
0336     }
0337
0338     /*
0339      * Find the idle state with the lowest power while satisfying
0340      * our constraints.
0341      */
0342     idx = -1;
0343     for (i = 0; i < drv->state_count; i++) {
0344         struct cpuidle_state *s = &drv->states[i];
0345
0346         if (dev->states_usage[i].disable)
0347             continue;
0348
0349         if (idx == -1)
0350             idx = i; /* first enabled state */
0351
0352         if (s->target_residency_ns > predicted_ns) {
0353             /*
0354              * Use a physical idle state, not busy polling, unless
0355              * a timer is going to trigger soon enough.
0356              */
0357             if ((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) &&
0358                 s->exit_latency_ns <= latency_req &&
0359                 s->target_residency_ns <= data->next_timer_ns) {
0360                 predicted_ns = s->target_residency_ns;
0361                 idx = i;
0362                 break;
0363             }
0364             if (predicted_ns < TICK_NSEC)
0365                 break;
0366
0367             if (!tick_nohz_tick_stopped()) {
0368                 /*
0369                  * If the state selected so far is shallow,
0370                  * waking up early won't hurt, so retain the
0371                  * tick in that case and let the governor run
0372                  * again in the next iteration of the loop.
0373                  */
0374                 predicted_ns = drv->states[idx].target_residency_ns;
0375                 break;
0376             }
0377
0378             /*
0379              * If the state selected so far is shallow and this
0380              * state's target residency matches the time till the
0381              * closest timer event, select this one to avoid getting
0382              * stuck in the shallow one for too long.
0383              */
0384             if (drv->states[idx].target_residency_ns < TICK_NSEC &&
0385                 s->target_residency_ns <= delta_tick)
0386                 idx = i;
0387
0388             return idx;
0389         }
0390         if (s->exit_latency_ns > latency_req)
0391             break;
0392
0393         idx = i;
0394     }
0395
0396     if (idx == -1)
0397         idx = 0; /* No states enabled. Must use 0. */
0398
0399     /*
0400      * Don't stop the tick if the selected state is a polling one or if the
0401      * expected idle duration is shorter than the tick period length.
0402      */
0403     if (((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) ||
0404          predicted_ns < TICK_NSEC) && !tick_nohz_tick_stopped()) {
0405         *stop_tick = false;
0406
0407         if (idx > 0 && drv->states[idx].target_residency_ns > delta_tick) {
0408             /*
0409              * The tick is not going to be stopped and the target
0410              * residency of the state to be returned is not within
0411              * the time until the next timer event including the
0412              * tick, so try to correct that.
0413              */
0414             for (i = idx - 1; i >= 0; i--) {
0415                 if (dev->states_usage[i].disable)
0416                     continue;
0417
0418                 idx = i;
0419                 if (drv->states[i].target_residency_ns <= delta_tick)
0420                     break;
0421             }
0422         }
0423     }
0424
0425     return idx;
0426 }
0427
0428 /**
0429  * menu_reflect - records that data structures need update
0430  * @dev: the CPU
0431  * @index: the index of actual entered state
0432  *
0433  * NOTE: it's important to be fast here because this operation will add to
0434  *       the overall exit latency.
0435  */
0436 static void menu_reflect(struct cpuidle_device *dev, int index)
0437 {
0438     struct menu_device *data = this_cpu_ptr(&menu_devices);
0439
0440     dev->last_state_idx = index;
0441     data->needs_update = 1;
0442     data->tick_wakeup = tick_nohz_idle_got_tick();
0443 }
0444
0445 /**
0446  * menu_update - attempts to guess what happened after entry
0447  * @drv: cpuidle driver containing state data
0448  * @dev: the CPU
0449  */
0450 static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
0451 {
0452     struct menu_device *data = this_cpu_ptr(&menu_devices);
0453     int last_idx = dev->last_state_idx;
0454     struct cpuidle_state *target = &drv->states[last_idx];
0455     u64 measured_ns;
0456     unsigned int new_factor;
0457
0458     /*
0459      * Try to figure out how much time passed between entry to low
0460      * power state and occurrence of the wakeup event.
0461      *
0462      * If the entered idle state didn't support residency measurements,
0463      * we use them anyway if they are short, and if long,
0464      * truncate to the whole expected time.
0465      *
0466      * Any measured amount of time will include the exit latency.
0467      * Since we are interested in when the wakeup begun, not when it
0468      * was completed, we must subtract the exit latency. However, if
0469      * the measured amount of time is less than the exit latency,
0470      * assume the state was never reached and the exit latency is 0.
0471      */
0472
0473     if (data->tick_wakeup && data->next_timer_ns > TICK_NSEC) {
0474         /*
0475          * The nohz code said that there wouldn't be any events within
0476          * the tick boundary (if the tick was stopped), but the idle
0477          * duration predictor had a differing opinion.  Since the CPU
0478          * was woken up by a tick (that wasn't stopped after all), the
0479          * predictor was not quite right, so assume that the CPU could
0480          * have been idle long (but not forever) to help the idle
0481          * duration predictor do a better job next time.
0482          */
0483         measured_ns = 9 * MAX_INTERESTING / 10;
0484     } else if ((drv->states[last_idx].flags & CPUIDLE_FLAG_POLLING) &&
0485            dev->poll_time_limit) {
0486         /*
0487          * The CPU exited the "polling" state due to a time limit, so
0488          * the idle duration prediction leading to the selection of that
0489          * state was inaccurate.  If a better prediction had been made,
0490          * the CPU might have been woken up from idle by the next timer.
0491          * Assume that to be the case.
0492          */
0493         measured_ns = data->next_timer_ns;
0494     } else {
0495         /* measured value */
0496         measured_ns = dev->last_residency_ns;
0497
0498         /* Deduct exit latency */
0499         if (measured_ns > 2 * target->exit_latency_ns)
0500             measured_ns -= target->exit_latency_ns;
0501         else
0502             measured_ns /= 2;
0503     }
0504
0505     /* Make sure our coefficients do not exceed unity */
0506     if (measured_ns > data->next_timer_ns)
0507         measured_ns = data->next_timer_ns;
0508
0509     /* Update our correction ratio */
0510     new_factor = data->correction_factor[data->bucket];
0511     new_factor -= new_factor / DECAY;
0512
0513     if (data->next_timer_ns > 0 && measured_ns < MAX_INTERESTING)
0514         new_factor += div64_u64(RESOLUTION * measured_ns,
0515                     data->next_timer_ns);
0516     else
0517         /*
0518          * we were idle so long that we count it as a perfect
0519          * prediction
0520          */
0521         new_factor += RESOLUTION;
0522
0523     /*
0524      * We don't want 0 as factor; we always want at least
0525      * a tiny bit of estimated time. Fortunately, due to rounding,
0526      * new_factor will stay nonzero regardless of measured_us values
0527      * and the compiler can eliminate this test as long as DECAY > 1.
0528      */
0529     if (DECAY == 1 && unlikely(new_factor == 0))
0530         new_factor = 1;
0531
0532     data->correction_factor[data->bucket] = new_factor;
0533
0534     /* update the repeating-pattern data */
0535     data->intervals[data->interval_ptr++] = ktime_to_us(measured_ns);
0536     if (data->interval_ptr >= INTERVALS)
0537         data->interval_ptr = 0;
0538 }
0539
0540 /**
0541  * menu_enable_device - scans a CPU's states and does setup
0542  * @drv: cpuidle driver
0543  * @dev: the CPU
0544  */
0545 static int menu_enable_device(struct cpuidle_driver *drv,
0546                 struct cpuidle_device *dev)
0547 {
0548     struct menu_device *data = &per_cpu(menu_devices, dev->cpu);
0549     int i;
0550
0551     memset(data, 0, sizeof(struct menu_device));
0552
0553     /*
0554      * if the correction factor is 0 (eg first time init or cpu hotplug
0555      * etc), we actually want to start out with a unity factor.
0556      */
0557     for(i = 0; i < BUCKETS; i++)
0558         data->correction_factor[i] = RESOLUTION * DECAY;
0559
0560     return 0;
0561 }
0562
0563 static struct cpuidle_governor menu_governor = {
0564     .name =     "menu",
0565     .rating =   20,
0566     .enable =   menu_enable_device,
0567     .select =   menu_select,
0568     .reflect =  menu_reflect,
0569 };
0570
0571 /**
0572  * init_menu - initializes the governor
0573  */
0574 static int __init init_menu(void)
0575 {
0576     return cpuidle_register_governor(&menu_governor);
0577 }
0578
0579 postcore_initcall(init_menu);