Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * Timer events oriented CPU idle governor
0004  *
0005  * Copyright (C) 2018 - 2021 Intel Corporation
0006  * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
0007  */
0008 
0009 /**
0010  * DOC: teo-description
0011  *
0012  * The idea of this governor is based on the observation that on many systems
0013  * timer events are two or more orders of magnitude more frequent than any
0014  * other interrupts, so they are likely to be the most significant cause of CPU
0015  * wakeups from idle states.  Moreover, information about what happened in the
0016  * (relatively recent) past can be used to estimate whether or not the deepest
0017  * idle state with target residency within the (known) time till the closest
0018  * timer event, referred to as the sleep length, is likely to be suitable for
0019  * the upcoming CPU idle period and, if not, then which of the shallower idle
0020  * states to choose instead of it.
0021  *
0022  * Of course, non-timer wakeup sources are more important in some use cases
0023  * which can be covered by taking a few most recent idle time intervals of the
0024  * CPU into account.  However, even in that context it is not necessary to
0025  * consider idle duration values greater than the sleep length, because the
0026  * closest timer will ultimately wake up the CPU anyway unless it is woken up
0027  * earlier.
0028  *
0029  * Thus this governor estimates whether or not the prospective idle duration of
0030  * a CPU is likely to be significantly shorter than the sleep length and selects
0031  * an idle state for it accordingly.
0032  *
0033  * The computations carried out by this governor are based on using bins whose
0034  * boundaries are aligned with the target residency parameter values of the CPU
0035  * idle states provided by the %CPUIdle driver in the ascending order.  That is,
0036  * the first bin spans from 0 up to, but not including, the target residency of
0037  * the second idle state (idle state 1), the second bin spans from the target
0038  * residency of idle state 1 up to, but not including, the target residency of
0039  * idle state 2, the third bin spans from the target residency of idle state 2
0040  * up to, but not including, the target residency of idle state 3 and so on.
0041  * The last bin spans from the target residency of the deepest idle state
0042  * supplied by the driver to infinity.
0043  *
0044  * Two metrics called "hits" and "intercepts" are associated with each bin.
0045  * They are updated every time before selecting an idle state for the given CPU
0046  * in accordance with what happened last time.
0047  *
0048  * The "hits" metric reflects the relative frequency of situations in which the
0049  * sleep length and the idle duration measured after CPU wakeup fall into the
0050  * same bin (that is, the CPU appears to wake up "on time" relative to the sleep
0051  * length).  In turn, the "intercepts" metric reflects the relative frequency of
0052  * situations in which the measured idle duration is so much shorter than the
0053  * sleep length that the bin it falls into corresponds to an idle state
0054  * shallower than the one whose bin is fallen into by the sleep length (these
0055  * situations are referred to as "intercepts" below).
0056  *
0057  * In addition to the metrics described above, the governor counts recent
0058  * intercepts (that is, intercepts that have occurred during the last
0059  * %NR_RECENT invocations of it for the given CPU) for each bin.
0060  *
0061  * In order to select an idle state for a CPU, the governor takes the following
0062  * steps (modulo the possible latency constraint that must be taken into account
0063  * too):
0064  *
0065  * 1. Find the deepest CPU idle state whose target residency does not exceed
0066  *    the current sleep length (the candidate idle state) and compute 3 sums as
0067  *    follows:
0068  *
0069  *    - The sum of the "hits" and "intercepts" metrics for the candidate state
0070  *      and all of the deeper idle states (it represents the cases in which the
0071  *      CPU was idle long enough to avoid being intercepted if the sleep length
0072  *      had been equal to the current one).
0073  *
0074  *    - The sum of the "intercepts" metrics for all of the idle states shallower
0075  *      than the candidate one (it represents the cases in which the CPU was not
0076  *      idle long enough to avoid being intercepted if the sleep length had been
0077  *      equal to the current one).
0078  *
0079  *    - The sum of the numbers of recent intercepts for all of the idle states
0080  *      shallower than the candidate one.
0081  *
0082  * 2. If the second sum is greater than the first one or the third sum is
0083  *    greater than %NR_RECENT / 2, the CPU is likely to wake up early, so look
0084  *    for an alternative idle state to select.
0085  *
0086  *    - Traverse the idle states shallower than the candidate one in the
0087  *      descending order.
0088  *
0089  *    - For each of them compute the sum of the "intercepts" metrics and the sum
0090  *      of the numbers of recent intercepts over all of the idle states between
0091  *      it and the candidate one (including the former and excluding the
0092  *      latter).
0093  *
0094  *    - If each of these sums that needs to be taken into account (because the
0095  *      check related to it has indicated that the CPU is likely to wake up
0096  *      early) is greater than a half of the corresponding sum computed in step
0097  *      1 (which means that the target residency of the state in question had
0098  *      not exceeded the idle duration in over a half of the relevant cases),
0099  *      select the given idle state instead of the candidate one.
0100  *
0101  * 3. By default, select the candidate state.
0102  */
0103 
0104 #include <linux/cpuidle.h>
0105 #include <linux/jiffies.h>
0106 #include <linux/kernel.h>
0107 #include <linux/sched/clock.h>
0108 #include <linux/tick.h>
0109 
0110 /*
0111  * The PULSE value is added to metrics when they grow and the DECAY_SHIFT value
0112  * is used for decreasing metrics on a regular basis.
0113  */
0114 #define PULSE       1024
0115 #define DECAY_SHIFT 3
0116 
0117 /*
0118  * Number of the most recent idle duration values to take into consideration for
0119  * the detection of recent early wakeup patterns.
0120  */
0121 #define NR_RECENT   9
0122 
0123 /**
0124  * struct teo_bin - Metrics used by the TEO cpuidle governor.
0125  * @intercepts: The "intercepts" metric.
0126  * @hits: The "hits" metric.
0127  * @recent: The number of recent "intercepts".
0128  */
0129 struct teo_bin {
0130     unsigned int intercepts;
0131     unsigned int hits;
0132     unsigned int recent;
0133 };
0134 
0135 /**
0136  * struct teo_cpu - CPU data used by the TEO cpuidle governor.
0137  * @time_span_ns: Time between idle state selection and post-wakeup update.
0138  * @sleep_length_ns: Time till the closest timer event (at the selection time).
0139  * @state_bins: Idle state data bins for this CPU.
0140  * @total: Grand total of the "intercepts" and "hits" mertics for all bins.
0141  * @next_recent_idx: Index of the next @recent_idx entry to update.
0142  * @recent_idx: Indices of bins corresponding to recent "intercepts".
0143  */
0144 struct teo_cpu {
0145     s64 time_span_ns;
0146     s64 sleep_length_ns;
0147     struct teo_bin state_bins[CPUIDLE_STATE_MAX];
0148     unsigned int total;
0149     int next_recent_idx;
0150     int recent_idx[NR_RECENT];
0151 };
0152 
0153 static DEFINE_PER_CPU(struct teo_cpu, teo_cpus);
0154 
0155 /**
0156  * teo_update - Update CPU metrics after wakeup.
0157  * @drv: cpuidle driver containing state data.
0158  * @dev: Target CPU.
0159  */
0160 static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
0161 {
0162     struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
0163     int i, idx_timer = 0, idx_duration = 0;
0164     u64 measured_ns;
0165 
0166     if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns) {
0167         /*
0168          * One of the safety nets has triggered or the wakeup was close
0169          * enough to the closest timer event expected at the idle state
0170          * selection time to be discarded.
0171          */
0172         measured_ns = U64_MAX;
0173     } else {
0174         u64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns;
0175 
0176         /*
0177          * The computations below are to determine whether or not the
0178          * (saved) time till the next timer event and the measured idle
0179          * duration fall into the same "bin", so use last_residency_ns
0180          * for that instead of time_span_ns which includes the cpuidle
0181          * overhead.
0182          */
0183         measured_ns = dev->last_residency_ns;
0184         /*
0185          * The delay between the wakeup and the first instruction
0186          * executed by the CPU is not likely to be worst-case every
0187          * time, so take 1/2 of the exit latency as a very rough
0188          * approximation of the average of it.
0189          */
0190         if (measured_ns >= lat_ns)
0191             measured_ns -= lat_ns / 2;
0192         else
0193             measured_ns /= 2;
0194     }
0195 
0196     cpu_data->total = 0;
0197 
0198     /*
0199      * Decay the "hits" and "intercepts" metrics for all of the bins and
0200      * find the bins that the sleep length and the measured idle duration
0201      * fall into.
0202      */
0203     for (i = 0; i < drv->state_count; i++) {
0204         s64 target_residency_ns = drv->states[i].target_residency_ns;
0205         struct teo_bin *bin = &cpu_data->state_bins[i];
0206 
0207         bin->hits -= bin->hits >> DECAY_SHIFT;
0208         bin->intercepts -= bin->intercepts >> DECAY_SHIFT;
0209 
0210         cpu_data->total += bin->hits + bin->intercepts;
0211 
0212         if (target_residency_ns <= cpu_data->sleep_length_ns) {
0213             idx_timer = i;
0214             if (target_residency_ns <= measured_ns)
0215                 idx_duration = i;
0216         }
0217     }
0218 
0219     i = cpu_data->next_recent_idx++;
0220     if (cpu_data->next_recent_idx >= NR_RECENT)
0221         cpu_data->next_recent_idx = 0;
0222 
0223     if (cpu_data->recent_idx[i] >= 0)
0224         cpu_data->state_bins[cpu_data->recent_idx[i]].recent--;
0225 
0226     /*
0227      * If the measured idle duration falls into the same bin as the sleep
0228      * length, this is a "hit", so update the "hits" metric for that bin.
0229      * Otherwise, update the "intercepts" metric for the bin fallen into by
0230      * the measured idle duration.
0231      */
0232     if (idx_timer == idx_duration) {
0233         cpu_data->state_bins[idx_timer].hits += PULSE;
0234         cpu_data->recent_idx[i] = -1;
0235     } else {
0236         cpu_data->state_bins[idx_duration].intercepts += PULSE;
0237         cpu_data->state_bins[idx_duration].recent++;
0238         cpu_data->recent_idx[i] = idx_duration;
0239     }
0240 
0241     cpu_data->total += PULSE;
0242 }
0243 
0244 static bool teo_time_ok(u64 interval_ns)
0245 {
0246     return !tick_nohz_tick_stopped() || interval_ns >= TICK_NSEC;
0247 }
0248 
0249 static s64 teo_middle_of_bin(int idx, struct cpuidle_driver *drv)
0250 {
0251     return (drv->states[idx].target_residency_ns +
0252         drv->states[idx+1].target_residency_ns) / 2;
0253 }
0254 
0255 /**
0256  * teo_find_shallower_state - Find shallower idle state matching given duration.
0257  * @drv: cpuidle driver containing state data.
0258  * @dev: Target CPU.
0259  * @state_idx: Index of the capping idle state.
0260  * @duration_ns: Idle duration value to match.
0261  */
0262 static int teo_find_shallower_state(struct cpuidle_driver *drv,
0263                     struct cpuidle_device *dev, int state_idx,
0264                     s64 duration_ns)
0265 {
0266     int i;
0267 
0268     for (i = state_idx - 1; i >= 0; i--) {
0269         if (dev->states_usage[i].disable)
0270             continue;
0271 
0272         state_idx = i;
0273         if (drv->states[i].target_residency_ns <= duration_ns)
0274             break;
0275     }
0276     return state_idx;
0277 }
0278 
0279 /**
0280  * teo_select - Selects the next idle state to enter.
0281  * @drv: cpuidle driver containing state data.
0282  * @dev: Target CPU.
0283  * @stop_tick: Indication on whether or not to stop the scheduler tick.
0284  */
0285 static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
0286               bool *stop_tick)
0287 {
0288     struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
0289     s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
0290     unsigned int idx_intercept_sum = 0;
0291     unsigned int intercept_sum = 0;
0292     unsigned int idx_recent_sum = 0;
0293     unsigned int recent_sum = 0;
0294     unsigned int idx_hit_sum = 0;
0295     unsigned int hit_sum = 0;
0296     int constraint_idx = 0;
0297     int idx0 = 0, idx = -1;
0298     bool alt_intercepts, alt_recent;
0299     ktime_t delta_tick;
0300     s64 duration_ns;
0301     int i;
0302 
0303     if (dev->last_state_idx >= 0) {
0304         teo_update(drv, dev);
0305         dev->last_state_idx = -1;
0306     }
0307 
0308     cpu_data->time_span_ns = local_clock();
0309 
0310     duration_ns = tick_nohz_get_sleep_length(&delta_tick);
0311     cpu_data->sleep_length_ns = duration_ns;
0312 
0313     /* Check if there is any choice in the first place. */
0314     if (drv->state_count < 2) {
0315         idx = 0;
0316         goto end;
0317     }
0318     if (!dev->states_usage[0].disable) {
0319         idx = 0;
0320         if (drv->states[1].target_residency_ns > duration_ns)
0321             goto end;
0322     }
0323 
0324     /*
0325      * Find the deepest idle state whose target residency does not exceed
0326      * the current sleep length and the deepest idle state not deeper than
0327      * the former whose exit latency does not exceed the current latency
0328      * constraint.  Compute the sums of metrics for early wakeup pattern
0329      * detection.
0330      */
0331     for (i = 1; i < drv->state_count; i++) {
0332         struct teo_bin *prev_bin = &cpu_data->state_bins[i-1];
0333         struct cpuidle_state *s = &drv->states[i];
0334 
0335         /*
0336          * Update the sums of idle state mertics for all of the states
0337          * shallower than the current one.
0338          */
0339         intercept_sum += prev_bin->intercepts;
0340         hit_sum += prev_bin->hits;
0341         recent_sum += prev_bin->recent;
0342 
0343         if (dev->states_usage[i].disable)
0344             continue;
0345 
0346         if (idx < 0) {
0347             idx = i; /* first enabled state */
0348             idx0 = i;
0349         }
0350 
0351         if (s->target_residency_ns > duration_ns)
0352             break;
0353 
0354         idx = i;
0355 
0356         if (s->exit_latency_ns <= latency_req)
0357             constraint_idx = i;
0358 
0359         idx_intercept_sum = intercept_sum;
0360         idx_hit_sum = hit_sum;
0361         idx_recent_sum = recent_sum;
0362     }
0363 
0364     /* Avoid unnecessary overhead. */
0365     if (idx < 0) {
0366         idx = 0; /* No states enabled, must use 0. */
0367         goto end;
0368     } else if (idx == idx0) {
0369         goto end;
0370     }
0371 
0372     /*
0373      * If the sum of the intercepts metric for all of the idle states
0374      * shallower than the current candidate one (idx) is greater than the
0375      * sum of the intercepts and hits metrics for the candidate state and
0376      * all of the deeper states, or the sum of the numbers of recent
0377      * intercepts over all of the states shallower than the candidate one
0378      * is greater than a half of the number of recent events taken into
0379      * account, the CPU is likely to wake up early, so find an alternative
0380      * idle state to select.
0381      */
0382     alt_intercepts = 2 * idx_intercept_sum > cpu_data->total - idx_hit_sum;
0383     alt_recent = idx_recent_sum > NR_RECENT / 2;
0384     if (alt_recent || alt_intercepts) {
0385         s64 first_suitable_span_ns = duration_ns;
0386         int first_suitable_idx = idx;
0387 
0388         /*
0389          * Look for the deepest idle state whose target residency had
0390          * not exceeded the idle duration in over a half of the relevant
0391          * cases (both with respect to intercepts overall and with
0392          * respect to the recent intercepts only) in the past.
0393          *
0394          * Take the possible latency constraint and duration limitation
0395          * present if the tick has been stopped already into account.
0396          */
0397         intercept_sum = 0;
0398         recent_sum = 0;
0399 
0400         for (i = idx - 1; i >= 0; i--) {
0401             struct teo_bin *bin = &cpu_data->state_bins[i];
0402             s64 span_ns;
0403 
0404             intercept_sum += bin->intercepts;
0405             recent_sum += bin->recent;
0406 
0407             span_ns = teo_middle_of_bin(i, drv);
0408 
0409             if ((!alt_recent || 2 * recent_sum > idx_recent_sum) &&
0410                 (!alt_intercepts ||
0411                  2 * intercept_sum > idx_intercept_sum)) {
0412                 if (teo_time_ok(span_ns) &&
0413                     !dev->states_usage[i].disable) {
0414                     idx = i;
0415                     duration_ns = span_ns;
0416                 } else {
0417                     /*
0418                      * The current state is too shallow or
0419                      * disabled, so take the first enabled
0420                      * deeper state with suitable time span.
0421                      */
0422                     idx = first_suitable_idx;
0423                     duration_ns = first_suitable_span_ns;
0424                 }
0425                 break;
0426             }
0427 
0428             if (dev->states_usage[i].disable)
0429                 continue;
0430 
0431             if (!teo_time_ok(span_ns)) {
0432                 /*
0433                  * The current state is too shallow, but if an
0434                  * alternative candidate state has been found,
0435                  * it may still turn out to be a better choice.
0436                  */
0437                 if (first_suitable_idx != idx)
0438                     continue;
0439 
0440                 break;
0441             }
0442 
0443             first_suitable_span_ns = span_ns;
0444             first_suitable_idx = i;
0445         }
0446     }
0447 
0448     /*
0449      * If there is a latency constraint, it may be necessary to select an
0450      * idle state shallower than the current candidate one.
0451      */
0452     if (idx > constraint_idx)
0453         idx = constraint_idx;
0454 
0455 end:
0456     /*
0457      * Don't stop the tick if the selected state is a polling one or if the
0458      * expected idle duration is shorter than the tick period length.
0459      */
0460     if (((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) ||
0461         duration_ns < TICK_NSEC) && !tick_nohz_tick_stopped()) {
0462         *stop_tick = false;
0463 
0464         /*
0465          * The tick is not going to be stopped, so if the target
0466          * residency of the state to be returned is not within the time
0467          * till the closest timer including the tick, try to correct
0468          * that.
0469          */
0470         if (idx > idx0 &&
0471             drv->states[idx].target_residency_ns > delta_tick)
0472             idx = teo_find_shallower_state(drv, dev, idx, delta_tick);
0473     }
0474 
0475     return idx;
0476 }
0477 
0478 /**
0479  * teo_reflect - Note that governor data for the CPU need to be updated.
0480  * @dev: Target CPU.
0481  * @state: Entered state.
0482  */
0483 static void teo_reflect(struct cpuidle_device *dev, int state)
0484 {
0485     struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
0486 
0487     dev->last_state_idx = state;
0488     /*
0489      * If the wakeup was not "natural", but triggered by one of the safety
0490      * nets, assume that the CPU might have been idle for the entire sleep
0491      * length time.
0492      */
0493     if (dev->poll_time_limit ||
0494         (tick_nohz_idle_got_tick() && cpu_data->sleep_length_ns > TICK_NSEC)) {
0495         dev->poll_time_limit = false;
0496         cpu_data->time_span_ns = cpu_data->sleep_length_ns;
0497     } else {
0498         cpu_data->time_span_ns = local_clock() - cpu_data->time_span_ns;
0499     }
0500 }
0501 
0502 /**
0503  * teo_enable_device - Initialize the governor's data for the target CPU.
0504  * @drv: cpuidle driver (not used).
0505  * @dev: Target CPU.
0506  */
0507 static int teo_enable_device(struct cpuidle_driver *drv,
0508                  struct cpuidle_device *dev)
0509 {
0510     struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
0511     int i;
0512 
0513     memset(cpu_data, 0, sizeof(*cpu_data));
0514 
0515     for (i = 0; i < NR_RECENT; i++)
0516         cpu_data->recent_idx[i] = -1;
0517 
0518     return 0;
0519 }
0520 
0521 static struct cpuidle_governor teo_governor = {
0522     .name =     "teo",
0523     .rating =   19,
0524     .enable =   teo_enable_device,
0525     .select =   teo_select,
0526     .reflect =  teo_reflect,
0527 };
0528 
0529 static int __init teo_governor_init(void)
0530 {
0531     return cpuidle_register_governor(&teo_governor);
0532 }
0533 
0534 postcore_initcall(teo_governor_init);