![]() |
|
|||
0001 // SPDX-License-Identifier: GPL-2.0 0002 /* 0003 * Timer events oriented CPU idle governor 0004 * 0005 * Copyright (C) 2018 - 2021 Intel Corporation 0006 * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> 0007 */ 0008 0009 /** 0010 * DOC: teo-description 0011 * 0012 * The idea of this governor is based on the observation that on many systems 0013 * timer events are two or more orders of magnitude more frequent than any 0014 * other interrupts, so they are likely to be the most significant cause of CPU 0015 * wakeups from idle states. Moreover, information about what happened in the 0016 * (relatively recent) past can be used to estimate whether or not the deepest 0017 * idle state with target residency within the (known) time till the closest 0018 * timer event, referred to as the sleep length, is likely to be suitable for 0019 * the upcoming CPU idle period and, if not, then which of the shallower idle 0020 * states to choose instead of it. 0021 * 0022 * Of course, non-timer wakeup sources are more important in some use cases 0023 * which can be covered by taking a few most recent idle time intervals of the 0024 * CPU into account. However, even in that context it is not necessary to 0025 * consider idle duration values greater than the sleep length, because the 0026 * closest timer will ultimately wake up the CPU anyway unless it is woken up 0027 * earlier. 0028 * 0029 * Thus this governor estimates whether or not the prospective idle duration of 0030 * a CPU is likely to be significantly shorter than the sleep length and selects 0031 * an idle state for it accordingly. 0032 * 0033 * The computations carried out by this governor are based on using bins whose 0034 * boundaries are aligned with the target residency parameter values of the CPU 0035 * idle states provided by the %CPUIdle driver in the ascending order. That is, 0036 * the first bin spans from 0 up to, but not including, the target residency of 0037 * the second idle state (idle state 1), the second bin spans from the target 0038 * residency of idle state 1 up to, but not including, the target residency of 0039 * idle state 2, the third bin spans from the target residency of idle state 2 0040 * up to, but not including, the target residency of idle state 3 and so on. 0041 * The last bin spans from the target residency of the deepest idle state 0042 * supplied by the driver to infinity. 0043 * 0044 * Two metrics called "hits" and "intercepts" are associated with each bin. 0045 * They are updated every time before selecting an idle state for the given CPU 0046 * in accordance with what happened last time. 0047 * 0048 * The "hits" metric reflects the relative frequency of situations in which the 0049 * sleep length and the idle duration measured after CPU wakeup fall into the 0050 * same bin (that is, the CPU appears to wake up "on time" relative to the sleep 0051 * length). In turn, the "intercepts" metric reflects the relative frequency of 0052 * situations in which the measured idle duration is so much shorter than the 0053 * sleep length that the bin it falls into corresponds to an idle state 0054 * shallower than the one whose bin is fallen into by the sleep length (these 0055 * situations are referred to as "intercepts" below). 0056 * 0057 * In addition to the metrics described above, the governor counts recent 0058 * intercepts (that is, intercepts that have occurred during the last 0059 * %NR_RECENT invocations of it for the given CPU) for each bin. 0060 * 0061 * In order to select an idle state for a CPU, the governor takes the following 0062 * steps (modulo the possible latency constraint that must be taken into account 0063 * too): 0064 * 0065 * 1. Find the deepest CPU idle state whose target residency does not exceed 0066 * the current sleep length (the candidate idle state) and compute 3 sums as 0067 * follows: 0068 * 0069 * - The sum of the "hits" and "intercepts" metrics for the candidate state 0070 * and all of the deeper idle states (it represents the cases in which the 0071 * CPU was idle long enough to avoid being intercepted if the sleep length 0072 * had been equal to the current one). 0073 * 0074 * - The sum of the "intercepts" metrics for all of the idle states shallower 0075 * than the candidate one (it represents the cases in which the CPU was not 0076 * idle long enough to avoid being intercepted if the sleep length had been 0077 * equal to the current one). 0078 * 0079 * - The sum of the numbers of recent intercepts for all of the idle states 0080 * shallower than the candidate one. 0081 * 0082 * 2. If the second sum is greater than the first one or the third sum is 0083 * greater than %NR_RECENT / 2, the CPU is likely to wake up early, so look 0084 * for an alternative idle state to select. 0085 * 0086 * - Traverse the idle states shallower than the candidate one in the 0087 * descending order. 0088 * 0089 * - For each of them compute the sum of the "intercepts" metrics and the sum 0090 * of the numbers of recent intercepts over all of the idle states between 0091 * it and the candidate one (including the former and excluding the 0092 * latter). 0093 * 0094 * - If each of these sums that needs to be taken into account (because the 0095 * check related to it has indicated that the CPU is likely to wake up 0096 * early) is greater than a half of the corresponding sum computed in step 0097 * 1 (which means that the target residency of the state in question had 0098 * not exceeded the idle duration in over a half of the relevant cases), 0099 * select the given idle state instead of the candidate one. 0100 * 0101 * 3. By default, select the candidate state. 0102 */ 0103 0104 #include <linux/cpuidle.h> 0105 #include <linux/jiffies.h> 0106 #include <linux/kernel.h> 0107 #include <linux/sched/clock.h> 0108 #include <linux/tick.h> 0109 0110 /* 0111 * The PULSE value is added to metrics when they grow and the DECAY_SHIFT value 0112 * is used for decreasing metrics on a regular basis. 0113 */ 0114 #define PULSE 1024 0115 #define DECAY_SHIFT 3 0116 0117 /* 0118 * Number of the most recent idle duration values to take into consideration for 0119 * the detection of recent early wakeup patterns. 0120 */ 0121 #define NR_RECENT 9 0122 0123 /** 0124 * struct teo_bin - Metrics used by the TEO cpuidle governor. 0125 * @intercepts: The "intercepts" metric. 0126 * @hits: The "hits" metric. 0127 * @recent: The number of recent "intercepts". 0128 */ 0129 struct teo_bin { 0130 unsigned int intercepts; 0131 unsigned int hits; 0132 unsigned int recent; 0133 }; 0134 0135 /** 0136 * struct teo_cpu - CPU data used by the TEO cpuidle governor. 0137 * @time_span_ns: Time between idle state selection and post-wakeup update. 0138 * @sleep_length_ns: Time till the closest timer event (at the selection time). 0139 * @state_bins: Idle state data bins for this CPU. 0140 * @total: Grand total of the "intercepts" and "hits" mertics for all bins. 0141 * @next_recent_idx: Index of the next @recent_idx entry to update. 0142 * @recent_idx: Indices of bins corresponding to recent "intercepts". 0143 */ 0144 struct teo_cpu { 0145 s64 time_span_ns; 0146 s64 sleep_length_ns; 0147 struct teo_bin state_bins[CPUIDLE_STATE_MAX]; 0148 unsigned int total; 0149 int next_recent_idx; 0150 int recent_idx[NR_RECENT]; 0151 }; 0152 0153 static DEFINE_PER_CPU(struct teo_cpu, teo_cpus); 0154 0155 /** 0156 * teo_update - Update CPU metrics after wakeup. 0157 * @drv: cpuidle driver containing state data. 0158 * @dev: Target CPU. 0159 */ 0160 static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) 0161 { 0162 struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); 0163 int i, idx_timer = 0, idx_duration = 0; 0164 u64 measured_ns; 0165 0166 if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns) { 0167 /* 0168 * One of the safety nets has triggered or the wakeup was close 0169 * enough to the closest timer event expected at the idle state 0170 * selection time to be discarded. 0171 */ 0172 measured_ns = U64_MAX; 0173 } else { 0174 u64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns; 0175 0176 /* 0177 * The computations below are to determine whether or not the 0178 * (saved) time till the next timer event and the measured idle 0179 * duration fall into the same "bin", so use last_residency_ns 0180 * for that instead of time_span_ns which includes the cpuidle 0181 * overhead. 0182 */ 0183 measured_ns = dev->last_residency_ns; 0184 /* 0185 * The delay between the wakeup and the first instruction 0186 * executed by the CPU is not likely to be worst-case every 0187 * time, so take 1/2 of the exit latency as a very rough 0188 * approximation of the average of it. 0189 */ 0190 if (measured_ns >= lat_ns) 0191 measured_ns -= lat_ns / 2; 0192 else 0193 measured_ns /= 2; 0194 } 0195 0196 cpu_data->total = 0; 0197 0198 /* 0199 * Decay the "hits" and "intercepts" metrics for all of the bins and 0200 * find the bins that the sleep length and the measured idle duration 0201 * fall into. 0202 */ 0203 for (i = 0; i < drv->state_count; i++) { 0204 s64 target_residency_ns = drv->states[i].target_residency_ns; 0205 struct teo_bin *bin = &cpu_data->state_bins[i]; 0206 0207 bin->hits -= bin->hits >> DECAY_SHIFT; 0208 bin->intercepts -= bin->intercepts >> DECAY_SHIFT; 0209 0210 cpu_data->total += bin->hits + bin->intercepts; 0211 0212 if (target_residency_ns <= cpu_data->sleep_length_ns) { 0213 idx_timer = i; 0214 if (target_residency_ns <= measured_ns) 0215 idx_duration = i; 0216 } 0217 } 0218 0219 i = cpu_data->next_recent_idx++; 0220 if (cpu_data->next_recent_idx >= NR_RECENT) 0221 cpu_data->next_recent_idx = 0; 0222 0223 if (cpu_data->recent_idx[i] >= 0) 0224 cpu_data->state_bins[cpu_data->recent_idx[i]].recent--; 0225 0226 /* 0227 * If the measured idle duration falls into the same bin as the sleep 0228 * length, this is a "hit", so update the "hits" metric for that bin. 0229 * Otherwise, update the "intercepts" metric for the bin fallen into by 0230 * the measured idle duration. 0231 */ 0232 if (idx_timer == idx_duration) { 0233 cpu_data->state_bins[idx_timer].hits += PULSE; 0234 cpu_data->recent_idx[i] = -1; 0235 } else { 0236 cpu_data->state_bins[idx_duration].intercepts += PULSE; 0237 cpu_data->state_bins[idx_duration].recent++; 0238 cpu_data->recent_idx[i] = idx_duration; 0239 } 0240 0241 cpu_data->total += PULSE; 0242 } 0243 0244 static bool teo_time_ok(u64 interval_ns) 0245 { 0246 return !tick_nohz_tick_stopped() || interval_ns >= TICK_NSEC; 0247 } 0248 0249 static s64 teo_middle_of_bin(int idx, struct cpuidle_driver *drv) 0250 { 0251 return (drv->states[idx].target_residency_ns + 0252 drv->states[idx+1].target_residency_ns) / 2; 0253 } 0254 0255 /** 0256 * teo_find_shallower_state - Find shallower idle state matching given duration. 0257 * @drv: cpuidle driver containing state data. 0258 * @dev: Target CPU. 0259 * @state_idx: Index of the capping idle state. 0260 * @duration_ns: Idle duration value to match. 0261 */ 0262 static int teo_find_shallower_state(struct cpuidle_driver *drv, 0263 struct cpuidle_device *dev, int state_idx, 0264 s64 duration_ns) 0265 { 0266 int i; 0267 0268 for (i = state_idx - 1; i >= 0; i--) { 0269 if (dev->states_usage[i].disable) 0270 continue; 0271 0272 state_idx = i; 0273 if (drv->states[i].target_residency_ns <= duration_ns) 0274 break; 0275 } 0276 return state_idx; 0277 } 0278 0279 /** 0280 * teo_select - Selects the next idle state to enter. 0281 * @drv: cpuidle driver containing state data. 0282 * @dev: Target CPU. 0283 * @stop_tick: Indication on whether or not to stop the scheduler tick. 0284 */ 0285 static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, 0286 bool *stop_tick) 0287 { 0288 struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); 0289 s64 latency_req = cpuidle_governor_latency_req(dev->cpu); 0290 unsigned int idx_intercept_sum = 0; 0291 unsigned int intercept_sum = 0; 0292 unsigned int idx_recent_sum = 0; 0293 unsigned int recent_sum = 0; 0294 unsigned int idx_hit_sum = 0; 0295 unsigned int hit_sum = 0; 0296 int constraint_idx = 0; 0297 int idx0 = 0, idx = -1; 0298 bool alt_intercepts, alt_recent; 0299 ktime_t delta_tick; 0300 s64 duration_ns; 0301 int i; 0302 0303 if (dev->last_state_idx >= 0) { 0304 teo_update(drv, dev); 0305 dev->last_state_idx = -1; 0306 } 0307 0308 cpu_data->time_span_ns = local_clock(); 0309 0310 duration_ns = tick_nohz_get_sleep_length(&delta_tick); 0311 cpu_data->sleep_length_ns = duration_ns; 0312 0313 /* Check if there is any choice in the first place. */ 0314 if (drv->state_count < 2) { 0315 idx = 0; 0316 goto end; 0317 } 0318 if (!dev->states_usage[0].disable) { 0319 idx = 0; 0320 if (drv->states[1].target_residency_ns > duration_ns) 0321 goto end; 0322 } 0323 0324 /* 0325 * Find the deepest idle state whose target residency does not exceed 0326 * the current sleep length and the deepest idle state not deeper than 0327 * the former whose exit latency does not exceed the current latency 0328 * constraint. Compute the sums of metrics for early wakeup pattern 0329 * detection. 0330 */ 0331 for (i = 1; i < drv->state_count; i++) { 0332 struct teo_bin *prev_bin = &cpu_data->state_bins[i-1]; 0333 struct cpuidle_state *s = &drv->states[i]; 0334 0335 /* 0336 * Update the sums of idle state mertics for all of the states 0337 * shallower than the current one. 0338 */ 0339 intercept_sum += prev_bin->intercepts; 0340 hit_sum += prev_bin->hits; 0341 recent_sum += prev_bin->recent; 0342 0343 if (dev->states_usage[i].disable) 0344 continue; 0345 0346 if (idx < 0) { 0347 idx = i; /* first enabled state */ 0348 idx0 = i; 0349 } 0350 0351 if (s->target_residency_ns > duration_ns) 0352 break; 0353 0354 idx = i; 0355 0356 if (s->exit_latency_ns <= latency_req) 0357 constraint_idx = i; 0358 0359 idx_intercept_sum = intercept_sum; 0360 idx_hit_sum = hit_sum; 0361 idx_recent_sum = recent_sum; 0362 } 0363 0364 /* Avoid unnecessary overhead. */ 0365 if (idx < 0) { 0366 idx = 0; /* No states enabled, must use 0. */ 0367 goto end; 0368 } else if (idx == idx0) { 0369 goto end; 0370 } 0371 0372 /* 0373 * If the sum of the intercepts metric for all of the idle states 0374 * shallower than the current candidate one (idx) is greater than the 0375 * sum of the intercepts and hits metrics for the candidate state and 0376 * all of the deeper states, or the sum of the numbers of recent 0377 * intercepts over all of the states shallower than the candidate one 0378 * is greater than a half of the number of recent events taken into 0379 * account, the CPU is likely to wake up early, so find an alternative 0380 * idle state to select. 0381 */ 0382 alt_intercepts = 2 * idx_intercept_sum > cpu_data->total - idx_hit_sum; 0383 alt_recent = idx_recent_sum > NR_RECENT / 2; 0384 if (alt_recent || alt_intercepts) { 0385 s64 first_suitable_span_ns = duration_ns; 0386 int first_suitable_idx = idx; 0387 0388 /* 0389 * Look for the deepest idle state whose target residency had 0390 * not exceeded the idle duration in over a half of the relevant 0391 * cases (both with respect to intercepts overall and with 0392 * respect to the recent intercepts only) in the past. 0393 * 0394 * Take the possible latency constraint and duration limitation 0395 * present if the tick has been stopped already into account. 0396 */ 0397 intercept_sum = 0; 0398 recent_sum = 0; 0399 0400 for (i = idx - 1; i >= 0; i--) { 0401 struct teo_bin *bin = &cpu_data->state_bins[i]; 0402 s64 span_ns; 0403 0404 intercept_sum += bin->intercepts; 0405 recent_sum += bin->recent; 0406 0407 span_ns = teo_middle_of_bin(i, drv); 0408 0409 if ((!alt_recent || 2 * recent_sum > idx_recent_sum) && 0410 (!alt_intercepts || 0411 2 * intercept_sum > idx_intercept_sum)) { 0412 if (teo_time_ok(span_ns) && 0413 !dev->states_usage[i].disable) { 0414 idx = i; 0415 duration_ns = span_ns; 0416 } else { 0417 /* 0418 * The current state is too shallow or 0419 * disabled, so take the first enabled 0420 * deeper state with suitable time span. 0421 */ 0422 idx = first_suitable_idx; 0423 duration_ns = first_suitable_span_ns; 0424 } 0425 break; 0426 } 0427 0428 if (dev->states_usage[i].disable) 0429 continue; 0430 0431 if (!teo_time_ok(span_ns)) { 0432 /* 0433 * The current state is too shallow, but if an 0434 * alternative candidate state has been found, 0435 * it may still turn out to be a better choice. 0436 */ 0437 if (first_suitable_idx != idx) 0438 continue; 0439 0440 break; 0441 } 0442 0443 first_suitable_span_ns = span_ns; 0444 first_suitable_idx = i; 0445 } 0446 } 0447 0448 /* 0449 * If there is a latency constraint, it may be necessary to select an 0450 * idle state shallower than the current candidate one. 0451 */ 0452 if (idx > constraint_idx) 0453 idx = constraint_idx; 0454 0455 end: 0456 /* 0457 * Don't stop the tick if the selected state is a polling one or if the 0458 * expected idle duration is shorter than the tick period length. 0459 */ 0460 if (((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) || 0461 duration_ns < TICK_NSEC) && !tick_nohz_tick_stopped()) { 0462 *stop_tick = false; 0463 0464 /* 0465 * The tick is not going to be stopped, so if the target 0466 * residency of the state to be returned is not within the time 0467 * till the closest timer including the tick, try to correct 0468 * that. 0469 */ 0470 if (idx > idx0 && 0471 drv->states[idx].target_residency_ns > delta_tick) 0472 idx = teo_find_shallower_state(drv, dev, idx, delta_tick); 0473 } 0474 0475 return idx; 0476 } 0477 0478 /** 0479 * teo_reflect - Note that governor data for the CPU need to be updated. 0480 * @dev: Target CPU. 0481 * @state: Entered state. 0482 */ 0483 static void teo_reflect(struct cpuidle_device *dev, int state) 0484 { 0485 struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); 0486 0487 dev->last_state_idx = state; 0488 /* 0489 * If the wakeup was not "natural", but triggered by one of the safety 0490 * nets, assume that the CPU might have been idle for the entire sleep 0491 * length time. 0492 */ 0493 if (dev->poll_time_limit || 0494 (tick_nohz_idle_got_tick() && cpu_data->sleep_length_ns > TICK_NSEC)) { 0495 dev->poll_time_limit = false; 0496 cpu_data->time_span_ns = cpu_data->sleep_length_ns; 0497 } else { 0498 cpu_data->time_span_ns = local_clock() - cpu_data->time_span_ns; 0499 } 0500 } 0501 0502 /** 0503 * teo_enable_device - Initialize the governor's data for the target CPU. 0504 * @drv: cpuidle driver (not used). 0505 * @dev: Target CPU. 0506 */ 0507 static int teo_enable_device(struct cpuidle_driver *drv, 0508 struct cpuidle_device *dev) 0509 { 0510 struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); 0511 int i; 0512 0513 memset(cpu_data, 0, sizeof(*cpu_data)); 0514 0515 for (i = 0; i < NR_RECENT; i++) 0516 cpu_data->recent_idx[i] = -1; 0517 0518 return 0; 0519 } 0520 0521 static struct cpuidle_governor teo_governor = { 0522 .name = "teo", 0523 .rating = 19, 0524 .enable = teo_enable_device, 0525 .select = teo_select, 0526 .reflect = teo_reflect, 0527 }; 0528 0529 static int __init teo_governor_init(void) 0530 { 0531 return cpuidle_register_governor(&teo_governor); 0532 } 0533 0534 postcore_initcall(teo_governor_init);
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.1.0 LXR engine. The LXR team |
![]() ![]() |