include/linux/energy_model.h

0001 /* SPDX-License-Identifier: GPL-2.0 */
0002 #ifndef _LINUX_ENERGY_MODEL_H
0003 #define _LINUX_ENERGY_MODEL_H
0004 #include <linux/cpumask.h>
0005 #include <linux/device.h>
0006 #include <linux/jump_label.h>
0007 #include <linux/kobject.h>
0008 #include <linux/rcupdate.h>
0009 #include <linux/sched/cpufreq.h>
0010 #include <linux/sched/topology.h>
0011 #include <linux/types.h>
0012
0013 /**
0014  * struct em_perf_state - Performance state of a performance domain
0015  * @frequency:  The frequency in KHz, for consistency with CPUFreq
0016  * @power:  The power consumed at this level (by 1 CPU or by a registered
0017  *      device). It can be a total power: static and dynamic.
0018  * @cost:   The cost coefficient associated with this level, used during
0019  *      energy calculation. Equal to: power * max_frequency / frequency
0020  * @flags:  see "em_perf_state flags" description below.
0021  */
0022 struct em_perf_state {
0023     unsigned long frequency;
0024     unsigned long power;
0025     unsigned long cost;
0026     unsigned long flags;
0027 };
0028
0029 /*
0030  * em_perf_state flags:
0031  *
0032  * EM_PERF_STATE_INEFFICIENT: The performance state is inefficient. There is
0033  * in this em_perf_domain, another performance state with a higher frequency
0034  * but a lower or equal power cost. Such inefficient states are ignored when
0035  * using em_pd_get_efficient_*() functions.
0036  */
0037 #define EM_PERF_STATE_INEFFICIENT BIT(0)
0038
0039 /**
0040  * struct em_perf_domain - Performance domain
0041  * @table:      List of performance states, in ascending order
0042  * @nr_perf_states: Number of performance states
0043  * @flags:      See "em_perf_domain flags"
0044  * @cpus:       Cpumask covering the CPUs of the domain. It's here
0045  *          for performance reasons to avoid potential cache
0046  *          misses during energy calculations in the scheduler
0047  *          and simplifies allocating/freeing that memory region.
0048  *
0049  * In case of CPU device, a "performance domain" represents a group of CPUs
0050  * whose performance is scaled together. All CPUs of a performance domain
0051  * must have the same micro-architecture. Performance domains often have
0052  * a 1-to-1 mapping with CPUFreq policies. In case of other devices the @cpus
0053  * field is unused.
0054  */
0055 struct em_perf_domain {
0056     struct em_perf_state *table;
0057     int nr_perf_states;
0058     unsigned long flags;
0059     unsigned long cpus[];
0060 };
0061
0062 /*
0063  *  em_perf_domain flags:
0064  *
0065  *  EM_PERF_DOMAIN_MICROWATTS: The power values are in micro-Watts or some
0066  *  other scale.
0067  *
0068  *  EM_PERF_DOMAIN_SKIP_INEFFICIENCIES: Skip inefficient states when estimating
0069  *  energy consumption.
0070  *
0071  *  EM_PERF_DOMAIN_ARTIFICIAL: The power values are artificial and might be
0072  *  created by platform missing real power information
0073  */
0074 #define EM_PERF_DOMAIN_MICROWATTS BIT(0)
0075 #define EM_PERF_DOMAIN_SKIP_INEFFICIENCIES BIT(1)
0076 #define EM_PERF_DOMAIN_ARTIFICIAL BIT(2)
0077
0078 #define em_span_cpus(em) (to_cpumask((em)->cpus))
0079 #define em_is_artificial(em) ((em)->flags & EM_PERF_DOMAIN_ARTIFICIAL)
0080
0081 #ifdef CONFIG_ENERGY_MODEL
0082 /*
0083  * The max power value in micro-Watts. The limit of 64 Watts is set as
0084  * a safety net to not overflow multiplications on 32bit platforms. The
0085  * 32bit value limit for total Perf Domain power implies a limit of
0086  * maximum CPUs in such domain to 64.
0087  */
0088 #define EM_MAX_POWER (64000000) /* 64 Watts */
0089
0090 /*
0091  * To avoid possible energy estimation overflow on 32bit machines add
0092  * limits to number of CPUs in the Perf. Domain.
0093  * We are safe on 64bit machine, thus some big number.
0094  */
0095 #ifdef CONFIG_64BIT
0096 #define EM_MAX_NUM_CPUS 4096
0097 #else
0098 #define EM_MAX_NUM_CPUS 16
0099 #endif
0100
0101 /*
0102  * To avoid an overflow on 32bit machines while calculating the energy
0103  * use a different order in the operation. First divide by the 'cpu_scale'
0104  * which would reduce big value stored in the 'cost' field, then multiply by
0105  * the 'sum_util'. This would allow to handle existing platforms, which have
0106  * e.g. power ~1.3 Watt at max freq, so the 'cost' value > 1mln micro-Watts.
0107  * In such scenario, where there are 4 CPUs in the Perf. Domain the 'sum_util'
0108  * could be 4096, then multiplication: 'cost' * 'sum_util'  would overflow.
0109  * This reordering of operations has some limitations, we lose small
0110  * precision in the estimation (comparing to 64bit platform w/o reordering).
0111  *
0112  * We are safe on 64bit machine.
0113  */
0114 #ifdef CONFIG_64BIT
0115 #define em_estimate_energy(cost, sum_util, scale_cpu) \
0116     (((cost) * (sum_util)) / (scale_cpu))
0117 #else
0118 #define em_estimate_energy(cost, sum_util, scale_cpu) \
0119     (((cost) / (scale_cpu)) * (sum_util))
0120 #endif
0121
0122 struct em_data_callback {
0123     /**
0124      * active_power() - Provide power at the next performance state of
0125      *      a device
0126      * @dev     : Device for which we do this operation (can be a CPU)
0127      * @power   : Active power at the performance state
0128      *      (modified)
0129      * @freq    : Frequency at the performance state in kHz
0130      *      (modified)
0131      *
0132      * active_power() must find the lowest performance state of 'dev' above
0133      * 'freq' and update 'power' and 'freq' to the matching active power
0134      * and frequency.
0135      *
0136      * In case of CPUs, the power is the one of a single CPU in the domain,
0137      * expressed in micro-Watts or an abstract scale. It is expected to
0138      * fit in the [0, EM_MAX_POWER] range.
0139      *
0140      * Return 0 on success.
0141      */
0142     int (*active_power)(struct device *dev, unsigned long *power,
0143                 unsigned long *freq);
0144
0145     /**
0146      * get_cost() - Provide the cost at the given performance state of
0147      *      a device
0148      * @dev     : Device for which we do this operation (can be a CPU)
0149      * @freq    : Frequency at the performance state in kHz
0150      * @cost    : The cost value for the performance state
0151      *      (modified)
0152      *
0153      * In case of CPUs, the cost is the one of a single CPU in the domain.
0154      * It is expected to fit in the [0, EM_MAX_POWER] range due to internal
0155      * usage in EAS calculation.
0156      *
0157      * Return 0 on success, or appropriate error value in case of failure.
0158      */
0159     int (*get_cost)(struct device *dev, unsigned long freq,
0160             unsigned long *cost);
0161 };
0162 #define EM_SET_ACTIVE_POWER_CB(em_cb, cb) ((em_cb).active_power = cb)
0163 #define EM_ADV_DATA_CB(_active_power_cb, _cost_cb)  \
0164     { .active_power = _active_power_cb,     \
0165       .get_cost = _cost_cb }
0166 #define EM_DATA_CB(_active_power_cb)            \
0167         EM_ADV_DATA_CB(_active_power_cb, NULL)
0168
0169 struct em_perf_domain *em_cpu_get(int cpu);
0170 struct em_perf_domain *em_pd_get(struct device *dev);
0171 int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
0172                 struct em_data_callback *cb, cpumask_t *span,
0173                 bool microwatts);
0174 void em_dev_unregister_perf_domain(struct device *dev);
0175
0176 /**
0177  * em_pd_get_efficient_state() - Get an efficient performance state from the EM
0178  * @pd   : Performance domain for which we want an efficient frequency
0179  * @freq : Frequency to map with the EM
0180  *
0181  * It is called from the scheduler code quite frequently and as a consequence
0182  * doesn't implement any check.
0183  *
0184  * Return: An efficient performance state, high enough to meet @freq
0185  * requirement.
0186  */
0187 static inline
0188 struct em_perf_state *em_pd_get_efficient_state(struct em_perf_domain *pd,
0189                         unsigned long freq)
0190 {
0191     struct em_perf_state *ps;
0192     int i;
0193
0194     for (i = 0; i < pd->nr_perf_states; i++) {
0195         ps = &pd->table[i];
0196         if (ps->frequency >= freq) {
0197             if (pd->flags & EM_PERF_DOMAIN_SKIP_INEFFICIENCIES &&
0198                 ps->flags & EM_PERF_STATE_INEFFICIENT)
0199                 continue;
0200             break;
0201         }
0202     }
0203
0204     return ps;
0205 }
0206
0207 /**
0208  * em_cpu_energy() - Estimates the energy consumed by the CPUs of a
0209  *      performance domain
0210  * @pd      : performance domain for which energy has to be estimated
0211  * @max_util    : highest utilization among CPUs of the domain
0212  * @sum_util    : sum of the utilization of all CPUs in the domain
0213  * @allowed_cpu_cap : maximum allowed CPU capacity for the @pd, which
0214  *            might reflect reduced frequency (due to thermal)
0215  *
0216  * This function must be used only for CPU devices. There is no validation,
0217  * i.e. if the EM is a CPU type and has cpumask allocated. It is called from
0218  * the scheduler code quite frequently and that is why there is not checks.
0219  *
0220  * Return: the sum of the energy consumed by the CPUs of the domain assuming
0221  * a capacity state satisfying the max utilization of the domain.
0222  */
0223 static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
0224                 unsigned long max_util, unsigned long sum_util,
0225                 unsigned long allowed_cpu_cap)
0226 {
0227     unsigned long freq, scale_cpu;
0228     struct em_perf_state *ps;
0229     int cpu;
0230
0231     if (!sum_util)
0232         return 0;
0233
0234     /*
0235      * In order to predict the performance state, map the utilization of
0236      * the most utilized CPU of the performance domain to a requested
0237      * frequency, like schedutil. Take also into account that the real
0238      * frequency might be set lower (due to thermal capping). Thus, clamp
0239      * max utilization to the allowed CPU capacity before calculating
0240      * effective frequency.
0241      */
0242     cpu = cpumask_first(to_cpumask(pd->cpus));
0243     scale_cpu = arch_scale_cpu_capacity(cpu);
0244     ps = &pd->table[pd->nr_perf_states - 1];
0245
0246     max_util = map_util_perf(max_util);
0247     max_util = min(max_util, allowed_cpu_cap);
0248     freq = map_util_freq(max_util, ps->frequency, scale_cpu);
0249
0250     /*
0251      * Find the lowest performance state of the Energy Model above the
0252      * requested frequency.
0253      */
0254     ps = em_pd_get_efficient_state(pd, freq);
0255
0256     /*
0257      * The capacity of a CPU in the domain at the performance state (ps)
0258      * can be computed as:
0259      *
0260      *             ps->freq * scale_cpu
0261      *   ps->cap = --------------------                          (1)
0262      *                 cpu_max_freq
0263      *
0264      * So, ignoring the costs of idle states (which are not available in
0265      * the EM), the energy consumed by this CPU at that performance state
0266      * is estimated as:
0267      *
0268      *             ps->power * cpu_util
0269      *   cpu_nrg = --------------------                          (2)
0270      *                   ps->cap
0271      *
0272      * since 'cpu_util / ps->cap' represents its percentage of busy time.
0273      *
0274      *   NOTE: Although the result of this computation actually is in
0275      *         units of power, it can be manipulated as an energy value
0276      *         over a scheduling period, since it is assumed to be
0277      *         constant during that interval.
0278      *
0279      * By injecting (1) in (2), 'cpu_nrg' can be re-expressed as a product
0280      * of two terms:
0281      *
0282      *             ps->power * cpu_max_freq   cpu_util
0283      *   cpu_nrg = ------------------------ * ---------          (3)
0284      *                    ps->freq            scale_cpu
0285      *
0286      * The first term is static, and is stored in the em_perf_state struct
0287      * as 'ps->cost'.
0288      *
0289      * Since all CPUs of the domain have the same micro-architecture, they
0290      * share the same 'ps->cost', and the same CPU capacity. Hence, the
0291      * total energy of the domain (which is the simple sum of the energy of
0292      * all of its CPUs) can be factorized as:
0293      *
0294      *            ps->cost * \Sum cpu_util
0295      *   pd_nrg = ------------------------                       (4)
0296      *                  scale_cpu
0297      */
0298     return em_estimate_energy(ps->cost, sum_util, scale_cpu);
0299 }
0300
0301 /**
0302  * em_pd_nr_perf_states() - Get the number of performance states of a perf.
0303  *              domain
0304  * @pd      : performance domain for which this must be done
0305  *
0306  * Return: the number of performance states in the performance domain table
0307  */
0308 static inline int em_pd_nr_perf_states(struct em_perf_domain *pd)
0309 {
0310     return pd->nr_perf_states;
0311 }
0312
0313 #else
0314 struct em_data_callback {};
0315 #define EM_ADV_DATA_CB(_active_power_cb, _cost_cb) { }
0316 #define EM_DATA_CB(_active_power_cb) { }
0317 #define EM_SET_ACTIVE_POWER_CB(em_cb, cb) do { } while (0)
0318
0319 static inline
0320 int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
0321                 struct em_data_callback *cb, cpumask_t *span,
0322                 bool microwatts)
0323 {
0324     return -EINVAL;
0325 }
0326 static inline void em_dev_unregister_perf_domain(struct device *dev)
0327 {
0328 }
0329 static inline struct em_perf_domain *em_cpu_get(int cpu)
0330 {
0331     return NULL;
0332 }
0333 static inline struct em_perf_domain *em_pd_get(struct device *dev)
0334 {
0335     return NULL;
0336 }
0337 static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
0338             unsigned long max_util, unsigned long sum_util,
0339             unsigned long allowed_cpu_cap)
0340 {
0341     return 0;
0342 }
0343 static inline int em_pd_nr_perf_states(struct em_perf_domain *pd)
0344 {
0345     return 0;
0346 }
0347 #endif
0348
0349 #endif