Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * x86 APERF/MPERF KHz calculation for
0004  * /sys/.../cpufreq/scaling_cur_freq
0005  *
0006  * Copyright (C) 2017 Intel Corp.
0007  * Author: Len Brown <len.brown@intel.com>
0008  */
0009 #include <linux/cpufreq.h>
0010 #include <linux/delay.h>
0011 #include <linux/ktime.h>
0012 #include <linux/math64.h>
0013 #include <linux/percpu.h>
0014 #include <linux/rcupdate.h>
0015 #include <linux/sched/isolation.h>
0016 #include <linux/sched/topology.h>
0017 #include <linux/smp.h>
0018 #include <linux/syscore_ops.h>
0019 
0020 #include <asm/cpu.h>
0021 #include <asm/cpu_device_id.h>
0022 #include <asm/intel-family.h>
0023 
0024 #include "cpu.h"
0025 
0026 struct aperfmperf {
0027     seqcount_t  seq;
0028     unsigned long   last_update;
0029     u64     acnt;
0030     u64     mcnt;
0031     u64     aperf;
0032     u64     mperf;
0033 };
0034 
0035 static DEFINE_PER_CPU_SHARED_ALIGNED(struct aperfmperf, cpu_samples) = {
0036     .seq = SEQCNT_ZERO(cpu_samples.seq)
0037 };
0038 
0039 static void init_counter_refs(void)
0040 {
0041     u64 aperf, mperf;
0042 
0043     rdmsrl(MSR_IA32_APERF, aperf);
0044     rdmsrl(MSR_IA32_MPERF, mperf);
0045 
0046     this_cpu_write(cpu_samples.aperf, aperf);
0047     this_cpu_write(cpu_samples.mperf, mperf);
0048 }
0049 
0050 #if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
0051 /*
0052  * APERF/MPERF frequency ratio computation.
0053  *
0054  * The scheduler wants to do frequency invariant accounting and needs a <1
0055  * ratio to account for the 'current' frequency, corresponding to
0056  * freq_curr / freq_max.
0057  *
0058  * Since the frequency freq_curr on x86 is controlled by micro-controller and
0059  * our P-state setting is little more than a request/hint, we need to observe
0060  * the effective frequency 'BusyMHz', i.e. the average frequency over a time
0061  * interval after discarding idle time. This is given by:
0062  *
0063  *   BusyMHz = delta_APERF / delta_MPERF * freq_base
0064  *
0065  * where freq_base is the max non-turbo P-state.
0066  *
0067  * The freq_max term has to be set to a somewhat arbitrary value, because we
0068  * can't know which turbo states will be available at a given point in time:
0069  * it all depends on the thermal headroom of the entire package. We set it to
0070  * the turbo level with 4 cores active.
0071  *
0072  * Benchmarks show that's a good compromise between the 1C turbo ratio
0073  * (freq_curr/freq_max would rarely reach 1) and something close to freq_base,
0074  * which would ignore the entire turbo range (a conspicuous part, making
0075  * freq_curr/freq_max always maxed out).
0076  *
0077  * An exception to the heuristic above is the Atom uarch, where we choose the
0078  * highest turbo level for freq_max since Atom's are generally oriented towards
0079  * power efficiency.
0080  *
0081  * Setting freq_max to anything less than the 1C turbo ratio makes the ratio
0082  * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1.
0083  */
0084 
0085 DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key);
0086 
0087 static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE;
0088 static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE;
0089 
0090 void arch_set_max_freq_ratio(bool turbo_disabled)
0091 {
0092     arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE :
0093                     arch_turbo_freq_ratio;
0094 }
0095 EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio);
0096 
0097 static bool __init turbo_disabled(void)
0098 {
0099     u64 misc_en;
0100     int err;
0101 
0102     err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en);
0103     if (err)
0104         return false;
0105 
0106     return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE);
0107 }
0108 
0109 static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
0110 {
0111     int err;
0112 
0113     err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq);
0114     if (err)
0115         return false;
0116 
0117     err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq);
0118     if (err)
0119         return false;
0120 
0121     *base_freq = (*base_freq >> 16) & 0x3F;     /* max P state */
0122     *turbo_freq = *turbo_freq & 0x3F;           /* 1C turbo    */
0123 
0124     return true;
0125 }
0126 
0127 #define X86_MATCH(model)                    \
0128     X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6,        \
0129         INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL)
0130 
0131 static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = {
0132     X86_MATCH(XEON_PHI_KNL),
0133     X86_MATCH(XEON_PHI_KNM),
0134     {}
0135 };
0136 
0137 static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = {
0138     X86_MATCH(SKYLAKE_X),
0139     {}
0140 };
0141 
0142 static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = {
0143     X86_MATCH(ATOM_GOLDMONT),
0144     X86_MATCH(ATOM_GOLDMONT_D),
0145     X86_MATCH(ATOM_GOLDMONT_PLUS),
0146     {}
0147 };
0148 
0149 static bool __init knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq,
0150                       int num_delta_fratio)
0151 {
0152     int fratio, delta_fratio, found;
0153     int err, i;
0154     u64 msr;
0155 
0156     err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
0157     if (err)
0158         return false;
0159 
0160     *base_freq = (*base_freq >> 8) & 0xFF;      /* max P state */
0161 
0162     err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
0163     if (err)
0164         return false;
0165 
0166     fratio = (msr >> 8) & 0xFF;
0167     i = 16;
0168     found = 0;
0169     do {
0170         if (found >= num_delta_fratio) {
0171             *turbo_freq = fratio;
0172             return true;
0173         }
0174 
0175         delta_fratio = (msr >> (i + 5)) & 0x7;
0176 
0177         if (delta_fratio) {
0178             found += 1;
0179             fratio -= delta_fratio;
0180         }
0181 
0182         i += 8;
0183     } while (i < 64);
0184 
0185     return true;
0186 }
0187 
0188 static bool __init skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size)
0189 {
0190     u64 ratios, counts;
0191     u32 group_size;
0192     int err, i;
0193 
0194     err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
0195     if (err)
0196         return false;
0197 
0198     *base_freq = (*base_freq >> 8) & 0xFF;      /* max P state */
0199 
0200     err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios);
0201     if (err)
0202         return false;
0203 
0204     err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts);
0205     if (err)
0206         return false;
0207 
0208     for (i = 0; i < 64; i += 8) {
0209         group_size = (counts >> i) & 0xFF;
0210         if (group_size >= size) {
0211             *turbo_freq = (ratios >> i) & 0xFF;
0212             return true;
0213         }
0214     }
0215 
0216     return false;
0217 }
0218 
0219 static bool __init core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
0220 {
0221     u64 msr;
0222     int err;
0223 
0224     err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
0225     if (err)
0226         return false;
0227 
0228     err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
0229     if (err)
0230         return false;
0231 
0232     *base_freq = (*base_freq >> 8) & 0xFF;    /* max P state */
0233     *turbo_freq = (msr >> 24) & 0xFF;         /* 4C turbo    */
0234 
0235     /* The CPU may have less than 4 cores */
0236     if (!*turbo_freq)
0237         *turbo_freq = msr & 0xFF;         /* 1C turbo    */
0238 
0239     return true;
0240 }
0241 
0242 static bool __init intel_set_max_freq_ratio(void)
0243 {
0244     u64 base_freq, turbo_freq;
0245     u64 turbo_ratio;
0246 
0247     if (slv_set_max_freq_ratio(&base_freq, &turbo_freq))
0248         goto out;
0249 
0250     if (x86_match_cpu(has_glm_turbo_ratio_limits) &&
0251         skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
0252         goto out;
0253 
0254     if (x86_match_cpu(has_knl_turbo_ratio_limits) &&
0255         knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
0256         goto out;
0257 
0258     if (x86_match_cpu(has_skx_turbo_ratio_limits) &&
0259         skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4))
0260         goto out;
0261 
0262     if (core_set_max_freq_ratio(&base_freq, &turbo_freq))
0263         goto out;
0264 
0265     return false;
0266 
0267 out:
0268     /*
0269      * Some hypervisors advertise X86_FEATURE_APERFMPERF
0270      * but then fill all MSR's with zeroes.
0271      * Some CPUs have turbo boost but don't declare any turbo ratio
0272      * in MSR_TURBO_RATIO_LIMIT.
0273      */
0274     if (!base_freq || !turbo_freq) {
0275         pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n");
0276         return false;
0277     }
0278 
0279     turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq);
0280     if (!turbo_ratio) {
0281         pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n");
0282         return false;
0283     }
0284 
0285     arch_turbo_freq_ratio = turbo_ratio;
0286     arch_set_max_freq_ratio(turbo_disabled());
0287 
0288     return true;
0289 }
0290 
0291 #ifdef CONFIG_PM_SLEEP
0292 static struct syscore_ops freq_invariance_syscore_ops = {
0293     .resume = init_counter_refs,
0294 };
0295 
0296 static void register_freq_invariance_syscore_ops(void)
0297 {
0298     register_syscore_ops(&freq_invariance_syscore_ops);
0299 }
0300 #else
0301 static inline void register_freq_invariance_syscore_ops(void) {}
0302 #endif
0303 
0304 static void freq_invariance_enable(void)
0305 {
0306     if (static_branch_unlikely(&arch_scale_freq_key)) {
0307         WARN_ON_ONCE(1);
0308         return;
0309     }
0310     static_branch_enable(&arch_scale_freq_key);
0311     register_freq_invariance_syscore_ops();
0312     pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio);
0313 }
0314 
0315 void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled)
0316 {
0317     arch_turbo_freq_ratio = ratio;
0318     arch_set_max_freq_ratio(turbo_disabled);
0319     freq_invariance_enable();
0320 }
0321 
0322 static void __init bp_init_freq_invariance(void)
0323 {
0324     if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
0325         return;
0326 
0327     if (intel_set_max_freq_ratio())
0328         freq_invariance_enable();
0329 }
0330 
0331 static void disable_freq_invariance_workfn(struct work_struct *work)
0332 {
0333     static_branch_disable(&arch_scale_freq_key);
0334 }
0335 
0336 static DECLARE_WORK(disable_freq_invariance_work,
0337             disable_freq_invariance_workfn);
0338 
0339 DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
0340 
0341 static void scale_freq_tick(u64 acnt, u64 mcnt)
0342 {
0343     u64 freq_scale;
0344 
0345     if (!arch_scale_freq_invariant())
0346         return;
0347 
0348     if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt))
0349         goto error;
0350 
0351     if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt)
0352         goto error;
0353 
0354     freq_scale = div64_u64(acnt, mcnt);
0355     if (!freq_scale)
0356         goto error;
0357 
0358     if (freq_scale > SCHED_CAPACITY_SCALE)
0359         freq_scale = SCHED_CAPACITY_SCALE;
0360 
0361     this_cpu_write(arch_freq_scale, freq_scale);
0362     return;
0363 
0364 error:
0365     pr_warn("Scheduler frequency invariance went wobbly, disabling!\n");
0366     schedule_work(&disable_freq_invariance_work);
0367 }
0368 #else
0369 static inline void bp_init_freq_invariance(void) { }
0370 static inline void scale_freq_tick(u64 acnt, u64 mcnt) { }
0371 #endif /* CONFIG_X86_64 && CONFIG_SMP */
0372 
0373 void arch_scale_freq_tick(void)
0374 {
0375     struct aperfmperf *s = this_cpu_ptr(&cpu_samples);
0376     u64 acnt, mcnt, aperf, mperf;
0377 
0378     if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
0379         return;
0380 
0381     rdmsrl(MSR_IA32_APERF, aperf);
0382     rdmsrl(MSR_IA32_MPERF, mperf);
0383     acnt = aperf - s->aperf;
0384     mcnt = mperf - s->mperf;
0385 
0386     s->aperf = aperf;
0387     s->mperf = mperf;
0388 
0389     raw_write_seqcount_begin(&s->seq);
0390     s->last_update = jiffies;
0391     s->acnt = acnt;
0392     s->mcnt = mcnt;
0393     raw_write_seqcount_end(&s->seq);
0394 
0395     scale_freq_tick(acnt, mcnt);
0396 }
0397 
0398 /*
0399  * Discard samples older than the define maximum sample age of 20ms. There
0400  * is no point in sending IPIs in such a case. If the scheduler tick was
0401  * not running then the CPU is either idle or isolated.
0402  */
0403 #define MAX_SAMPLE_AGE  ((unsigned long)HZ / 50)
0404 
0405 unsigned int arch_freq_get_on_cpu(int cpu)
0406 {
0407     struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu);
0408     unsigned int seq, freq;
0409     unsigned long last;
0410     u64 acnt, mcnt;
0411 
0412     if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
0413         goto fallback;
0414 
0415     do {
0416         seq = raw_read_seqcount_begin(&s->seq);
0417         last = s->last_update;
0418         acnt = s->acnt;
0419         mcnt = s->mcnt;
0420     } while (read_seqcount_retry(&s->seq, seq));
0421 
0422     /*
0423      * Bail on invalid count and when the last update was too long ago,
0424      * which covers idle and NOHZ full CPUs.
0425      */
0426     if (!mcnt || (jiffies - last) > MAX_SAMPLE_AGE)
0427         goto fallback;
0428 
0429     return div64_u64((cpu_khz * acnt), mcnt);
0430 
0431 fallback:
0432     freq = cpufreq_quick_get(cpu);
0433     return freq ? freq : cpu_khz;
0434 }
0435 
0436 static int __init bp_init_aperfmperf(void)
0437 {
0438     if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
0439         return 0;
0440 
0441     init_counter_refs();
0442     bp_init_freq_invariance();
0443     return 0;
0444 }
0445 early_initcall(bp_init_aperfmperf);
0446 
0447 void ap_init_aperfmperf(void)
0448 {
0449     if (cpu_feature_enabled(X86_FEATURE_APERFMPERF))
0450         init_counter_refs();
0451 }