0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
0026
0027 #include <linux/module.h>
0028 #include <linux/kernel.h>
0029 #include <linux/delay.h>
0030 #include <linux/kthread.h>
0031 #include <linux/cpu.h>
0032 #include <linux/thermal.h>
0033 #include <linux/slab.h>
0034 #include <linux/tick.h>
0035 #include <linux/debugfs.h>
0036 #include <linux/seq_file.h>
0037 #include <linux/sched/rt.h>
0038 #include <uapi/linux/sched/types.h>
0039
0040 #include <asm/nmi.h>
0041 #include <asm/msr.h>
0042 #include <asm/mwait.h>
0043 #include <asm/cpu_device_id.h>
0044 #include <asm/hardirq.h>
0045
0046 #define MAX_TARGET_RATIO (50U)
0047
0048
0049
0050
0051
0052 #define CONFIDENCE_OK (3)
0053
0054
0055
0056 #define DEFAULT_DURATION_JIFFIES (6)
0057
0058 static unsigned int target_mwait;
0059 static struct dentry *debug_dir;
0060
0061
0062 static unsigned int set_target_ratio;
0063 static unsigned int current_ratio;
0064 static bool should_skip;
0065 static bool reduce_irq;
0066 static atomic_t idle_wakeup_counter;
0067 static unsigned int control_cpu;
0068
0069
0070
0071 static bool clamping;
0072
0073 struct powerclamp_worker_data {
0074 struct kthread_worker *worker;
0075 struct kthread_work balancing_work;
0076 struct kthread_delayed_work idle_injection_work;
0077 unsigned int cpu;
0078 unsigned int count;
0079 unsigned int guard;
0080 unsigned int window_size_now;
0081 unsigned int target_ratio;
0082 unsigned int duration_jiffies;
0083 bool clamping;
0084 };
0085
0086 static struct powerclamp_worker_data __percpu *worker_data;
0087 static struct thermal_cooling_device *cooling_dev;
0088 static unsigned long *cpu_clamping_mask;
0089
0090
0091
0092 static unsigned int duration;
0093 static unsigned int pkg_cstate_ratio_cur;
0094 static unsigned int window_size;
0095
0096 static int duration_set(const char *arg, const struct kernel_param *kp)
0097 {
0098 int ret = 0;
0099 unsigned long new_duration;
0100
0101 ret = kstrtoul(arg, 10, &new_duration);
0102 if (ret)
0103 goto exit;
0104 if (new_duration > 25 || new_duration < 6) {
0105 pr_err("Out of recommended range %lu, between 6-25ms\n",
0106 new_duration);
0107 ret = -EINVAL;
0108 }
0109
0110 duration = clamp(new_duration, 6ul, 25ul);
0111 smp_mb();
0112
0113 exit:
0114
0115 return ret;
0116 }
0117
0118 static const struct kernel_param_ops duration_ops = {
0119 .set = duration_set,
0120 .get = param_get_int,
0121 };
0122
0123
0124 module_param_cb(duration, &duration_ops, &duration, 0644);
0125 MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec.");
0126
0127 struct powerclamp_calibration_data {
0128 unsigned long confidence;
0129
0130
0131
0132
0133
0134 unsigned long steady_comp;
0135
0136
0137 unsigned long dynamic_comp;
0138
0139
0140 };
0141
0142 static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO];
0143
0144 static int window_size_set(const char *arg, const struct kernel_param *kp)
0145 {
0146 int ret = 0;
0147 unsigned long new_window_size;
0148
0149 ret = kstrtoul(arg, 10, &new_window_size);
0150 if (ret)
0151 goto exit_win;
0152 if (new_window_size > 10 || new_window_size < 2) {
0153 pr_err("Out of recommended window size %lu, between 2-10\n",
0154 new_window_size);
0155 ret = -EINVAL;
0156 }
0157
0158 window_size = clamp(new_window_size, 2ul, 10ul);
0159 smp_mb();
0160
0161 exit_win:
0162
0163 return ret;
0164 }
0165
0166 static const struct kernel_param_ops window_size_ops = {
0167 .set = window_size_set,
0168 .get = param_get_int,
0169 };
0170
0171 module_param_cb(window_size, &window_size_ops, &window_size, 0644);
0172 MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n"
0173 "\tpowerclamp controls idle ratio within this window. larger\n"
0174 "\twindow size results in slower response time but more smooth\n"
0175 "\tclamping results. default to 2.");
0176
0177 static void find_target_mwait(void)
0178 {
0179 unsigned int eax, ebx, ecx, edx;
0180 unsigned int highest_cstate = 0;
0181 unsigned int highest_subcstate = 0;
0182 int i;
0183
0184 if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
0185 return;
0186
0187 cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
0188
0189 if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
0190 !(ecx & CPUID5_ECX_INTERRUPT_BREAK))
0191 return;
0192
0193 edx >>= MWAIT_SUBSTATE_SIZE;
0194 for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
0195 if (edx & MWAIT_SUBSTATE_MASK) {
0196 highest_cstate = i;
0197 highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
0198 }
0199 }
0200 target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
0201 (highest_subcstate - 1);
0202
0203 }
0204
0205 struct pkg_cstate_info {
0206 bool skip;
0207 int msr_index;
0208 int cstate_id;
0209 };
0210
0211 #define PKG_CSTATE_INIT(id) { \
0212 .msr_index = MSR_PKG_C##id##_RESIDENCY, \
0213 .cstate_id = id \
0214 }
0215
0216 static struct pkg_cstate_info pkg_cstates[] = {
0217 PKG_CSTATE_INIT(2),
0218 PKG_CSTATE_INIT(3),
0219 PKG_CSTATE_INIT(6),
0220 PKG_CSTATE_INIT(7),
0221 PKG_CSTATE_INIT(8),
0222 PKG_CSTATE_INIT(9),
0223 PKG_CSTATE_INIT(10),
0224 {NULL},
0225 };
0226
0227 static bool has_pkg_state_counter(void)
0228 {
0229 u64 val;
0230 struct pkg_cstate_info *info = pkg_cstates;
0231
0232
0233 while (info->msr_index) {
0234 if (!rdmsrl_safe(info->msr_index, &val))
0235 return true;
0236 info++;
0237 }
0238
0239 return false;
0240 }
0241
0242 static u64 pkg_state_counter(void)
0243 {
0244 u64 val;
0245 u64 count = 0;
0246 struct pkg_cstate_info *info = pkg_cstates;
0247
0248 while (info->msr_index) {
0249 if (!info->skip) {
0250 if (!rdmsrl_safe(info->msr_index, &val))
0251 count += val;
0252 else
0253 info->skip = true;
0254 }
0255 info++;
0256 }
0257
0258 return count;
0259 }
0260
0261 static unsigned int get_compensation(int ratio)
0262 {
0263 unsigned int comp = 0;
0264
0265
0266 if (ratio == 1 &&
0267 cal_data[ratio].confidence >= CONFIDENCE_OK &&
0268 cal_data[ratio + 1].confidence >= CONFIDENCE_OK &&
0269 cal_data[ratio + 2].confidence >= CONFIDENCE_OK) {
0270 comp = (cal_data[ratio].steady_comp +
0271 cal_data[ratio + 1].steady_comp +
0272 cal_data[ratio + 2].steady_comp) / 3;
0273 } else if (ratio == MAX_TARGET_RATIO - 1 &&
0274 cal_data[ratio].confidence >= CONFIDENCE_OK &&
0275 cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
0276 cal_data[ratio - 2].confidence >= CONFIDENCE_OK) {
0277 comp = (cal_data[ratio].steady_comp +
0278 cal_data[ratio - 1].steady_comp +
0279 cal_data[ratio - 2].steady_comp) / 3;
0280 } else if (cal_data[ratio].confidence >= CONFIDENCE_OK &&
0281 cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
0282 cal_data[ratio + 1].confidence >= CONFIDENCE_OK) {
0283 comp = (cal_data[ratio].steady_comp +
0284 cal_data[ratio - 1].steady_comp +
0285 cal_data[ratio + 1].steady_comp) / 3;
0286 }
0287
0288
0289 if (reduce_irq)
0290 comp = ratio;
0291
0292 if (comp + ratio >= MAX_TARGET_RATIO)
0293 comp = MAX_TARGET_RATIO - ratio - 1;
0294
0295 return comp;
0296 }
0297
0298 static void adjust_compensation(int target_ratio, unsigned int win)
0299 {
0300 int delta;
0301 struct powerclamp_calibration_data *d = &cal_data[target_ratio];
0302
0303
0304
0305
0306
0307
0308 if (d->confidence >= CONFIDENCE_OK ||
0309 atomic_read(&idle_wakeup_counter) >
0310 win * num_online_cpus())
0311 return;
0312
0313 delta = set_target_ratio - current_ratio;
0314
0315 if (delta >= 0 && delta <= (1+target_ratio/10)) {
0316 if (d->steady_comp)
0317 d->steady_comp =
0318 roundup(delta+d->steady_comp, 2)/2;
0319 else
0320 d->steady_comp = delta;
0321 d->confidence++;
0322 }
0323 }
0324
0325 static bool powerclamp_adjust_controls(unsigned int target_ratio,
0326 unsigned int guard, unsigned int win)
0327 {
0328 static u64 msr_last, tsc_last;
0329 u64 msr_now, tsc_now;
0330 u64 val64;
0331
0332
0333 msr_now = pkg_state_counter();
0334 tsc_now = rdtsc();
0335
0336
0337 if (!msr_last || !tsc_last)
0338 current_ratio = 1;
0339 else if (tsc_now-tsc_last) {
0340 val64 = 100*(msr_now-msr_last);
0341 do_div(val64, (tsc_now-tsc_last));
0342 current_ratio = val64;
0343 }
0344
0345
0346 msr_last = msr_now;
0347 tsc_last = tsc_now;
0348
0349 adjust_compensation(target_ratio, win);
0350
0351
0352
0353
0354 reduce_irq = atomic_read(&idle_wakeup_counter) >=
0355 2 * win * num_online_cpus();
0356
0357 atomic_set(&idle_wakeup_counter, 0);
0358
0359 return set_target_ratio + guard <= current_ratio;
0360 }
0361
0362 static void clamp_balancing_func(struct kthread_work *work)
0363 {
0364 struct powerclamp_worker_data *w_data;
0365 int sleeptime;
0366 unsigned long target_jiffies;
0367 unsigned int compensated_ratio;
0368 int interval;
0369
0370 w_data = container_of(work, struct powerclamp_worker_data,
0371 balancing_work);
0372
0373
0374
0375
0376
0377
0378 w_data->target_ratio = READ_ONCE(set_target_ratio);
0379 w_data->guard = 1 + w_data->target_ratio / 20;
0380 w_data->window_size_now = window_size;
0381 w_data->duration_jiffies = msecs_to_jiffies(duration);
0382 w_data->count++;
0383
0384
0385
0386
0387
0388
0389 compensated_ratio = w_data->target_ratio +
0390 get_compensation(w_data->target_ratio);
0391 if (compensated_ratio <= 0)
0392 compensated_ratio = 1;
0393 interval = w_data->duration_jiffies * 100 / compensated_ratio;
0394
0395
0396 target_jiffies = roundup(jiffies, interval);
0397 sleeptime = target_jiffies - jiffies;
0398 if (sleeptime <= 0)
0399 sleeptime = 1;
0400
0401 if (clamping && w_data->clamping && cpu_online(w_data->cpu))
0402 kthread_queue_delayed_work(w_data->worker,
0403 &w_data->idle_injection_work,
0404 sleeptime);
0405 }
0406
0407 static void clamp_idle_injection_func(struct kthread_work *work)
0408 {
0409 struct powerclamp_worker_data *w_data;
0410
0411 w_data = container_of(work, struct powerclamp_worker_data,
0412 idle_injection_work.work);
0413
0414
0415
0416
0417
0418 if (w_data->cpu == control_cpu &&
0419 !(w_data->count % w_data->window_size_now)) {
0420 should_skip =
0421 powerclamp_adjust_controls(w_data->target_ratio,
0422 w_data->guard,
0423 w_data->window_size_now);
0424 smp_mb();
0425 }
0426
0427 if (should_skip)
0428 goto balance;
0429
0430 play_idle(jiffies_to_usecs(w_data->duration_jiffies));
0431
0432 balance:
0433 if (clamping && w_data->clamping && cpu_online(w_data->cpu))
0434 kthread_queue_work(w_data->worker, &w_data->balancing_work);
0435 }
0436
0437
0438
0439
0440
0441 static void poll_pkg_cstate(struct work_struct *dummy);
0442 static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate);
0443 static void poll_pkg_cstate(struct work_struct *dummy)
0444 {
0445 static u64 msr_last;
0446 static u64 tsc_last;
0447
0448 u64 msr_now;
0449 u64 tsc_now;
0450 u64 val64;
0451
0452 msr_now = pkg_state_counter();
0453 tsc_now = rdtsc();
0454
0455
0456 if (!msr_last || !tsc_last)
0457 pkg_cstate_ratio_cur = 1;
0458 else {
0459 if (tsc_now - tsc_last) {
0460 val64 = 100 * (msr_now - msr_last);
0461 do_div(val64, (tsc_now - tsc_last));
0462 pkg_cstate_ratio_cur = val64;
0463 }
0464 }
0465
0466
0467 msr_last = msr_now;
0468 tsc_last = tsc_now;
0469
0470 if (true == clamping)
0471 schedule_delayed_work(&poll_pkg_cstate_work, HZ);
0472 }
0473
0474 static void start_power_clamp_worker(unsigned long cpu)
0475 {
0476 struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
0477 struct kthread_worker *worker;
0478
0479 worker = kthread_create_worker_on_cpu(cpu, 0, "kidle_inj/%ld", cpu);
0480 if (IS_ERR(worker))
0481 return;
0482
0483 w_data->worker = worker;
0484 w_data->count = 0;
0485 w_data->cpu = cpu;
0486 w_data->clamping = true;
0487 set_bit(cpu, cpu_clamping_mask);
0488 sched_set_fifo(worker->task);
0489 kthread_init_work(&w_data->balancing_work, clamp_balancing_func);
0490 kthread_init_delayed_work(&w_data->idle_injection_work,
0491 clamp_idle_injection_func);
0492 kthread_queue_work(w_data->worker, &w_data->balancing_work);
0493 }
0494
0495 static void stop_power_clamp_worker(unsigned long cpu)
0496 {
0497 struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
0498
0499 if (!w_data->worker)
0500 return;
0501
0502 w_data->clamping = false;
0503
0504
0505
0506
0507
0508
0509 smp_wmb();
0510 kthread_cancel_work_sync(&w_data->balancing_work);
0511 kthread_cancel_delayed_work_sync(&w_data->idle_injection_work);
0512
0513
0514
0515
0516
0517
0518
0519 clear_bit(w_data->cpu, cpu_clamping_mask);
0520 kthread_destroy_worker(w_data->worker);
0521
0522 w_data->worker = NULL;
0523 }
0524
0525 static int start_power_clamp(void)
0526 {
0527 unsigned long cpu;
0528
0529 set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1);
0530
0531 cpus_read_lock();
0532
0533
0534 control_cpu = 0;
0535 if (!cpu_online(control_cpu))
0536 control_cpu = smp_processor_id();
0537
0538 clamping = true;
0539 schedule_delayed_work(&poll_pkg_cstate_work, 0);
0540
0541
0542 for_each_online_cpu(cpu) {
0543 start_power_clamp_worker(cpu);
0544 }
0545 cpus_read_unlock();
0546
0547 return 0;
0548 }
0549
0550 static void end_power_clamp(void)
0551 {
0552 int i;
0553
0554
0555
0556
0557
0558 clamping = false;
0559 for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
0560 pr_debug("clamping worker for cpu %d alive, destroy\n", i);
0561 stop_power_clamp_worker(i);
0562 }
0563 }
0564
0565 static int powerclamp_cpu_online(unsigned int cpu)
0566 {
0567 if (clamping == false)
0568 return 0;
0569 start_power_clamp_worker(cpu);
0570
0571 if (cpu == 0) {
0572 control_cpu = 0;
0573 smp_mb();
0574 }
0575 return 0;
0576 }
0577
0578 static int powerclamp_cpu_predown(unsigned int cpu)
0579 {
0580 if (clamping == false)
0581 return 0;
0582
0583 stop_power_clamp_worker(cpu);
0584 if (cpu != control_cpu)
0585 return 0;
0586
0587 control_cpu = cpumask_first(cpu_online_mask);
0588 if (control_cpu == cpu)
0589 control_cpu = cpumask_next(cpu, cpu_online_mask);
0590 smp_mb();
0591 return 0;
0592 }
0593
0594 static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
0595 unsigned long *state)
0596 {
0597 *state = MAX_TARGET_RATIO;
0598
0599 return 0;
0600 }
0601
0602 static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
0603 unsigned long *state)
0604 {
0605 if (true == clamping)
0606 *state = pkg_cstate_ratio_cur;
0607 else
0608
0609 *state = -1;
0610
0611 return 0;
0612 }
0613
0614 static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
0615 unsigned long new_target_ratio)
0616 {
0617 int ret = 0;
0618
0619 new_target_ratio = clamp(new_target_ratio, 0UL,
0620 (unsigned long) (MAX_TARGET_RATIO-1));
0621 if (set_target_ratio == 0 && new_target_ratio > 0) {
0622 pr_info("Start idle injection to reduce power\n");
0623 set_target_ratio = new_target_ratio;
0624 ret = start_power_clamp();
0625 goto exit_set;
0626 } else if (set_target_ratio > 0 && new_target_ratio == 0) {
0627 pr_info("Stop forced idle injection\n");
0628 end_power_clamp();
0629 set_target_ratio = 0;
0630 } else {
0631 set_target_ratio = new_target_ratio;
0632
0633 smp_mb();
0634 }
0635
0636 exit_set:
0637 return ret;
0638 }
0639
0640
0641 static const struct thermal_cooling_device_ops powerclamp_cooling_ops = {
0642 .get_max_state = powerclamp_get_max_state,
0643 .get_cur_state = powerclamp_get_cur_state,
0644 .set_cur_state = powerclamp_set_cur_state,
0645 };
0646
0647 static const struct x86_cpu_id __initconst intel_powerclamp_ids[] = {
0648 X86_MATCH_VENDOR_FEATURE(INTEL, X86_FEATURE_MWAIT, NULL),
0649 {}
0650 };
0651 MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
0652
0653 static int __init powerclamp_probe(void)
0654 {
0655
0656 if (!x86_match_cpu(intel_powerclamp_ids)) {
0657 pr_err("CPU does not support MWAIT\n");
0658 return -ENODEV;
0659 }
0660
0661
0662 if (!has_pkg_state_counter()) {
0663 pr_info("No package C-state available\n");
0664 return -ENODEV;
0665 }
0666
0667
0668 find_target_mwait();
0669
0670 return 0;
0671 }
0672
0673 static int powerclamp_debug_show(struct seq_file *m, void *unused)
0674 {
0675 int i = 0;
0676
0677 seq_printf(m, "controlling cpu: %d\n", control_cpu);
0678 seq_printf(m, "pct confidence steady dynamic (compensation)\n");
0679 for (i = 0; i < MAX_TARGET_RATIO; i++) {
0680 seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
0681 i,
0682 cal_data[i].confidence,
0683 cal_data[i].steady_comp,
0684 cal_data[i].dynamic_comp);
0685 }
0686
0687 return 0;
0688 }
0689
0690 DEFINE_SHOW_ATTRIBUTE(powerclamp_debug);
0691
0692 static inline void powerclamp_create_debug_files(void)
0693 {
0694 debug_dir = debugfs_create_dir("intel_powerclamp", NULL);
0695
0696 debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir, cal_data,
0697 &powerclamp_debug_fops);
0698 }
0699
0700 static enum cpuhp_state hp_state;
0701
0702 static int __init powerclamp_init(void)
0703 {
0704 int retval;
0705
0706 cpu_clamping_mask = bitmap_zalloc(num_possible_cpus(), GFP_KERNEL);
0707 if (!cpu_clamping_mask)
0708 return -ENOMEM;
0709
0710
0711 retval = powerclamp_probe();
0712 if (retval)
0713 goto exit_free;
0714
0715
0716 window_size = 2;
0717 retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
0718 "thermal/intel_powerclamp:online",
0719 powerclamp_cpu_online,
0720 powerclamp_cpu_predown);
0721 if (retval < 0)
0722 goto exit_free;
0723
0724 hp_state = retval;
0725
0726 worker_data = alloc_percpu(struct powerclamp_worker_data);
0727 if (!worker_data) {
0728 retval = -ENOMEM;
0729 goto exit_unregister;
0730 }
0731
0732 cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
0733 &powerclamp_cooling_ops);
0734 if (IS_ERR(cooling_dev)) {
0735 retval = -ENODEV;
0736 goto exit_free_thread;
0737 }
0738
0739 if (!duration)
0740 duration = jiffies_to_msecs(DEFAULT_DURATION_JIFFIES);
0741
0742 powerclamp_create_debug_files();
0743
0744 return 0;
0745
0746 exit_free_thread:
0747 free_percpu(worker_data);
0748 exit_unregister:
0749 cpuhp_remove_state_nocalls(hp_state);
0750 exit_free:
0751 bitmap_free(cpu_clamping_mask);
0752 return retval;
0753 }
0754 module_init(powerclamp_init);
0755
0756 static void __exit powerclamp_exit(void)
0757 {
0758 end_power_clamp();
0759 cpuhp_remove_state_nocalls(hp_state);
0760 free_percpu(worker_data);
0761 thermal_cooling_device_unregister(cooling_dev);
0762 bitmap_free(cpu_clamping_mask);
0763
0764 cancel_delayed_work_sync(&poll_pkg_cstate_work);
0765 debugfs_remove_recursive(debug_dir);
0766 }
0767 module_exit(powerclamp_exit);
0768
0769 MODULE_LICENSE("GPL");
0770 MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
0771 MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
0772 MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs");