0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038
0039
0040
0041
0042
0043
0044
0045
0046
0047
0048
0049
0050
0051
0052
0053
0054
0055
0056
0057
0058
0059
0060
0061
0062
0063
0064
0065
0066
0067
0068
0069
0070
0071
0072
0073
0074
0075
0076
0077
0078
0079
0080
0081
0082
0083
0084
0085
0086
0087
0088
0089
0090
0091
0092
0093
0094
0095
0096
0097
0098
0099
0100
0101
0102
0103
0104
0105
0106
0107
0108
0109
0110
0111
0112
0113
0114
0115
0116
0117
0118
0119
0120
0121
0122
0123
0124
0125
0126
0127
0128
0129
0130
0131
0132
0133
0134
0135
0136
0137
0138
0139
0140 static int psi_bug __read_mostly;
0141
0142 DEFINE_STATIC_KEY_FALSE(psi_disabled);
0143 DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled);
0144
0145 #ifdef CONFIG_PSI_DEFAULT_DISABLED
0146 static bool psi_enable;
0147 #else
0148 static bool psi_enable = true;
0149 #endif
0150 static int __init setup_psi(char *str)
0151 {
0152 return kstrtobool(str, &psi_enable) == 0;
0153 }
0154 __setup("psi=", setup_psi);
0155
0156
0157 #define PSI_FREQ (2*HZ+1)
0158 #define EXP_10s 1677
0159 #define EXP_60s 1981
0160 #define EXP_300s 2034
0161
0162
0163 #define WINDOW_MIN_US 500000
0164 #define WINDOW_MAX_US 10000000
0165 #define UPDATES_PER_WINDOW 10
0166
0167
0168 static u64 psi_period __read_mostly;
0169
0170
0171 static DEFINE_PER_CPU(struct psi_group_cpu, system_group_pcpu);
0172 struct psi_group psi_system = {
0173 .pcpu = &system_group_pcpu,
0174 };
0175
0176 static void psi_avgs_work(struct work_struct *work);
0177
0178 static void poll_timer_fn(struct timer_list *t);
0179
0180 static void group_init(struct psi_group *group)
0181 {
0182 int cpu;
0183
0184 for_each_possible_cpu(cpu)
0185 seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
0186 group->avg_last_update = sched_clock();
0187 group->avg_next_update = group->avg_last_update + psi_period;
0188 INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
0189 mutex_init(&group->avgs_lock);
0190
0191 mutex_init(&group->trigger_lock);
0192 INIT_LIST_HEAD(&group->triggers);
0193 group->poll_min_period = U32_MAX;
0194 group->polling_next_update = ULLONG_MAX;
0195 init_waitqueue_head(&group->poll_wait);
0196 timer_setup(&group->poll_timer, poll_timer_fn, 0);
0197 rcu_assign_pointer(group->poll_task, NULL);
0198 }
0199
0200 void __init psi_init(void)
0201 {
0202 if (!psi_enable) {
0203 static_branch_enable(&psi_disabled);
0204 return;
0205 }
0206
0207 if (!cgroup_psi_enabled())
0208 static_branch_disable(&psi_cgroups_enabled);
0209
0210 psi_period = jiffies_to_nsecs(PSI_FREQ);
0211 group_init(&psi_system);
0212 }
0213
0214 static bool test_state(unsigned int *tasks, enum psi_states state)
0215 {
0216 switch (state) {
0217 case PSI_IO_SOME:
0218 return unlikely(tasks[NR_IOWAIT]);
0219 case PSI_IO_FULL:
0220 return unlikely(tasks[NR_IOWAIT] && !tasks[NR_RUNNING]);
0221 case PSI_MEM_SOME:
0222 return unlikely(tasks[NR_MEMSTALL]);
0223 case PSI_MEM_FULL:
0224 return unlikely(tasks[NR_MEMSTALL] &&
0225 tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
0226 case PSI_CPU_SOME:
0227 return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]);
0228 case PSI_CPU_FULL:
0229 return unlikely(tasks[NR_RUNNING] && !tasks[NR_ONCPU]);
0230 case PSI_NONIDLE:
0231 return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
0232 tasks[NR_RUNNING];
0233 default:
0234 return false;
0235 }
0236 }
0237
0238 static void get_recent_times(struct psi_group *group, int cpu,
0239 enum psi_aggregators aggregator, u32 *times,
0240 u32 *pchanged_states)
0241 {
0242 struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
0243 u64 now, state_start;
0244 enum psi_states s;
0245 unsigned int seq;
0246 u32 state_mask;
0247
0248 *pchanged_states = 0;
0249
0250
0251 do {
0252 seq = read_seqcount_begin(&groupc->seq);
0253 now = cpu_clock(cpu);
0254 memcpy(times, groupc->times, sizeof(groupc->times));
0255 state_mask = groupc->state_mask;
0256 state_start = groupc->state_start;
0257 } while (read_seqcount_retry(&groupc->seq, seq));
0258
0259
0260 for (s = 0; s < NR_PSI_STATES; s++) {
0261 u32 delta;
0262
0263
0264
0265
0266
0267
0268
0269
0270
0271 if (state_mask & (1 << s))
0272 times[s] += now - state_start;
0273
0274 delta = times[s] - groupc->times_prev[aggregator][s];
0275 groupc->times_prev[aggregator][s] = times[s];
0276
0277 times[s] = delta;
0278 if (delta)
0279 *pchanged_states |= (1 << s);
0280 }
0281 }
0282
0283 static void calc_avgs(unsigned long avg[3], int missed_periods,
0284 u64 time, u64 period)
0285 {
0286 unsigned long pct;
0287
0288
0289 if (missed_periods) {
0290 avg[0] = calc_load_n(avg[0], EXP_10s, 0, missed_periods);
0291 avg[1] = calc_load_n(avg[1], EXP_60s, 0, missed_periods);
0292 avg[2] = calc_load_n(avg[2], EXP_300s, 0, missed_periods);
0293 }
0294
0295
0296 pct = div_u64(time * 100, period);
0297 pct *= FIXED_1;
0298 avg[0] = calc_load(avg[0], EXP_10s, pct);
0299 avg[1] = calc_load(avg[1], EXP_60s, pct);
0300 avg[2] = calc_load(avg[2], EXP_300s, pct);
0301 }
0302
0303 static void collect_percpu_times(struct psi_group *group,
0304 enum psi_aggregators aggregator,
0305 u32 *pchanged_states)
0306 {
0307 u64 deltas[NR_PSI_STATES - 1] = { 0, };
0308 unsigned long nonidle_total = 0;
0309 u32 changed_states = 0;
0310 int cpu;
0311 int s;
0312
0313
0314
0315
0316
0317
0318
0319
0320
0321 for_each_possible_cpu(cpu) {
0322 u32 times[NR_PSI_STATES];
0323 u32 nonidle;
0324 u32 cpu_changed_states;
0325
0326 get_recent_times(group, cpu, aggregator, times,
0327 &cpu_changed_states);
0328 changed_states |= cpu_changed_states;
0329
0330 nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]);
0331 nonidle_total += nonidle;
0332
0333 for (s = 0; s < PSI_NONIDLE; s++)
0334 deltas[s] += (u64)times[s] * nonidle;
0335 }
0336
0337
0338
0339
0340
0341
0342
0343
0344
0345
0346
0347
0348
0349
0350 for (s = 0; s < NR_PSI_STATES - 1; s++)
0351 group->total[aggregator][s] +=
0352 div_u64(deltas[s], max(nonidle_total, 1UL));
0353
0354 if (pchanged_states)
0355 *pchanged_states = changed_states;
0356 }
0357
0358 static u64 update_averages(struct psi_group *group, u64 now)
0359 {
0360 unsigned long missed_periods = 0;
0361 u64 expires, period;
0362 u64 avg_next_update;
0363 int s;
0364
0365
0366 expires = group->avg_next_update;
0367 if (now - expires >= psi_period)
0368 missed_periods = div_u64(now - expires, psi_period);
0369
0370
0371
0372
0373
0374
0375
0376
0377 avg_next_update = expires + ((1 + missed_periods) * psi_period);
0378 period = now - (group->avg_last_update + (missed_periods * psi_period));
0379 group->avg_last_update = now;
0380
0381 for (s = 0; s < NR_PSI_STATES - 1; s++) {
0382 u32 sample;
0383
0384 sample = group->total[PSI_AVGS][s] - group->avg_total[s];
0385
0386
0387
0388
0389
0390
0391
0392
0393
0394
0395
0396
0397
0398
0399
0400
0401
0402 if (sample > period)
0403 sample = period;
0404 group->avg_total[s] += sample;
0405 calc_avgs(group->avg[s], missed_periods, sample, period);
0406 }
0407
0408 return avg_next_update;
0409 }
0410
0411 static void psi_avgs_work(struct work_struct *work)
0412 {
0413 struct delayed_work *dwork;
0414 struct psi_group *group;
0415 u32 changed_states;
0416 bool nonidle;
0417 u64 now;
0418
0419 dwork = to_delayed_work(work);
0420 group = container_of(dwork, struct psi_group, avgs_work);
0421
0422 mutex_lock(&group->avgs_lock);
0423
0424 now = sched_clock();
0425
0426 collect_percpu_times(group, PSI_AVGS, &changed_states);
0427 nonidle = changed_states & (1 << PSI_NONIDLE);
0428
0429
0430
0431
0432
0433
0434
0435 if (now >= group->avg_next_update)
0436 group->avg_next_update = update_averages(group, now);
0437
0438 if (nonidle) {
0439 schedule_delayed_work(dwork, nsecs_to_jiffies(
0440 group->avg_next_update - now) + 1);
0441 }
0442
0443 mutex_unlock(&group->avgs_lock);
0444 }
0445
0446
0447 static void window_reset(struct psi_window *win, u64 now, u64 value,
0448 u64 prev_growth)
0449 {
0450 win->start_time = now;
0451 win->start_value = value;
0452 win->prev_growth = prev_growth;
0453 }
0454
0455
0456
0457
0458
0459
0460
0461
0462
0463
0464
0465
0466 static u64 window_update(struct psi_window *win, u64 now, u64 value)
0467 {
0468 u64 elapsed;
0469 u64 growth;
0470
0471 elapsed = now - win->start_time;
0472 growth = value - win->start_value;
0473
0474
0475
0476
0477
0478
0479
0480 if (elapsed > win->size)
0481 window_reset(win, now, value, growth);
0482 else {
0483 u32 remaining;
0484
0485 remaining = win->size - elapsed;
0486 growth += div64_u64(win->prev_growth * remaining, win->size);
0487 }
0488
0489 return growth;
0490 }
0491
0492 static void init_triggers(struct psi_group *group, u64 now)
0493 {
0494 struct psi_trigger *t;
0495
0496 list_for_each_entry(t, &group->triggers, node)
0497 window_reset(&t->win, now,
0498 group->total[PSI_POLL][t->state], 0);
0499 memcpy(group->polling_total, group->total[PSI_POLL],
0500 sizeof(group->polling_total));
0501 group->polling_next_update = now + group->poll_min_period;
0502 }
0503
0504 static u64 update_triggers(struct psi_group *group, u64 now)
0505 {
0506 struct psi_trigger *t;
0507 bool update_total = false;
0508 u64 *total = group->total[PSI_POLL];
0509
0510
0511
0512
0513
0514 list_for_each_entry(t, &group->triggers, node) {
0515 u64 growth;
0516 bool new_stall;
0517
0518 new_stall = group->polling_total[t->state] != total[t->state];
0519
0520
0521 if (!new_stall && !t->pending_event)
0522 continue;
0523
0524
0525
0526
0527
0528
0529 if (new_stall) {
0530
0531
0532
0533
0534
0535
0536 update_total = true;
0537
0538
0539 growth = window_update(&t->win, now, total[t->state]);
0540 if (growth < t->threshold)
0541 continue;
0542
0543 t->pending_event = true;
0544 }
0545
0546 if (now < t->last_event_time + t->win.size)
0547 continue;
0548
0549
0550 if (cmpxchg(&t->event, 0, 1) == 0)
0551 wake_up_interruptible(&t->event_wait);
0552 t->last_event_time = now;
0553
0554 t->pending_event = false;
0555 }
0556
0557 if (update_total)
0558 memcpy(group->polling_total, total,
0559 sizeof(group->polling_total));
0560
0561 return now + group->poll_min_period;
0562 }
0563
0564
0565 static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay)
0566 {
0567 struct task_struct *task;
0568
0569
0570
0571
0572
0573
0574
0575 if (timer_pending(&group->poll_timer))
0576 return;
0577
0578 rcu_read_lock();
0579
0580 task = rcu_dereference(group->poll_task);
0581
0582
0583
0584
0585 if (likely(task))
0586 mod_timer(&group->poll_timer, jiffies + delay);
0587
0588 rcu_read_unlock();
0589 }
0590
0591 static void psi_poll_work(struct psi_group *group)
0592 {
0593 u32 changed_states;
0594 u64 now;
0595
0596 mutex_lock(&group->trigger_lock);
0597
0598 now = sched_clock();
0599
0600 collect_percpu_times(group, PSI_POLL, &changed_states);
0601
0602 if (changed_states & group->poll_states) {
0603
0604 if (now > group->polling_until)
0605 init_triggers(group, now);
0606
0607
0608
0609
0610
0611
0612 group->polling_until = now +
0613 group->poll_min_period * UPDATES_PER_WINDOW;
0614 }
0615
0616 if (now > group->polling_until) {
0617 group->polling_next_update = ULLONG_MAX;
0618 goto out;
0619 }
0620
0621 if (now >= group->polling_next_update)
0622 group->polling_next_update = update_triggers(group, now);
0623
0624 psi_schedule_poll_work(group,
0625 nsecs_to_jiffies(group->polling_next_update - now) + 1);
0626
0627 out:
0628 mutex_unlock(&group->trigger_lock);
0629 }
0630
0631 static int psi_poll_worker(void *data)
0632 {
0633 struct psi_group *group = (struct psi_group *)data;
0634
0635 sched_set_fifo_low(current);
0636
0637 while (true) {
0638 wait_event_interruptible(group->poll_wait,
0639 atomic_cmpxchg(&group->poll_wakeup, 1, 0) ||
0640 kthread_should_stop());
0641 if (kthread_should_stop())
0642 break;
0643
0644 psi_poll_work(group);
0645 }
0646 return 0;
0647 }
0648
0649 static void poll_timer_fn(struct timer_list *t)
0650 {
0651 struct psi_group *group = from_timer(group, t, poll_timer);
0652
0653 atomic_set(&group->poll_wakeup, 1);
0654 wake_up_interruptible(&group->poll_wait);
0655 }
0656
0657 static void record_times(struct psi_group_cpu *groupc, u64 now)
0658 {
0659 u32 delta;
0660
0661 delta = now - groupc->state_start;
0662 groupc->state_start = now;
0663
0664 if (groupc->state_mask & (1 << PSI_IO_SOME)) {
0665 groupc->times[PSI_IO_SOME] += delta;
0666 if (groupc->state_mask & (1 << PSI_IO_FULL))
0667 groupc->times[PSI_IO_FULL] += delta;
0668 }
0669
0670 if (groupc->state_mask & (1 << PSI_MEM_SOME)) {
0671 groupc->times[PSI_MEM_SOME] += delta;
0672 if (groupc->state_mask & (1 << PSI_MEM_FULL))
0673 groupc->times[PSI_MEM_FULL] += delta;
0674 }
0675
0676 if (groupc->state_mask & (1 << PSI_CPU_SOME)) {
0677 groupc->times[PSI_CPU_SOME] += delta;
0678 if (groupc->state_mask & (1 << PSI_CPU_FULL))
0679 groupc->times[PSI_CPU_FULL] += delta;
0680 }
0681
0682 if (groupc->state_mask & (1 << PSI_NONIDLE))
0683 groupc->times[PSI_NONIDLE] += delta;
0684 }
0685
0686 static void psi_group_change(struct psi_group *group, int cpu,
0687 unsigned int clear, unsigned int set, u64 now,
0688 bool wake_clock)
0689 {
0690 struct psi_group_cpu *groupc;
0691 u32 state_mask = 0;
0692 unsigned int t, m;
0693 enum psi_states s;
0694
0695 groupc = per_cpu_ptr(group->pcpu, cpu);
0696
0697
0698
0699
0700
0701
0702
0703
0704
0705 write_seqcount_begin(&groupc->seq);
0706
0707 record_times(groupc, now);
0708
0709 for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
0710 if (!(m & (1 << t)))
0711 continue;
0712 if (groupc->tasks[t]) {
0713 groupc->tasks[t]--;
0714 } else if (!psi_bug) {
0715 printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n",
0716 cpu, t, groupc->tasks[0],
0717 groupc->tasks[1], groupc->tasks[2],
0718 groupc->tasks[3], groupc->tasks[4],
0719 clear, set);
0720 psi_bug = 1;
0721 }
0722 }
0723
0724 for (t = 0; set; set &= ~(1 << t), t++)
0725 if (set & (1 << t))
0726 groupc->tasks[t]++;
0727
0728
0729 for (s = 0; s < NR_PSI_STATES; s++) {
0730 if (test_state(groupc->tasks, s))
0731 state_mask |= (1 << s);
0732 }
0733
0734
0735
0736
0737
0738
0739
0740
0741
0742 if (unlikely(groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall))
0743 state_mask |= (1 << PSI_MEM_FULL);
0744
0745 groupc->state_mask = state_mask;
0746
0747 write_seqcount_end(&groupc->seq);
0748
0749 if (state_mask & group->poll_states)
0750 psi_schedule_poll_work(group, 1);
0751
0752 if (wake_clock && !delayed_work_pending(&group->avgs_work))
0753 schedule_delayed_work(&group->avgs_work, PSI_FREQ);
0754 }
0755
0756 static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
0757 {
0758 if (*iter == &psi_system)
0759 return NULL;
0760
0761 #ifdef CONFIG_CGROUPS
0762 if (static_branch_likely(&psi_cgroups_enabled)) {
0763 struct cgroup *cgroup = NULL;
0764
0765 if (!*iter)
0766 cgroup = task->cgroups->dfl_cgrp;
0767 else
0768 cgroup = cgroup_parent(*iter);
0769
0770 if (cgroup && cgroup_parent(cgroup)) {
0771 *iter = cgroup;
0772 return cgroup_psi(cgroup);
0773 }
0774 }
0775 #endif
0776 *iter = &psi_system;
0777 return &psi_system;
0778 }
0779
0780 static void psi_flags_change(struct task_struct *task, int clear, int set)
0781 {
0782 if (((task->psi_flags & set) ||
0783 (task->psi_flags & clear) != clear) &&
0784 !psi_bug) {
0785 printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n",
0786 task->pid, task->comm, task_cpu(task),
0787 task->psi_flags, clear, set);
0788 psi_bug = 1;
0789 }
0790
0791 task->psi_flags &= ~clear;
0792 task->psi_flags |= set;
0793 }
0794
0795 void psi_task_change(struct task_struct *task, int clear, int set)
0796 {
0797 int cpu = task_cpu(task);
0798 struct psi_group *group;
0799 bool wake_clock = true;
0800 void *iter = NULL;
0801 u64 now;
0802
0803 if (!task->pid)
0804 return;
0805
0806 psi_flags_change(task, clear, set);
0807
0808 now = cpu_clock(cpu);
0809
0810
0811
0812
0813
0814
0815 if (unlikely((clear & TSK_RUNNING) &&
0816 (task->flags & PF_WQ_WORKER) &&
0817 wq_worker_last_func(task) == psi_avgs_work))
0818 wake_clock = false;
0819
0820 while ((group = iterate_groups(task, &iter)))
0821 psi_group_change(group, cpu, clear, set, now, wake_clock);
0822 }
0823
0824 void psi_task_switch(struct task_struct *prev, struct task_struct *next,
0825 bool sleep)
0826 {
0827 struct psi_group *group, *common = NULL;
0828 int cpu = task_cpu(prev);
0829 void *iter;
0830 u64 now = cpu_clock(cpu);
0831
0832 if (next->pid) {
0833 bool identical_state;
0834
0835 psi_flags_change(next, 0, TSK_ONCPU);
0836
0837
0838
0839
0840
0841
0842 identical_state = prev->psi_flags == next->psi_flags;
0843 iter = NULL;
0844 while ((group = iterate_groups(next, &iter))) {
0845 if (identical_state &&
0846 per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) {
0847 common = group;
0848 break;
0849 }
0850
0851 psi_group_change(group, cpu, 0, TSK_ONCPU, now, true);
0852 }
0853 }
0854
0855 if (prev->pid) {
0856 int clear = TSK_ONCPU, set = 0;
0857
0858
0859
0860
0861
0862
0863
0864 if (sleep) {
0865 clear |= TSK_RUNNING;
0866 if (prev->in_memstall)
0867 clear |= TSK_MEMSTALL_RUNNING;
0868 if (prev->in_iowait)
0869 set |= TSK_IOWAIT;
0870 }
0871
0872 psi_flags_change(prev, clear, set);
0873
0874 iter = NULL;
0875 while ((group = iterate_groups(prev, &iter)) && group != common)
0876 psi_group_change(group, cpu, clear, set, now, true);
0877
0878
0879
0880
0881
0882 if (sleep) {
0883 clear &= ~TSK_ONCPU;
0884 for (; group; group = iterate_groups(prev, &iter))
0885 psi_group_change(group, cpu, clear, set, now, true);
0886 }
0887 }
0888 }
0889
0890
0891
0892
0893
0894
0895
0896
0897 void psi_memstall_enter(unsigned long *flags)
0898 {
0899 struct rq_flags rf;
0900 struct rq *rq;
0901
0902 if (static_branch_likely(&psi_disabled))
0903 return;
0904
0905 *flags = current->in_memstall;
0906 if (*flags)
0907 return;
0908
0909
0910
0911
0912
0913 rq = this_rq_lock_irq(&rf);
0914
0915 current->in_memstall = 1;
0916 psi_task_change(current, 0, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);
0917
0918 rq_unlock_irq(rq, &rf);
0919 }
0920
0921
0922
0923
0924
0925
0926
0927 void psi_memstall_leave(unsigned long *flags)
0928 {
0929 struct rq_flags rf;
0930 struct rq *rq;
0931
0932 if (static_branch_likely(&psi_disabled))
0933 return;
0934
0935 if (*flags)
0936 return;
0937
0938
0939
0940
0941
0942 rq = this_rq_lock_irq(&rf);
0943
0944 current->in_memstall = 0;
0945 psi_task_change(current, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING, 0);
0946
0947 rq_unlock_irq(rq, &rf);
0948 }
0949
0950 #ifdef CONFIG_CGROUPS
0951 int psi_cgroup_alloc(struct cgroup *cgroup)
0952 {
0953 if (static_branch_likely(&psi_disabled))
0954 return 0;
0955
0956 cgroup->psi = kzalloc(sizeof(struct psi_group), GFP_KERNEL);
0957 if (!cgroup->psi)
0958 return -ENOMEM;
0959
0960 cgroup->psi->pcpu = alloc_percpu(struct psi_group_cpu);
0961 if (!cgroup->psi->pcpu) {
0962 kfree(cgroup->psi);
0963 return -ENOMEM;
0964 }
0965 group_init(cgroup->psi);
0966 return 0;
0967 }
0968
0969 void psi_cgroup_free(struct cgroup *cgroup)
0970 {
0971 if (static_branch_likely(&psi_disabled))
0972 return;
0973
0974 cancel_delayed_work_sync(&cgroup->psi->avgs_work);
0975 free_percpu(cgroup->psi->pcpu);
0976
0977 WARN_ONCE(cgroup->psi->poll_states, "psi: trigger leak\n");
0978 kfree(cgroup->psi);
0979 }
0980
0981
0982
0983
0984
0985
0986
0987
0988
0989
0990
0991
0992
0993 void cgroup_move_task(struct task_struct *task, struct css_set *to)
0994 {
0995 unsigned int task_flags;
0996 struct rq_flags rf;
0997 struct rq *rq;
0998
0999 if (static_branch_likely(&psi_disabled)) {
1000
1001
1002
1003
1004 rcu_assign_pointer(task->cgroups, to);
1005 return;
1006 }
1007
1008 rq = task_rq_lock(task, &rf);
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034 task_flags = task->psi_flags;
1035
1036 if (task_flags)
1037 psi_task_change(task, task_flags, 0);
1038
1039
1040 rcu_assign_pointer(task->cgroups, to);
1041
1042 if (task_flags)
1043 psi_task_change(task, 0, task_flags);
1044
1045 task_rq_unlock(rq, task, &rf);
1046 }
1047 #endif
1048
1049 int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
1050 {
1051 int full;
1052 u64 now;
1053
1054 if (static_branch_likely(&psi_disabled))
1055 return -EOPNOTSUPP;
1056
1057
1058 mutex_lock(&group->avgs_lock);
1059 now = sched_clock();
1060 collect_percpu_times(group, PSI_AVGS, NULL);
1061 if (now >= group->avg_next_update)
1062 group->avg_next_update = update_averages(group, now);
1063 mutex_unlock(&group->avgs_lock);
1064
1065 for (full = 0; full < 2; full++) {
1066 unsigned long avg[3] = { 0, };
1067 u64 total = 0;
1068 int w;
1069
1070
1071 if (!(group == &psi_system && res == PSI_CPU && full)) {
1072 for (w = 0; w < 3; w++)
1073 avg[w] = group->avg[res * 2 + full][w];
1074 total = div_u64(group->total[PSI_AVGS][res * 2 + full],
1075 NSEC_PER_USEC);
1076 }
1077
1078 seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
1079 full ? "full" : "some",
1080 LOAD_INT(avg[0]), LOAD_FRAC(avg[0]),
1081 LOAD_INT(avg[1]), LOAD_FRAC(avg[1]),
1082 LOAD_INT(avg[2]), LOAD_FRAC(avg[2]),
1083 total);
1084 }
1085
1086 return 0;
1087 }
1088
1089 struct psi_trigger *psi_trigger_create(struct psi_group *group,
1090 char *buf, enum psi_res res)
1091 {
1092 struct psi_trigger *t;
1093 enum psi_states state;
1094 u32 threshold_us;
1095 u32 window_us;
1096
1097 if (static_branch_likely(&psi_disabled))
1098 return ERR_PTR(-EOPNOTSUPP);
1099
1100 if (sscanf(buf, "some %u %u", &threshold_us, &window_us) == 2)
1101 state = PSI_IO_SOME + res * 2;
1102 else if (sscanf(buf, "full %u %u", &threshold_us, &window_us) == 2)
1103 state = PSI_IO_FULL + res * 2;
1104 else
1105 return ERR_PTR(-EINVAL);
1106
1107 if (state >= PSI_NONIDLE)
1108 return ERR_PTR(-EINVAL);
1109
1110 if (window_us < WINDOW_MIN_US ||
1111 window_us > WINDOW_MAX_US)
1112 return ERR_PTR(-EINVAL);
1113
1114
1115 if (threshold_us == 0 || threshold_us > window_us)
1116 return ERR_PTR(-EINVAL);
1117
1118 t = kmalloc(sizeof(*t), GFP_KERNEL);
1119 if (!t)
1120 return ERR_PTR(-ENOMEM);
1121
1122 t->group = group;
1123 t->state = state;
1124 t->threshold = threshold_us * NSEC_PER_USEC;
1125 t->win.size = window_us * NSEC_PER_USEC;
1126 window_reset(&t->win, sched_clock(),
1127 group->total[PSI_POLL][t->state], 0);
1128
1129 t->event = 0;
1130 t->last_event_time = 0;
1131 init_waitqueue_head(&t->event_wait);
1132 t->pending_event = false;
1133
1134 mutex_lock(&group->trigger_lock);
1135
1136 if (!rcu_access_pointer(group->poll_task)) {
1137 struct task_struct *task;
1138
1139 task = kthread_create(psi_poll_worker, group, "psimon");
1140 if (IS_ERR(task)) {
1141 kfree(t);
1142 mutex_unlock(&group->trigger_lock);
1143 return ERR_CAST(task);
1144 }
1145 atomic_set(&group->poll_wakeup, 0);
1146 wake_up_process(task);
1147 rcu_assign_pointer(group->poll_task, task);
1148 }
1149
1150 list_add(&t->node, &group->triggers);
1151 group->poll_min_period = min(group->poll_min_period,
1152 div_u64(t->win.size, UPDATES_PER_WINDOW));
1153 group->nr_triggers[t->state]++;
1154 group->poll_states |= (1 << t->state);
1155
1156 mutex_unlock(&group->trigger_lock);
1157
1158 return t;
1159 }
1160
1161 void psi_trigger_destroy(struct psi_trigger *t)
1162 {
1163 struct psi_group *group;
1164 struct task_struct *task_to_destroy = NULL;
1165
1166
1167
1168
1169
1170 if (!t)
1171 return;
1172
1173 group = t->group;
1174
1175
1176
1177
1178 wake_up_interruptible(&t->event_wait);
1179
1180 mutex_lock(&group->trigger_lock);
1181
1182 if (!list_empty(&t->node)) {
1183 struct psi_trigger *tmp;
1184 u64 period = ULLONG_MAX;
1185
1186 list_del(&t->node);
1187 group->nr_triggers[t->state]--;
1188 if (!group->nr_triggers[t->state])
1189 group->poll_states &= ~(1 << t->state);
1190
1191 list_for_each_entry(tmp, &group->triggers, node)
1192 period = min(period, div_u64(tmp->win.size,
1193 UPDATES_PER_WINDOW));
1194 group->poll_min_period = period;
1195
1196 if (group->poll_states == 0) {
1197 group->polling_until = 0;
1198 task_to_destroy = rcu_dereference_protected(
1199 group->poll_task,
1200 lockdep_is_held(&group->trigger_lock));
1201 rcu_assign_pointer(group->poll_task, NULL);
1202 del_timer(&group->poll_timer);
1203 }
1204 }
1205
1206 mutex_unlock(&group->trigger_lock);
1207
1208
1209
1210
1211
1212
1213 synchronize_rcu();
1214
1215
1216
1217
1218 if (task_to_destroy) {
1219
1220
1221
1222
1223 kthread_stop(task_to_destroy);
1224 }
1225 kfree(t);
1226 }
1227
1228 __poll_t psi_trigger_poll(void **trigger_ptr,
1229 struct file *file, poll_table *wait)
1230 {
1231 __poll_t ret = DEFAULT_POLLMASK;
1232 struct psi_trigger *t;
1233
1234 if (static_branch_likely(&psi_disabled))
1235 return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
1236
1237 t = smp_load_acquire(trigger_ptr);
1238 if (!t)
1239 return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
1240
1241 poll_wait(file, &t->event_wait, wait);
1242
1243 if (cmpxchg(&t->event, 1, 0) == 1)
1244 ret |= EPOLLPRI;
1245
1246 return ret;
1247 }
1248
1249 #ifdef CONFIG_PROC_FS
1250 static int psi_io_show(struct seq_file *m, void *v)
1251 {
1252 return psi_show(m, &psi_system, PSI_IO);
1253 }
1254
1255 static int psi_memory_show(struct seq_file *m, void *v)
1256 {
1257 return psi_show(m, &psi_system, PSI_MEM);
1258 }
1259
1260 static int psi_cpu_show(struct seq_file *m, void *v)
1261 {
1262 return psi_show(m, &psi_system, PSI_CPU);
1263 }
1264
1265 static int psi_open(struct file *file, int (*psi_show)(struct seq_file *, void *))
1266 {
1267 if (file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE))
1268 return -EPERM;
1269
1270 return single_open(file, psi_show, NULL);
1271 }
1272
1273 static int psi_io_open(struct inode *inode, struct file *file)
1274 {
1275 return psi_open(file, psi_io_show);
1276 }
1277
1278 static int psi_memory_open(struct inode *inode, struct file *file)
1279 {
1280 return psi_open(file, psi_memory_show);
1281 }
1282
1283 static int psi_cpu_open(struct inode *inode, struct file *file)
1284 {
1285 return psi_open(file, psi_cpu_show);
1286 }
1287
1288 static ssize_t psi_write(struct file *file, const char __user *user_buf,
1289 size_t nbytes, enum psi_res res)
1290 {
1291 char buf[32];
1292 size_t buf_size;
1293 struct seq_file *seq;
1294 struct psi_trigger *new;
1295
1296 if (static_branch_likely(&psi_disabled))
1297 return -EOPNOTSUPP;
1298
1299 if (!nbytes)
1300 return -EINVAL;
1301
1302 buf_size = min(nbytes, sizeof(buf));
1303 if (copy_from_user(buf, user_buf, buf_size))
1304 return -EFAULT;
1305
1306 buf[buf_size - 1] = '\0';
1307
1308 seq = file->private_data;
1309
1310
1311 mutex_lock(&seq->lock);
1312
1313
1314 if (seq->private) {
1315 mutex_unlock(&seq->lock);
1316 return -EBUSY;
1317 }
1318
1319 new = psi_trigger_create(&psi_system, buf, res);
1320 if (IS_ERR(new)) {
1321 mutex_unlock(&seq->lock);
1322 return PTR_ERR(new);
1323 }
1324
1325 smp_store_release(&seq->private, new);
1326 mutex_unlock(&seq->lock);
1327
1328 return nbytes;
1329 }
1330
1331 static ssize_t psi_io_write(struct file *file, const char __user *user_buf,
1332 size_t nbytes, loff_t *ppos)
1333 {
1334 return psi_write(file, user_buf, nbytes, PSI_IO);
1335 }
1336
1337 static ssize_t psi_memory_write(struct file *file, const char __user *user_buf,
1338 size_t nbytes, loff_t *ppos)
1339 {
1340 return psi_write(file, user_buf, nbytes, PSI_MEM);
1341 }
1342
1343 static ssize_t psi_cpu_write(struct file *file, const char __user *user_buf,
1344 size_t nbytes, loff_t *ppos)
1345 {
1346 return psi_write(file, user_buf, nbytes, PSI_CPU);
1347 }
1348
1349 static __poll_t psi_fop_poll(struct file *file, poll_table *wait)
1350 {
1351 struct seq_file *seq = file->private_data;
1352
1353 return psi_trigger_poll(&seq->private, file, wait);
1354 }
1355
1356 static int psi_fop_release(struct inode *inode, struct file *file)
1357 {
1358 struct seq_file *seq = file->private_data;
1359
1360 psi_trigger_destroy(seq->private);
1361 return single_release(inode, file);
1362 }
1363
1364 static const struct proc_ops psi_io_proc_ops = {
1365 .proc_open = psi_io_open,
1366 .proc_read = seq_read,
1367 .proc_lseek = seq_lseek,
1368 .proc_write = psi_io_write,
1369 .proc_poll = psi_fop_poll,
1370 .proc_release = psi_fop_release,
1371 };
1372
1373 static const struct proc_ops psi_memory_proc_ops = {
1374 .proc_open = psi_memory_open,
1375 .proc_read = seq_read,
1376 .proc_lseek = seq_lseek,
1377 .proc_write = psi_memory_write,
1378 .proc_poll = psi_fop_poll,
1379 .proc_release = psi_fop_release,
1380 };
1381
1382 static const struct proc_ops psi_cpu_proc_ops = {
1383 .proc_open = psi_cpu_open,
1384 .proc_read = seq_read,
1385 .proc_lseek = seq_lseek,
1386 .proc_write = psi_cpu_write,
1387 .proc_poll = psi_fop_poll,
1388 .proc_release = psi_fop_release,
1389 };
1390
1391 static int __init psi_proc_init(void)
1392 {
1393 if (psi_enable) {
1394 proc_mkdir("pressure", NULL);
1395 proc_create("pressure/io", 0666, NULL, &psi_io_proc_ops);
1396 proc_create("pressure/memory", 0666, NULL, &psi_memory_proc_ops);
1397 proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops);
1398 }
1399 return 0;
1400 }
1401 module_init(psi_proc_init);
1402
1403 #endif