Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * Pressure stall information for CPU, memory and IO
0004  *
0005  * Copyright (c) 2018 Facebook, Inc.
0006  * Author: Johannes Weiner <hannes@cmpxchg.org>
0007  *
0008  * Polling support by Suren Baghdasaryan <surenb@google.com>
0009  * Copyright (c) 2018 Google, Inc.
0010  *
0011  * When CPU, memory and IO are contended, tasks experience delays that
0012  * reduce throughput and introduce latencies into the workload. Memory
0013  * and IO contention, in addition, can cause a full loss of forward
0014  * progress in which the CPU goes idle.
0015  *
0016  * This code aggregates individual task delays into resource pressure
0017  * metrics that indicate problems with both workload health and
0018  * resource utilization.
0019  *
0020  *          Model
0021  *
0022  * The time in which a task can execute on a CPU is our baseline for
0023  * productivity. Pressure expresses the amount of time in which this
0024  * potential cannot be realized due to resource contention.
0025  *
0026  * This concept of productivity has two components: the workload and
0027  * the CPU. To measure the impact of pressure on both, we define two
0028  * contention states for a resource: SOME and FULL.
0029  *
0030  * In the SOME state of a given resource, one or more tasks are
0031  * delayed on that resource. This affects the workload's ability to
0032  * perform work, but the CPU may still be executing other tasks.
0033  *
0034  * In the FULL state of a given resource, all non-idle tasks are
0035  * delayed on that resource such that nobody is advancing and the CPU
0036  * goes idle. This leaves both workload and CPU unproductive.
0037  *
0038  *  SOME = nr_delayed_tasks != 0
0039  *  FULL = nr_delayed_tasks != 0 && nr_productive_tasks == 0
0040  *
0041  * What it means for a task to be productive is defined differently
0042  * for each resource. For IO, productive means a running task. For
0043  * memory, productive means a running task that isn't a reclaimer. For
0044  * CPU, productive means an oncpu task.
0045  *
0046  * Naturally, the FULL state doesn't exist for the CPU resource at the
0047  * system level, but exist at the cgroup level. At the cgroup level,
0048  * FULL means all non-idle tasks in the cgroup are delayed on the CPU
0049  * resource which is being used by others outside of the cgroup or
0050  * throttled by the cgroup cpu.max configuration.
0051  *
0052  * The percentage of wallclock time spent in those compound stall
0053  * states gives pressure numbers between 0 and 100 for each resource,
0054  * where the SOME percentage indicates workload slowdowns and the FULL
0055  * percentage indicates reduced CPU utilization:
0056  *
0057  *  %SOME = time(SOME) / period
0058  *  %FULL = time(FULL) / period
0059  *
0060  *          Multiple CPUs
0061  *
0062  * The more tasks and available CPUs there are, the more work can be
0063  * performed concurrently. This means that the potential that can go
0064  * unrealized due to resource contention *also* scales with non-idle
0065  * tasks and CPUs.
0066  *
0067  * Consider a scenario where 257 number crunching tasks are trying to
0068  * run concurrently on 256 CPUs. If we simply aggregated the task
0069  * states, we would have to conclude a CPU SOME pressure number of
0070  * 100%, since *somebody* is waiting on a runqueue at all
0071  * times. However, that is clearly not the amount of contention the
0072  * workload is experiencing: only one out of 256 possible execution
0073  * threads will be contended at any given time, or about 0.4%.
0074  *
0075  * Conversely, consider a scenario of 4 tasks and 4 CPUs where at any
0076  * given time *one* of the tasks is delayed due to a lack of memory.
0077  * Again, looking purely at the task state would yield a memory FULL
0078  * pressure number of 0%, since *somebody* is always making forward
0079  * progress. But again this wouldn't capture the amount of execution
0080  * potential lost, which is 1 out of 4 CPUs, or 25%.
0081  *
0082  * To calculate wasted potential (pressure) with multiple processors,
0083  * we have to base our calculation on the number of non-idle tasks in
0084  * conjunction with the number of available CPUs, which is the number
0085  * of potential execution threads. SOME becomes then the proportion of
0086  * delayed tasks to possible threads, and FULL is the share of possible
0087  * threads that are unproductive due to delays:
0088  *
0089  *  threads = min(nr_nonidle_tasks, nr_cpus)
0090  *     SOME = min(nr_delayed_tasks / threads, 1)
0091  *     FULL = (threads - min(nr_productive_tasks, threads)) / threads
0092  *
0093  * For the 257 number crunchers on 256 CPUs, this yields:
0094  *
0095  *  threads = min(257, 256)
0096  *     SOME = min(1 / 256, 1)             = 0.4%
0097  *     FULL = (256 - min(256, 256)) / 256 = 0%
0098  *
0099  * For the 1 out of 4 memory-delayed tasks, this yields:
0100  *
0101  *  threads = min(4, 4)
0102  *     SOME = min(1 / 4, 1)               = 25%
0103  *     FULL = (4 - min(3, 4)) / 4         = 25%
0104  *
0105  * [ Substitute nr_cpus with 1, and you can see that it's a natural
0106  *   extension of the single-CPU model. ]
0107  *
0108  *          Implementation
0109  *
0110  * To assess the precise time spent in each such state, we would have
0111  * to freeze the system on task changes and start/stop the state
0112  * clocks accordingly. Obviously that doesn't scale in practice.
0113  *
0114  * Because the scheduler aims to distribute the compute load evenly
0115  * among the available CPUs, we can track task state locally to each
0116  * CPU and, at much lower frequency, extrapolate the global state for
0117  * the cumulative stall times and the running averages.
0118  *
0119  * For each runqueue, we track:
0120  *
0121  *     tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0)
0122  *     tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_productive_tasks[cpu])
0123  *  tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0)
0124  *
0125  * and then periodically aggregate:
0126  *
0127  *  tNONIDLE = sum(tNONIDLE[i])
0128  *
0129  *     tSOME = sum(tSOME[i] * tNONIDLE[i]) / tNONIDLE
0130  *     tFULL = sum(tFULL[i] * tNONIDLE[i]) / tNONIDLE
0131  *
0132  *     %SOME = tSOME / period
0133  *     %FULL = tFULL / period
0134  *
0135  * This gives us an approximation of pressure that is practical
0136  * cost-wise, yet way more sensitive and accurate than periodic
0137  * sampling of the aggregate task states would be.
0138  */
0139 
0140 static int psi_bug __read_mostly;
0141 
0142 DEFINE_STATIC_KEY_FALSE(psi_disabled);
0143 DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled);
0144 
0145 #ifdef CONFIG_PSI_DEFAULT_DISABLED
0146 static bool psi_enable;
0147 #else
0148 static bool psi_enable = true;
0149 #endif
0150 static int __init setup_psi(char *str)
0151 {
0152     return kstrtobool(str, &psi_enable) == 0;
0153 }
0154 __setup("psi=", setup_psi);
0155 
0156 /* Running averages - we need to be higher-res than loadavg */
0157 #define PSI_FREQ    (2*HZ+1)    /* 2 sec intervals */
0158 #define EXP_10s     1677        /* 1/exp(2s/10s) as fixed-point */
0159 #define EXP_60s     1981        /* 1/exp(2s/60s) */
0160 #define EXP_300s    2034        /* 1/exp(2s/300s) */
0161 
0162 /* PSI trigger definitions */
0163 #define WINDOW_MIN_US 500000    /* Min window size is 500ms */
0164 #define WINDOW_MAX_US 10000000  /* Max window size is 10s */
0165 #define UPDATES_PER_WINDOW 10   /* 10 updates per window */
0166 
0167 /* Sampling frequency in nanoseconds */
0168 static u64 psi_period __read_mostly;
0169 
0170 /* System-level pressure and stall tracking */
0171 static DEFINE_PER_CPU(struct psi_group_cpu, system_group_pcpu);
0172 struct psi_group psi_system = {
0173     .pcpu = &system_group_pcpu,
0174 };
0175 
0176 static void psi_avgs_work(struct work_struct *work);
0177 
0178 static void poll_timer_fn(struct timer_list *t);
0179 
0180 static void group_init(struct psi_group *group)
0181 {
0182     int cpu;
0183 
0184     for_each_possible_cpu(cpu)
0185         seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
0186     group->avg_last_update = sched_clock();
0187     group->avg_next_update = group->avg_last_update + psi_period;
0188     INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
0189     mutex_init(&group->avgs_lock);
0190     /* Init trigger-related members */
0191     mutex_init(&group->trigger_lock);
0192     INIT_LIST_HEAD(&group->triggers);
0193     group->poll_min_period = U32_MAX;
0194     group->polling_next_update = ULLONG_MAX;
0195     init_waitqueue_head(&group->poll_wait);
0196     timer_setup(&group->poll_timer, poll_timer_fn, 0);
0197     rcu_assign_pointer(group->poll_task, NULL);
0198 }
0199 
0200 void __init psi_init(void)
0201 {
0202     if (!psi_enable) {
0203         static_branch_enable(&psi_disabled);
0204         return;
0205     }
0206 
0207     if (!cgroup_psi_enabled())
0208         static_branch_disable(&psi_cgroups_enabled);
0209 
0210     psi_period = jiffies_to_nsecs(PSI_FREQ);
0211     group_init(&psi_system);
0212 }
0213 
0214 static bool test_state(unsigned int *tasks, enum psi_states state)
0215 {
0216     switch (state) {
0217     case PSI_IO_SOME:
0218         return unlikely(tasks[NR_IOWAIT]);
0219     case PSI_IO_FULL:
0220         return unlikely(tasks[NR_IOWAIT] && !tasks[NR_RUNNING]);
0221     case PSI_MEM_SOME:
0222         return unlikely(tasks[NR_MEMSTALL]);
0223     case PSI_MEM_FULL:
0224         return unlikely(tasks[NR_MEMSTALL] &&
0225             tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
0226     case PSI_CPU_SOME:
0227         return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]);
0228     case PSI_CPU_FULL:
0229         return unlikely(tasks[NR_RUNNING] && !tasks[NR_ONCPU]);
0230     case PSI_NONIDLE:
0231         return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
0232             tasks[NR_RUNNING];
0233     default:
0234         return false;
0235     }
0236 }
0237 
0238 static void get_recent_times(struct psi_group *group, int cpu,
0239                  enum psi_aggregators aggregator, u32 *times,
0240                  u32 *pchanged_states)
0241 {
0242     struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
0243     u64 now, state_start;
0244     enum psi_states s;
0245     unsigned int seq;
0246     u32 state_mask;
0247 
0248     *pchanged_states = 0;
0249 
0250     /* Snapshot a coherent view of the CPU state */
0251     do {
0252         seq = read_seqcount_begin(&groupc->seq);
0253         now = cpu_clock(cpu);
0254         memcpy(times, groupc->times, sizeof(groupc->times));
0255         state_mask = groupc->state_mask;
0256         state_start = groupc->state_start;
0257     } while (read_seqcount_retry(&groupc->seq, seq));
0258 
0259     /* Calculate state time deltas against the previous snapshot */
0260     for (s = 0; s < NR_PSI_STATES; s++) {
0261         u32 delta;
0262         /*
0263          * In addition to already concluded states, we also
0264          * incorporate currently active states on the CPU,
0265          * since states may last for many sampling periods.
0266          *
0267          * This way we keep our delta sampling buckets small
0268          * (u32) and our reported pressure close to what's
0269          * actually happening.
0270          */
0271         if (state_mask & (1 << s))
0272             times[s] += now - state_start;
0273 
0274         delta = times[s] - groupc->times_prev[aggregator][s];
0275         groupc->times_prev[aggregator][s] = times[s];
0276 
0277         times[s] = delta;
0278         if (delta)
0279             *pchanged_states |= (1 << s);
0280     }
0281 }
0282 
0283 static void calc_avgs(unsigned long avg[3], int missed_periods,
0284               u64 time, u64 period)
0285 {
0286     unsigned long pct;
0287 
0288     /* Fill in zeroes for periods of no activity */
0289     if (missed_periods) {
0290         avg[0] = calc_load_n(avg[0], EXP_10s, 0, missed_periods);
0291         avg[1] = calc_load_n(avg[1], EXP_60s, 0, missed_periods);
0292         avg[2] = calc_load_n(avg[2], EXP_300s, 0, missed_periods);
0293     }
0294 
0295     /* Sample the most recent active period */
0296     pct = div_u64(time * 100, period);
0297     pct *= FIXED_1;
0298     avg[0] = calc_load(avg[0], EXP_10s, pct);
0299     avg[1] = calc_load(avg[1], EXP_60s, pct);
0300     avg[2] = calc_load(avg[2], EXP_300s, pct);
0301 }
0302 
0303 static void collect_percpu_times(struct psi_group *group,
0304                  enum psi_aggregators aggregator,
0305                  u32 *pchanged_states)
0306 {
0307     u64 deltas[NR_PSI_STATES - 1] = { 0, };
0308     unsigned long nonidle_total = 0;
0309     u32 changed_states = 0;
0310     int cpu;
0311     int s;
0312 
0313     /*
0314      * Collect the per-cpu time buckets and average them into a
0315      * single time sample that is normalized to wallclock time.
0316      *
0317      * For averaging, each CPU is weighted by its non-idle time in
0318      * the sampling period. This eliminates artifacts from uneven
0319      * loading, or even entirely idle CPUs.
0320      */
0321     for_each_possible_cpu(cpu) {
0322         u32 times[NR_PSI_STATES];
0323         u32 nonidle;
0324         u32 cpu_changed_states;
0325 
0326         get_recent_times(group, cpu, aggregator, times,
0327                 &cpu_changed_states);
0328         changed_states |= cpu_changed_states;
0329 
0330         nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]);
0331         nonidle_total += nonidle;
0332 
0333         for (s = 0; s < PSI_NONIDLE; s++)
0334             deltas[s] += (u64)times[s] * nonidle;
0335     }
0336 
0337     /*
0338      * Integrate the sample into the running statistics that are
0339      * reported to userspace: the cumulative stall times and the
0340      * decaying averages.
0341      *
0342      * Pressure percentages are sampled at PSI_FREQ. We might be
0343      * called more often when the user polls more frequently than
0344      * that; we might be called less often when there is no task
0345      * activity, thus no data, and clock ticks are sporadic. The
0346      * below handles both.
0347      */
0348 
0349     /* total= */
0350     for (s = 0; s < NR_PSI_STATES - 1; s++)
0351         group->total[aggregator][s] +=
0352                 div_u64(deltas[s], max(nonidle_total, 1UL));
0353 
0354     if (pchanged_states)
0355         *pchanged_states = changed_states;
0356 }
0357 
0358 static u64 update_averages(struct psi_group *group, u64 now)
0359 {
0360     unsigned long missed_periods = 0;
0361     u64 expires, period;
0362     u64 avg_next_update;
0363     int s;
0364 
0365     /* avgX= */
0366     expires = group->avg_next_update;
0367     if (now - expires >= psi_period)
0368         missed_periods = div_u64(now - expires, psi_period);
0369 
0370     /*
0371      * The periodic clock tick can get delayed for various
0372      * reasons, especially on loaded systems. To avoid clock
0373      * drift, we schedule the clock in fixed psi_period intervals.
0374      * But the deltas we sample out of the per-cpu buckets above
0375      * are based on the actual time elapsing between clock ticks.
0376      */
0377     avg_next_update = expires + ((1 + missed_periods) * psi_period);
0378     period = now - (group->avg_last_update + (missed_periods * psi_period));
0379     group->avg_last_update = now;
0380 
0381     for (s = 0; s < NR_PSI_STATES - 1; s++) {
0382         u32 sample;
0383 
0384         sample = group->total[PSI_AVGS][s] - group->avg_total[s];
0385         /*
0386          * Due to the lockless sampling of the time buckets,
0387          * recorded time deltas can slip into the next period,
0388          * which under full pressure can result in samples in
0389          * excess of the period length.
0390          *
0391          * We don't want to report non-sensical pressures in
0392          * excess of 100%, nor do we want to drop such events
0393          * on the floor. Instead we punt any overage into the
0394          * future until pressure subsides. By doing this we
0395          * don't underreport the occurring pressure curve, we
0396          * just report it delayed by one period length.
0397          *
0398          * The error isn't cumulative. As soon as another
0399          * delta slips from a period P to P+1, by definition
0400          * it frees up its time T in P.
0401          */
0402         if (sample > period)
0403             sample = period;
0404         group->avg_total[s] += sample;
0405         calc_avgs(group->avg[s], missed_periods, sample, period);
0406     }
0407 
0408     return avg_next_update;
0409 }
0410 
0411 static void psi_avgs_work(struct work_struct *work)
0412 {
0413     struct delayed_work *dwork;
0414     struct psi_group *group;
0415     u32 changed_states;
0416     bool nonidle;
0417     u64 now;
0418 
0419     dwork = to_delayed_work(work);
0420     group = container_of(dwork, struct psi_group, avgs_work);
0421 
0422     mutex_lock(&group->avgs_lock);
0423 
0424     now = sched_clock();
0425 
0426     collect_percpu_times(group, PSI_AVGS, &changed_states);
0427     nonidle = changed_states & (1 << PSI_NONIDLE);
0428     /*
0429      * If there is task activity, periodically fold the per-cpu
0430      * times and feed samples into the running averages. If things
0431      * are idle and there is no data to process, stop the clock.
0432      * Once restarted, we'll catch up the running averages in one
0433      * go - see calc_avgs() and missed_periods.
0434      */
0435     if (now >= group->avg_next_update)
0436         group->avg_next_update = update_averages(group, now);
0437 
0438     if (nonidle) {
0439         schedule_delayed_work(dwork, nsecs_to_jiffies(
0440                 group->avg_next_update - now) + 1);
0441     }
0442 
0443     mutex_unlock(&group->avgs_lock);
0444 }
0445 
0446 /* Trigger tracking window manipulations */
0447 static void window_reset(struct psi_window *win, u64 now, u64 value,
0448              u64 prev_growth)
0449 {
0450     win->start_time = now;
0451     win->start_value = value;
0452     win->prev_growth = prev_growth;
0453 }
0454 
0455 /*
0456  * PSI growth tracking window update and growth calculation routine.
0457  *
0458  * This approximates a sliding tracking window by interpolating
0459  * partially elapsed windows using historical growth data from the
0460  * previous intervals. This minimizes memory requirements (by not storing
0461  * all the intermediate values in the previous window) and simplifies
0462  * the calculations. It works well because PSI signal changes only in
0463  * positive direction and over relatively small window sizes the growth
0464  * is close to linear.
0465  */
0466 static u64 window_update(struct psi_window *win, u64 now, u64 value)
0467 {
0468     u64 elapsed;
0469     u64 growth;
0470 
0471     elapsed = now - win->start_time;
0472     growth = value - win->start_value;
0473     /*
0474      * After each tracking window passes win->start_value and
0475      * win->start_time get reset and win->prev_growth stores
0476      * the average per-window growth of the previous window.
0477      * win->prev_growth is then used to interpolate additional
0478      * growth from the previous window assuming it was linear.
0479      */
0480     if (elapsed > win->size)
0481         window_reset(win, now, value, growth);
0482     else {
0483         u32 remaining;
0484 
0485         remaining = win->size - elapsed;
0486         growth += div64_u64(win->prev_growth * remaining, win->size);
0487     }
0488 
0489     return growth;
0490 }
0491 
0492 static void init_triggers(struct psi_group *group, u64 now)
0493 {
0494     struct psi_trigger *t;
0495 
0496     list_for_each_entry(t, &group->triggers, node)
0497         window_reset(&t->win, now,
0498                 group->total[PSI_POLL][t->state], 0);
0499     memcpy(group->polling_total, group->total[PSI_POLL],
0500            sizeof(group->polling_total));
0501     group->polling_next_update = now + group->poll_min_period;
0502 }
0503 
0504 static u64 update_triggers(struct psi_group *group, u64 now)
0505 {
0506     struct psi_trigger *t;
0507     bool update_total = false;
0508     u64 *total = group->total[PSI_POLL];
0509 
0510     /*
0511      * On subsequent updates, calculate growth deltas and let
0512      * watchers know when their specified thresholds are exceeded.
0513      */
0514     list_for_each_entry(t, &group->triggers, node) {
0515         u64 growth;
0516         bool new_stall;
0517 
0518         new_stall = group->polling_total[t->state] != total[t->state];
0519 
0520         /* Check for stall activity or a previous threshold breach */
0521         if (!new_stall && !t->pending_event)
0522             continue;
0523         /*
0524          * Check for new stall activity, as well as deferred
0525          * events that occurred in the last window after the
0526          * trigger had already fired (we want to ratelimit
0527          * events without dropping any).
0528          */
0529         if (new_stall) {
0530             /*
0531              * Multiple triggers might be looking at the same state,
0532              * remember to update group->polling_total[] once we've
0533              * been through all of them. Also remember to extend the
0534              * polling time if we see new stall activity.
0535              */
0536             update_total = true;
0537 
0538             /* Calculate growth since last update */
0539             growth = window_update(&t->win, now, total[t->state]);
0540             if (growth < t->threshold)
0541                 continue;
0542 
0543             t->pending_event = true;
0544         }
0545         /* Limit event signaling to once per window */
0546         if (now < t->last_event_time + t->win.size)
0547             continue;
0548 
0549         /* Generate an event */
0550         if (cmpxchg(&t->event, 0, 1) == 0)
0551             wake_up_interruptible(&t->event_wait);
0552         t->last_event_time = now;
0553         /* Reset threshold breach flag once event got generated */
0554         t->pending_event = false;
0555     }
0556 
0557     if (update_total)
0558         memcpy(group->polling_total, total,
0559                 sizeof(group->polling_total));
0560 
0561     return now + group->poll_min_period;
0562 }
0563 
0564 /* Schedule polling if it's not already scheduled. */
0565 static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay)
0566 {
0567     struct task_struct *task;
0568 
0569     /*
0570      * Do not reschedule if already scheduled.
0571      * Possible race with a timer scheduled after this check but before
0572      * mod_timer below can be tolerated because group->polling_next_update
0573      * will keep updates on schedule.
0574      */
0575     if (timer_pending(&group->poll_timer))
0576         return;
0577 
0578     rcu_read_lock();
0579 
0580     task = rcu_dereference(group->poll_task);
0581     /*
0582      * kworker might be NULL in case psi_trigger_destroy races with
0583      * psi_task_change (hotpath) which can't use locks
0584      */
0585     if (likely(task))
0586         mod_timer(&group->poll_timer, jiffies + delay);
0587 
0588     rcu_read_unlock();
0589 }
0590 
0591 static void psi_poll_work(struct psi_group *group)
0592 {
0593     u32 changed_states;
0594     u64 now;
0595 
0596     mutex_lock(&group->trigger_lock);
0597 
0598     now = sched_clock();
0599 
0600     collect_percpu_times(group, PSI_POLL, &changed_states);
0601 
0602     if (changed_states & group->poll_states) {
0603         /* Initialize trigger windows when entering polling mode */
0604         if (now > group->polling_until)
0605             init_triggers(group, now);
0606 
0607         /*
0608          * Keep the monitor active for at least the duration of the
0609          * minimum tracking window as long as monitor states are
0610          * changing.
0611          */
0612         group->polling_until = now +
0613             group->poll_min_period * UPDATES_PER_WINDOW;
0614     }
0615 
0616     if (now > group->polling_until) {
0617         group->polling_next_update = ULLONG_MAX;
0618         goto out;
0619     }
0620 
0621     if (now >= group->polling_next_update)
0622         group->polling_next_update = update_triggers(group, now);
0623 
0624     psi_schedule_poll_work(group,
0625         nsecs_to_jiffies(group->polling_next_update - now) + 1);
0626 
0627 out:
0628     mutex_unlock(&group->trigger_lock);
0629 }
0630 
0631 static int psi_poll_worker(void *data)
0632 {
0633     struct psi_group *group = (struct psi_group *)data;
0634 
0635     sched_set_fifo_low(current);
0636 
0637     while (true) {
0638         wait_event_interruptible(group->poll_wait,
0639                 atomic_cmpxchg(&group->poll_wakeup, 1, 0) ||
0640                 kthread_should_stop());
0641         if (kthread_should_stop())
0642             break;
0643 
0644         psi_poll_work(group);
0645     }
0646     return 0;
0647 }
0648 
0649 static void poll_timer_fn(struct timer_list *t)
0650 {
0651     struct psi_group *group = from_timer(group, t, poll_timer);
0652 
0653     atomic_set(&group->poll_wakeup, 1);
0654     wake_up_interruptible(&group->poll_wait);
0655 }
0656 
0657 static void record_times(struct psi_group_cpu *groupc, u64 now)
0658 {
0659     u32 delta;
0660 
0661     delta = now - groupc->state_start;
0662     groupc->state_start = now;
0663 
0664     if (groupc->state_mask & (1 << PSI_IO_SOME)) {
0665         groupc->times[PSI_IO_SOME] += delta;
0666         if (groupc->state_mask & (1 << PSI_IO_FULL))
0667             groupc->times[PSI_IO_FULL] += delta;
0668     }
0669 
0670     if (groupc->state_mask & (1 << PSI_MEM_SOME)) {
0671         groupc->times[PSI_MEM_SOME] += delta;
0672         if (groupc->state_mask & (1 << PSI_MEM_FULL))
0673             groupc->times[PSI_MEM_FULL] += delta;
0674     }
0675 
0676     if (groupc->state_mask & (1 << PSI_CPU_SOME)) {
0677         groupc->times[PSI_CPU_SOME] += delta;
0678         if (groupc->state_mask & (1 << PSI_CPU_FULL))
0679             groupc->times[PSI_CPU_FULL] += delta;
0680     }
0681 
0682     if (groupc->state_mask & (1 << PSI_NONIDLE))
0683         groupc->times[PSI_NONIDLE] += delta;
0684 }
0685 
0686 static void psi_group_change(struct psi_group *group, int cpu,
0687                  unsigned int clear, unsigned int set, u64 now,
0688                  bool wake_clock)
0689 {
0690     struct psi_group_cpu *groupc;
0691     u32 state_mask = 0;
0692     unsigned int t, m;
0693     enum psi_states s;
0694 
0695     groupc = per_cpu_ptr(group->pcpu, cpu);
0696 
0697     /*
0698      * First we assess the aggregate resource states this CPU's
0699      * tasks have been in since the last change, and account any
0700      * SOME and FULL time these may have resulted in.
0701      *
0702      * Then we update the task counts according to the state
0703      * change requested through the @clear and @set bits.
0704      */
0705     write_seqcount_begin(&groupc->seq);
0706 
0707     record_times(groupc, now);
0708 
0709     for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
0710         if (!(m & (1 << t)))
0711             continue;
0712         if (groupc->tasks[t]) {
0713             groupc->tasks[t]--;
0714         } else if (!psi_bug) {
0715             printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n",
0716                     cpu, t, groupc->tasks[0],
0717                     groupc->tasks[1], groupc->tasks[2],
0718                     groupc->tasks[3], groupc->tasks[4],
0719                     clear, set);
0720             psi_bug = 1;
0721         }
0722     }
0723 
0724     for (t = 0; set; set &= ~(1 << t), t++)
0725         if (set & (1 << t))
0726             groupc->tasks[t]++;
0727 
0728     /* Calculate state mask representing active states */
0729     for (s = 0; s < NR_PSI_STATES; s++) {
0730         if (test_state(groupc->tasks, s))
0731             state_mask |= (1 << s);
0732     }
0733 
0734     /*
0735      * Since we care about lost potential, a memstall is FULL
0736      * when there are no other working tasks, but also when
0737      * the CPU is actively reclaiming and nothing productive
0738      * could run even if it were runnable. So when the current
0739      * task in a cgroup is in_memstall, the corresponding groupc
0740      * on that cpu is in PSI_MEM_FULL state.
0741      */
0742     if (unlikely(groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall))
0743         state_mask |= (1 << PSI_MEM_FULL);
0744 
0745     groupc->state_mask = state_mask;
0746 
0747     write_seqcount_end(&groupc->seq);
0748 
0749     if (state_mask & group->poll_states)
0750         psi_schedule_poll_work(group, 1);
0751 
0752     if (wake_clock && !delayed_work_pending(&group->avgs_work))
0753         schedule_delayed_work(&group->avgs_work, PSI_FREQ);
0754 }
0755 
0756 static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
0757 {
0758     if (*iter == &psi_system)
0759         return NULL;
0760 
0761 #ifdef CONFIG_CGROUPS
0762     if (static_branch_likely(&psi_cgroups_enabled)) {
0763         struct cgroup *cgroup = NULL;
0764 
0765         if (!*iter)
0766             cgroup = task->cgroups->dfl_cgrp;
0767         else
0768             cgroup = cgroup_parent(*iter);
0769 
0770         if (cgroup && cgroup_parent(cgroup)) {
0771             *iter = cgroup;
0772             return cgroup_psi(cgroup);
0773         }
0774     }
0775 #endif
0776     *iter = &psi_system;
0777     return &psi_system;
0778 }
0779 
0780 static void psi_flags_change(struct task_struct *task, int clear, int set)
0781 {
0782     if (((task->psi_flags & set) ||
0783          (task->psi_flags & clear) != clear) &&
0784         !psi_bug) {
0785         printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n",
0786                 task->pid, task->comm, task_cpu(task),
0787                 task->psi_flags, clear, set);
0788         psi_bug = 1;
0789     }
0790 
0791     task->psi_flags &= ~clear;
0792     task->psi_flags |= set;
0793 }
0794 
0795 void psi_task_change(struct task_struct *task, int clear, int set)
0796 {
0797     int cpu = task_cpu(task);
0798     struct psi_group *group;
0799     bool wake_clock = true;
0800     void *iter = NULL;
0801     u64 now;
0802 
0803     if (!task->pid)
0804         return;
0805 
0806     psi_flags_change(task, clear, set);
0807 
0808     now = cpu_clock(cpu);
0809     /*
0810      * Periodic aggregation shuts off if there is a period of no
0811      * task changes, so we wake it back up if necessary. However,
0812      * don't do this if the task change is the aggregation worker
0813      * itself going to sleep, or we'll ping-pong forever.
0814      */
0815     if (unlikely((clear & TSK_RUNNING) &&
0816              (task->flags & PF_WQ_WORKER) &&
0817              wq_worker_last_func(task) == psi_avgs_work))
0818         wake_clock = false;
0819 
0820     while ((group = iterate_groups(task, &iter)))
0821         psi_group_change(group, cpu, clear, set, now, wake_clock);
0822 }
0823 
0824 void psi_task_switch(struct task_struct *prev, struct task_struct *next,
0825              bool sleep)
0826 {
0827     struct psi_group *group, *common = NULL;
0828     int cpu = task_cpu(prev);
0829     void *iter;
0830     u64 now = cpu_clock(cpu);
0831 
0832     if (next->pid) {
0833         bool identical_state;
0834 
0835         psi_flags_change(next, 0, TSK_ONCPU);
0836         /*
0837          * When switching between tasks that have an identical
0838          * runtime state, the cgroup that contains both tasks
0839          * we reach the first common ancestor. Iterate @next's
0840          * ancestors only until we encounter @prev's ONCPU.
0841          */
0842         identical_state = prev->psi_flags == next->psi_flags;
0843         iter = NULL;
0844         while ((group = iterate_groups(next, &iter))) {
0845             if (identical_state &&
0846                 per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) {
0847                 common = group;
0848                 break;
0849             }
0850 
0851             psi_group_change(group, cpu, 0, TSK_ONCPU, now, true);
0852         }
0853     }
0854 
0855     if (prev->pid) {
0856         int clear = TSK_ONCPU, set = 0;
0857 
0858         /*
0859          * When we're going to sleep, psi_dequeue() lets us
0860          * handle TSK_RUNNING, TSK_MEMSTALL_RUNNING and
0861          * TSK_IOWAIT here, where we can combine it with
0862          * TSK_ONCPU and save walking common ancestors twice.
0863          */
0864         if (sleep) {
0865             clear |= TSK_RUNNING;
0866             if (prev->in_memstall)
0867                 clear |= TSK_MEMSTALL_RUNNING;
0868             if (prev->in_iowait)
0869                 set |= TSK_IOWAIT;
0870         }
0871 
0872         psi_flags_change(prev, clear, set);
0873 
0874         iter = NULL;
0875         while ((group = iterate_groups(prev, &iter)) && group != common)
0876             psi_group_change(group, cpu, clear, set, now, true);
0877 
0878         /*
0879          * TSK_ONCPU is handled up to the common ancestor. If we're tasked
0880          * with dequeuing too, finish that for the rest of the hierarchy.
0881          */
0882         if (sleep) {
0883             clear &= ~TSK_ONCPU;
0884             for (; group; group = iterate_groups(prev, &iter))
0885                 psi_group_change(group, cpu, clear, set, now, true);
0886         }
0887     }
0888 }
0889 
0890 /**
0891  * psi_memstall_enter - mark the beginning of a memory stall section
0892  * @flags: flags to handle nested sections
0893  *
0894  * Marks the calling task as being stalled due to a lack of memory,
0895  * such as waiting for a refault or performing reclaim.
0896  */
0897 void psi_memstall_enter(unsigned long *flags)
0898 {
0899     struct rq_flags rf;
0900     struct rq *rq;
0901 
0902     if (static_branch_likely(&psi_disabled))
0903         return;
0904 
0905     *flags = current->in_memstall;
0906     if (*flags)
0907         return;
0908     /*
0909      * in_memstall setting & accounting needs to be atomic wrt
0910      * changes to the task's scheduling state, otherwise we can
0911      * race with CPU migration.
0912      */
0913     rq = this_rq_lock_irq(&rf);
0914 
0915     current->in_memstall = 1;
0916     psi_task_change(current, 0, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);
0917 
0918     rq_unlock_irq(rq, &rf);
0919 }
0920 
0921 /**
0922  * psi_memstall_leave - mark the end of an memory stall section
0923  * @flags: flags to handle nested memdelay sections
0924  *
0925  * Marks the calling task as no longer stalled due to lack of memory.
0926  */
0927 void psi_memstall_leave(unsigned long *flags)
0928 {
0929     struct rq_flags rf;
0930     struct rq *rq;
0931 
0932     if (static_branch_likely(&psi_disabled))
0933         return;
0934 
0935     if (*flags)
0936         return;
0937     /*
0938      * in_memstall clearing & accounting needs to be atomic wrt
0939      * changes to the task's scheduling state, otherwise we could
0940      * race with CPU migration.
0941      */
0942     rq = this_rq_lock_irq(&rf);
0943 
0944     current->in_memstall = 0;
0945     psi_task_change(current, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING, 0);
0946 
0947     rq_unlock_irq(rq, &rf);
0948 }
0949 
0950 #ifdef CONFIG_CGROUPS
0951 int psi_cgroup_alloc(struct cgroup *cgroup)
0952 {
0953     if (static_branch_likely(&psi_disabled))
0954         return 0;
0955 
0956     cgroup->psi = kzalloc(sizeof(struct psi_group), GFP_KERNEL);
0957     if (!cgroup->psi)
0958         return -ENOMEM;
0959 
0960     cgroup->psi->pcpu = alloc_percpu(struct psi_group_cpu);
0961     if (!cgroup->psi->pcpu) {
0962         kfree(cgroup->psi);
0963         return -ENOMEM;
0964     }
0965     group_init(cgroup->psi);
0966     return 0;
0967 }
0968 
0969 void psi_cgroup_free(struct cgroup *cgroup)
0970 {
0971     if (static_branch_likely(&psi_disabled))
0972         return;
0973 
0974     cancel_delayed_work_sync(&cgroup->psi->avgs_work);
0975     free_percpu(cgroup->psi->pcpu);
0976     /* All triggers must be removed by now */
0977     WARN_ONCE(cgroup->psi->poll_states, "psi: trigger leak\n");
0978     kfree(cgroup->psi);
0979 }
0980 
0981 /**
0982  * cgroup_move_task - move task to a different cgroup
0983  * @task: the task
0984  * @to: the target css_set
0985  *
0986  * Move task to a new cgroup and safely migrate its associated stall
0987  * state between the different groups.
0988  *
0989  * This function acquires the task's rq lock to lock out concurrent
0990  * changes to the task's scheduling state and - in case the task is
0991  * running - concurrent changes to its stall state.
0992  */
0993 void cgroup_move_task(struct task_struct *task, struct css_set *to)
0994 {
0995     unsigned int task_flags;
0996     struct rq_flags rf;
0997     struct rq *rq;
0998 
0999     if (static_branch_likely(&psi_disabled)) {
1000         /*
1001          * Lame to do this here, but the scheduler cannot be locked
1002          * from the outside, so we move cgroups from inside sched/.
1003          */
1004         rcu_assign_pointer(task->cgroups, to);
1005         return;
1006     }
1007 
1008     rq = task_rq_lock(task, &rf);
1009 
1010     /*
1011      * We may race with schedule() dropping the rq lock between
1012      * deactivating prev and switching to next. Because the psi
1013      * updates from the deactivation are deferred to the switch
1014      * callback to save cgroup tree updates, the task's scheduling
1015      * state here is not coherent with its psi state:
1016      *
1017      * schedule()                   cgroup_move_task()
1018      *   rq_lock()
1019      *   deactivate_task()
1020      *     p->on_rq = 0
1021      *     psi_dequeue() // defers TSK_RUNNING & TSK_IOWAIT updates
1022      *   pick_next_task()
1023      *     rq_unlock()
1024      *                                rq_lock()
1025      *                                psi_task_change() // old cgroup
1026      *                                task->cgroups = to
1027      *                                psi_task_change() // new cgroup
1028      *                                rq_unlock()
1029      *     rq_lock()
1030      *   psi_sched_switch() // does deferred updates in new cgroup
1031      *
1032      * Don't rely on the scheduling state. Use psi_flags instead.
1033      */
1034     task_flags = task->psi_flags;
1035 
1036     if (task_flags)
1037         psi_task_change(task, task_flags, 0);
1038 
1039     /* See comment above */
1040     rcu_assign_pointer(task->cgroups, to);
1041 
1042     if (task_flags)
1043         psi_task_change(task, 0, task_flags);
1044 
1045     task_rq_unlock(rq, task, &rf);
1046 }
1047 #endif /* CONFIG_CGROUPS */
1048 
1049 int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
1050 {
1051     int full;
1052     u64 now;
1053 
1054     if (static_branch_likely(&psi_disabled))
1055         return -EOPNOTSUPP;
1056 
1057     /* Update averages before reporting them */
1058     mutex_lock(&group->avgs_lock);
1059     now = sched_clock();
1060     collect_percpu_times(group, PSI_AVGS, NULL);
1061     if (now >= group->avg_next_update)
1062         group->avg_next_update = update_averages(group, now);
1063     mutex_unlock(&group->avgs_lock);
1064 
1065     for (full = 0; full < 2; full++) {
1066         unsigned long avg[3] = { 0, };
1067         u64 total = 0;
1068         int w;
1069 
1070         /* CPU FULL is undefined at the system level */
1071         if (!(group == &psi_system && res == PSI_CPU && full)) {
1072             for (w = 0; w < 3; w++)
1073                 avg[w] = group->avg[res * 2 + full][w];
1074             total = div_u64(group->total[PSI_AVGS][res * 2 + full],
1075                     NSEC_PER_USEC);
1076         }
1077 
1078         seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
1079                full ? "full" : "some",
1080                LOAD_INT(avg[0]), LOAD_FRAC(avg[0]),
1081                LOAD_INT(avg[1]), LOAD_FRAC(avg[1]),
1082                LOAD_INT(avg[2]), LOAD_FRAC(avg[2]),
1083                total);
1084     }
1085 
1086     return 0;
1087 }
1088 
1089 struct psi_trigger *psi_trigger_create(struct psi_group *group,
1090             char *buf, enum psi_res res)
1091 {
1092     struct psi_trigger *t;
1093     enum psi_states state;
1094     u32 threshold_us;
1095     u32 window_us;
1096 
1097     if (static_branch_likely(&psi_disabled))
1098         return ERR_PTR(-EOPNOTSUPP);
1099 
1100     if (sscanf(buf, "some %u %u", &threshold_us, &window_us) == 2)
1101         state = PSI_IO_SOME + res * 2;
1102     else if (sscanf(buf, "full %u %u", &threshold_us, &window_us) == 2)
1103         state = PSI_IO_FULL + res * 2;
1104     else
1105         return ERR_PTR(-EINVAL);
1106 
1107     if (state >= PSI_NONIDLE)
1108         return ERR_PTR(-EINVAL);
1109 
1110     if (window_us < WINDOW_MIN_US ||
1111         window_us > WINDOW_MAX_US)
1112         return ERR_PTR(-EINVAL);
1113 
1114     /* Check threshold */
1115     if (threshold_us == 0 || threshold_us > window_us)
1116         return ERR_PTR(-EINVAL);
1117 
1118     t = kmalloc(sizeof(*t), GFP_KERNEL);
1119     if (!t)
1120         return ERR_PTR(-ENOMEM);
1121 
1122     t->group = group;
1123     t->state = state;
1124     t->threshold = threshold_us * NSEC_PER_USEC;
1125     t->win.size = window_us * NSEC_PER_USEC;
1126     window_reset(&t->win, sched_clock(),
1127             group->total[PSI_POLL][t->state], 0);
1128 
1129     t->event = 0;
1130     t->last_event_time = 0;
1131     init_waitqueue_head(&t->event_wait);
1132     t->pending_event = false;
1133 
1134     mutex_lock(&group->trigger_lock);
1135 
1136     if (!rcu_access_pointer(group->poll_task)) {
1137         struct task_struct *task;
1138 
1139         task = kthread_create(psi_poll_worker, group, "psimon");
1140         if (IS_ERR(task)) {
1141             kfree(t);
1142             mutex_unlock(&group->trigger_lock);
1143             return ERR_CAST(task);
1144         }
1145         atomic_set(&group->poll_wakeup, 0);
1146         wake_up_process(task);
1147         rcu_assign_pointer(group->poll_task, task);
1148     }
1149 
1150     list_add(&t->node, &group->triggers);
1151     group->poll_min_period = min(group->poll_min_period,
1152         div_u64(t->win.size, UPDATES_PER_WINDOW));
1153     group->nr_triggers[t->state]++;
1154     group->poll_states |= (1 << t->state);
1155 
1156     mutex_unlock(&group->trigger_lock);
1157 
1158     return t;
1159 }
1160 
1161 void psi_trigger_destroy(struct psi_trigger *t)
1162 {
1163     struct psi_group *group;
1164     struct task_struct *task_to_destroy = NULL;
1165 
1166     /*
1167      * We do not check psi_disabled since it might have been disabled after
1168      * the trigger got created.
1169      */
1170     if (!t)
1171         return;
1172 
1173     group = t->group;
1174     /*
1175      * Wakeup waiters to stop polling. Can happen if cgroup is deleted
1176      * from under a polling process.
1177      */
1178     wake_up_interruptible(&t->event_wait);
1179 
1180     mutex_lock(&group->trigger_lock);
1181 
1182     if (!list_empty(&t->node)) {
1183         struct psi_trigger *tmp;
1184         u64 period = ULLONG_MAX;
1185 
1186         list_del(&t->node);
1187         group->nr_triggers[t->state]--;
1188         if (!group->nr_triggers[t->state])
1189             group->poll_states &= ~(1 << t->state);
1190         /* reset min update period for the remaining triggers */
1191         list_for_each_entry(tmp, &group->triggers, node)
1192             period = min(period, div_u64(tmp->win.size,
1193                     UPDATES_PER_WINDOW));
1194         group->poll_min_period = period;
1195         /* Destroy poll_task when the last trigger is destroyed */
1196         if (group->poll_states == 0) {
1197             group->polling_until = 0;
1198             task_to_destroy = rcu_dereference_protected(
1199                     group->poll_task,
1200                     lockdep_is_held(&group->trigger_lock));
1201             rcu_assign_pointer(group->poll_task, NULL);
1202             del_timer(&group->poll_timer);
1203         }
1204     }
1205 
1206     mutex_unlock(&group->trigger_lock);
1207 
1208     /*
1209      * Wait for psi_schedule_poll_work RCU to complete its read-side
1210      * critical section before destroying the trigger and optionally the
1211      * poll_task.
1212      */
1213     synchronize_rcu();
1214     /*
1215      * Stop kthread 'psimon' after releasing trigger_lock to prevent a
1216      * deadlock while waiting for psi_poll_work to acquire trigger_lock
1217      */
1218     if (task_to_destroy) {
1219         /*
1220          * After the RCU grace period has expired, the worker
1221          * can no longer be found through group->poll_task.
1222          */
1223         kthread_stop(task_to_destroy);
1224     }
1225     kfree(t);
1226 }
1227 
1228 __poll_t psi_trigger_poll(void **trigger_ptr,
1229                 struct file *file, poll_table *wait)
1230 {
1231     __poll_t ret = DEFAULT_POLLMASK;
1232     struct psi_trigger *t;
1233 
1234     if (static_branch_likely(&psi_disabled))
1235         return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
1236 
1237     t = smp_load_acquire(trigger_ptr);
1238     if (!t)
1239         return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
1240 
1241     poll_wait(file, &t->event_wait, wait);
1242 
1243     if (cmpxchg(&t->event, 1, 0) == 1)
1244         ret |= EPOLLPRI;
1245 
1246     return ret;
1247 }
1248 
1249 #ifdef CONFIG_PROC_FS
1250 static int psi_io_show(struct seq_file *m, void *v)
1251 {
1252     return psi_show(m, &psi_system, PSI_IO);
1253 }
1254 
1255 static int psi_memory_show(struct seq_file *m, void *v)
1256 {
1257     return psi_show(m, &psi_system, PSI_MEM);
1258 }
1259 
1260 static int psi_cpu_show(struct seq_file *m, void *v)
1261 {
1262     return psi_show(m, &psi_system, PSI_CPU);
1263 }
1264 
1265 static int psi_open(struct file *file, int (*psi_show)(struct seq_file *, void *))
1266 {
1267     if (file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE))
1268         return -EPERM;
1269 
1270     return single_open(file, psi_show, NULL);
1271 }
1272 
1273 static int psi_io_open(struct inode *inode, struct file *file)
1274 {
1275     return psi_open(file, psi_io_show);
1276 }
1277 
1278 static int psi_memory_open(struct inode *inode, struct file *file)
1279 {
1280     return psi_open(file, psi_memory_show);
1281 }
1282 
1283 static int psi_cpu_open(struct inode *inode, struct file *file)
1284 {
1285     return psi_open(file, psi_cpu_show);
1286 }
1287 
1288 static ssize_t psi_write(struct file *file, const char __user *user_buf,
1289              size_t nbytes, enum psi_res res)
1290 {
1291     char buf[32];
1292     size_t buf_size;
1293     struct seq_file *seq;
1294     struct psi_trigger *new;
1295 
1296     if (static_branch_likely(&psi_disabled))
1297         return -EOPNOTSUPP;
1298 
1299     if (!nbytes)
1300         return -EINVAL;
1301 
1302     buf_size = min(nbytes, sizeof(buf));
1303     if (copy_from_user(buf, user_buf, buf_size))
1304         return -EFAULT;
1305 
1306     buf[buf_size - 1] = '\0';
1307 
1308     seq = file->private_data;
1309 
1310     /* Take seq->lock to protect seq->private from concurrent writes */
1311     mutex_lock(&seq->lock);
1312 
1313     /* Allow only one trigger per file descriptor */
1314     if (seq->private) {
1315         mutex_unlock(&seq->lock);
1316         return -EBUSY;
1317     }
1318 
1319     new = psi_trigger_create(&psi_system, buf, res);
1320     if (IS_ERR(new)) {
1321         mutex_unlock(&seq->lock);
1322         return PTR_ERR(new);
1323     }
1324 
1325     smp_store_release(&seq->private, new);
1326     mutex_unlock(&seq->lock);
1327 
1328     return nbytes;
1329 }
1330 
1331 static ssize_t psi_io_write(struct file *file, const char __user *user_buf,
1332                 size_t nbytes, loff_t *ppos)
1333 {
1334     return psi_write(file, user_buf, nbytes, PSI_IO);
1335 }
1336 
1337 static ssize_t psi_memory_write(struct file *file, const char __user *user_buf,
1338                 size_t nbytes, loff_t *ppos)
1339 {
1340     return psi_write(file, user_buf, nbytes, PSI_MEM);
1341 }
1342 
1343 static ssize_t psi_cpu_write(struct file *file, const char __user *user_buf,
1344                  size_t nbytes, loff_t *ppos)
1345 {
1346     return psi_write(file, user_buf, nbytes, PSI_CPU);
1347 }
1348 
1349 static __poll_t psi_fop_poll(struct file *file, poll_table *wait)
1350 {
1351     struct seq_file *seq = file->private_data;
1352 
1353     return psi_trigger_poll(&seq->private, file, wait);
1354 }
1355 
1356 static int psi_fop_release(struct inode *inode, struct file *file)
1357 {
1358     struct seq_file *seq = file->private_data;
1359 
1360     psi_trigger_destroy(seq->private);
1361     return single_release(inode, file);
1362 }
1363 
1364 static const struct proc_ops psi_io_proc_ops = {
1365     .proc_open  = psi_io_open,
1366     .proc_read  = seq_read,
1367     .proc_lseek = seq_lseek,
1368     .proc_write = psi_io_write,
1369     .proc_poll  = psi_fop_poll,
1370     .proc_release   = psi_fop_release,
1371 };
1372 
1373 static const struct proc_ops psi_memory_proc_ops = {
1374     .proc_open  = psi_memory_open,
1375     .proc_read  = seq_read,
1376     .proc_lseek = seq_lseek,
1377     .proc_write = psi_memory_write,
1378     .proc_poll  = psi_fop_poll,
1379     .proc_release   = psi_fop_release,
1380 };
1381 
1382 static const struct proc_ops psi_cpu_proc_ops = {
1383     .proc_open  = psi_cpu_open,
1384     .proc_read  = seq_read,
1385     .proc_lseek = seq_lseek,
1386     .proc_write = psi_cpu_write,
1387     .proc_poll  = psi_fop_poll,
1388     .proc_release   = psi_fop_release,
1389 };
1390 
1391 static int __init psi_proc_init(void)
1392 {
1393     if (psi_enable) {
1394         proc_mkdir("pressure", NULL);
1395         proc_create("pressure/io", 0666, NULL, &psi_io_proc_ops);
1396         proc_create("pressure/memory", 0666, NULL, &psi_memory_proc_ops);
1397         proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops);
1398     }
1399     return 0;
1400 }
1401 module_init(psi_proc_init);
1402 
1403 #endif /* CONFIG_PROC_FS */