include/linux/psi_types.h

0001 /* SPDX-License-Identifier: GPL-2.0 */
0002 #ifndef _LINUX_PSI_TYPES_H
0003 #define _LINUX_PSI_TYPES_H
0004
0005 #include <linux/kthread.h>
0006 #include <linux/seqlock.h>
0007 #include <linux/types.h>
0008 #include <linux/kref.h>
0009 #include <linux/wait.h>
0010
0011 #ifdef CONFIG_PSI
0012
0013 /* Tracked task states */
0014 enum psi_task_count {
0015     NR_IOWAIT,
0016     NR_MEMSTALL,
0017     NR_RUNNING,
0018     /*
0019      * This can't have values other than 0 or 1 and could be
0020      * implemented as a bit flag. But for now we still have room
0021      * in the first cacheline of psi_group_cpu, and this way we
0022      * don't have to special case any state tracking for it.
0023      */
0024     NR_ONCPU,
0025     /*
0026      * For IO and CPU stalls the presence of running/oncpu tasks
0027      * in the domain means a partial rather than a full stall.
0028      * For memory it's not so simple because of page reclaimers:
0029      * they are running/oncpu while representing a stall. To tell
0030      * whether a domain has productivity left or not, we need to
0031      * distinguish between regular running (i.e. productive)
0032      * threads and memstall ones.
0033      */
0034     NR_MEMSTALL_RUNNING,
0035     NR_PSI_TASK_COUNTS = 5,
0036 };
0037
0038 /* Task state bitmasks */
0039 #define TSK_IOWAIT  (1 << NR_IOWAIT)
0040 #define TSK_MEMSTALL    (1 << NR_MEMSTALL)
0041 #define TSK_RUNNING (1 << NR_RUNNING)
0042 #define TSK_ONCPU   (1 << NR_ONCPU)
0043 #define TSK_MEMSTALL_RUNNING    (1 << NR_MEMSTALL_RUNNING)
0044
0045 /* Resources that workloads could be stalled on */
0046 enum psi_res {
0047     PSI_IO,
0048     PSI_MEM,
0049     PSI_CPU,
0050     NR_PSI_RESOURCES = 3,
0051 };
0052
0053 /*
0054  * Pressure states for each resource:
0055  *
0056  * SOME: Stalled tasks & working tasks
0057  * FULL: Stalled tasks & no working tasks
0058  */
0059 enum psi_states {
0060     PSI_IO_SOME,
0061     PSI_IO_FULL,
0062     PSI_MEM_SOME,
0063     PSI_MEM_FULL,
0064     PSI_CPU_SOME,
0065     PSI_CPU_FULL,
0066     /* Only per-CPU, to weigh the CPU in the global average: */
0067     PSI_NONIDLE,
0068     NR_PSI_STATES = 7,
0069 };
0070
0071 enum psi_aggregators {
0072     PSI_AVGS = 0,
0073     PSI_POLL,
0074     NR_PSI_AGGREGATORS,
0075 };
0076
0077 struct psi_group_cpu {
0078     /* 1st cacheline updated by the scheduler */
0079
0080     /* Aggregator needs to know of concurrent changes */
0081     seqcount_t seq ____cacheline_aligned_in_smp;
0082
0083     /* States of the tasks belonging to this group */
0084     unsigned int tasks[NR_PSI_TASK_COUNTS];
0085
0086     /* Aggregate pressure state derived from the tasks */
0087     u32 state_mask;
0088
0089     /* Period time sampling buckets for each state of interest (ns) */
0090     u32 times[NR_PSI_STATES];
0091
0092     /* Time of last task change in this group (rq_clock) */
0093     u64 state_start;
0094
0095     /* 2nd cacheline updated by the aggregator */
0096
0097     /* Delta detection against the sampling buckets */
0098     u32 times_prev[NR_PSI_AGGREGATORS][NR_PSI_STATES]
0099             ____cacheline_aligned_in_smp;
0100 };
0101
0102 /* PSI growth tracking window */
0103 struct psi_window {
0104     /* Window size in ns */
0105     u64 size;
0106
0107     /* Start time of the current window in ns */
0108     u64 start_time;
0109
0110     /* Value at the start of the window */
0111     u64 start_value;
0112
0113     /* Value growth in the previous window */
0114     u64 prev_growth;
0115 };
0116
0117 struct psi_trigger {
0118     /* PSI state being monitored by the trigger */
0119     enum psi_states state;
0120
0121     /* User-spacified threshold in ns */
0122     u64 threshold;
0123
0124     /* List node inside triggers list */
0125     struct list_head node;
0126
0127     /* Backpointer needed during trigger destruction */
0128     struct psi_group *group;
0129
0130     /* Wait queue for polling */
0131     wait_queue_head_t event_wait;
0132
0133     /* Pending event flag */
0134     int event;
0135
0136     /* Tracking window */
0137     struct psi_window win;
0138
0139     /*
0140      * Time last event was generated. Used for rate-limiting
0141      * events to one per window
0142      */
0143     u64 last_event_time;
0144
0145     /* Deferred event(s) from previous ratelimit window */
0146     bool pending_event;
0147 };
0148
0149 struct psi_group {
0150     /* Protects data used by the aggregator */
0151     struct mutex avgs_lock;
0152
0153     /* Per-cpu task state & time tracking */
0154     struct psi_group_cpu __percpu *pcpu;
0155
0156     /* Running pressure averages */
0157     u64 avg_total[NR_PSI_STATES - 1];
0158     u64 avg_last_update;
0159     u64 avg_next_update;
0160
0161     /* Aggregator work control */
0162     struct delayed_work avgs_work;
0163
0164     /* Total stall times and sampled pressure averages */
0165     u64 total[NR_PSI_AGGREGATORS][NR_PSI_STATES - 1];
0166     unsigned long avg[NR_PSI_STATES - 1][3];
0167
0168     /* Monitor work control */
0169     struct task_struct __rcu *poll_task;
0170     struct timer_list poll_timer;
0171     wait_queue_head_t poll_wait;
0172     atomic_t poll_wakeup;
0173
0174     /* Protects data used by the monitor */
0175     struct mutex trigger_lock;
0176
0177     /* Configured polling triggers */
0178     struct list_head triggers;
0179     u32 nr_triggers[NR_PSI_STATES - 1];
0180     u32 poll_states;
0181     u64 poll_min_period;
0182
0183     /* Total stall times at the start of monitor activation */
0184     u64 polling_total[NR_PSI_STATES - 1];
0185     u64 polling_next_update;
0186     u64 polling_until;
0187 };
0188
0189 #else /* CONFIG_PSI */
0190
0191 struct psi_group { };
0192
0193 #endif /* CONFIG_PSI */
0194
0195 #endif /* _LINUX_PSI_TYPES_H */