0001
0002
0003 #include "vmlinux.h"
0004 #include <bpf/bpf_helpers.h>
0005 #include <bpf/bpf_tracing.h>
0006 #include <bpf/bpf_core_read.h>
0007
0008
0009 #define PF_KTHREAD 0x00200000
0010
0011
0012 #define TASK_INTERRUPTIBLE 0x0001
0013 #define TASK_UNINTERRUPTIBLE 0x0002
0014
0015
0016 #define CLONE_THREAD 0x10000
0017
0018 #define MAX_STACKS 32
0019 #define MAX_ENTRIES 102400
0020
0021 struct tstamp_data {
0022 __u32 stack_id;
0023 __u32 state;
0024 __u64 timestamp;
0025 };
0026
0027 struct offcpu_key {
0028 __u32 pid;
0029 __u32 tgid;
0030 __u32 stack_id;
0031 __u32 state;
0032 __u64 cgroup_id;
0033 };
0034
0035 struct {
0036 __uint(type, BPF_MAP_TYPE_STACK_TRACE);
0037 __uint(key_size, sizeof(__u32));
0038 __uint(value_size, MAX_STACKS * sizeof(__u64));
0039 __uint(max_entries, MAX_ENTRIES);
0040 } stacks SEC(".maps");
0041
0042 struct {
0043 __uint(type, BPF_MAP_TYPE_TASK_STORAGE);
0044 __uint(map_flags, BPF_F_NO_PREALLOC);
0045 __type(key, int);
0046 __type(value, struct tstamp_data);
0047 } tstamp SEC(".maps");
0048
0049 struct {
0050 __uint(type, BPF_MAP_TYPE_HASH);
0051 __uint(key_size, sizeof(struct offcpu_key));
0052 __uint(value_size, sizeof(__u64));
0053 __uint(max_entries, MAX_ENTRIES);
0054 } off_cpu SEC(".maps");
0055
0056 struct {
0057 __uint(type, BPF_MAP_TYPE_HASH);
0058 __uint(key_size, sizeof(__u32));
0059 __uint(value_size, sizeof(__u8));
0060 __uint(max_entries, 1);
0061 } cpu_filter SEC(".maps");
0062
0063 struct {
0064 __uint(type, BPF_MAP_TYPE_HASH);
0065 __uint(key_size, sizeof(__u32));
0066 __uint(value_size, sizeof(__u8));
0067 __uint(max_entries, 1);
0068 } task_filter SEC(".maps");
0069
0070 struct {
0071 __uint(type, BPF_MAP_TYPE_HASH);
0072 __uint(key_size, sizeof(__u64));
0073 __uint(value_size, sizeof(__u8));
0074 __uint(max_entries, 1);
0075 } cgroup_filter SEC(".maps");
0076
0077
0078 struct task_struct___new {
0079 long __state;
0080 } __attribute__((preserve_access_index));
0081
0082
0083 struct task_struct___old {
0084 long state;
0085 } __attribute__((preserve_access_index));
0086
0087 int enabled = 0;
0088 int has_cpu = 0;
0089 int has_task = 0;
0090 int has_cgroup = 0;
0091 int uses_tgid = 0;
0092
0093 const volatile bool has_prev_state = false;
0094 const volatile bool needs_cgroup = false;
0095 const volatile bool uses_cgroup_v1 = false;
0096
0097 int perf_subsys_id = -1;
0098
0099
0100
0101
0102
0103
0104
0105 static inline int get_task_state(struct task_struct *t)
0106 {
0107
0108 struct task_struct___new *t_new = (void *)t;
0109
0110 if (bpf_core_field_exists(t_new->__state)) {
0111 return BPF_CORE_READ(t_new, __state);
0112 } else {
0113
0114 struct task_struct___old *t_old = (void *)t;
0115
0116 return BPF_CORE_READ(t_old, state);
0117 }
0118 }
0119
0120 static inline __u64 get_cgroup_id(struct task_struct *t)
0121 {
0122 struct cgroup *cgrp;
0123
0124 if (!uses_cgroup_v1)
0125 return BPF_CORE_READ(t, cgroups, dfl_cgrp, kn, id);
0126
0127 if (perf_subsys_id == -1) {
0128 #if __has_builtin(__builtin_preserve_enum_value)
0129 perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
0130 perf_event_cgrp_id);
0131 #else
0132 perf_subsys_id = perf_event_cgrp_id;
0133 #endif
0134 }
0135
0136 cgrp = BPF_CORE_READ(t, cgroups, subsys[perf_subsys_id], cgroup);
0137 return BPF_CORE_READ(cgrp, kn, id);
0138 }
0139
0140 static inline int can_record(struct task_struct *t, int state)
0141 {
0142
0143 if (t->flags & PF_KTHREAD)
0144 return 0;
0145
0146 if (state != TASK_INTERRUPTIBLE &&
0147 state != TASK_UNINTERRUPTIBLE)
0148 return 0;
0149
0150 if (has_cpu) {
0151 __u32 cpu = bpf_get_smp_processor_id();
0152 __u8 *ok;
0153
0154 ok = bpf_map_lookup_elem(&cpu_filter, &cpu);
0155 if (!ok)
0156 return 0;
0157 }
0158
0159 if (has_task) {
0160 __u8 *ok;
0161 __u32 pid;
0162
0163 if (uses_tgid)
0164 pid = t->tgid;
0165 else
0166 pid = t->pid;
0167
0168 ok = bpf_map_lookup_elem(&task_filter, &pid);
0169 if (!ok)
0170 return 0;
0171 }
0172
0173 if (has_cgroup) {
0174 __u8 *ok;
0175 __u64 cgrp_id = get_cgroup_id(t);
0176
0177 ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp_id);
0178 if (!ok)
0179 return 0;
0180 }
0181
0182 return 1;
0183 }
0184
0185 static int off_cpu_stat(u64 *ctx, struct task_struct *prev,
0186 struct task_struct *next, int state)
0187 {
0188 __u64 ts;
0189 __u32 stack_id;
0190 struct tstamp_data *pelem;
0191
0192 ts = bpf_ktime_get_ns();
0193
0194 if (!can_record(prev, state))
0195 goto next;
0196
0197 stack_id = bpf_get_stackid(ctx, &stacks,
0198 BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK);
0199
0200 pelem = bpf_task_storage_get(&tstamp, prev, NULL,
0201 BPF_LOCAL_STORAGE_GET_F_CREATE);
0202 if (!pelem)
0203 goto next;
0204
0205 pelem->timestamp = ts;
0206 pelem->state = state;
0207 pelem->stack_id = stack_id;
0208
0209 next:
0210 pelem = bpf_task_storage_get(&tstamp, next, NULL, 0);
0211
0212 if (pelem && pelem->timestamp) {
0213 struct offcpu_key key = {
0214 .pid = next->pid,
0215 .tgid = next->tgid,
0216 .stack_id = pelem->stack_id,
0217 .state = pelem->state,
0218 .cgroup_id = needs_cgroup ? get_cgroup_id(next) : 0,
0219 };
0220 __u64 delta = ts - pelem->timestamp;
0221 __u64 *total;
0222
0223 total = bpf_map_lookup_elem(&off_cpu, &key);
0224 if (total)
0225 *total += delta;
0226 else
0227 bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY);
0228
0229
0230 pelem->timestamp = 0;
0231 }
0232
0233 return 0;
0234 }
0235
0236 SEC("tp_btf/task_newtask")
0237 int on_newtask(u64 *ctx)
0238 {
0239 struct task_struct *task;
0240 u64 clone_flags;
0241 u32 pid;
0242 u8 val = 1;
0243
0244 if (!uses_tgid)
0245 return 0;
0246
0247 task = (struct task_struct *)bpf_get_current_task();
0248
0249 pid = BPF_CORE_READ(task, tgid);
0250 if (!bpf_map_lookup_elem(&task_filter, &pid))
0251 return 0;
0252
0253 task = (struct task_struct *)ctx[0];
0254 clone_flags = ctx[1];
0255
0256 pid = task->tgid;
0257 if (!(clone_flags & CLONE_THREAD))
0258 bpf_map_update_elem(&task_filter, &pid, &val, BPF_NOEXIST);
0259
0260 return 0;
0261 }
0262
0263 SEC("tp_btf/sched_switch")
0264 int on_switch(u64 *ctx)
0265 {
0266 struct task_struct *prev, *next;
0267 int prev_state;
0268
0269 if (!enabled)
0270 return 0;
0271
0272 prev = (struct task_struct *)ctx[1];
0273 next = (struct task_struct *)ctx[2];
0274
0275 if (has_prev_state)
0276 prev_state = (int)ctx[3];
0277 else
0278 prev_state = get_task_state(prev);
0279
0280 return off_cpu_stat(ctx, prev, next, prev_state);
0281 }
0282
0283 char LICENSE[] SEC("license") = "Dual BSD/GPL";