Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
0002 // Copyright (c) 2022 Google
0003 #include "vmlinux.h"
0004 #include <bpf/bpf_helpers.h>
0005 #include <bpf/bpf_tracing.h>
0006 #include <bpf/bpf_core_read.h>
0007 
0008 /* task->flags for off-cpu analysis */
0009 #define PF_KTHREAD   0x00200000  /* I am a kernel thread */
0010 
0011 /* task->state for off-cpu analysis */
0012 #define TASK_INTERRUPTIBLE  0x0001
0013 #define TASK_UNINTERRUPTIBLE    0x0002
0014 
0015 /* create a new thread */
0016 #define CLONE_THREAD  0x10000
0017 
0018 #define MAX_STACKS   32
0019 #define MAX_ENTRIES  102400
0020 
0021 struct tstamp_data {
0022     __u32 stack_id;
0023     __u32 state;
0024     __u64 timestamp;
0025 };
0026 
0027 struct offcpu_key {
0028     __u32 pid;
0029     __u32 tgid;
0030     __u32 stack_id;
0031     __u32 state;
0032     __u64 cgroup_id;
0033 };
0034 
0035 struct {
0036     __uint(type, BPF_MAP_TYPE_STACK_TRACE);
0037     __uint(key_size, sizeof(__u32));
0038     __uint(value_size, MAX_STACKS * sizeof(__u64));
0039     __uint(max_entries, MAX_ENTRIES);
0040 } stacks SEC(".maps");
0041 
0042 struct {
0043     __uint(type, BPF_MAP_TYPE_TASK_STORAGE);
0044     __uint(map_flags, BPF_F_NO_PREALLOC);
0045     __type(key, int);
0046     __type(value, struct tstamp_data);
0047 } tstamp SEC(".maps");
0048 
0049 struct {
0050     __uint(type, BPF_MAP_TYPE_HASH);
0051     __uint(key_size, sizeof(struct offcpu_key));
0052     __uint(value_size, sizeof(__u64));
0053     __uint(max_entries, MAX_ENTRIES);
0054 } off_cpu SEC(".maps");
0055 
0056 struct {
0057     __uint(type, BPF_MAP_TYPE_HASH);
0058     __uint(key_size, sizeof(__u32));
0059     __uint(value_size, sizeof(__u8));
0060     __uint(max_entries, 1);
0061 } cpu_filter SEC(".maps");
0062 
0063 struct {
0064     __uint(type, BPF_MAP_TYPE_HASH);
0065     __uint(key_size, sizeof(__u32));
0066     __uint(value_size, sizeof(__u8));
0067     __uint(max_entries, 1);
0068 } task_filter SEC(".maps");
0069 
0070 struct {
0071     __uint(type, BPF_MAP_TYPE_HASH);
0072     __uint(key_size, sizeof(__u64));
0073     __uint(value_size, sizeof(__u8));
0074     __uint(max_entries, 1);
0075 } cgroup_filter SEC(".maps");
0076 
0077 /* new kernel task_struct definition */
0078 struct task_struct___new {
0079     long __state;
0080 } __attribute__((preserve_access_index));
0081 
0082 /* old kernel task_struct definition */
0083 struct task_struct___old {
0084     long state;
0085 } __attribute__((preserve_access_index));
0086 
0087 int enabled = 0;
0088 int has_cpu = 0;
0089 int has_task = 0;
0090 int has_cgroup = 0;
0091 int uses_tgid = 0;
0092 
0093 const volatile bool has_prev_state = false;
0094 const volatile bool needs_cgroup = false;
0095 const volatile bool uses_cgroup_v1 = false;
0096 
0097 int perf_subsys_id = -1;
0098 
0099 /*
0100  * Old kernel used to call it task_struct->state and now it's '__state'.
0101  * Use BPF CO-RE "ignored suffix rule" to deal with it like below:
0102  *
0103  * https://nakryiko.com/posts/bpf-core-reference-guide/#handling-incompatible-field-and-type-changes
0104  */
0105 static inline int get_task_state(struct task_struct *t)
0106 {
0107     /* recast pointer to capture new type for compiler */
0108     struct task_struct___new *t_new = (void *)t;
0109 
0110     if (bpf_core_field_exists(t_new->__state)) {
0111         return BPF_CORE_READ(t_new, __state);
0112     } else {
0113         /* recast pointer to capture old type for compiler */
0114         struct task_struct___old *t_old = (void *)t;
0115 
0116         return BPF_CORE_READ(t_old, state);
0117     }
0118 }
0119 
0120 static inline __u64 get_cgroup_id(struct task_struct *t)
0121 {
0122     struct cgroup *cgrp;
0123 
0124     if (!uses_cgroup_v1)
0125         return BPF_CORE_READ(t, cgroups, dfl_cgrp, kn, id);
0126 
0127     if (perf_subsys_id == -1) {
0128 #if __has_builtin(__builtin_preserve_enum_value)
0129         perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
0130                              perf_event_cgrp_id);
0131 #else
0132         perf_subsys_id = perf_event_cgrp_id;
0133 #endif
0134     }
0135 
0136     cgrp = BPF_CORE_READ(t, cgroups, subsys[perf_subsys_id], cgroup);
0137     return BPF_CORE_READ(cgrp, kn, id);
0138 }
0139 
0140 static inline int can_record(struct task_struct *t, int state)
0141 {
0142     /* kernel threads don't have user stack */
0143     if (t->flags & PF_KTHREAD)
0144         return 0;
0145 
0146     if (state != TASK_INTERRUPTIBLE &&
0147         state != TASK_UNINTERRUPTIBLE)
0148         return 0;
0149 
0150     if (has_cpu) {
0151         __u32 cpu = bpf_get_smp_processor_id();
0152         __u8 *ok;
0153 
0154         ok = bpf_map_lookup_elem(&cpu_filter, &cpu);
0155         if (!ok)
0156             return 0;
0157     }
0158 
0159     if (has_task) {
0160         __u8 *ok;
0161         __u32 pid;
0162 
0163         if (uses_tgid)
0164             pid = t->tgid;
0165         else
0166             pid = t->pid;
0167 
0168         ok = bpf_map_lookup_elem(&task_filter, &pid);
0169         if (!ok)
0170             return 0;
0171     }
0172 
0173     if (has_cgroup) {
0174         __u8 *ok;
0175         __u64 cgrp_id = get_cgroup_id(t);
0176 
0177         ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp_id);
0178         if (!ok)
0179             return 0;
0180     }
0181 
0182     return 1;
0183 }
0184 
0185 static int off_cpu_stat(u64 *ctx, struct task_struct *prev,
0186             struct task_struct *next, int state)
0187 {
0188     __u64 ts;
0189     __u32 stack_id;
0190     struct tstamp_data *pelem;
0191 
0192     ts = bpf_ktime_get_ns();
0193 
0194     if (!can_record(prev, state))
0195         goto next;
0196 
0197     stack_id = bpf_get_stackid(ctx, &stacks,
0198                    BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK);
0199 
0200     pelem = bpf_task_storage_get(&tstamp, prev, NULL,
0201                      BPF_LOCAL_STORAGE_GET_F_CREATE);
0202     if (!pelem)
0203         goto next;
0204 
0205     pelem->timestamp = ts;
0206     pelem->state = state;
0207     pelem->stack_id = stack_id;
0208 
0209 next:
0210     pelem = bpf_task_storage_get(&tstamp, next, NULL, 0);
0211 
0212     if (pelem && pelem->timestamp) {
0213         struct offcpu_key key = {
0214             .pid = next->pid,
0215             .tgid = next->tgid,
0216             .stack_id = pelem->stack_id,
0217             .state = pelem->state,
0218             .cgroup_id = needs_cgroup ? get_cgroup_id(next) : 0,
0219         };
0220         __u64 delta = ts - pelem->timestamp;
0221         __u64 *total;
0222 
0223         total = bpf_map_lookup_elem(&off_cpu, &key);
0224         if (total)
0225             *total += delta;
0226         else
0227             bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY);
0228 
0229         /* prevent to reuse the timestamp later */
0230         pelem->timestamp = 0;
0231     }
0232 
0233     return 0;
0234 }
0235 
0236 SEC("tp_btf/task_newtask")
0237 int on_newtask(u64 *ctx)
0238 {
0239     struct task_struct *task;
0240     u64 clone_flags;
0241     u32 pid;
0242     u8 val = 1;
0243 
0244     if (!uses_tgid)
0245         return 0;
0246 
0247     task = (struct task_struct *)bpf_get_current_task();
0248 
0249     pid = BPF_CORE_READ(task, tgid);
0250     if (!bpf_map_lookup_elem(&task_filter, &pid))
0251         return 0;
0252 
0253     task = (struct task_struct *)ctx[0];
0254     clone_flags = ctx[1];
0255 
0256     pid = task->tgid;
0257     if (!(clone_flags & CLONE_THREAD))
0258         bpf_map_update_elem(&task_filter, &pid, &val, BPF_NOEXIST);
0259 
0260     return 0;
0261 }
0262 
0263 SEC("tp_btf/sched_switch")
0264 int on_switch(u64 *ctx)
0265 {
0266     struct task_struct *prev, *next;
0267     int prev_state;
0268 
0269     if (!enabled)
0270         return 0;
0271 
0272     prev = (struct task_struct *)ctx[1];
0273     next = (struct task_struct *)ctx[2];
0274 
0275     if (has_prev_state)
0276         prev_state = (int)ctx[3];
0277     else
0278         prev_state = get_task_state(prev);
0279 
0280     return off_cpu_stat(ctx, prev, next, prev_state);
0281 }
0282 
0283 char LICENSE[] SEC("license") = "Dual BSD/GPL";