0001
0002
0003
0004 #include "vmlinux.h"
0005 #include <bpf/bpf_helpers.h>
0006 #include <bpf/bpf_tracing.h>
0007 #include <bpf/bpf_core_read.h>
0008
0009 #define MAX_LEVELS 10
0010 #define MAX_EVENTS 32
0011
0012
0013
0014
0015
0016 struct {
0017 __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
0018 __uint(key_size, sizeof(__u32));
0019 __uint(value_size, sizeof(int));
0020 __uint(max_entries, 1);
0021 } events SEC(".maps");
0022
0023
0024 struct {
0025 __uint(type, BPF_MAP_TYPE_HASH);
0026 __uint(key_size, sizeof(__u64));
0027 __uint(value_size, sizeof(__u32));
0028 __uint(max_entries, 1);
0029 } cgrp_idx SEC(".maps");
0030
0031
0032 struct {
0033 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
0034 __uint(key_size, sizeof(__u32));
0035 __uint(value_size, sizeof(struct bpf_perf_event_value));
0036 } prev_readings SEC(".maps");
0037
0038
0039
0040 struct {
0041 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
0042 __uint(key_size, sizeof(__u32));
0043 __uint(value_size, sizeof(struct bpf_perf_event_value));
0044 } cgrp_readings SEC(".maps");
0045
0046 const volatile __u32 num_events = 1;
0047 const volatile __u32 num_cpus = 1;
0048
0049 int enabled = 0;
0050 int use_cgroup_v2 = 0;
0051 int perf_subsys_id = -1;
0052
0053 static inline int get_cgroup_v1_idx(__u32 *cgrps, int size)
0054 {
0055 struct task_struct *p = (void *)bpf_get_current_task();
0056 struct cgroup *cgrp;
0057 register int i = 0;
0058 __u32 *elem;
0059 int level;
0060 int cnt;
0061
0062 if (perf_subsys_id == -1) {
0063 #if __has_builtin(__builtin_preserve_enum_value)
0064 perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
0065 perf_event_cgrp_id);
0066 #else
0067 perf_subsys_id = perf_event_cgrp_id;
0068 #endif
0069 }
0070 cgrp = BPF_CORE_READ(p, cgroups, subsys[perf_subsys_id], cgroup);
0071 level = BPF_CORE_READ(cgrp, level);
0072
0073 for (cnt = 0; i < MAX_LEVELS; i++) {
0074 __u64 cgrp_id;
0075
0076 if (i > level)
0077 break;
0078
0079
0080 cgrp_id = BPF_CORE_READ(cgrp, ancestor_ids[i]);
0081 elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id);
0082 if (!elem)
0083 continue;
0084
0085 cgrps[cnt++] = *elem;
0086 if (cnt == size)
0087 break;
0088 }
0089
0090 return cnt;
0091 }
0092
0093 static inline int get_cgroup_v2_idx(__u32 *cgrps, int size)
0094 {
0095 register int i = 0;
0096 __u32 *elem;
0097 int cnt;
0098
0099 for (cnt = 0; i < MAX_LEVELS; i++) {
0100 __u64 cgrp_id = bpf_get_current_ancestor_cgroup_id(i);
0101
0102 if (cgrp_id == 0)
0103 break;
0104
0105
0106 elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id);
0107 if (!elem)
0108 continue;
0109
0110 cgrps[cnt++] = *elem;
0111 if (cnt == size)
0112 break;
0113 }
0114
0115 return cnt;
0116 }
0117
0118 static int bperf_cgroup_count(void)
0119 {
0120 register __u32 idx = 0;
0121 register int c = 0;
0122 struct bpf_perf_event_value val, delta, *prev_val, *cgrp_val;
0123 __u32 cpu = bpf_get_smp_processor_id();
0124 __u32 cgrp_idx[MAX_LEVELS];
0125 int cgrp_cnt;
0126 __u32 key, cgrp;
0127 long err;
0128
0129 if (use_cgroup_v2)
0130 cgrp_cnt = get_cgroup_v2_idx(cgrp_idx, MAX_LEVELS);
0131 else
0132 cgrp_cnt = get_cgroup_v1_idx(cgrp_idx, MAX_LEVELS);
0133
0134 for ( ; idx < MAX_EVENTS; idx++) {
0135 if (idx == num_events)
0136 break;
0137
0138
0139 key = idx;
0140
0141 prev_val = bpf_map_lookup_elem(&prev_readings, &key);
0142 if (!prev_val) {
0143 val.counter = val.enabled = val.running = 0;
0144 bpf_map_update_elem(&prev_readings, &key, &val, BPF_ANY);
0145
0146 prev_val = bpf_map_lookup_elem(&prev_readings, &key);
0147 if (!prev_val)
0148 continue;
0149 }
0150
0151
0152 key = idx * num_cpus + cpu;
0153 err = bpf_perf_event_read_value(&events, key, &val, sizeof(val));
0154 if (err)
0155 continue;
0156
0157 if (enabled) {
0158 delta.counter = val.counter - prev_val->counter;
0159 delta.enabled = val.enabled - prev_val->enabled;
0160 delta.running = val.running - prev_val->running;
0161
0162 for (c = 0; c < MAX_LEVELS; c++) {
0163 if (c == cgrp_cnt)
0164 break;
0165
0166 cgrp = cgrp_idx[c];
0167
0168
0169 key = cgrp * num_events + idx;
0170 cgrp_val = bpf_map_lookup_elem(&cgrp_readings, &key);
0171 if (cgrp_val) {
0172 cgrp_val->counter += delta.counter;
0173 cgrp_val->enabled += delta.enabled;
0174 cgrp_val->running += delta.running;
0175 } else {
0176 bpf_map_update_elem(&cgrp_readings, &key,
0177 &delta, BPF_ANY);
0178 }
0179 }
0180 }
0181
0182 *prev_val = val;
0183 }
0184 return 0;
0185 }
0186
0187
0188 SEC("perf_event")
0189 int BPF_PROG(on_cgrp_switch)
0190 {
0191 return bperf_cgroup_count();
0192 }
0193
0194 SEC("raw_tp/sched_switch")
0195 int BPF_PROG(trigger_read)
0196 {
0197 return bperf_cgroup_count();
0198 }
0199
0200 char LICENSE[] SEC("license") = "Dual BSD/GPL";