Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
0002 // Copyright (c) 2021 Facebook
0003 // Copyright (c) 2021 Google
0004 #include "vmlinux.h"
0005 #include <bpf/bpf_helpers.h>
0006 #include <bpf/bpf_tracing.h>
0007 #include <bpf/bpf_core_read.h>
0008 
0009 #define MAX_LEVELS  10  // max cgroup hierarchy level: arbitrary
0010 #define MAX_EVENTS  32  // max events per cgroup: arbitrary
0011 
0012 // NOTE: many of map and global data will be modified before loading
0013 //       from the userspace (perf tool) using the skeleton helpers.
0014 
0015 // single set of global perf events to measure
0016 struct {
0017     __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
0018     __uint(key_size, sizeof(__u32));
0019     __uint(value_size, sizeof(int));
0020     __uint(max_entries, 1);
0021 } events SEC(".maps");
0022 
0023 // from cgroup id to event index
0024 struct {
0025     __uint(type, BPF_MAP_TYPE_HASH);
0026     __uint(key_size, sizeof(__u64));
0027     __uint(value_size, sizeof(__u32));
0028     __uint(max_entries, 1);
0029 } cgrp_idx SEC(".maps");
0030 
0031 // per-cpu event snapshots to calculate delta
0032 struct {
0033     __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
0034     __uint(key_size, sizeof(__u32));
0035     __uint(value_size, sizeof(struct bpf_perf_event_value));
0036 } prev_readings SEC(".maps");
0037 
0038 // aggregated event values for each cgroup (per-cpu)
0039 // will be read from the user-space
0040 struct {
0041     __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
0042     __uint(key_size, sizeof(__u32));
0043     __uint(value_size, sizeof(struct bpf_perf_event_value));
0044 } cgrp_readings SEC(".maps");
0045 
0046 const volatile __u32 num_events = 1;
0047 const volatile __u32 num_cpus = 1;
0048 
0049 int enabled = 0;
0050 int use_cgroup_v2 = 0;
0051 int perf_subsys_id = -1;
0052 
0053 static inline int get_cgroup_v1_idx(__u32 *cgrps, int size)
0054 {
0055     struct task_struct *p = (void *)bpf_get_current_task();
0056     struct cgroup *cgrp;
0057     register int i = 0;
0058     __u32 *elem;
0059     int level;
0060     int cnt;
0061 
0062     if (perf_subsys_id == -1) {
0063 #if __has_builtin(__builtin_preserve_enum_value)
0064         perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
0065                              perf_event_cgrp_id);
0066 #else
0067         perf_subsys_id = perf_event_cgrp_id;
0068 #endif
0069     }
0070     cgrp = BPF_CORE_READ(p, cgroups, subsys[perf_subsys_id], cgroup);
0071     level = BPF_CORE_READ(cgrp, level);
0072 
0073     for (cnt = 0; i < MAX_LEVELS; i++) {
0074         __u64 cgrp_id;
0075 
0076         if (i > level)
0077             break;
0078 
0079         // convert cgroup-id to a map index
0080         cgrp_id = BPF_CORE_READ(cgrp, ancestor_ids[i]);
0081         elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id);
0082         if (!elem)
0083             continue;
0084 
0085         cgrps[cnt++] = *elem;
0086         if (cnt == size)
0087             break;
0088     }
0089 
0090     return cnt;
0091 }
0092 
0093 static inline int get_cgroup_v2_idx(__u32 *cgrps, int size)
0094 {
0095     register int i = 0;
0096     __u32 *elem;
0097     int cnt;
0098 
0099     for (cnt = 0; i < MAX_LEVELS; i++) {
0100         __u64 cgrp_id = bpf_get_current_ancestor_cgroup_id(i);
0101 
0102         if (cgrp_id == 0)
0103             break;
0104 
0105         // convert cgroup-id to a map index
0106         elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id);
0107         if (!elem)
0108             continue;
0109 
0110         cgrps[cnt++] = *elem;
0111         if (cnt == size)
0112             break;
0113     }
0114 
0115     return cnt;
0116 }
0117 
0118 static int bperf_cgroup_count(void)
0119 {
0120     register __u32 idx = 0;  // to have it in a register to pass BPF verifier
0121     register int c = 0;
0122     struct bpf_perf_event_value val, delta, *prev_val, *cgrp_val;
0123     __u32 cpu = bpf_get_smp_processor_id();
0124     __u32 cgrp_idx[MAX_LEVELS];
0125     int cgrp_cnt;
0126     __u32 key, cgrp;
0127     long err;
0128 
0129     if (use_cgroup_v2)
0130         cgrp_cnt = get_cgroup_v2_idx(cgrp_idx, MAX_LEVELS);
0131     else
0132         cgrp_cnt = get_cgroup_v1_idx(cgrp_idx, MAX_LEVELS);
0133 
0134     for ( ; idx < MAX_EVENTS; idx++) {
0135         if (idx == num_events)
0136             break;
0137 
0138         // XXX: do not pass idx directly (for verifier)
0139         key = idx;
0140         // this is per-cpu array for diff
0141         prev_val = bpf_map_lookup_elem(&prev_readings, &key);
0142         if (!prev_val) {
0143             val.counter = val.enabled = val.running = 0;
0144             bpf_map_update_elem(&prev_readings, &key, &val, BPF_ANY);
0145 
0146             prev_val = bpf_map_lookup_elem(&prev_readings, &key);
0147             if (!prev_val)
0148                 continue;
0149         }
0150 
0151         // read from global perf_event array
0152         key = idx * num_cpus + cpu;
0153         err = bpf_perf_event_read_value(&events, key, &val, sizeof(val));
0154         if (err)
0155             continue;
0156 
0157         if (enabled) {
0158             delta.counter = val.counter - prev_val->counter;
0159             delta.enabled = val.enabled - prev_val->enabled;
0160             delta.running = val.running - prev_val->running;
0161 
0162             for (c = 0; c < MAX_LEVELS; c++) {
0163                 if (c == cgrp_cnt)
0164                     break;
0165 
0166                 cgrp = cgrp_idx[c];
0167 
0168                 // aggregate the result by cgroup
0169                 key = cgrp * num_events + idx;
0170                 cgrp_val = bpf_map_lookup_elem(&cgrp_readings, &key);
0171                 if (cgrp_val) {
0172                     cgrp_val->counter += delta.counter;
0173                     cgrp_val->enabled += delta.enabled;
0174                     cgrp_val->running += delta.running;
0175                 } else {
0176                     bpf_map_update_elem(&cgrp_readings, &key,
0177                                 &delta, BPF_ANY);
0178                 }
0179             }
0180         }
0181 
0182         *prev_val = val;
0183     }
0184     return 0;
0185 }
0186 
0187 // This will be attached to cgroup-switches event for each cpu
0188 SEC("perf_event")
0189 int BPF_PROG(on_cgrp_switch)
0190 {
0191     return bperf_cgroup_count();
0192 }
0193 
0194 SEC("raw_tp/sched_switch")
0195 int BPF_PROG(trigger_read)
0196 {
0197     return bperf_cgroup_count();
0198 }
0199 
0200 char LICENSE[] SEC("license") = "Dual BSD/GPL";