Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 #include "util/bpf_counter.h"
0003 #include "util/debug.h"
0004 #include "util/evsel.h"
0005 #include "util/evlist.h"
0006 #include "util/off_cpu.h"
0007 #include "util/perf-hooks.h"
0008 #include "util/record.h"
0009 #include "util/session.h"
0010 #include "util/target.h"
0011 #include "util/cpumap.h"
0012 #include "util/thread_map.h"
0013 #include "util/cgroup.h"
0014 #include "util/strlist.h"
0015 #include <bpf/bpf.h>
0016 
0017 #include "bpf_skel/off_cpu.skel.h"
0018 
0019 #define MAX_STACKS  32
0020 #define MAX_PROC  4096
0021 /* we don't need actual timestamp, just want to put the samples at last */
0022 #define OFF_CPU_TIMESTAMP  (~0ull << 32)
0023 
0024 static struct off_cpu_bpf *skel;
0025 
0026 struct off_cpu_key {
0027     u32 pid;
0028     u32 tgid;
0029     u32 stack_id;
0030     u32 state;
0031     u64 cgroup_id;
0032 };
0033 
0034 union off_cpu_data {
0035     struct perf_event_header hdr;
0036     u64 array[1024 / sizeof(u64)];
0037 };
0038 
0039 static int off_cpu_config(struct evlist *evlist)
0040 {
0041     struct evsel *evsel;
0042     struct perf_event_attr attr = {
0043         .type   = PERF_TYPE_SOFTWARE,
0044         .config = PERF_COUNT_SW_BPF_OUTPUT,
0045         .size   = sizeof(attr), /* to capture ABI version */
0046     };
0047     char *evname = strdup(OFFCPU_EVENT);
0048 
0049     if (evname == NULL)
0050         return -ENOMEM;
0051 
0052     evsel = evsel__new(&attr);
0053     if (!evsel) {
0054         free(evname);
0055         return -ENOMEM;
0056     }
0057 
0058     evsel->core.attr.freq = 1;
0059     evsel->core.attr.sample_period = 1;
0060     /* off-cpu analysis depends on stack trace */
0061     evsel->core.attr.sample_type = PERF_SAMPLE_CALLCHAIN;
0062 
0063     evlist__add(evlist, evsel);
0064 
0065     free(evsel->name);
0066     evsel->name = evname;
0067 
0068     return 0;
0069 }
0070 
0071 static void off_cpu_start(void *arg)
0072 {
0073     struct evlist *evlist = arg;
0074 
0075     /* update task filter for the given workload */
0076     if (!skel->bss->has_cpu && !skel->bss->has_task &&
0077         perf_thread_map__pid(evlist->core.threads, 0) != -1) {
0078         int fd;
0079         u32 pid;
0080         u8 val = 1;
0081 
0082         skel->bss->has_task = 1;
0083         skel->bss->uses_tgid = 1;
0084         fd = bpf_map__fd(skel->maps.task_filter);
0085         pid = perf_thread_map__pid(evlist->core.threads, 0);
0086         bpf_map_update_elem(fd, &pid, &val, BPF_ANY);
0087     }
0088 
0089     skel->bss->enabled = 1;
0090 }
0091 
0092 static void off_cpu_finish(void *arg __maybe_unused)
0093 {
0094     skel->bss->enabled = 0;
0095     off_cpu_bpf__destroy(skel);
0096 }
0097 
0098 /* v5.18 kernel added prev_state arg, so it needs to check the signature */
0099 static void check_sched_switch_args(void)
0100 {
0101     const struct btf *btf = bpf_object__btf(skel->obj);
0102     const struct btf_type *t1, *t2, *t3;
0103     u32 type_id;
0104 
0105     type_id = btf__find_by_name_kind(btf, "bpf_trace_sched_switch",
0106                      BTF_KIND_TYPEDEF);
0107     if ((s32)type_id < 0)
0108         return;
0109 
0110     t1 = btf__type_by_id(btf, type_id);
0111     if (t1 == NULL)
0112         return;
0113 
0114     t2 = btf__type_by_id(btf, t1->type);
0115     if (t2 == NULL || !btf_is_ptr(t2))
0116         return;
0117 
0118     t3 = btf__type_by_id(btf, t2->type);
0119     if (t3 && btf_is_func_proto(t3) && btf_vlen(t3) == 4) {
0120         /* new format: pass prev_state as 4th arg */
0121         skel->rodata->has_prev_state = true;
0122     }
0123 }
0124 
0125 int off_cpu_prepare(struct evlist *evlist, struct target *target,
0126             struct record_opts *opts)
0127 {
0128     int err, fd, i;
0129     int ncpus = 1, ntasks = 1, ncgrps = 1;
0130     struct strlist *pid_slist = NULL;
0131     struct str_node *pos;
0132 
0133     if (off_cpu_config(evlist) < 0) {
0134         pr_err("Failed to config off-cpu BPF event\n");
0135         return -1;
0136     }
0137 
0138     skel = off_cpu_bpf__open();
0139     if (!skel) {
0140         pr_err("Failed to open off-cpu BPF skeleton\n");
0141         return -1;
0142     }
0143 
0144     /* don't need to set cpu filter for system-wide mode */
0145     if (target->cpu_list) {
0146         ncpus = perf_cpu_map__nr(evlist->core.user_requested_cpus);
0147         bpf_map__set_max_entries(skel->maps.cpu_filter, ncpus);
0148     }
0149 
0150     if (target->pid) {
0151         pid_slist = strlist__new(target->pid, NULL);
0152         if (!pid_slist) {
0153             pr_err("Failed to create a strlist for pid\n");
0154             return -1;
0155         }
0156 
0157         ntasks = 0;
0158         strlist__for_each_entry(pos, pid_slist) {
0159             char *end_ptr;
0160             int pid = strtol(pos->s, &end_ptr, 10);
0161 
0162             if (pid == INT_MIN || pid == INT_MAX ||
0163                 (*end_ptr != '\0' && *end_ptr != ','))
0164                 continue;
0165 
0166             ntasks++;
0167         }
0168 
0169         if (ntasks < MAX_PROC)
0170             ntasks = MAX_PROC;
0171 
0172         bpf_map__set_max_entries(skel->maps.task_filter, ntasks);
0173     } else if (target__has_task(target)) {
0174         ntasks = perf_thread_map__nr(evlist->core.threads);
0175         bpf_map__set_max_entries(skel->maps.task_filter, ntasks);
0176     } else if (target__none(target)) {
0177         bpf_map__set_max_entries(skel->maps.task_filter, MAX_PROC);
0178     }
0179 
0180     if (evlist__first(evlist)->cgrp) {
0181         ncgrps = evlist->core.nr_entries - 1; /* excluding a dummy */
0182         bpf_map__set_max_entries(skel->maps.cgroup_filter, ncgrps);
0183 
0184         if (!cgroup_is_v2("perf_event"))
0185             skel->rodata->uses_cgroup_v1 = true;
0186     }
0187 
0188     if (opts->record_cgroup) {
0189         skel->rodata->needs_cgroup = true;
0190 
0191         if (!cgroup_is_v2("perf_event"))
0192             skel->rodata->uses_cgroup_v1 = true;
0193     }
0194 
0195     set_max_rlimit();
0196     check_sched_switch_args();
0197 
0198     err = off_cpu_bpf__load(skel);
0199     if (err) {
0200         pr_err("Failed to load off-cpu skeleton\n");
0201         goto out;
0202     }
0203 
0204     if (target->cpu_list) {
0205         u32 cpu;
0206         u8 val = 1;
0207 
0208         skel->bss->has_cpu = 1;
0209         fd = bpf_map__fd(skel->maps.cpu_filter);
0210 
0211         for (i = 0; i < ncpus; i++) {
0212             cpu = perf_cpu_map__cpu(evlist->core.user_requested_cpus, i).cpu;
0213             bpf_map_update_elem(fd, &cpu, &val, BPF_ANY);
0214         }
0215     }
0216 
0217     if (target->pid) {
0218         u8 val = 1;
0219 
0220         skel->bss->has_task = 1;
0221         skel->bss->uses_tgid = 1;
0222         fd = bpf_map__fd(skel->maps.task_filter);
0223 
0224         strlist__for_each_entry(pos, pid_slist) {
0225             char *end_ptr;
0226             u32 tgid;
0227             int pid = strtol(pos->s, &end_ptr, 10);
0228 
0229             if (pid == INT_MIN || pid == INT_MAX ||
0230                 (*end_ptr != '\0' && *end_ptr != ','))
0231                 continue;
0232 
0233             tgid = pid;
0234             bpf_map_update_elem(fd, &tgid, &val, BPF_ANY);
0235         }
0236     } else if (target__has_task(target)) {
0237         u32 pid;
0238         u8 val = 1;
0239 
0240         skel->bss->has_task = 1;
0241         fd = bpf_map__fd(skel->maps.task_filter);
0242 
0243         for (i = 0; i < ntasks; i++) {
0244             pid = perf_thread_map__pid(evlist->core.threads, i);
0245             bpf_map_update_elem(fd, &pid, &val, BPF_ANY);
0246         }
0247     }
0248 
0249     if (evlist__first(evlist)->cgrp) {
0250         struct evsel *evsel;
0251         u8 val = 1;
0252 
0253         skel->bss->has_cgroup = 1;
0254         fd = bpf_map__fd(skel->maps.cgroup_filter);
0255 
0256         evlist__for_each_entry(evlist, evsel) {
0257             struct cgroup *cgrp = evsel->cgrp;
0258 
0259             if (cgrp == NULL)
0260                 continue;
0261 
0262             if (!cgrp->id && read_cgroup_id(cgrp) < 0) {
0263                 pr_err("Failed to read cgroup id of %s\n",
0264                        cgrp->name);
0265                 goto out;
0266             }
0267 
0268             bpf_map_update_elem(fd, &cgrp->id, &val, BPF_ANY);
0269         }
0270     }
0271 
0272     err = off_cpu_bpf__attach(skel);
0273     if (err) {
0274         pr_err("Failed to attach off-cpu BPF skeleton\n");
0275         goto out;
0276     }
0277 
0278     if (perf_hooks__set_hook("record_start", off_cpu_start, evlist) ||
0279         perf_hooks__set_hook("record_end", off_cpu_finish, evlist)) {
0280         pr_err("Failed to attach off-cpu skeleton\n");
0281         goto out;
0282     }
0283 
0284     return 0;
0285 
0286 out:
0287     off_cpu_bpf__destroy(skel);
0288     return -1;
0289 }
0290 
0291 int off_cpu_write(struct perf_session *session)
0292 {
0293     int bytes = 0, size;
0294     int fd, stack;
0295     u64 sample_type, val, sid = 0;
0296     struct evsel *evsel;
0297     struct perf_data_file *file = &session->data->file;
0298     struct off_cpu_key prev, key;
0299     union off_cpu_data data = {
0300         .hdr = {
0301             .type = PERF_RECORD_SAMPLE,
0302             .misc = PERF_RECORD_MISC_USER,
0303         },
0304     };
0305     u64 tstamp = OFF_CPU_TIMESTAMP;
0306 
0307     skel->bss->enabled = 0;
0308 
0309     evsel = evlist__find_evsel_by_str(session->evlist, OFFCPU_EVENT);
0310     if (evsel == NULL) {
0311         pr_err("%s evsel not found\n", OFFCPU_EVENT);
0312         return 0;
0313     }
0314 
0315     sample_type = evsel->core.attr.sample_type;
0316 
0317     if (sample_type & ~OFFCPU_SAMPLE_TYPES) {
0318         pr_err("not supported sample type: %llx\n",
0319                (unsigned long long)sample_type);
0320         return -1;
0321     }
0322 
0323     if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) {
0324         if (evsel->core.id)
0325             sid = evsel->core.id[0];
0326     }
0327 
0328     fd = bpf_map__fd(skel->maps.off_cpu);
0329     stack = bpf_map__fd(skel->maps.stacks);
0330     memset(&prev, 0, sizeof(prev));
0331 
0332     while (!bpf_map_get_next_key(fd, &prev, &key)) {
0333         int n = 1;  /* start from perf_event_header */
0334         int ip_pos = -1;
0335 
0336         bpf_map_lookup_elem(fd, &key, &val);
0337 
0338         if (sample_type & PERF_SAMPLE_IDENTIFIER)
0339             data.array[n++] = sid;
0340         if (sample_type & PERF_SAMPLE_IP) {
0341             ip_pos = n;
0342             data.array[n++] = 0;  /* will be updated */
0343         }
0344         if (sample_type & PERF_SAMPLE_TID)
0345             data.array[n++] = (u64)key.pid << 32 | key.tgid;
0346         if (sample_type & PERF_SAMPLE_TIME)
0347             data.array[n++] = tstamp;
0348         if (sample_type & PERF_SAMPLE_ID)
0349             data.array[n++] = sid;
0350         if (sample_type & PERF_SAMPLE_CPU)
0351             data.array[n++] = 0;
0352         if (sample_type & PERF_SAMPLE_PERIOD)
0353             data.array[n++] = val;
0354         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
0355             int len = 0;
0356 
0357             /* data.array[n] is callchain->nr (updated later) */
0358             data.array[n + 1] = PERF_CONTEXT_USER;
0359             data.array[n + 2] = 0;
0360 
0361             bpf_map_lookup_elem(stack, &key.stack_id, &data.array[n + 2]);
0362             while (data.array[n + 2 + len])
0363                 len++;
0364 
0365             /* update length of callchain */
0366             data.array[n] = len + 1;
0367 
0368             /* update sample ip with the first callchain entry */
0369             if (ip_pos >= 0)
0370                 data.array[ip_pos] = data.array[n + 2];
0371 
0372             /* calculate sample callchain data array length */
0373             n += len + 2;
0374         }
0375         if (sample_type & PERF_SAMPLE_CGROUP)
0376             data.array[n++] = key.cgroup_id;
0377 
0378         size = n * sizeof(u64);
0379         data.hdr.size = size;
0380         bytes += size;
0381 
0382         if (perf_data_file__write(file, &data, size) < 0) {
0383             pr_err("failed to write perf data, error: %m\n");
0384             return bytes;
0385         }
0386 
0387         prev = key;
0388         /* increase dummy timestamp to sort later samples */
0389         tstamp++;
0390     }
0391     return bytes;
0392 }