perf/util/bpf_counter.c

0001 // SPDX-License-Identifier: GPL-2.0
0002
0003 /* Copyright (c) 2019 Facebook */
0004
0005 #include <assert.h>
0006 #include <limits.h>
0007 #include <unistd.h>
0008 #include <sys/file.h>
0009 #include <sys/time.h>
0010 #include <linux/err.h>
0011 #include <linux/zalloc.h>
0012 #include <api/fs/fs.h>
0013 #include <perf/bpf_perf.h>
0014
0015 #include "bpf_counter.h"
0016 #include "bpf-utils.h"
0017 #include "counts.h"
0018 #include "debug.h"
0019 #include "evsel.h"
0020 #include "evlist.h"
0021 #include "target.h"
0022 #include "cgroup.h"
0023 #include "cpumap.h"
0024 #include "thread_map.h"
0025
0026 #include "bpf_skel/bpf_prog_profiler.skel.h"
0027 #include "bpf_skel/bperf_u.h"
0028 #include "bpf_skel/bperf_leader.skel.h"
0029 #include "bpf_skel/bperf_follower.skel.h"
0030
0031 #define ATTR_MAP_SIZE 16
0032
0033 static inline void *u64_to_ptr(__u64 ptr)
0034 {
0035     return (void *)(unsigned long)ptr;
0036 }
0037
0038 static struct bpf_counter *bpf_counter_alloc(void)
0039 {
0040     struct bpf_counter *counter;
0041
0042     counter = zalloc(sizeof(*counter));
0043     if (counter)
0044         INIT_LIST_HEAD(&counter->list);
0045     return counter;
0046 }
0047
0048 static int bpf_program_profiler__destroy(struct evsel *evsel)
0049 {
0050     struct bpf_counter *counter, *tmp;
0051
0052     list_for_each_entry_safe(counter, tmp,
0053                  &evsel->bpf_counter_list, list) {
0054         list_del_init(&counter->list);
0055         bpf_prog_profiler_bpf__destroy(counter->skel);
0056         free(counter);
0057     }
0058     assert(list_empty(&evsel->bpf_counter_list));
0059
0060     return 0;
0061 }
0062
0063 static char *bpf_target_prog_name(int tgt_fd)
0064 {
0065     struct bpf_func_info *func_info;
0066     struct perf_bpil *info_linear;
0067     const struct btf_type *t;
0068     struct btf *btf = NULL;
0069     char *name = NULL;
0070
0071     info_linear = get_bpf_prog_info_linear(tgt_fd, 1UL << PERF_BPIL_FUNC_INFO);
0072     if (IS_ERR_OR_NULL(info_linear)) {
0073         pr_debug("failed to get info_linear for prog FD %d\n", tgt_fd);
0074         return NULL;
0075     }
0076
0077     if (info_linear->info.btf_id == 0) {
0078         pr_debug("prog FD %d doesn't have valid btf\n", tgt_fd);
0079         goto out;
0080     }
0081
0082     btf = btf__load_from_kernel_by_id(info_linear->info.btf_id);
0083     if (libbpf_get_error(btf)) {
0084         pr_debug("failed to load btf for prog FD %d\n", tgt_fd);
0085         goto out;
0086     }
0087
0088     func_info = u64_to_ptr(info_linear->info.func_info);
0089     t = btf__type_by_id(btf, func_info[0].type_id);
0090     if (!t) {
0091         pr_debug("btf %d doesn't have type %d\n",
0092              info_linear->info.btf_id, func_info[0].type_id);
0093         goto out;
0094     }
0095     name = strdup(btf__name_by_offset(btf, t->name_off));
0096 out:
0097     btf__free(btf);
0098     free(info_linear);
0099     return name;
0100 }
0101
0102 static int bpf_program_profiler_load_one(struct evsel *evsel, u32 prog_id)
0103 {
0104     struct bpf_prog_profiler_bpf *skel;
0105     struct bpf_counter *counter;
0106     struct bpf_program *prog;
0107     char *prog_name;
0108     int prog_fd;
0109     int err;
0110
0111     prog_fd = bpf_prog_get_fd_by_id(prog_id);
0112     if (prog_fd < 0) {
0113         pr_err("Failed to open fd for bpf prog %u\n", prog_id);
0114         return -1;
0115     }
0116     counter = bpf_counter_alloc();
0117     if (!counter) {
0118         close(prog_fd);
0119         return -1;
0120     }
0121
0122     skel = bpf_prog_profiler_bpf__open();
0123     if (!skel) {
0124         pr_err("Failed to open bpf skeleton\n");
0125         goto err_out;
0126     }
0127
0128     skel->rodata->num_cpu = evsel__nr_cpus(evsel);
0129
0130     bpf_map__set_max_entries(skel->maps.events, evsel__nr_cpus(evsel));
0131     bpf_map__set_max_entries(skel->maps.fentry_readings, 1);
0132     bpf_map__set_max_entries(skel->maps.accum_readings, 1);
0133
0134     prog_name = bpf_target_prog_name(prog_fd);
0135     if (!prog_name) {
0136         pr_err("Failed to get program name for bpf prog %u. Does it have BTF?\n", prog_id);
0137         goto err_out;
0138     }
0139
0140     bpf_object__for_each_program(prog, skel->obj) {
0141         err = bpf_program__set_attach_target(prog, prog_fd, prog_name);
0142         if (err) {
0143             pr_err("bpf_program__set_attach_target failed.\n"
0144                    "Does bpf prog %u have BTF?\n", prog_id);
0145             goto err_out;
0146         }
0147     }
0148     set_max_rlimit();
0149     err = bpf_prog_profiler_bpf__load(skel);
0150     if (err) {
0151         pr_err("bpf_prog_profiler_bpf__load failed\n");
0152         goto err_out;
0153     }
0154
0155     assert(skel != NULL);
0156     counter->skel = skel;
0157     list_add(&counter->list, &evsel->bpf_counter_list);
0158     close(prog_fd);
0159     return 0;
0160 err_out:
0161     bpf_prog_profiler_bpf__destroy(skel);
0162     free(counter);
0163     close(prog_fd);
0164     return -1;
0165 }
0166
0167 static int bpf_program_profiler__load(struct evsel *evsel, struct target *target)
0168 {
0169     char *bpf_str, *bpf_str_, *tok, *saveptr = NULL, *p;
0170     u32 prog_id;
0171     int ret;
0172
0173     bpf_str_ = bpf_str = strdup(target->bpf_str);
0174     if (!bpf_str)
0175         return -1;
0176
0177     while ((tok = strtok_r(bpf_str, ",", &saveptr)) != NULL) {
0178         prog_id = strtoul(tok, &p, 10);
0179         if (prog_id == 0 || prog_id == UINT_MAX ||
0180             (*p != '\0' && *p != ',')) {
0181             pr_err("Failed to parse bpf prog ids %s\n",
0182                    target->bpf_str);
0183             return -1;
0184         }
0185
0186         ret = bpf_program_profiler_load_one(evsel, prog_id);
0187         if (ret) {
0188             bpf_program_profiler__destroy(evsel);
0189             free(bpf_str_);
0190             return -1;
0191         }
0192         bpf_str = NULL;
0193     }
0194     free(bpf_str_);
0195     return 0;
0196 }
0197
0198 static int bpf_program_profiler__enable(struct evsel *evsel)
0199 {
0200     struct bpf_counter *counter;
0201     int ret;
0202
0203     list_for_each_entry(counter, &evsel->bpf_counter_list, list) {
0204         assert(counter->skel != NULL);
0205         ret = bpf_prog_profiler_bpf__attach(counter->skel);
0206         if (ret) {
0207             bpf_program_profiler__destroy(evsel);
0208             return ret;
0209         }
0210     }
0211     return 0;
0212 }
0213
0214 static int bpf_program_profiler__disable(struct evsel *evsel)
0215 {
0216     struct bpf_counter *counter;
0217
0218     list_for_each_entry(counter, &evsel->bpf_counter_list, list) {
0219         assert(counter->skel != NULL);
0220         bpf_prog_profiler_bpf__detach(counter->skel);
0221     }
0222     return 0;
0223 }
0224
0225 static int bpf_program_profiler__read(struct evsel *evsel)
0226 {
0227     // BPF_MAP_TYPE_PERCPU_ARRAY uses /sys/devices/system/cpu/possible
0228     // Sometimes possible > online, like on a Ryzen 3900X that has 24
0229     // threads but its possible showed 0-31 -acme
0230     int num_cpu_bpf = libbpf_num_possible_cpus();
0231     struct bpf_perf_event_value values[num_cpu_bpf];
0232     struct bpf_counter *counter;
0233     struct perf_counts_values *counts;
0234     int reading_map_fd;
0235     __u32 key = 0;
0236     int err, idx, bpf_cpu;
0237
0238     if (list_empty(&evsel->bpf_counter_list))
0239         return -EAGAIN;
0240
0241     perf_cpu_map__for_each_idx(idx, evsel__cpus(evsel)) {
0242         counts = perf_counts(evsel->counts, idx, 0);
0243         counts->val = 0;
0244         counts->ena = 0;
0245         counts->run = 0;
0246     }
0247     list_for_each_entry(counter, &evsel->bpf_counter_list, list) {
0248         struct bpf_prog_profiler_bpf *skel = counter->skel;
0249
0250         assert(skel != NULL);
0251         reading_map_fd = bpf_map__fd(skel->maps.accum_readings);
0252
0253         err = bpf_map_lookup_elem(reading_map_fd, &key, values);
0254         if (err) {
0255             pr_err("failed to read value\n");
0256             return err;
0257         }
0258
0259         for (bpf_cpu = 0; bpf_cpu < num_cpu_bpf; bpf_cpu++) {
0260             idx = perf_cpu_map__idx(evsel__cpus(evsel),
0261                         (struct perf_cpu){.cpu = bpf_cpu});
0262             if (idx == -1)
0263                 continue;
0264             counts = perf_counts(evsel->counts, idx, 0);
0265             counts->val += values[bpf_cpu].counter;
0266             counts->ena += values[bpf_cpu].enabled;
0267             counts->run += values[bpf_cpu].running;
0268         }
0269     }
0270     return 0;
0271 }
0272
0273 static int bpf_program_profiler__install_pe(struct evsel *evsel, int cpu_map_idx,
0274                         int fd)
0275 {
0276     struct bpf_prog_profiler_bpf *skel;
0277     struct bpf_counter *counter;
0278     int ret;
0279
0280     list_for_each_entry(counter, &evsel->bpf_counter_list, list) {
0281         skel = counter->skel;
0282         assert(skel != NULL);
0283
0284         ret = bpf_map_update_elem(bpf_map__fd(skel->maps.events),
0285                       &cpu_map_idx, &fd, BPF_ANY);
0286         if (ret)
0287             return ret;
0288     }
0289     return 0;
0290 }
0291
0292 struct bpf_counter_ops bpf_program_profiler_ops = {
0293     .load       = bpf_program_profiler__load,
0294     .enable     = bpf_program_profiler__enable,
0295     .disable    = bpf_program_profiler__disable,
0296     .read       = bpf_program_profiler__read,
0297     .destroy    = bpf_program_profiler__destroy,
0298     .install_pe = bpf_program_profiler__install_pe,
0299 };
0300
0301 static bool bperf_attr_map_compatible(int attr_map_fd)
0302 {
0303     struct bpf_map_info map_info = {0};
0304     __u32 map_info_len = sizeof(map_info);
0305     int err;
0306
0307     err = bpf_obj_get_info_by_fd(attr_map_fd, &map_info, &map_info_len);
0308
0309     if (err)
0310         return false;
0311     return (map_info.key_size == sizeof(struct perf_event_attr)) &&
0312         (map_info.value_size == sizeof(struct perf_event_attr_map_entry));
0313 }
0314
0315 #ifndef HAVE_LIBBPF_BPF_MAP_CREATE
0316 LIBBPF_API int bpf_create_map(enum bpf_map_type map_type, int key_size,
0317                               int value_size, int max_entries, __u32 map_flags);
0318 int
0319 bpf_map_create(enum bpf_map_type map_type,
0320            const char *map_name __maybe_unused,
0321            __u32 key_size,
0322            __u32 value_size,
0323            __u32 max_entries,
0324            const struct bpf_map_create_opts *opts __maybe_unused)
0325 {
0326 #pragma GCC diagnostic push
0327 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
0328     return bpf_create_map(map_type, key_size, value_size, max_entries, 0);
0329 #pragma GCC diagnostic pop
0330 }
0331 #endif
0332
0333 static int bperf_lock_attr_map(struct target *target)
0334 {
0335     char path[PATH_MAX];
0336     int map_fd, err;
0337
0338     if (target->attr_map) {
0339         scnprintf(path, PATH_MAX, "%s", target->attr_map);
0340     } else {
0341         scnprintf(path, PATH_MAX, "%s/fs/bpf/%s", sysfs__mountpoint(),
0342               BPF_PERF_DEFAULT_ATTR_MAP_PATH);
0343     }
0344
0345     if (access(path, F_OK)) {
0346         map_fd = bpf_map_create(BPF_MAP_TYPE_HASH, NULL,
0347                     sizeof(struct perf_event_attr),
0348                     sizeof(struct perf_event_attr_map_entry),
0349                     ATTR_MAP_SIZE, NULL);
0350         if (map_fd < 0)
0351             return -1;
0352
0353         err = bpf_obj_pin(map_fd, path);
0354         if (err) {
0355             /* someone pinned the map in parallel? */
0356             close(map_fd);
0357             map_fd = bpf_obj_get(path);
0358             if (map_fd < 0)
0359                 return -1;
0360         }
0361     } else {
0362         map_fd = bpf_obj_get(path);
0363         if (map_fd < 0)
0364             return -1;
0365     }
0366
0367     if (!bperf_attr_map_compatible(map_fd)) {
0368         close(map_fd);
0369         return -1;
0370
0371     }
0372     err = flock(map_fd, LOCK_EX);
0373     if (err) {
0374         close(map_fd);
0375         return -1;
0376     }
0377     return map_fd;
0378 }
0379
0380 static int bperf_check_target(struct evsel *evsel,
0381                   struct target *target,
0382                   enum bperf_filter_type *filter_type,
0383                   __u32 *filter_entry_cnt)
0384 {
0385     if (evsel->core.leader->nr_members > 1) {
0386         pr_err("bpf managed perf events do not yet support groups.\n");
0387         return -1;
0388     }
0389
0390     /* determine filter type based on target */
0391     if (target->system_wide) {
0392         *filter_type = BPERF_FILTER_GLOBAL;
0393         *filter_entry_cnt = 1;
0394     } else if (target->cpu_list) {
0395         *filter_type = BPERF_FILTER_CPU;
0396         *filter_entry_cnt = perf_cpu_map__nr(evsel__cpus(evsel));
0397     } else if (target->tid) {
0398         *filter_type = BPERF_FILTER_PID;
0399         *filter_entry_cnt = perf_thread_map__nr(evsel->core.threads);
0400     } else if (target->pid || evsel->evlist->workload.pid != -1) {
0401         *filter_type = BPERF_FILTER_TGID;
0402         *filter_entry_cnt = perf_thread_map__nr(evsel->core.threads);
0403     } else {
0404         pr_err("bpf managed perf events do not yet support these targets.\n");
0405         return -1;
0406     }
0407
0408     return 0;
0409 }
0410
0411 static  struct perf_cpu_map *all_cpu_map;
0412
0413 static int bperf_reload_leader_program(struct evsel *evsel, int attr_map_fd,
0414                        struct perf_event_attr_map_entry *entry)
0415 {
0416     struct bperf_leader_bpf *skel = bperf_leader_bpf__open();
0417     int link_fd, diff_map_fd, err;
0418     struct bpf_link *link = NULL;
0419
0420     if (!skel) {
0421         pr_err("Failed to open leader skeleton\n");
0422         return -1;
0423     }
0424
0425     bpf_map__set_max_entries(skel->maps.events, libbpf_num_possible_cpus());
0426     err = bperf_leader_bpf__load(skel);
0427     if (err) {
0428         pr_err("Failed to load leader skeleton\n");
0429         goto out;
0430     }
0431
0432     link = bpf_program__attach(skel->progs.on_switch);
0433     if (IS_ERR(link)) {
0434         pr_err("Failed to attach leader program\n");
0435         err = PTR_ERR(link);
0436         goto out;
0437     }
0438
0439     link_fd = bpf_link__fd(link);
0440     diff_map_fd = bpf_map__fd(skel->maps.diff_readings);
0441     entry->link_id = bpf_link_get_id(link_fd);
0442     entry->diff_map_id = bpf_map_get_id(diff_map_fd);
0443     err = bpf_map_update_elem(attr_map_fd, &evsel->core.attr, entry, BPF_ANY);
0444     assert(err == 0);
0445
0446     evsel->bperf_leader_link_fd = bpf_link_get_fd_by_id(entry->link_id);
0447     assert(evsel->bperf_leader_link_fd >= 0);
0448
0449     /*
0450      * save leader_skel for install_pe, which is called within
0451      * following evsel__open_per_cpu call
0452      */
0453     evsel->leader_skel = skel;
0454     evsel__open_per_cpu(evsel, all_cpu_map, -1);
0455
0456 out:
0457     bperf_leader_bpf__destroy(skel);
0458     bpf_link__destroy(link);
0459     return err;
0460 }
0461
0462 static int bperf__load(struct evsel *evsel, struct target *target)
0463 {
0464     struct perf_event_attr_map_entry entry = {0xffffffff, 0xffffffff};
0465     int attr_map_fd, diff_map_fd = -1, err;
0466     enum bperf_filter_type filter_type;
0467     __u32 filter_entry_cnt, i;
0468
0469     if (bperf_check_target(evsel, target, &filter_type, &filter_entry_cnt))
0470         return -1;
0471
0472     if (!all_cpu_map) {
0473         all_cpu_map = perf_cpu_map__new(NULL);
0474         if (!all_cpu_map)
0475             return -1;
0476     }
0477
0478     evsel->bperf_leader_prog_fd = -1;
0479     evsel->bperf_leader_link_fd = -1;
0480
0481     /*
0482      * Step 1: hold a fd on the leader program and the bpf_link, if
0483      * the program is not already gone, reload the program.
0484      * Use flock() to ensure exclusive access to the perf_event_attr
0485      * map.
0486      */
0487     attr_map_fd = bperf_lock_attr_map(target);
0488     if (attr_map_fd < 0) {
0489         pr_err("Failed to lock perf_event_attr map\n");
0490         return -1;
0491     }
0492
0493     err = bpf_map_lookup_elem(attr_map_fd, &evsel->core.attr, &entry);
0494     if (err) {
0495         err = bpf_map_update_elem(attr_map_fd, &evsel->core.attr, &entry, BPF_ANY);
0496         if (err)
0497             goto out;
0498     }
0499
0500     evsel->bperf_leader_link_fd = bpf_link_get_fd_by_id(entry.link_id);
0501     if (evsel->bperf_leader_link_fd < 0 &&
0502         bperf_reload_leader_program(evsel, attr_map_fd, &entry)) {
0503         err = -1;
0504         goto out;
0505     }
0506     /*
0507      * The bpf_link holds reference to the leader program, and the
0508      * leader program holds reference to the maps. Therefore, if
0509      * link_id is valid, diff_map_id should also be valid.
0510      */
0511     evsel->bperf_leader_prog_fd = bpf_prog_get_fd_by_id(
0512         bpf_link_get_prog_id(evsel->bperf_leader_link_fd));
0513     assert(evsel->bperf_leader_prog_fd >= 0);
0514
0515     diff_map_fd = bpf_map_get_fd_by_id(entry.diff_map_id);
0516     assert(diff_map_fd >= 0);
0517
0518     /*
0519      * bperf uses BPF_PROG_TEST_RUN to get accurate reading. Check
0520      * whether the kernel support it
0521      */
0522     err = bperf_trigger_reading(evsel->bperf_leader_prog_fd, 0);
0523     if (err) {
0524         pr_err("The kernel does not support test_run for raw_tp BPF programs.\n"
0525                "Therefore, --use-bpf might show inaccurate readings\n");
0526         goto out;
0527     }
0528
0529     /* Step 2: load the follower skeleton */
0530     evsel->follower_skel = bperf_follower_bpf__open();
0531     if (!evsel->follower_skel) {
0532         err = -1;
0533         pr_err("Failed to open follower skeleton\n");
0534         goto out;
0535     }
0536
0537     /* attach fexit program to the leader program */
0538     bpf_program__set_attach_target(evsel->follower_skel->progs.fexit_XXX,
0539                        evsel->bperf_leader_prog_fd, "on_switch");
0540
0541     /* connect to leader diff_reading map */
0542     bpf_map__reuse_fd(evsel->follower_skel->maps.diff_readings, diff_map_fd);
0543
0544     /* set up reading map */
0545     bpf_map__set_max_entries(evsel->follower_skel->maps.accum_readings,
0546                  filter_entry_cnt);
0547     /* set up follower filter based on target */
0548     bpf_map__set_max_entries(evsel->follower_skel->maps.filter,
0549                  filter_entry_cnt);
0550     err = bperf_follower_bpf__load(evsel->follower_skel);
0551     if (err) {
0552         pr_err("Failed to load follower skeleton\n");
0553         bperf_follower_bpf__destroy(evsel->follower_skel);
0554         evsel->follower_skel = NULL;
0555         goto out;
0556     }
0557
0558     for (i = 0; i < filter_entry_cnt; i++) {
0559         int filter_map_fd;
0560         __u32 key;
0561
0562         if (filter_type == BPERF_FILTER_PID ||
0563             filter_type == BPERF_FILTER_TGID)
0564             key = evsel->core.threads->map[i].pid;
0565         else if (filter_type == BPERF_FILTER_CPU)
0566             key = evsel->core.cpus->map[i].cpu;
0567         else
0568             break;
0569
0570         filter_map_fd = bpf_map__fd(evsel->follower_skel->maps.filter);
0571         bpf_map_update_elem(filter_map_fd, &key, &i, BPF_ANY);
0572     }
0573
0574     evsel->follower_skel->bss->type = filter_type;
0575
0576     err = bperf_follower_bpf__attach(evsel->follower_skel);
0577
0578 out:
0579     if (err && evsel->bperf_leader_link_fd >= 0)
0580         close(evsel->bperf_leader_link_fd);
0581     if (err && evsel->bperf_leader_prog_fd >= 0)
0582         close(evsel->bperf_leader_prog_fd);
0583     if (diff_map_fd >= 0)
0584         close(diff_map_fd);
0585
0586     flock(attr_map_fd, LOCK_UN);
0587     close(attr_map_fd);
0588
0589     return err;
0590 }
0591
0592 static int bperf__install_pe(struct evsel *evsel, int cpu_map_idx, int fd)
0593 {
0594     struct bperf_leader_bpf *skel = evsel->leader_skel;
0595
0596     return bpf_map_update_elem(bpf_map__fd(skel->maps.events),
0597                    &cpu_map_idx, &fd, BPF_ANY);
0598 }
0599
0600 /*
0601  * trigger the leader prog on each cpu, so the accum_reading map could get
0602  * the latest readings.
0603  */
0604 static int bperf_sync_counters(struct evsel *evsel)
0605 {
0606     int num_cpu, i, cpu;
0607
0608     num_cpu = all_cpu_map->nr;
0609     for (i = 0; i < num_cpu; i++) {
0610         cpu = all_cpu_map->map[i].cpu;
0611         bperf_trigger_reading(evsel->bperf_leader_prog_fd, cpu);
0612     }
0613     return 0;
0614 }
0615
0616 static int bperf__enable(struct evsel *evsel)
0617 {
0618     evsel->follower_skel->bss->enabled = 1;
0619     return 0;
0620 }
0621
0622 static int bperf__disable(struct evsel *evsel)
0623 {
0624     evsel->follower_skel->bss->enabled = 0;
0625     return 0;
0626 }
0627
0628 static int bperf__read(struct evsel *evsel)
0629 {
0630     struct bperf_follower_bpf *skel = evsel->follower_skel;
0631     __u32 num_cpu_bpf = cpu__max_cpu().cpu;
0632     struct bpf_perf_event_value values[num_cpu_bpf];
0633     struct perf_counts_values *counts;
0634     int reading_map_fd, err = 0;
0635     __u32 i;
0636     int j;
0637
0638     bperf_sync_counters(evsel);
0639     reading_map_fd = bpf_map__fd(skel->maps.accum_readings);
0640
0641     for (i = 0; i < bpf_map__max_entries(skel->maps.accum_readings); i++) {
0642         struct perf_cpu entry;
0643         __u32 cpu;
0644
0645         err = bpf_map_lookup_elem(reading_map_fd, &i, values);
0646         if (err)
0647             goto out;
0648         switch (evsel->follower_skel->bss->type) {
0649         case BPERF_FILTER_GLOBAL:
0650             assert(i == 0);
0651
0652             perf_cpu_map__for_each_cpu(entry, j, evsel__cpus(evsel)) {
0653                 counts = perf_counts(evsel->counts, j, 0);
0654                 counts->val = values[entry.cpu].counter;
0655                 counts->ena = values[entry.cpu].enabled;
0656                 counts->run = values[entry.cpu].running;
0657             }
0658             break;
0659         case BPERF_FILTER_CPU:
0660             cpu = perf_cpu_map__cpu(evsel__cpus(evsel), i).cpu;
0661             assert(cpu >= 0);
0662             counts = perf_counts(evsel->counts, i, 0);
0663             counts->val = values[cpu].counter;
0664             counts->ena = values[cpu].enabled;
0665             counts->run = values[cpu].running;
0666             break;
0667         case BPERF_FILTER_PID:
0668         case BPERF_FILTER_TGID:
0669             counts = perf_counts(evsel->counts, 0, i);
0670             counts->val = 0;
0671             counts->ena = 0;
0672             counts->run = 0;
0673
0674             for (cpu = 0; cpu < num_cpu_bpf; cpu++) {
0675                 counts->val += values[cpu].counter;
0676                 counts->ena += values[cpu].enabled;
0677                 counts->run += values[cpu].running;
0678             }
0679             break;
0680         default:
0681             break;
0682         }
0683     }
0684 out:
0685     return err;
0686 }
0687
0688 static int bperf__destroy(struct evsel *evsel)
0689 {
0690     bperf_follower_bpf__destroy(evsel->follower_skel);
0691     close(evsel->bperf_leader_prog_fd);
0692     close(evsel->bperf_leader_link_fd);
0693     return 0;
0694 }
0695
0696 /*
0697  * bperf: share hardware PMCs with BPF
0698  *
0699  * perf uses performance monitoring counters (PMC) to monitor system
0700  * performance. The PMCs are limited hardware resources. For example,
0701  * Intel CPUs have 3x fixed PMCs and 4x programmable PMCs per cpu.
0702  *
0703  * Modern data center systems use these PMCs in many different ways:
0704  * system level monitoring, (maybe nested) container level monitoring, per
0705  * process monitoring, profiling (in sample mode), etc. In some cases,
0706  * there are more active perf_events than available hardware PMCs. To allow
0707  * all perf_events to have a chance to run, it is necessary to do expensive
0708  * time multiplexing of events.
0709  *
0710  * On the other hand, many monitoring tools count the common metrics
0711  * (cycles, instructions). It is a waste to have multiple tools create
0712  * multiple perf_events of "cycles" and occupy multiple PMCs.
0713  *
0714  * bperf tries to reduce such wastes by allowing multiple perf_events of
0715  * "cycles" or "instructions" (at different scopes) to share PMUs. Instead
0716  * of having each perf-stat session to read its own perf_events, bperf uses
0717  * BPF programs to read the perf_events and aggregate readings to BPF maps.
0718  * Then, the perf-stat session(s) reads the values from these BPF maps.
0719  *
0720  *                                ||
0721  *       shared progs and maps <- || -> per session progs and maps
0722  *                                ||
0723  *   ---------------              ||
0724  *   | perf_events |              ||
0725  *   ---------------       fexit  ||      -----------------
0726  *          |             --------||----> | follower prog |
0727  *       --------------- /        || ---  -----------------
0728  * cs -> | leader prog |/         ||/        |         |
0729  *   --> ---------------         /||  --------------  ------------------
0730  *  /       |         |         / ||  | filter map |  | accum_readings |
0731  * /  ------------  ------------  ||  --------------  ------------------
0732  * |  | prev map |  | diff map |  ||                        |
0733  * |  ------------  ------------  ||                        |
0734  *  \                             ||                        |
0735  * = \ ==================================================== | ============
0736  *    \                                                    /   user space
0737  *     \                                                  /
0738  *      \                                                /
0739  *    BPF_PROG_TEST_RUN                    BPF_MAP_LOOKUP_ELEM
0740  *        \                                            /
0741  *         \                                          /
0742  *          \------  perf-stat ----------------------/
0743  *
0744  * The figure above shows the architecture of bperf. Note that the figure
0745  * is divided into 3 regions: shared progs and maps (top left), per session
0746  * progs and maps (top right), and user space (bottom).
0747  *
0748  * The leader prog is triggered on each context switch (cs). The leader
0749  * prog reads perf_events and stores the difference (current_reading -
0750  * previous_reading) to the diff map. For the same metric, e.g. "cycles",
0751  * multiple perf-stat sessions share the same leader prog.
0752  *
0753  * Each perf-stat session creates a follower prog as fexit program to the
0754  * leader prog. It is possible to attach up to BPF_MAX_TRAMP_PROGS (38)
0755  * follower progs to the same leader prog. The follower prog checks current
0756  * task and processor ID to decide whether to add the value from the diff
0757  * map to its accumulated reading map (accum_readings).
0758  *
0759  * Finally, perf-stat user space reads the value from accum_reading map.
0760  *
0761  * Besides context switch, it is also necessary to trigger the leader prog
0762  * before perf-stat reads the value. Otherwise, the accum_reading map may
0763  * not have the latest reading from the perf_events. This is achieved by
0764  * triggering the event via sys_bpf(BPF_PROG_TEST_RUN) to each CPU.
0765  *
0766  * Comment before the definition of struct perf_event_attr_map_entry
0767  * describes how different sessions of perf-stat share information about
0768  * the leader prog.
0769  */
0770
0771 struct bpf_counter_ops bperf_ops = {
0772     .load       = bperf__load,
0773     .enable     = bperf__enable,
0774     .disable    = bperf__disable,
0775     .read       = bperf__read,
0776     .install_pe = bperf__install_pe,
0777     .destroy    = bperf__destroy,
0778 };
0779
0780 extern struct bpf_counter_ops bperf_cgrp_ops;
0781
0782 static inline bool bpf_counter_skip(struct evsel *evsel)
0783 {
0784     return list_empty(&evsel->bpf_counter_list) &&
0785         evsel->follower_skel == NULL;
0786 }
0787
0788 int bpf_counter__install_pe(struct evsel *evsel, int cpu_map_idx, int fd)
0789 {
0790     if (bpf_counter_skip(evsel))
0791         return 0;
0792     return evsel->bpf_counter_ops->install_pe(evsel, cpu_map_idx, fd);
0793 }
0794
0795 int bpf_counter__load(struct evsel *evsel, struct target *target)
0796 {
0797     if (target->bpf_str)
0798         evsel->bpf_counter_ops = &bpf_program_profiler_ops;
0799     else if (cgrp_event_expanded && target->use_bpf)
0800         evsel->bpf_counter_ops = &bperf_cgrp_ops;
0801     else if (target->use_bpf || evsel->bpf_counter ||
0802          evsel__match_bpf_counter_events(evsel->name))
0803         evsel->bpf_counter_ops = &bperf_ops;
0804
0805     if (evsel->bpf_counter_ops)
0806         return evsel->bpf_counter_ops->load(evsel, target);
0807     return 0;
0808 }
0809
0810 int bpf_counter__enable(struct evsel *evsel)
0811 {
0812     if (bpf_counter_skip(evsel))
0813         return 0;
0814     return evsel->bpf_counter_ops->enable(evsel);
0815 }
0816
0817 int bpf_counter__disable(struct evsel *evsel)
0818 {
0819     if (bpf_counter_skip(evsel))
0820         return 0;
0821     return evsel->bpf_counter_ops->disable(evsel);
0822 }
0823
0824 int bpf_counter__read(struct evsel *evsel)
0825 {
0826     if (bpf_counter_skip(evsel))
0827         return -EAGAIN;
0828     return evsel->bpf_counter_ops->read(evsel);
0829 }
0830
0831 void bpf_counter__destroy(struct evsel *evsel)
0832 {
0833     if (bpf_counter_skip(evsel))
0834         return;
0835     evsel->bpf_counter_ops->destroy(evsel);
0836     evsel->bpf_counter_ops = NULL;
0837 }