Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 // Copyright (c) 2019 Facebook
0003 
0004 #include <stdint.h>
0005 #include <stddef.h>
0006 #include <stdbool.h>
0007 #include <linux/bpf.h>
0008 #include <linux/ptrace.h>
0009 #include <linux/sched.h>
0010 #include <linux/types.h>
0011 #include <bpf/bpf_helpers.h>
0012 
0013 typedef uint32_t pid_t;
0014 struct task_struct {};
0015 
0016 #define TASK_COMM_LEN 16
0017 #define PERF_MAX_STACK_DEPTH 127
0018 
0019 #define STROBE_TYPE_INVALID 0
0020 #define STROBE_TYPE_INT 1
0021 #define STROBE_TYPE_STR 2
0022 #define STROBE_TYPE_MAP 3
0023 
0024 #define STACK_TABLE_EPOCH_SHIFT 20
0025 #define STROBE_MAX_STR_LEN 1
0026 #define STROBE_MAX_CFGS 32
0027 #define STROBE_MAX_PAYLOAD                      \
0028     (STROBE_MAX_STRS * STROBE_MAX_STR_LEN +             \
0029     STROBE_MAX_MAPS * (1 + STROBE_MAX_MAP_ENTRIES * 2) * STROBE_MAX_STR_LEN)
0030 
0031 struct strobe_value_header {
0032     /*
0033      * meaning depends on type:
0034      * 1. int: 0, if value not set, 1 otherwise
0035      * 2. str: 1 always, whether value is set or not is determined by ptr
0036      * 3. map: 1 always, pointer points to additional struct with number
0037      *    of entries (up to STROBE_MAX_MAP_ENTRIES)
0038      */
0039     uint16_t len;
0040     /*
0041      * _reserved might be used for some future fields/flags, but we always
0042      * want to keep strobe_value_header to be 8 bytes, so BPF can read 16
0043      * bytes in one go and get both header and value
0044      */
0045     uint8_t _reserved[6];
0046 };
0047 
0048 /*
0049  * strobe_value_generic is used from BPF probe only, but needs to be a union
0050  * of strobe_value_int/strobe_value_str/strobe_value_map
0051  */
0052 struct strobe_value_generic {
0053     struct strobe_value_header header;
0054     union {
0055         int64_t val;
0056         void *ptr;
0057     };
0058 };
0059 
0060 struct strobe_value_int {
0061     struct strobe_value_header header;
0062     int64_t value;
0063 };
0064 
0065 struct strobe_value_str {
0066     struct strobe_value_header header;
0067     const char* value;
0068 };
0069 
0070 struct strobe_value_map {
0071     struct strobe_value_header header;
0072     const struct strobe_map_raw* value;
0073 };
0074 
0075 struct strobe_map_entry {
0076     const char* key;
0077     const char* val;
0078 };
0079 
0080 /*
0081  * Map of C-string key/value pairs with fixed maximum capacity. Each map has
0082  * corresponding int64 ID, which application can use (or ignore) in whatever
0083  * way appropriate. Map is "write-only", there is no way to get data out of
0084  * map. Map is intended to be used to provide metadata for profilers and is
0085  * not to be used for internal in-app communication. All methods are
0086  * thread-safe.
0087  */
0088 struct strobe_map_raw {
0089     /*
0090      * general purpose unique ID that's up to application to decide
0091      * whether and how to use; for request metadata use case id is unique
0092      * request ID that's used to match metadata with stack traces on
0093      * Strobelight backend side
0094      */
0095     int64_t id;
0096     /* number of used entries in map */
0097     int64_t cnt;
0098     /*
0099      * having volatile doesn't change anything on BPF side, but clang
0100      * emits warnings for passing `volatile const char *` into
0101      * bpf_probe_read_user_str that expects just `const char *`
0102      */
0103     const char* tag;
0104     /*
0105      * key/value entries, each consisting of 2 pointers to key and value
0106      * C strings
0107      */
0108     struct strobe_map_entry entries[STROBE_MAX_MAP_ENTRIES];
0109 };
0110 
0111 /* Following values define supported values of TLS mode */
0112 #define TLS_NOT_SET -1
0113 #define TLS_LOCAL_EXEC 0
0114 #define TLS_IMM_EXEC 1
0115 #define TLS_GENERAL_DYN 2
0116 
0117 /*
0118  * structure that universally represents TLS location (both for static
0119  * executables and shared libraries)
0120  */
0121 struct strobe_value_loc {
0122     /*
0123      * tls_mode defines what TLS mode was used for particular metavariable:
0124      * - -1 (TLS_NOT_SET) - no metavariable;
0125      * - 0 (TLS_LOCAL_EXEC) - Local Executable mode;
0126      * - 1 (TLS_IMM_EXEC) - Immediate Executable mode;
0127      * - 2 (TLS_GENERAL_DYN) - General Dynamic mode;
0128      * Local Dynamic mode is not yet supported, because never seen in
0129      * practice.  Mode defines how offset field is interpreted. See
0130      * calc_location() in below for details.
0131      */
0132     int64_t tls_mode;
0133     /*
0134      * TLS_LOCAL_EXEC: offset from thread pointer (fs:0 for x86-64,
0135      * tpidr_el0 for aarch64).
0136      * TLS_IMM_EXEC: absolute address of GOT entry containing offset
0137      * from thread pointer;
0138      * TLS_GENERAL_DYN: absolute addres of double GOT entry
0139      * containing tls_index_t struct;
0140      */
0141     int64_t offset;
0142 };
0143 
0144 struct strobemeta_cfg {
0145     int64_t req_meta_idx;
0146     struct strobe_value_loc int_locs[STROBE_MAX_INTS];
0147     struct strobe_value_loc str_locs[STROBE_MAX_STRS];
0148     struct strobe_value_loc map_locs[STROBE_MAX_MAPS];
0149 };
0150 
0151 struct strobe_map_descr {
0152     uint64_t id;
0153     int16_t tag_len;
0154     /*
0155      * cnt <0 - map value isn't set;
0156      * 0 - map has id set, but no key/value entries
0157      */
0158     int16_t cnt;
0159     /*
0160      * both key_lens[i] and val_lens[i] should be >0 for present key/value
0161      * entry
0162      */
0163     uint16_t key_lens[STROBE_MAX_MAP_ENTRIES];
0164     uint16_t val_lens[STROBE_MAX_MAP_ENTRIES];
0165 };
0166 
0167 struct strobemeta_payload {
0168     /* req_id has valid request ID, if req_meta_valid == 1 */
0169     int64_t req_id;
0170     uint8_t req_meta_valid;
0171     /*
0172      * mask has Nth bit set to 1, if Nth metavar was present and
0173      * successfully read
0174      */
0175     uint64_t int_vals_set_mask;
0176     int64_t int_vals[STROBE_MAX_INTS];
0177     /* len is >0 for present values */
0178     uint16_t str_lens[STROBE_MAX_STRS];
0179     /* if map_descrs[i].cnt == -1, metavar is not present/set */
0180     struct strobe_map_descr map_descrs[STROBE_MAX_MAPS];
0181     /*
0182      * payload has compactly packed values of str and map variables in the
0183      * form: strval1\0strval2\0map1key1\0map1val1\0map2key1\0map2val1\0
0184      * (and so on); str_lens[i], key_lens[i] and val_lens[i] determines
0185      * value length
0186      */
0187     char payload[STROBE_MAX_PAYLOAD];
0188 };
0189 
0190 struct strobelight_bpf_sample {
0191     uint64_t ktime;
0192     char comm[TASK_COMM_LEN];
0193     pid_t pid;
0194     int user_stack_id;
0195     int kernel_stack_id;
0196     int has_meta;
0197     struct strobemeta_payload metadata;
0198     /*
0199      * makes it possible to pass (<real payload size> + 1) as data size to
0200      * perf_submit() to avoid perf_submit's paranoia about passing zero as
0201      * size, as it deduces that <real payload size> might be
0202      * **theoretically** zero
0203      */
0204     char dummy_safeguard;
0205 };
0206 
0207 struct {
0208     __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
0209     __uint(max_entries, 32);
0210     __uint(key_size, sizeof(int));
0211     __uint(value_size, sizeof(int));
0212 } samples SEC(".maps");
0213 
0214 struct {
0215     __uint(type, BPF_MAP_TYPE_STACK_TRACE);
0216     __uint(max_entries, 16);
0217     __uint(key_size, sizeof(uint32_t));
0218     __uint(value_size, sizeof(uint64_t) * PERF_MAX_STACK_DEPTH);
0219 } stacks_0 SEC(".maps");
0220 
0221 struct {
0222     __uint(type, BPF_MAP_TYPE_STACK_TRACE);
0223     __uint(max_entries, 16);
0224     __uint(key_size, sizeof(uint32_t));
0225     __uint(value_size, sizeof(uint64_t) * PERF_MAX_STACK_DEPTH);
0226 } stacks_1 SEC(".maps");
0227 
0228 struct {
0229     __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
0230     __uint(max_entries, 1);
0231     __type(key, uint32_t);
0232     __type(value, struct strobelight_bpf_sample);
0233 } sample_heap SEC(".maps");
0234 
0235 struct {
0236     __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
0237     __uint(max_entries, STROBE_MAX_CFGS);
0238     __type(key, pid_t);
0239     __type(value, struct strobemeta_cfg);
0240 } strobemeta_cfgs SEC(".maps");
0241 
0242 /* Type for the dtv.  */
0243 /* https://github.com/lattera/glibc/blob/master/nptl/sysdeps/x86_64/tls.h#L34 */
0244 typedef union dtv {
0245     size_t counter;
0246     struct {
0247         void* val;
0248         bool is_static;
0249     } pointer;
0250 } dtv_t;
0251 
0252 /* Partial definition for tcbhead_t */
0253 /* https://github.com/bminor/glibc/blob/master/sysdeps/x86_64/nptl/tls.h#L42 */
0254 struct tcbhead {
0255     void* tcb;
0256     dtv_t* dtv;
0257 };
0258 
0259 /*
0260  * TLS module/offset information for shared library case.
0261  * For x86-64, this is mapped onto two entries in GOT.
0262  * For aarch64, this is pointed to by second GOT entry.
0263  */
0264 struct tls_index {
0265     uint64_t module;
0266     uint64_t offset;
0267 };
0268 
0269 #ifdef SUBPROGS
0270 __noinline
0271 #else
0272 __always_inline
0273 #endif
0274 static void *calc_location(struct strobe_value_loc *loc, void *tls_base)
0275 {
0276     /*
0277      * tls_mode value is:
0278      * - -1 (TLS_NOT_SET), if no metavar is present;
0279      * - 0 (TLS_LOCAL_EXEC), if metavar uses Local Executable mode of TLS
0280      * (offset from fs:0 for x86-64 or tpidr_el0 for aarch64);
0281      * - 1 (TLS_IMM_EXEC), if metavar uses Immediate Executable mode of TLS;
0282      * - 2 (TLS_GENERAL_DYN), if metavar uses General Dynamic mode of TLS;
0283      * This schema allows to use something like:
0284      * (tls_mode + 1) * (tls_base + offset)
0285      * to get NULL for "no metavar" location, or correct pointer for local
0286      * executable mode without doing extra ifs.
0287      */
0288     if (loc->tls_mode <= TLS_LOCAL_EXEC) {
0289         /* static executable is simple, we just have offset from
0290          * tls_base */
0291         void *addr = tls_base + loc->offset;
0292         /* multiply by (tls_mode + 1) to get NULL, if we have no
0293          * metavar in this slot */
0294         return (void *)((loc->tls_mode + 1) * (int64_t)addr);
0295     }
0296     /*
0297      * Other modes are more complicated, we need to jump through few hoops.
0298      *
0299      * For immediate executable mode (currently supported only for aarch64):
0300      *  - loc->offset is pointing to a GOT entry containing fixed offset
0301      *  relative to tls_base;
0302      *
0303      * For general dynamic mode:
0304      *  - loc->offset is pointing to a beginning of double GOT entries;
0305      *  - (for aarch64 only) second entry points to tls_index_t struct;
0306      *  - (for x86-64 only) two GOT entries are already tls_index_t;
0307      *  - tls_index_t->module is used to find start of TLS section in
0308      *  which variable resides;
0309      *  - tls_index_t->offset provides offset within that TLS section,
0310      *  pointing to value of variable.
0311      */
0312     struct tls_index tls_index;
0313     dtv_t *dtv;
0314     void *tls_ptr;
0315 
0316     bpf_probe_read_user(&tls_index, sizeof(struct tls_index),
0317                 (void *)loc->offset);
0318     /* valid module index is always positive */
0319     if (tls_index.module > 0) {
0320         /* dtv = ((struct tcbhead *)tls_base)->dtv[tls_index.module] */
0321         bpf_probe_read_user(&dtv, sizeof(dtv),
0322                     &((struct tcbhead *)tls_base)->dtv);
0323         dtv += tls_index.module;
0324     } else {
0325         dtv = NULL;
0326     }
0327     bpf_probe_read_user(&tls_ptr, sizeof(void *), dtv);
0328     /* if pointer has (void *)-1 value, then TLS wasn't initialized yet */
0329     return tls_ptr && tls_ptr != (void *)-1
0330         ? tls_ptr + tls_index.offset
0331         : NULL;
0332 }
0333 
0334 #ifdef SUBPROGS
0335 __noinline
0336 #else
0337 __always_inline
0338 #endif
0339 static void read_int_var(struct strobemeta_cfg *cfg,
0340              size_t idx, void *tls_base,
0341              struct strobe_value_generic *value,
0342              struct strobemeta_payload *data)
0343 {
0344     void *location = calc_location(&cfg->int_locs[idx], tls_base);
0345     if (!location)
0346         return;
0347 
0348     bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location);
0349     data->int_vals[idx] = value->val;
0350     if (value->header.len)
0351         data->int_vals_set_mask |= (1 << idx);
0352 }
0353 
0354 static __always_inline uint64_t read_str_var(struct strobemeta_cfg *cfg,
0355                          size_t idx, void *tls_base,
0356                          struct strobe_value_generic *value,
0357                          struct strobemeta_payload *data,
0358                          void *payload)
0359 {
0360     void *location;
0361     uint64_t len;
0362 
0363     data->str_lens[idx] = 0;
0364     location = calc_location(&cfg->str_locs[idx], tls_base);
0365     if (!location)
0366         return 0;
0367 
0368     bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location);
0369     len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN, value->ptr);
0370     /*
0371      * if bpf_probe_read_user_str returns error (<0), due to casting to
0372      * unsinged int, it will become big number, so next check is
0373      * sufficient to check for errors AND prove to BPF verifier, that
0374      * bpf_probe_read_user_str won't return anything bigger than
0375      * STROBE_MAX_STR_LEN
0376      */
0377     if (len > STROBE_MAX_STR_LEN)
0378         return 0;
0379 
0380     data->str_lens[idx] = len;
0381     return len;
0382 }
0383 
0384 static __always_inline void *read_map_var(struct strobemeta_cfg *cfg,
0385                       size_t idx, void *tls_base,
0386                       struct strobe_value_generic *value,
0387                       struct strobemeta_payload *data,
0388                       void *payload)
0389 {
0390     struct strobe_map_descr* descr = &data->map_descrs[idx];
0391     struct strobe_map_raw map;
0392     void *location;
0393     uint64_t len;
0394     int i;
0395 
0396     descr->tag_len = 0; /* presume no tag is set */
0397     descr->cnt = -1; /* presume no value is set */
0398 
0399     location = calc_location(&cfg->map_locs[idx], tls_base);
0400     if (!location)
0401         return payload;
0402 
0403     bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location);
0404     if (bpf_probe_read_user(&map, sizeof(struct strobe_map_raw), value->ptr))
0405         return payload;
0406 
0407     descr->id = map.id;
0408     descr->cnt = map.cnt;
0409     if (cfg->req_meta_idx == idx) {
0410         data->req_id = map.id;
0411         data->req_meta_valid = 1;
0412     }
0413 
0414     len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN, map.tag);
0415     if (len <= STROBE_MAX_STR_LEN) {
0416         descr->tag_len = len;
0417         payload += len;
0418     }
0419 
0420 #ifdef NO_UNROLL
0421 #pragma clang loop unroll(disable)
0422 #else
0423 #pragma unroll
0424 #endif
0425     for (int i = 0; i < STROBE_MAX_MAP_ENTRIES; ++i) {
0426         if (i >= map.cnt)
0427             break;
0428 
0429         descr->key_lens[i] = 0;
0430         len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN,
0431                           map.entries[i].key);
0432         if (len <= STROBE_MAX_STR_LEN) {
0433             descr->key_lens[i] = len;
0434             payload += len;
0435         }
0436         descr->val_lens[i] = 0;
0437         len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN,
0438                           map.entries[i].val);
0439         if (len <= STROBE_MAX_STR_LEN) {
0440             descr->val_lens[i] = len;
0441             payload += len;
0442         }
0443     }
0444 
0445     return payload;
0446 }
0447 
0448 #ifdef USE_BPF_LOOP
0449 enum read_type {
0450     READ_INT_VAR,
0451     READ_MAP_VAR,
0452     READ_STR_VAR,
0453 };
0454 
0455 struct read_var_ctx {
0456     struct strobemeta_payload *data;
0457     void *tls_base;
0458     struct strobemeta_cfg *cfg;
0459     void *payload;
0460     /* value gets mutated */
0461     struct strobe_value_generic *value;
0462     enum read_type type;
0463 };
0464 
0465 static int read_var_callback(__u32 index, struct read_var_ctx *ctx)
0466 {
0467     switch (ctx->type) {
0468     case READ_INT_VAR:
0469         if (index >= STROBE_MAX_INTS)
0470             return 1;
0471         read_int_var(ctx->cfg, index, ctx->tls_base, ctx->value, ctx->data);
0472         break;
0473     case READ_MAP_VAR:
0474         if (index >= STROBE_MAX_MAPS)
0475             return 1;
0476         ctx->payload = read_map_var(ctx->cfg, index, ctx->tls_base,
0477                         ctx->value, ctx->data, ctx->payload);
0478         break;
0479     case READ_STR_VAR:
0480         if (index >= STROBE_MAX_STRS)
0481             return 1;
0482         ctx->payload += read_str_var(ctx->cfg, index, ctx->tls_base,
0483                          ctx->value, ctx->data, ctx->payload);
0484         break;
0485     }
0486     return 0;
0487 }
0488 #endif /* USE_BPF_LOOP */
0489 
0490 /*
0491  * read_strobe_meta returns NULL, if no metadata was read; otherwise returns
0492  * pointer to *right after* payload ends
0493  */
0494 #ifdef SUBPROGS
0495 __noinline
0496 #else
0497 __always_inline
0498 #endif
0499 static void *read_strobe_meta(struct task_struct *task,
0500                   struct strobemeta_payload *data)
0501 {
0502     pid_t pid = bpf_get_current_pid_tgid() >> 32;
0503     struct strobe_value_generic value = {0};
0504     struct strobemeta_cfg *cfg;
0505     void *tls_base, *payload;
0506 
0507     cfg = bpf_map_lookup_elem(&strobemeta_cfgs, &pid);
0508     if (!cfg)
0509         return NULL;
0510 
0511     data->int_vals_set_mask = 0;
0512     data->req_meta_valid = 0;
0513     payload = data->payload;
0514     /*
0515      * we don't have struct task_struct definition, it should be:
0516      * tls_base = (void *)task->thread.fsbase;
0517      */
0518     tls_base = (void *)task;
0519 
0520 #ifdef USE_BPF_LOOP
0521     struct read_var_ctx ctx = {
0522         .cfg = cfg,
0523         .tls_base = tls_base,
0524         .value = &value,
0525         .data = data,
0526         .payload = payload,
0527     };
0528     int err;
0529 
0530     ctx.type = READ_INT_VAR;
0531     err = bpf_loop(STROBE_MAX_INTS, read_var_callback, &ctx, 0);
0532     if (err != STROBE_MAX_INTS)
0533         return NULL;
0534 
0535     ctx.type = READ_STR_VAR;
0536     err = bpf_loop(STROBE_MAX_STRS, read_var_callback, &ctx, 0);
0537     if (err != STROBE_MAX_STRS)
0538         return NULL;
0539 
0540     ctx.type = READ_MAP_VAR;
0541     err = bpf_loop(STROBE_MAX_MAPS, read_var_callback, &ctx, 0);
0542     if (err != STROBE_MAX_MAPS)
0543         return NULL;
0544 #else
0545 #ifdef NO_UNROLL
0546 #pragma clang loop unroll(disable)
0547 #else
0548 #pragma unroll
0549 #endif /* NO_UNROLL */
0550     for (int i = 0; i < STROBE_MAX_INTS; ++i) {
0551         read_int_var(cfg, i, tls_base, &value, data);
0552     }
0553 #ifdef NO_UNROLL
0554 #pragma clang loop unroll(disable)
0555 #else
0556 #pragma unroll
0557 #endif /* NO_UNROLL */
0558     for (int i = 0; i < STROBE_MAX_STRS; ++i) {
0559         payload += read_str_var(cfg, i, tls_base, &value, data, payload);
0560     }
0561 #ifdef NO_UNROLL
0562 #pragma clang loop unroll(disable)
0563 #else
0564 #pragma unroll
0565 #endif /* NO_UNROLL */
0566     for (int i = 0; i < STROBE_MAX_MAPS; ++i) {
0567         payload = read_map_var(cfg, i, tls_base, &value, data, payload);
0568     }
0569 #endif /* USE_BPF_LOOP */
0570 
0571     /*
0572      * return pointer right after end of payload, so it's possible to
0573      * calculate exact amount of useful data that needs to be sent
0574      */
0575     return payload;
0576 }
0577 
0578 SEC("raw_tracepoint/kfree_skb")
0579 int on_event(struct pt_regs *ctx) {
0580     pid_t pid =  bpf_get_current_pid_tgid() >> 32;
0581     struct strobelight_bpf_sample* sample;
0582     struct task_struct *task;
0583     uint32_t zero = 0;
0584     uint64_t ktime_ns;
0585     void *sample_end;
0586 
0587     sample = bpf_map_lookup_elem(&sample_heap, &zero);
0588     if (!sample)
0589         return 0; /* this will never happen */
0590 
0591     sample->pid = pid;
0592     bpf_get_current_comm(&sample->comm, TASK_COMM_LEN);
0593     ktime_ns = bpf_ktime_get_ns();
0594     sample->ktime = ktime_ns;
0595 
0596     task = (struct task_struct *)bpf_get_current_task();
0597     sample_end = read_strobe_meta(task, &sample->metadata);
0598     sample->has_meta = sample_end != NULL;
0599     sample_end = sample_end ? : &sample->metadata;
0600 
0601     if ((ktime_ns >> STACK_TABLE_EPOCH_SHIFT) & 1) {
0602         sample->kernel_stack_id = bpf_get_stackid(ctx, &stacks_1, 0);
0603         sample->user_stack_id = bpf_get_stackid(ctx, &stacks_1, BPF_F_USER_STACK);
0604     } else {
0605         sample->kernel_stack_id = bpf_get_stackid(ctx, &stacks_0, 0);
0606         sample->user_stack_id = bpf_get_stackid(ctx, &stacks_0, BPF_F_USER_STACK);
0607     }
0608 
0609     uint64_t sample_size = sample_end - (void *)sample;
0610     /* should always be true */
0611     if (sample_size < sizeof(struct strobelight_bpf_sample))
0612         bpf_perf_event_output(ctx, &samples, 0, sample, 1 + sample_size);
0613     return 0;
0614 }
0615 
0616 char _license[] SEC("license") = "GPL";