Back to home page

OSCL-LXR

 
 

    


0001 #include <linux/bpf.h>
0002 #include <linux/btf.h>
0003 #include <linux/err.h>
0004 #include <linux/irq_work.h>
0005 #include <linux/slab.h>
0006 #include <linux/filter.h>
0007 #include <linux/mm.h>
0008 #include <linux/vmalloc.h>
0009 #include <linux/wait.h>
0010 #include <linux/poll.h>
0011 #include <linux/kmemleak.h>
0012 #include <uapi/linux/btf.h>
0013 #include <linux/btf_ids.h>
0014 
0015 #define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE)
0016 
0017 /* non-mmap()'able part of bpf_ringbuf (everything up to consumer page) */
0018 #define RINGBUF_PGOFF \
0019     (offsetof(struct bpf_ringbuf, consumer_pos) >> PAGE_SHIFT)
0020 /* consumer page and producer page */
0021 #define RINGBUF_POS_PAGES 2
0022 
0023 #define RINGBUF_MAX_RECORD_SZ (UINT_MAX/4)
0024 
0025 /* Maximum size of ring buffer area is limited by 32-bit page offset within
0026  * record header, counted in pages. Reserve 8 bits for extensibility, and take
0027  * into account few extra pages for consumer/producer pages and
0028  * non-mmap()'able parts. This gives 64GB limit, which seems plenty for single
0029  * ring buffer.
0030  */
0031 #define RINGBUF_MAX_DATA_SZ \
0032     (((1ULL << 24) - RINGBUF_POS_PAGES - RINGBUF_PGOFF) * PAGE_SIZE)
0033 
0034 struct bpf_ringbuf {
0035     wait_queue_head_t waitq;
0036     struct irq_work work;
0037     u64 mask;
0038     struct page **pages;
0039     int nr_pages;
0040     spinlock_t spinlock ____cacheline_aligned_in_smp;
0041     /* Consumer and producer counters are put into separate pages to allow
0042      * mapping consumer page as r/w, but restrict producer page to r/o.
0043      * This protects producer position from being modified by user-space
0044      * application and ruining in-kernel position tracking.
0045      */
0046     unsigned long consumer_pos __aligned(PAGE_SIZE);
0047     unsigned long producer_pos __aligned(PAGE_SIZE);
0048     char data[] __aligned(PAGE_SIZE);
0049 };
0050 
0051 struct bpf_ringbuf_map {
0052     struct bpf_map map;
0053     struct bpf_ringbuf *rb;
0054 };
0055 
0056 /* 8-byte ring buffer record header structure */
0057 struct bpf_ringbuf_hdr {
0058     u32 len;
0059     u32 pg_off;
0060 };
0061 
0062 static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node)
0063 {
0064     const gfp_t flags = GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL |
0065                 __GFP_NOWARN | __GFP_ZERO;
0066     int nr_meta_pages = RINGBUF_PGOFF + RINGBUF_POS_PAGES;
0067     int nr_data_pages = data_sz >> PAGE_SHIFT;
0068     int nr_pages = nr_meta_pages + nr_data_pages;
0069     struct page **pages, *page;
0070     struct bpf_ringbuf *rb;
0071     size_t array_size;
0072     int i;
0073 
0074     /* Each data page is mapped twice to allow "virtual"
0075      * continuous read of samples wrapping around the end of ring
0076      * buffer area:
0077      * ------------------------------------------------------
0078      * | meta pages |  real data pages  |  same data pages  |
0079      * ------------------------------------------------------
0080      * |            | 1 2 3 4 5 6 7 8 9 | 1 2 3 4 5 6 7 8 9 |
0081      * ------------------------------------------------------
0082      * |            | TA             DA | TA             DA |
0083      * ------------------------------------------------------
0084      *                               ^^^^^^^
0085      *                                  |
0086      * Here, no need to worry about special handling of wrapped-around
0087      * data due to double-mapped data pages. This works both in kernel and
0088      * when mmap()'ed in user-space, simplifying both kernel and
0089      * user-space implementations significantly.
0090      */
0091     array_size = (nr_meta_pages + 2 * nr_data_pages) * sizeof(*pages);
0092     pages = bpf_map_area_alloc(array_size, numa_node);
0093     if (!pages)
0094         return NULL;
0095 
0096     for (i = 0; i < nr_pages; i++) {
0097         page = alloc_pages_node(numa_node, flags, 0);
0098         if (!page) {
0099             nr_pages = i;
0100             goto err_free_pages;
0101         }
0102         pages[i] = page;
0103         if (i >= nr_meta_pages)
0104             pages[nr_data_pages + i] = page;
0105     }
0106 
0107     rb = vmap(pages, nr_meta_pages + 2 * nr_data_pages,
0108           VM_MAP | VM_USERMAP, PAGE_KERNEL);
0109     if (rb) {
0110         kmemleak_not_leak(pages);
0111         rb->pages = pages;
0112         rb->nr_pages = nr_pages;
0113         return rb;
0114     }
0115 
0116 err_free_pages:
0117     for (i = 0; i < nr_pages; i++)
0118         __free_page(pages[i]);
0119     kvfree(pages);
0120     return NULL;
0121 }
0122 
0123 static void bpf_ringbuf_notify(struct irq_work *work)
0124 {
0125     struct bpf_ringbuf *rb = container_of(work, struct bpf_ringbuf, work);
0126 
0127     wake_up_all(&rb->waitq);
0128 }
0129 
0130 static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node)
0131 {
0132     struct bpf_ringbuf *rb;
0133 
0134     rb = bpf_ringbuf_area_alloc(data_sz, numa_node);
0135     if (!rb)
0136         return NULL;
0137 
0138     spin_lock_init(&rb->spinlock);
0139     init_waitqueue_head(&rb->waitq);
0140     init_irq_work(&rb->work, bpf_ringbuf_notify);
0141 
0142     rb->mask = data_sz - 1;
0143     rb->consumer_pos = 0;
0144     rb->producer_pos = 0;
0145 
0146     return rb;
0147 }
0148 
0149 static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr)
0150 {
0151     struct bpf_ringbuf_map *rb_map;
0152 
0153     if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK)
0154         return ERR_PTR(-EINVAL);
0155 
0156     if (attr->key_size || attr->value_size ||
0157         !is_power_of_2(attr->max_entries) ||
0158         !PAGE_ALIGNED(attr->max_entries))
0159         return ERR_PTR(-EINVAL);
0160 
0161 #ifdef CONFIG_64BIT
0162     /* on 32-bit arch, it's impossible to overflow record's hdr->pgoff */
0163     if (attr->max_entries > RINGBUF_MAX_DATA_SZ)
0164         return ERR_PTR(-E2BIG);
0165 #endif
0166 
0167     rb_map = kzalloc(sizeof(*rb_map), GFP_USER | __GFP_ACCOUNT);
0168     if (!rb_map)
0169         return ERR_PTR(-ENOMEM);
0170 
0171     bpf_map_init_from_attr(&rb_map->map, attr);
0172 
0173     rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node);
0174     if (!rb_map->rb) {
0175         kfree(rb_map);
0176         return ERR_PTR(-ENOMEM);
0177     }
0178 
0179     return &rb_map->map;
0180 }
0181 
0182 static void bpf_ringbuf_free(struct bpf_ringbuf *rb)
0183 {
0184     /* copy pages pointer and nr_pages to local variable, as we are going
0185      * to unmap rb itself with vunmap() below
0186      */
0187     struct page **pages = rb->pages;
0188     int i, nr_pages = rb->nr_pages;
0189 
0190     vunmap(rb);
0191     for (i = 0; i < nr_pages; i++)
0192         __free_page(pages[i]);
0193     kvfree(pages);
0194 }
0195 
0196 static void ringbuf_map_free(struct bpf_map *map)
0197 {
0198     struct bpf_ringbuf_map *rb_map;
0199 
0200     rb_map = container_of(map, struct bpf_ringbuf_map, map);
0201     bpf_ringbuf_free(rb_map->rb);
0202     kfree(rb_map);
0203 }
0204 
0205 static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key)
0206 {
0207     return ERR_PTR(-ENOTSUPP);
0208 }
0209 
0210 static int ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value,
0211                    u64 flags)
0212 {
0213     return -ENOTSUPP;
0214 }
0215 
0216 static int ringbuf_map_delete_elem(struct bpf_map *map, void *key)
0217 {
0218     return -ENOTSUPP;
0219 }
0220 
0221 static int ringbuf_map_get_next_key(struct bpf_map *map, void *key,
0222                     void *next_key)
0223 {
0224     return -ENOTSUPP;
0225 }
0226 
0227 static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
0228 {
0229     struct bpf_ringbuf_map *rb_map;
0230 
0231     rb_map = container_of(map, struct bpf_ringbuf_map, map);
0232 
0233     if (vma->vm_flags & VM_WRITE) {
0234         /* allow writable mapping for the consumer_pos only */
0235         if (vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != PAGE_SIZE)
0236             return -EPERM;
0237     } else {
0238         vma->vm_flags &= ~VM_MAYWRITE;
0239     }
0240     /* remap_vmalloc_range() checks size and offset constraints */
0241     return remap_vmalloc_range(vma, rb_map->rb,
0242                    vma->vm_pgoff + RINGBUF_PGOFF);
0243 }
0244 
0245 static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb)
0246 {
0247     unsigned long cons_pos, prod_pos;
0248 
0249     cons_pos = smp_load_acquire(&rb->consumer_pos);
0250     prod_pos = smp_load_acquire(&rb->producer_pos);
0251     return prod_pos - cons_pos;
0252 }
0253 
0254 static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp,
0255                  struct poll_table_struct *pts)
0256 {
0257     struct bpf_ringbuf_map *rb_map;
0258 
0259     rb_map = container_of(map, struct bpf_ringbuf_map, map);
0260     poll_wait(filp, &rb_map->rb->waitq, pts);
0261 
0262     if (ringbuf_avail_data_sz(rb_map->rb))
0263         return EPOLLIN | EPOLLRDNORM;
0264     return 0;
0265 }
0266 
0267 BTF_ID_LIST_SINGLE(ringbuf_map_btf_ids, struct, bpf_ringbuf_map)
0268 const struct bpf_map_ops ringbuf_map_ops = {
0269     .map_meta_equal = bpf_map_meta_equal,
0270     .map_alloc = ringbuf_map_alloc,
0271     .map_free = ringbuf_map_free,
0272     .map_mmap = ringbuf_map_mmap,
0273     .map_poll = ringbuf_map_poll,
0274     .map_lookup_elem = ringbuf_map_lookup_elem,
0275     .map_update_elem = ringbuf_map_update_elem,
0276     .map_delete_elem = ringbuf_map_delete_elem,
0277     .map_get_next_key = ringbuf_map_get_next_key,
0278     .map_btf_id = &ringbuf_map_btf_ids[0],
0279 };
0280 
0281 /* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself,
0282  * calculate offset from record metadata to ring buffer in pages, rounded
0283  * down. This page offset is stored as part of record metadata and allows to
0284  * restore struct bpf_ringbuf * from record pointer. This page offset is
0285  * stored at offset 4 of record metadata header.
0286  */
0287 static size_t bpf_ringbuf_rec_pg_off(struct bpf_ringbuf *rb,
0288                      struct bpf_ringbuf_hdr *hdr)
0289 {
0290     return ((void *)hdr - (void *)rb) >> PAGE_SHIFT;
0291 }
0292 
0293 /* Given pointer to ring buffer record header, restore pointer to struct
0294  * bpf_ringbuf itself by using page offset stored at offset 4
0295  */
0296 static struct bpf_ringbuf *
0297 bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr)
0298 {
0299     unsigned long addr = (unsigned long)(void *)hdr;
0300     unsigned long off = (unsigned long)hdr->pg_off << PAGE_SHIFT;
0301 
0302     return (void*)((addr & PAGE_MASK) - off);
0303 }
0304 
0305 static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)
0306 {
0307     unsigned long cons_pos, prod_pos, new_prod_pos, flags;
0308     u32 len, pg_off;
0309     struct bpf_ringbuf_hdr *hdr;
0310 
0311     if (unlikely(size > RINGBUF_MAX_RECORD_SZ))
0312         return NULL;
0313 
0314     len = round_up(size + BPF_RINGBUF_HDR_SZ, 8);
0315     if (len > rb->mask + 1)
0316         return NULL;
0317 
0318     cons_pos = smp_load_acquire(&rb->consumer_pos);
0319 
0320     if (in_nmi()) {
0321         if (!spin_trylock_irqsave(&rb->spinlock, flags))
0322             return NULL;
0323     } else {
0324         spin_lock_irqsave(&rb->spinlock, flags);
0325     }
0326 
0327     prod_pos = rb->producer_pos;
0328     new_prod_pos = prod_pos + len;
0329 
0330     /* check for out of ringbuf space by ensuring producer position
0331      * doesn't advance more than (ringbuf_size - 1) ahead
0332      */
0333     if (new_prod_pos - cons_pos > rb->mask) {
0334         spin_unlock_irqrestore(&rb->spinlock, flags);
0335         return NULL;
0336     }
0337 
0338     hdr = (void *)rb->data + (prod_pos & rb->mask);
0339     pg_off = bpf_ringbuf_rec_pg_off(rb, hdr);
0340     hdr->len = size | BPF_RINGBUF_BUSY_BIT;
0341     hdr->pg_off = pg_off;
0342 
0343     /* pairs with consumer's smp_load_acquire() */
0344     smp_store_release(&rb->producer_pos, new_prod_pos);
0345 
0346     spin_unlock_irqrestore(&rb->spinlock, flags);
0347 
0348     return (void *)hdr + BPF_RINGBUF_HDR_SZ;
0349 }
0350 
0351 BPF_CALL_3(bpf_ringbuf_reserve, struct bpf_map *, map, u64, size, u64, flags)
0352 {
0353     struct bpf_ringbuf_map *rb_map;
0354 
0355     if (unlikely(flags))
0356         return 0;
0357 
0358     rb_map = container_of(map, struct bpf_ringbuf_map, map);
0359     return (unsigned long)__bpf_ringbuf_reserve(rb_map->rb, size);
0360 }
0361 
0362 const struct bpf_func_proto bpf_ringbuf_reserve_proto = {
0363     .func       = bpf_ringbuf_reserve,
0364     .ret_type   = RET_PTR_TO_ALLOC_MEM_OR_NULL,
0365     .arg1_type  = ARG_CONST_MAP_PTR,
0366     .arg2_type  = ARG_CONST_ALLOC_SIZE_OR_ZERO,
0367     .arg3_type  = ARG_ANYTHING,
0368 };
0369 
0370 static void bpf_ringbuf_commit(void *sample, u64 flags, bool discard)
0371 {
0372     unsigned long rec_pos, cons_pos;
0373     struct bpf_ringbuf_hdr *hdr;
0374     struct bpf_ringbuf *rb;
0375     u32 new_len;
0376 
0377     hdr = sample - BPF_RINGBUF_HDR_SZ;
0378     rb = bpf_ringbuf_restore_from_rec(hdr);
0379     new_len = hdr->len ^ BPF_RINGBUF_BUSY_BIT;
0380     if (discard)
0381         new_len |= BPF_RINGBUF_DISCARD_BIT;
0382 
0383     /* update record header with correct final size prefix */
0384     xchg(&hdr->len, new_len);
0385 
0386     /* if consumer caught up and is waiting for our record, notify about
0387      * new data availability
0388      */
0389     rec_pos = (void *)hdr - (void *)rb->data;
0390     cons_pos = smp_load_acquire(&rb->consumer_pos) & rb->mask;
0391 
0392     if (flags & BPF_RB_FORCE_WAKEUP)
0393         irq_work_queue(&rb->work);
0394     else if (cons_pos == rec_pos && !(flags & BPF_RB_NO_WAKEUP))
0395         irq_work_queue(&rb->work);
0396 }
0397 
0398 BPF_CALL_2(bpf_ringbuf_submit, void *, sample, u64, flags)
0399 {
0400     bpf_ringbuf_commit(sample, flags, false /* discard */);
0401     return 0;
0402 }
0403 
0404 const struct bpf_func_proto bpf_ringbuf_submit_proto = {
0405     .func       = bpf_ringbuf_submit,
0406     .ret_type   = RET_VOID,
0407     .arg1_type  = ARG_PTR_TO_ALLOC_MEM | OBJ_RELEASE,
0408     .arg2_type  = ARG_ANYTHING,
0409 };
0410 
0411 BPF_CALL_2(bpf_ringbuf_discard, void *, sample, u64, flags)
0412 {
0413     bpf_ringbuf_commit(sample, flags, true /* discard */);
0414     return 0;
0415 }
0416 
0417 const struct bpf_func_proto bpf_ringbuf_discard_proto = {
0418     .func       = bpf_ringbuf_discard,
0419     .ret_type   = RET_VOID,
0420     .arg1_type  = ARG_PTR_TO_ALLOC_MEM | OBJ_RELEASE,
0421     .arg2_type  = ARG_ANYTHING,
0422 };
0423 
0424 BPF_CALL_4(bpf_ringbuf_output, struct bpf_map *, map, void *, data, u64, size,
0425        u64, flags)
0426 {
0427     struct bpf_ringbuf_map *rb_map;
0428     void *rec;
0429 
0430     if (unlikely(flags & ~(BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP)))
0431         return -EINVAL;
0432 
0433     rb_map = container_of(map, struct bpf_ringbuf_map, map);
0434     rec = __bpf_ringbuf_reserve(rb_map->rb, size);
0435     if (!rec)
0436         return -EAGAIN;
0437 
0438     memcpy(rec, data, size);
0439     bpf_ringbuf_commit(rec, flags, false /* discard */);
0440     return 0;
0441 }
0442 
0443 const struct bpf_func_proto bpf_ringbuf_output_proto = {
0444     .func       = bpf_ringbuf_output,
0445     .ret_type   = RET_INTEGER,
0446     .arg1_type  = ARG_CONST_MAP_PTR,
0447     .arg2_type  = ARG_PTR_TO_MEM | MEM_RDONLY,
0448     .arg3_type  = ARG_CONST_SIZE_OR_ZERO,
0449     .arg4_type  = ARG_ANYTHING,
0450 };
0451 
0452 BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags)
0453 {
0454     struct bpf_ringbuf *rb;
0455 
0456     rb = container_of(map, struct bpf_ringbuf_map, map)->rb;
0457 
0458     switch (flags) {
0459     case BPF_RB_AVAIL_DATA:
0460         return ringbuf_avail_data_sz(rb);
0461     case BPF_RB_RING_SIZE:
0462         return rb->mask + 1;
0463     case BPF_RB_CONS_POS:
0464         return smp_load_acquire(&rb->consumer_pos);
0465     case BPF_RB_PROD_POS:
0466         return smp_load_acquire(&rb->producer_pos);
0467     default:
0468         return 0;
0469     }
0470 }
0471 
0472 const struct bpf_func_proto bpf_ringbuf_query_proto = {
0473     .func       = bpf_ringbuf_query,
0474     .ret_type   = RET_INTEGER,
0475     .arg1_type  = ARG_CONST_MAP_PTR,
0476     .arg2_type  = ARG_ANYTHING,
0477 };
0478 
0479 BPF_CALL_4(bpf_ringbuf_reserve_dynptr, struct bpf_map *, map, u32, size, u64, flags,
0480        struct bpf_dynptr_kern *, ptr)
0481 {
0482     struct bpf_ringbuf_map *rb_map;
0483     void *sample;
0484     int err;
0485 
0486     if (unlikely(flags)) {
0487         bpf_dynptr_set_null(ptr);
0488         return -EINVAL;
0489     }
0490 
0491     err = bpf_dynptr_check_size(size);
0492     if (err) {
0493         bpf_dynptr_set_null(ptr);
0494         return err;
0495     }
0496 
0497     rb_map = container_of(map, struct bpf_ringbuf_map, map);
0498 
0499     sample = __bpf_ringbuf_reserve(rb_map->rb, size);
0500     if (!sample) {
0501         bpf_dynptr_set_null(ptr);
0502         return -EINVAL;
0503     }
0504 
0505     bpf_dynptr_init(ptr, sample, BPF_DYNPTR_TYPE_RINGBUF, 0, size);
0506 
0507     return 0;
0508 }
0509 
0510 const struct bpf_func_proto bpf_ringbuf_reserve_dynptr_proto = {
0511     .func       = bpf_ringbuf_reserve_dynptr,
0512     .ret_type   = RET_INTEGER,
0513     .arg1_type  = ARG_CONST_MAP_PTR,
0514     .arg2_type  = ARG_ANYTHING,
0515     .arg3_type  = ARG_ANYTHING,
0516     .arg4_type  = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | MEM_UNINIT,
0517 };
0518 
0519 BPF_CALL_2(bpf_ringbuf_submit_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags)
0520 {
0521     if (!ptr->data)
0522         return 0;
0523 
0524     bpf_ringbuf_commit(ptr->data, flags, false /* discard */);
0525 
0526     bpf_dynptr_set_null(ptr);
0527 
0528     return 0;
0529 }
0530 
0531 const struct bpf_func_proto bpf_ringbuf_submit_dynptr_proto = {
0532     .func       = bpf_ringbuf_submit_dynptr,
0533     .ret_type   = RET_VOID,
0534     .arg1_type  = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE,
0535     .arg2_type  = ARG_ANYTHING,
0536 };
0537 
0538 BPF_CALL_2(bpf_ringbuf_discard_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags)
0539 {
0540     if (!ptr->data)
0541         return 0;
0542 
0543     bpf_ringbuf_commit(ptr->data, flags, true /* discard */);
0544 
0545     bpf_dynptr_set_null(ptr);
0546 
0547     return 0;
0548 }
0549 
0550 const struct bpf_func_proto bpf_ringbuf_discard_dynptr_proto = {
0551     .func       = bpf_ringbuf_discard_dynptr,
0552     .ret_type   = RET_VOID,
0553     .arg1_type  = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE,
0554     .arg2_type  = ARG_ANYTHING,
0555 };