0001 #include <linux/bpf.h>
0002 #include <linux/btf.h>
0003 #include <linux/err.h>
0004 #include <linux/irq_work.h>
0005 #include <linux/slab.h>
0006 #include <linux/filter.h>
0007 #include <linux/mm.h>
0008 #include <linux/vmalloc.h>
0009 #include <linux/wait.h>
0010 #include <linux/poll.h>
0011 #include <linux/kmemleak.h>
0012 #include <uapi/linux/btf.h>
0013 #include <linux/btf_ids.h>
0014
0015 #define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE)
0016
0017
0018 #define RINGBUF_PGOFF \
0019 (offsetof(struct bpf_ringbuf, consumer_pos) >> PAGE_SHIFT)
0020
0021 #define RINGBUF_POS_PAGES 2
0022
0023 #define RINGBUF_MAX_RECORD_SZ (UINT_MAX/4)
0024
0025
0026
0027
0028
0029
0030
0031 #define RINGBUF_MAX_DATA_SZ \
0032 (((1ULL << 24) - RINGBUF_POS_PAGES - RINGBUF_PGOFF) * PAGE_SIZE)
0033
0034 struct bpf_ringbuf {
0035 wait_queue_head_t waitq;
0036 struct irq_work work;
0037 u64 mask;
0038 struct page **pages;
0039 int nr_pages;
0040 spinlock_t spinlock ____cacheline_aligned_in_smp;
0041
0042
0043
0044
0045
0046 unsigned long consumer_pos __aligned(PAGE_SIZE);
0047 unsigned long producer_pos __aligned(PAGE_SIZE);
0048 char data[] __aligned(PAGE_SIZE);
0049 };
0050
0051 struct bpf_ringbuf_map {
0052 struct bpf_map map;
0053 struct bpf_ringbuf *rb;
0054 };
0055
0056
0057 struct bpf_ringbuf_hdr {
0058 u32 len;
0059 u32 pg_off;
0060 };
0061
0062 static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node)
0063 {
0064 const gfp_t flags = GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL |
0065 __GFP_NOWARN | __GFP_ZERO;
0066 int nr_meta_pages = RINGBUF_PGOFF + RINGBUF_POS_PAGES;
0067 int nr_data_pages = data_sz >> PAGE_SHIFT;
0068 int nr_pages = nr_meta_pages + nr_data_pages;
0069 struct page **pages, *page;
0070 struct bpf_ringbuf *rb;
0071 size_t array_size;
0072 int i;
0073
0074
0075
0076
0077
0078
0079
0080
0081
0082
0083
0084
0085
0086
0087
0088
0089
0090
0091 array_size = (nr_meta_pages + 2 * nr_data_pages) * sizeof(*pages);
0092 pages = bpf_map_area_alloc(array_size, numa_node);
0093 if (!pages)
0094 return NULL;
0095
0096 for (i = 0; i < nr_pages; i++) {
0097 page = alloc_pages_node(numa_node, flags, 0);
0098 if (!page) {
0099 nr_pages = i;
0100 goto err_free_pages;
0101 }
0102 pages[i] = page;
0103 if (i >= nr_meta_pages)
0104 pages[nr_data_pages + i] = page;
0105 }
0106
0107 rb = vmap(pages, nr_meta_pages + 2 * nr_data_pages,
0108 VM_MAP | VM_USERMAP, PAGE_KERNEL);
0109 if (rb) {
0110 kmemleak_not_leak(pages);
0111 rb->pages = pages;
0112 rb->nr_pages = nr_pages;
0113 return rb;
0114 }
0115
0116 err_free_pages:
0117 for (i = 0; i < nr_pages; i++)
0118 __free_page(pages[i]);
0119 kvfree(pages);
0120 return NULL;
0121 }
0122
0123 static void bpf_ringbuf_notify(struct irq_work *work)
0124 {
0125 struct bpf_ringbuf *rb = container_of(work, struct bpf_ringbuf, work);
0126
0127 wake_up_all(&rb->waitq);
0128 }
0129
0130 static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node)
0131 {
0132 struct bpf_ringbuf *rb;
0133
0134 rb = bpf_ringbuf_area_alloc(data_sz, numa_node);
0135 if (!rb)
0136 return NULL;
0137
0138 spin_lock_init(&rb->spinlock);
0139 init_waitqueue_head(&rb->waitq);
0140 init_irq_work(&rb->work, bpf_ringbuf_notify);
0141
0142 rb->mask = data_sz - 1;
0143 rb->consumer_pos = 0;
0144 rb->producer_pos = 0;
0145
0146 return rb;
0147 }
0148
0149 static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr)
0150 {
0151 struct bpf_ringbuf_map *rb_map;
0152
0153 if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK)
0154 return ERR_PTR(-EINVAL);
0155
0156 if (attr->key_size || attr->value_size ||
0157 !is_power_of_2(attr->max_entries) ||
0158 !PAGE_ALIGNED(attr->max_entries))
0159 return ERR_PTR(-EINVAL);
0160
0161 #ifdef CONFIG_64BIT
0162
0163 if (attr->max_entries > RINGBUF_MAX_DATA_SZ)
0164 return ERR_PTR(-E2BIG);
0165 #endif
0166
0167 rb_map = kzalloc(sizeof(*rb_map), GFP_USER | __GFP_ACCOUNT);
0168 if (!rb_map)
0169 return ERR_PTR(-ENOMEM);
0170
0171 bpf_map_init_from_attr(&rb_map->map, attr);
0172
0173 rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node);
0174 if (!rb_map->rb) {
0175 kfree(rb_map);
0176 return ERR_PTR(-ENOMEM);
0177 }
0178
0179 return &rb_map->map;
0180 }
0181
0182 static void bpf_ringbuf_free(struct bpf_ringbuf *rb)
0183 {
0184
0185
0186
0187 struct page **pages = rb->pages;
0188 int i, nr_pages = rb->nr_pages;
0189
0190 vunmap(rb);
0191 for (i = 0; i < nr_pages; i++)
0192 __free_page(pages[i]);
0193 kvfree(pages);
0194 }
0195
0196 static void ringbuf_map_free(struct bpf_map *map)
0197 {
0198 struct bpf_ringbuf_map *rb_map;
0199
0200 rb_map = container_of(map, struct bpf_ringbuf_map, map);
0201 bpf_ringbuf_free(rb_map->rb);
0202 kfree(rb_map);
0203 }
0204
0205 static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key)
0206 {
0207 return ERR_PTR(-ENOTSUPP);
0208 }
0209
0210 static int ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value,
0211 u64 flags)
0212 {
0213 return -ENOTSUPP;
0214 }
0215
0216 static int ringbuf_map_delete_elem(struct bpf_map *map, void *key)
0217 {
0218 return -ENOTSUPP;
0219 }
0220
0221 static int ringbuf_map_get_next_key(struct bpf_map *map, void *key,
0222 void *next_key)
0223 {
0224 return -ENOTSUPP;
0225 }
0226
0227 static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
0228 {
0229 struct bpf_ringbuf_map *rb_map;
0230
0231 rb_map = container_of(map, struct bpf_ringbuf_map, map);
0232
0233 if (vma->vm_flags & VM_WRITE) {
0234
0235 if (vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != PAGE_SIZE)
0236 return -EPERM;
0237 } else {
0238 vma->vm_flags &= ~VM_MAYWRITE;
0239 }
0240
0241 return remap_vmalloc_range(vma, rb_map->rb,
0242 vma->vm_pgoff + RINGBUF_PGOFF);
0243 }
0244
0245 static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb)
0246 {
0247 unsigned long cons_pos, prod_pos;
0248
0249 cons_pos = smp_load_acquire(&rb->consumer_pos);
0250 prod_pos = smp_load_acquire(&rb->producer_pos);
0251 return prod_pos - cons_pos;
0252 }
0253
0254 static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp,
0255 struct poll_table_struct *pts)
0256 {
0257 struct bpf_ringbuf_map *rb_map;
0258
0259 rb_map = container_of(map, struct bpf_ringbuf_map, map);
0260 poll_wait(filp, &rb_map->rb->waitq, pts);
0261
0262 if (ringbuf_avail_data_sz(rb_map->rb))
0263 return EPOLLIN | EPOLLRDNORM;
0264 return 0;
0265 }
0266
0267 BTF_ID_LIST_SINGLE(ringbuf_map_btf_ids, struct, bpf_ringbuf_map)
0268 const struct bpf_map_ops ringbuf_map_ops = {
0269 .map_meta_equal = bpf_map_meta_equal,
0270 .map_alloc = ringbuf_map_alloc,
0271 .map_free = ringbuf_map_free,
0272 .map_mmap = ringbuf_map_mmap,
0273 .map_poll = ringbuf_map_poll,
0274 .map_lookup_elem = ringbuf_map_lookup_elem,
0275 .map_update_elem = ringbuf_map_update_elem,
0276 .map_delete_elem = ringbuf_map_delete_elem,
0277 .map_get_next_key = ringbuf_map_get_next_key,
0278 .map_btf_id = &ringbuf_map_btf_ids[0],
0279 };
0280
0281
0282
0283
0284
0285
0286
0287 static size_t bpf_ringbuf_rec_pg_off(struct bpf_ringbuf *rb,
0288 struct bpf_ringbuf_hdr *hdr)
0289 {
0290 return ((void *)hdr - (void *)rb) >> PAGE_SHIFT;
0291 }
0292
0293
0294
0295
0296 static struct bpf_ringbuf *
0297 bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr)
0298 {
0299 unsigned long addr = (unsigned long)(void *)hdr;
0300 unsigned long off = (unsigned long)hdr->pg_off << PAGE_SHIFT;
0301
0302 return (void*)((addr & PAGE_MASK) - off);
0303 }
0304
0305 static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)
0306 {
0307 unsigned long cons_pos, prod_pos, new_prod_pos, flags;
0308 u32 len, pg_off;
0309 struct bpf_ringbuf_hdr *hdr;
0310
0311 if (unlikely(size > RINGBUF_MAX_RECORD_SZ))
0312 return NULL;
0313
0314 len = round_up(size + BPF_RINGBUF_HDR_SZ, 8);
0315 if (len > rb->mask + 1)
0316 return NULL;
0317
0318 cons_pos = smp_load_acquire(&rb->consumer_pos);
0319
0320 if (in_nmi()) {
0321 if (!spin_trylock_irqsave(&rb->spinlock, flags))
0322 return NULL;
0323 } else {
0324 spin_lock_irqsave(&rb->spinlock, flags);
0325 }
0326
0327 prod_pos = rb->producer_pos;
0328 new_prod_pos = prod_pos + len;
0329
0330
0331
0332
0333 if (new_prod_pos - cons_pos > rb->mask) {
0334 spin_unlock_irqrestore(&rb->spinlock, flags);
0335 return NULL;
0336 }
0337
0338 hdr = (void *)rb->data + (prod_pos & rb->mask);
0339 pg_off = bpf_ringbuf_rec_pg_off(rb, hdr);
0340 hdr->len = size | BPF_RINGBUF_BUSY_BIT;
0341 hdr->pg_off = pg_off;
0342
0343
0344 smp_store_release(&rb->producer_pos, new_prod_pos);
0345
0346 spin_unlock_irqrestore(&rb->spinlock, flags);
0347
0348 return (void *)hdr + BPF_RINGBUF_HDR_SZ;
0349 }
0350
0351 BPF_CALL_3(bpf_ringbuf_reserve, struct bpf_map *, map, u64, size, u64, flags)
0352 {
0353 struct bpf_ringbuf_map *rb_map;
0354
0355 if (unlikely(flags))
0356 return 0;
0357
0358 rb_map = container_of(map, struct bpf_ringbuf_map, map);
0359 return (unsigned long)__bpf_ringbuf_reserve(rb_map->rb, size);
0360 }
0361
0362 const struct bpf_func_proto bpf_ringbuf_reserve_proto = {
0363 .func = bpf_ringbuf_reserve,
0364 .ret_type = RET_PTR_TO_ALLOC_MEM_OR_NULL,
0365 .arg1_type = ARG_CONST_MAP_PTR,
0366 .arg2_type = ARG_CONST_ALLOC_SIZE_OR_ZERO,
0367 .arg3_type = ARG_ANYTHING,
0368 };
0369
0370 static void bpf_ringbuf_commit(void *sample, u64 flags, bool discard)
0371 {
0372 unsigned long rec_pos, cons_pos;
0373 struct bpf_ringbuf_hdr *hdr;
0374 struct bpf_ringbuf *rb;
0375 u32 new_len;
0376
0377 hdr = sample - BPF_RINGBUF_HDR_SZ;
0378 rb = bpf_ringbuf_restore_from_rec(hdr);
0379 new_len = hdr->len ^ BPF_RINGBUF_BUSY_BIT;
0380 if (discard)
0381 new_len |= BPF_RINGBUF_DISCARD_BIT;
0382
0383
0384 xchg(&hdr->len, new_len);
0385
0386
0387
0388
0389 rec_pos = (void *)hdr - (void *)rb->data;
0390 cons_pos = smp_load_acquire(&rb->consumer_pos) & rb->mask;
0391
0392 if (flags & BPF_RB_FORCE_WAKEUP)
0393 irq_work_queue(&rb->work);
0394 else if (cons_pos == rec_pos && !(flags & BPF_RB_NO_WAKEUP))
0395 irq_work_queue(&rb->work);
0396 }
0397
0398 BPF_CALL_2(bpf_ringbuf_submit, void *, sample, u64, flags)
0399 {
0400 bpf_ringbuf_commit(sample, flags, false );
0401 return 0;
0402 }
0403
0404 const struct bpf_func_proto bpf_ringbuf_submit_proto = {
0405 .func = bpf_ringbuf_submit,
0406 .ret_type = RET_VOID,
0407 .arg1_type = ARG_PTR_TO_ALLOC_MEM | OBJ_RELEASE,
0408 .arg2_type = ARG_ANYTHING,
0409 };
0410
0411 BPF_CALL_2(bpf_ringbuf_discard, void *, sample, u64, flags)
0412 {
0413 bpf_ringbuf_commit(sample, flags, true );
0414 return 0;
0415 }
0416
0417 const struct bpf_func_proto bpf_ringbuf_discard_proto = {
0418 .func = bpf_ringbuf_discard,
0419 .ret_type = RET_VOID,
0420 .arg1_type = ARG_PTR_TO_ALLOC_MEM | OBJ_RELEASE,
0421 .arg2_type = ARG_ANYTHING,
0422 };
0423
0424 BPF_CALL_4(bpf_ringbuf_output, struct bpf_map *, map, void *, data, u64, size,
0425 u64, flags)
0426 {
0427 struct bpf_ringbuf_map *rb_map;
0428 void *rec;
0429
0430 if (unlikely(flags & ~(BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP)))
0431 return -EINVAL;
0432
0433 rb_map = container_of(map, struct bpf_ringbuf_map, map);
0434 rec = __bpf_ringbuf_reserve(rb_map->rb, size);
0435 if (!rec)
0436 return -EAGAIN;
0437
0438 memcpy(rec, data, size);
0439 bpf_ringbuf_commit(rec, flags, false );
0440 return 0;
0441 }
0442
0443 const struct bpf_func_proto bpf_ringbuf_output_proto = {
0444 .func = bpf_ringbuf_output,
0445 .ret_type = RET_INTEGER,
0446 .arg1_type = ARG_CONST_MAP_PTR,
0447 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
0448 .arg3_type = ARG_CONST_SIZE_OR_ZERO,
0449 .arg4_type = ARG_ANYTHING,
0450 };
0451
0452 BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags)
0453 {
0454 struct bpf_ringbuf *rb;
0455
0456 rb = container_of(map, struct bpf_ringbuf_map, map)->rb;
0457
0458 switch (flags) {
0459 case BPF_RB_AVAIL_DATA:
0460 return ringbuf_avail_data_sz(rb);
0461 case BPF_RB_RING_SIZE:
0462 return rb->mask + 1;
0463 case BPF_RB_CONS_POS:
0464 return smp_load_acquire(&rb->consumer_pos);
0465 case BPF_RB_PROD_POS:
0466 return smp_load_acquire(&rb->producer_pos);
0467 default:
0468 return 0;
0469 }
0470 }
0471
0472 const struct bpf_func_proto bpf_ringbuf_query_proto = {
0473 .func = bpf_ringbuf_query,
0474 .ret_type = RET_INTEGER,
0475 .arg1_type = ARG_CONST_MAP_PTR,
0476 .arg2_type = ARG_ANYTHING,
0477 };
0478
0479 BPF_CALL_4(bpf_ringbuf_reserve_dynptr, struct bpf_map *, map, u32, size, u64, flags,
0480 struct bpf_dynptr_kern *, ptr)
0481 {
0482 struct bpf_ringbuf_map *rb_map;
0483 void *sample;
0484 int err;
0485
0486 if (unlikely(flags)) {
0487 bpf_dynptr_set_null(ptr);
0488 return -EINVAL;
0489 }
0490
0491 err = bpf_dynptr_check_size(size);
0492 if (err) {
0493 bpf_dynptr_set_null(ptr);
0494 return err;
0495 }
0496
0497 rb_map = container_of(map, struct bpf_ringbuf_map, map);
0498
0499 sample = __bpf_ringbuf_reserve(rb_map->rb, size);
0500 if (!sample) {
0501 bpf_dynptr_set_null(ptr);
0502 return -EINVAL;
0503 }
0504
0505 bpf_dynptr_init(ptr, sample, BPF_DYNPTR_TYPE_RINGBUF, 0, size);
0506
0507 return 0;
0508 }
0509
0510 const struct bpf_func_proto bpf_ringbuf_reserve_dynptr_proto = {
0511 .func = bpf_ringbuf_reserve_dynptr,
0512 .ret_type = RET_INTEGER,
0513 .arg1_type = ARG_CONST_MAP_PTR,
0514 .arg2_type = ARG_ANYTHING,
0515 .arg3_type = ARG_ANYTHING,
0516 .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | MEM_UNINIT,
0517 };
0518
0519 BPF_CALL_2(bpf_ringbuf_submit_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags)
0520 {
0521 if (!ptr->data)
0522 return 0;
0523
0524 bpf_ringbuf_commit(ptr->data, flags, false );
0525
0526 bpf_dynptr_set_null(ptr);
0527
0528 return 0;
0529 }
0530
0531 const struct bpf_func_proto bpf_ringbuf_submit_dynptr_proto = {
0532 .func = bpf_ringbuf_submit_dynptr,
0533 .ret_type = RET_VOID,
0534 .arg1_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE,
0535 .arg2_type = ARG_ANYTHING,
0536 };
0537
0538 BPF_CALL_2(bpf_ringbuf_discard_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags)
0539 {
0540 if (!ptr->data)
0541 return 0;
0542
0543 bpf_ringbuf_commit(ptr->data, flags, true );
0544
0545 bpf_dynptr_set_null(ptr);
0546
0547 return 0;
0548 }
0549
0550 const struct bpf_func_proto bpf_ringbuf_discard_dynptr_proto = {
0551 .func = bpf_ringbuf_discard_dynptr,
0552 .ret_type = RET_VOID,
0553 .arg1_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE,
0554 .arg2_type = ARG_ANYTHING,
0555 };