Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
0003  * Copyright (c) 2016,2017 Facebook
0004  */
0005 #include <linux/bpf.h>
0006 #include <linux/btf.h>
0007 #include <linux/err.h>
0008 #include <linux/slab.h>
0009 #include <linux/mm.h>
0010 #include <linux/filter.h>
0011 #include <linux/perf_event.h>
0012 #include <uapi/linux/btf.h>
0013 #include <linux/rcupdate_trace.h>
0014 #include <linux/btf_ids.h>
0015 
0016 #include "map_in_map.h"
0017 
0018 #define ARRAY_CREATE_FLAG_MASK \
0019     (BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK | \
0020      BPF_F_PRESERVE_ELEMS | BPF_F_INNER_MAP)
0021 
0022 static void bpf_array_free_percpu(struct bpf_array *array)
0023 {
0024     int i;
0025 
0026     for (i = 0; i < array->map.max_entries; i++) {
0027         free_percpu(array->pptrs[i]);
0028         cond_resched();
0029     }
0030 }
0031 
0032 static int bpf_array_alloc_percpu(struct bpf_array *array)
0033 {
0034     void __percpu *ptr;
0035     int i;
0036 
0037     for (i = 0; i < array->map.max_entries; i++) {
0038         ptr = bpf_map_alloc_percpu(&array->map, array->elem_size, 8,
0039                        GFP_USER | __GFP_NOWARN);
0040         if (!ptr) {
0041             bpf_array_free_percpu(array);
0042             return -ENOMEM;
0043         }
0044         array->pptrs[i] = ptr;
0045         cond_resched();
0046     }
0047 
0048     return 0;
0049 }
0050 
0051 /* Called from syscall */
0052 int array_map_alloc_check(union bpf_attr *attr)
0053 {
0054     bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
0055     int numa_node = bpf_map_attr_numa_node(attr);
0056 
0057     /* check sanity of attributes */
0058     if (attr->max_entries == 0 || attr->key_size != 4 ||
0059         attr->value_size == 0 ||
0060         attr->map_flags & ~ARRAY_CREATE_FLAG_MASK ||
0061         !bpf_map_flags_access_ok(attr->map_flags) ||
0062         (percpu && numa_node != NUMA_NO_NODE))
0063         return -EINVAL;
0064 
0065     if (attr->map_type != BPF_MAP_TYPE_ARRAY &&
0066         attr->map_flags & (BPF_F_MMAPABLE | BPF_F_INNER_MAP))
0067         return -EINVAL;
0068 
0069     if (attr->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY &&
0070         attr->map_flags & BPF_F_PRESERVE_ELEMS)
0071         return -EINVAL;
0072 
0073     /* avoid overflow on round_up(map->value_size) */
0074     if (attr->value_size > INT_MAX)
0075         return -E2BIG;
0076 
0077     return 0;
0078 }
0079 
0080 static struct bpf_map *array_map_alloc(union bpf_attr *attr)
0081 {
0082     bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
0083     int numa_node = bpf_map_attr_numa_node(attr);
0084     u32 elem_size, index_mask, max_entries;
0085     bool bypass_spec_v1 = bpf_bypass_spec_v1();
0086     u64 array_size, mask64;
0087     struct bpf_array *array;
0088 
0089     elem_size = round_up(attr->value_size, 8);
0090 
0091     max_entries = attr->max_entries;
0092 
0093     /* On 32 bit archs roundup_pow_of_two() with max_entries that has
0094      * upper most bit set in u32 space is undefined behavior due to
0095      * resulting 1U << 32, so do it manually here in u64 space.
0096      */
0097     mask64 = fls_long(max_entries - 1);
0098     mask64 = 1ULL << mask64;
0099     mask64 -= 1;
0100 
0101     index_mask = mask64;
0102     if (!bypass_spec_v1) {
0103         /* round up array size to nearest power of 2,
0104          * since cpu will speculate within index_mask limits
0105          */
0106         max_entries = index_mask + 1;
0107         /* Check for overflows. */
0108         if (max_entries < attr->max_entries)
0109             return ERR_PTR(-E2BIG);
0110     }
0111 
0112     array_size = sizeof(*array);
0113     if (percpu) {
0114         array_size += (u64) max_entries * sizeof(void *);
0115     } else {
0116         /* rely on vmalloc() to return page-aligned memory and
0117          * ensure array->value is exactly page-aligned
0118          */
0119         if (attr->map_flags & BPF_F_MMAPABLE) {
0120             array_size = PAGE_ALIGN(array_size);
0121             array_size += PAGE_ALIGN((u64) max_entries * elem_size);
0122         } else {
0123             array_size += (u64) max_entries * elem_size;
0124         }
0125     }
0126 
0127     /* allocate all map elements and zero-initialize them */
0128     if (attr->map_flags & BPF_F_MMAPABLE) {
0129         void *data;
0130 
0131         /* kmalloc'ed memory can't be mmap'ed, use explicit vmalloc */
0132         data = bpf_map_area_mmapable_alloc(array_size, numa_node);
0133         if (!data)
0134             return ERR_PTR(-ENOMEM);
0135         array = data + PAGE_ALIGN(sizeof(struct bpf_array))
0136             - offsetof(struct bpf_array, value);
0137     } else {
0138         array = bpf_map_area_alloc(array_size, numa_node);
0139     }
0140     if (!array)
0141         return ERR_PTR(-ENOMEM);
0142     array->index_mask = index_mask;
0143     array->map.bypass_spec_v1 = bypass_spec_v1;
0144 
0145     /* copy mandatory map attributes */
0146     bpf_map_init_from_attr(&array->map, attr);
0147     array->elem_size = elem_size;
0148 
0149     if (percpu && bpf_array_alloc_percpu(array)) {
0150         bpf_map_area_free(array);
0151         return ERR_PTR(-ENOMEM);
0152     }
0153 
0154     return &array->map;
0155 }
0156 
0157 static void *array_map_elem_ptr(struct bpf_array* array, u32 index)
0158 {
0159     return array->value + (u64)array->elem_size * index;
0160 }
0161 
0162 /* Called from syscall or from eBPF program */
0163 static void *array_map_lookup_elem(struct bpf_map *map, void *key)
0164 {
0165     struct bpf_array *array = container_of(map, struct bpf_array, map);
0166     u32 index = *(u32 *)key;
0167 
0168     if (unlikely(index >= array->map.max_entries))
0169         return NULL;
0170 
0171     return array->value + (u64)array->elem_size * (index & array->index_mask);
0172 }
0173 
0174 static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm,
0175                        u32 off)
0176 {
0177     struct bpf_array *array = container_of(map, struct bpf_array, map);
0178 
0179     if (map->max_entries != 1)
0180         return -ENOTSUPP;
0181     if (off >= map->value_size)
0182         return -EINVAL;
0183 
0184     *imm = (unsigned long)array->value;
0185     return 0;
0186 }
0187 
0188 static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm,
0189                        u32 *off)
0190 {
0191     struct bpf_array *array = container_of(map, struct bpf_array, map);
0192     u64 base = (unsigned long)array->value;
0193     u64 range = array->elem_size;
0194 
0195     if (map->max_entries != 1)
0196         return -ENOTSUPP;
0197     if (imm < base || imm >= base + range)
0198         return -ENOENT;
0199 
0200     *off = imm - base;
0201     return 0;
0202 }
0203 
0204 /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */
0205 static int array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
0206 {
0207     struct bpf_array *array = container_of(map, struct bpf_array, map);
0208     struct bpf_insn *insn = insn_buf;
0209     u32 elem_size = array->elem_size;
0210     const int ret = BPF_REG_0;
0211     const int map_ptr = BPF_REG_1;
0212     const int index = BPF_REG_2;
0213 
0214     if (map->map_flags & BPF_F_INNER_MAP)
0215         return -EOPNOTSUPP;
0216 
0217     *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
0218     *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
0219     if (!map->bypass_spec_v1) {
0220         *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4);
0221         *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
0222     } else {
0223         *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3);
0224     }
0225 
0226     if (is_power_of_2(elem_size)) {
0227         *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
0228     } else {
0229         *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size);
0230     }
0231     *insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr);
0232     *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
0233     *insn++ = BPF_MOV64_IMM(ret, 0);
0234     return insn - insn_buf;
0235 }
0236 
0237 /* Called from eBPF program */
0238 static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
0239 {
0240     struct bpf_array *array = container_of(map, struct bpf_array, map);
0241     u32 index = *(u32 *)key;
0242 
0243     if (unlikely(index >= array->map.max_entries))
0244         return NULL;
0245 
0246     return this_cpu_ptr(array->pptrs[index & array->index_mask]);
0247 }
0248 
0249 static void *percpu_array_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
0250 {
0251     struct bpf_array *array = container_of(map, struct bpf_array, map);
0252     u32 index = *(u32 *)key;
0253 
0254     if (cpu >= nr_cpu_ids)
0255         return NULL;
0256 
0257     if (unlikely(index >= array->map.max_entries))
0258         return NULL;
0259 
0260     return per_cpu_ptr(array->pptrs[index & array->index_mask], cpu);
0261 }
0262 
0263 int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
0264 {
0265     struct bpf_array *array = container_of(map, struct bpf_array, map);
0266     u32 index = *(u32 *)key;
0267     void __percpu *pptr;
0268     int cpu, off = 0;
0269     u32 size;
0270 
0271     if (unlikely(index >= array->map.max_entries))
0272         return -ENOENT;
0273 
0274     /* per_cpu areas are zero-filled and bpf programs can only
0275      * access 'value_size' of them, so copying rounded areas
0276      * will not leak any kernel data
0277      */
0278     size = array->elem_size;
0279     rcu_read_lock();
0280     pptr = array->pptrs[index & array->index_mask];
0281     for_each_possible_cpu(cpu) {
0282         bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size);
0283         off += size;
0284     }
0285     rcu_read_unlock();
0286     return 0;
0287 }
0288 
0289 /* Called from syscall */
0290 static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
0291 {
0292     struct bpf_array *array = container_of(map, struct bpf_array, map);
0293     u32 index = key ? *(u32 *)key : U32_MAX;
0294     u32 *next = (u32 *)next_key;
0295 
0296     if (index >= array->map.max_entries) {
0297         *next = 0;
0298         return 0;
0299     }
0300 
0301     if (index == array->map.max_entries - 1)
0302         return -ENOENT;
0303 
0304     *next = index + 1;
0305     return 0;
0306 }
0307 
0308 static void check_and_free_fields(struct bpf_array *arr, void *val)
0309 {
0310     if (map_value_has_timer(&arr->map))
0311         bpf_timer_cancel_and_free(val + arr->map.timer_off);
0312     if (map_value_has_kptrs(&arr->map))
0313         bpf_map_free_kptrs(&arr->map, val);
0314 }
0315 
0316 /* Called from syscall or from eBPF program */
0317 static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
0318                  u64 map_flags)
0319 {
0320     struct bpf_array *array = container_of(map, struct bpf_array, map);
0321     u32 index = *(u32 *)key;
0322     char *val;
0323 
0324     if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST))
0325         /* unknown flags */
0326         return -EINVAL;
0327 
0328     if (unlikely(index >= array->map.max_entries))
0329         /* all elements were pre-allocated, cannot insert a new one */
0330         return -E2BIG;
0331 
0332     if (unlikely(map_flags & BPF_NOEXIST))
0333         /* all elements already exist */
0334         return -EEXIST;
0335 
0336     if (unlikely((map_flags & BPF_F_LOCK) &&
0337              !map_value_has_spin_lock(map)))
0338         return -EINVAL;
0339 
0340     if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
0341         memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]),
0342                value, map->value_size);
0343     } else {
0344         val = array->value +
0345             (u64)array->elem_size * (index & array->index_mask);
0346         if (map_flags & BPF_F_LOCK)
0347             copy_map_value_locked(map, val, value, false);
0348         else
0349             copy_map_value(map, val, value);
0350         check_and_free_fields(array, val);
0351     }
0352     return 0;
0353 }
0354 
0355 int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
0356                 u64 map_flags)
0357 {
0358     struct bpf_array *array = container_of(map, struct bpf_array, map);
0359     u32 index = *(u32 *)key;
0360     void __percpu *pptr;
0361     int cpu, off = 0;
0362     u32 size;
0363 
0364     if (unlikely(map_flags > BPF_EXIST))
0365         /* unknown flags */
0366         return -EINVAL;
0367 
0368     if (unlikely(index >= array->map.max_entries))
0369         /* all elements were pre-allocated, cannot insert a new one */
0370         return -E2BIG;
0371 
0372     if (unlikely(map_flags == BPF_NOEXIST))
0373         /* all elements already exist */
0374         return -EEXIST;
0375 
0376     /* the user space will provide round_up(value_size, 8) bytes that
0377      * will be copied into per-cpu area. bpf programs can only access
0378      * value_size of it. During lookup the same extra bytes will be
0379      * returned or zeros which were zero-filled by percpu_alloc,
0380      * so no kernel data leaks possible
0381      */
0382     size = array->elem_size;
0383     rcu_read_lock();
0384     pptr = array->pptrs[index & array->index_mask];
0385     for_each_possible_cpu(cpu) {
0386         bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size);
0387         off += size;
0388     }
0389     rcu_read_unlock();
0390     return 0;
0391 }
0392 
0393 /* Called from syscall or from eBPF program */
0394 static int array_map_delete_elem(struct bpf_map *map, void *key)
0395 {
0396     return -EINVAL;
0397 }
0398 
0399 static void *array_map_vmalloc_addr(struct bpf_array *array)
0400 {
0401     return (void *)round_down((unsigned long)array, PAGE_SIZE);
0402 }
0403 
0404 static void array_map_free_timers(struct bpf_map *map)
0405 {
0406     struct bpf_array *array = container_of(map, struct bpf_array, map);
0407     int i;
0408 
0409     /* We don't reset or free kptr on uref dropping to zero. */
0410     if (!map_value_has_timer(map))
0411         return;
0412 
0413     for (i = 0; i < array->map.max_entries; i++)
0414         bpf_timer_cancel_and_free(array_map_elem_ptr(array, i) + map->timer_off);
0415 }
0416 
0417 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
0418 static void array_map_free(struct bpf_map *map)
0419 {
0420     struct bpf_array *array = container_of(map, struct bpf_array, map);
0421     int i;
0422 
0423     if (map_value_has_kptrs(map)) {
0424         for (i = 0; i < array->map.max_entries; i++)
0425             bpf_map_free_kptrs(map, array_map_elem_ptr(array, i));
0426         bpf_map_free_kptr_off_tab(map);
0427     }
0428 
0429     if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
0430         bpf_array_free_percpu(array);
0431 
0432     if (array->map.map_flags & BPF_F_MMAPABLE)
0433         bpf_map_area_free(array_map_vmalloc_addr(array));
0434     else
0435         bpf_map_area_free(array);
0436 }
0437 
0438 static void array_map_seq_show_elem(struct bpf_map *map, void *key,
0439                     struct seq_file *m)
0440 {
0441     void *value;
0442 
0443     rcu_read_lock();
0444 
0445     value = array_map_lookup_elem(map, key);
0446     if (!value) {
0447         rcu_read_unlock();
0448         return;
0449     }
0450 
0451     if (map->btf_key_type_id)
0452         seq_printf(m, "%u: ", *(u32 *)key);
0453     btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);
0454     seq_puts(m, "\n");
0455 
0456     rcu_read_unlock();
0457 }
0458 
0459 static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key,
0460                        struct seq_file *m)
0461 {
0462     struct bpf_array *array = container_of(map, struct bpf_array, map);
0463     u32 index = *(u32 *)key;
0464     void __percpu *pptr;
0465     int cpu;
0466 
0467     rcu_read_lock();
0468 
0469     seq_printf(m, "%u: {\n", *(u32 *)key);
0470     pptr = array->pptrs[index & array->index_mask];
0471     for_each_possible_cpu(cpu) {
0472         seq_printf(m, "\tcpu%d: ", cpu);
0473         btf_type_seq_show(map->btf, map->btf_value_type_id,
0474                   per_cpu_ptr(pptr, cpu), m);
0475         seq_puts(m, "\n");
0476     }
0477     seq_puts(m, "}\n");
0478 
0479     rcu_read_unlock();
0480 }
0481 
0482 static int array_map_check_btf(const struct bpf_map *map,
0483                    const struct btf *btf,
0484                    const struct btf_type *key_type,
0485                    const struct btf_type *value_type)
0486 {
0487     u32 int_data;
0488 
0489     /* One exception for keyless BTF: .bss/.data/.rodata map */
0490     if (btf_type_is_void(key_type)) {
0491         if (map->map_type != BPF_MAP_TYPE_ARRAY ||
0492             map->max_entries != 1)
0493             return -EINVAL;
0494 
0495         if (BTF_INFO_KIND(value_type->info) != BTF_KIND_DATASEC)
0496             return -EINVAL;
0497 
0498         return 0;
0499     }
0500 
0501     if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
0502         return -EINVAL;
0503 
0504     int_data = *(u32 *)(key_type + 1);
0505     /* bpf array can only take a u32 key. This check makes sure
0506      * that the btf matches the attr used during map_create.
0507      */
0508     if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data))
0509         return -EINVAL;
0510 
0511     return 0;
0512 }
0513 
0514 static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
0515 {
0516     struct bpf_array *array = container_of(map, struct bpf_array, map);
0517     pgoff_t pgoff = PAGE_ALIGN(sizeof(*array)) >> PAGE_SHIFT;
0518 
0519     if (!(map->map_flags & BPF_F_MMAPABLE))
0520         return -EINVAL;
0521 
0522     if (vma->vm_pgoff * PAGE_SIZE + (vma->vm_end - vma->vm_start) >
0523         PAGE_ALIGN((u64)array->map.max_entries * array->elem_size))
0524         return -EINVAL;
0525 
0526     return remap_vmalloc_range(vma, array_map_vmalloc_addr(array),
0527                    vma->vm_pgoff + pgoff);
0528 }
0529 
0530 static bool array_map_meta_equal(const struct bpf_map *meta0,
0531                  const struct bpf_map *meta1)
0532 {
0533     if (!bpf_map_meta_equal(meta0, meta1))
0534         return false;
0535     return meta0->map_flags & BPF_F_INNER_MAP ? true :
0536            meta0->max_entries == meta1->max_entries;
0537 }
0538 
0539 struct bpf_iter_seq_array_map_info {
0540     struct bpf_map *map;
0541     void *percpu_value_buf;
0542     u32 index;
0543 };
0544 
0545 static void *bpf_array_map_seq_start(struct seq_file *seq, loff_t *pos)
0546 {
0547     struct bpf_iter_seq_array_map_info *info = seq->private;
0548     struct bpf_map *map = info->map;
0549     struct bpf_array *array;
0550     u32 index;
0551 
0552     if (info->index >= map->max_entries)
0553         return NULL;
0554 
0555     if (*pos == 0)
0556         ++*pos;
0557     array = container_of(map, struct bpf_array, map);
0558     index = info->index & array->index_mask;
0559     if (info->percpu_value_buf)
0560            return array->pptrs[index];
0561     return array_map_elem_ptr(array, index);
0562 }
0563 
0564 static void *bpf_array_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
0565 {
0566     struct bpf_iter_seq_array_map_info *info = seq->private;
0567     struct bpf_map *map = info->map;
0568     struct bpf_array *array;
0569     u32 index;
0570 
0571     ++*pos;
0572     ++info->index;
0573     if (info->index >= map->max_entries)
0574         return NULL;
0575 
0576     array = container_of(map, struct bpf_array, map);
0577     index = info->index & array->index_mask;
0578     if (info->percpu_value_buf)
0579            return array->pptrs[index];
0580     return array_map_elem_ptr(array, index);
0581 }
0582 
0583 static int __bpf_array_map_seq_show(struct seq_file *seq, void *v)
0584 {
0585     struct bpf_iter_seq_array_map_info *info = seq->private;
0586     struct bpf_iter__bpf_map_elem ctx = {};
0587     struct bpf_map *map = info->map;
0588     struct bpf_array *array = container_of(map, struct bpf_array, map);
0589     struct bpf_iter_meta meta;
0590     struct bpf_prog *prog;
0591     int off = 0, cpu = 0;
0592     void __percpu **pptr;
0593     u32 size;
0594 
0595     meta.seq = seq;
0596     prog = bpf_iter_get_info(&meta, v == NULL);
0597     if (!prog)
0598         return 0;
0599 
0600     ctx.meta = &meta;
0601     ctx.map = info->map;
0602     if (v) {
0603         ctx.key = &info->index;
0604 
0605         if (!info->percpu_value_buf) {
0606             ctx.value = v;
0607         } else {
0608             pptr = v;
0609             size = array->elem_size;
0610             for_each_possible_cpu(cpu) {
0611                 bpf_long_memcpy(info->percpu_value_buf + off,
0612                         per_cpu_ptr(pptr, cpu),
0613                         size);
0614                 off += size;
0615             }
0616             ctx.value = info->percpu_value_buf;
0617         }
0618     }
0619 
0620     return bpf_iter_run_prog(prog, &ctx);
0621 }
0622 
0623 static int bpf_array_map_seq_show(struct seq_file *seq, void *v)
0624 {
0625     return __bpf_array_map_seq_show(seq, v);
0626 }
0627 
0628 static void bpf_array_map_seq_stop(struct seq_file *seq, void *v)
0629 {
0630     if (!v)
0631         (void)__bpf_array_map_seq_show(seq, NULL);
0632 }
0633 
0634 static int bpf_iter_init_array_map(void *priv_data,
0635                    struct bpf_iter_aux_info *aux)
0636 {
0637     struct bpf_iter_seq_array_map_info *seq_info = priv_data;
0638     struct bpf_map *map = aux->map;
0639     struct bpf_array *array = container_of(map, struct bpf_array, map);
0640     void *value_buf;
0641     u32 buf_size;
0642 
0643     if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
0644         buf_size = array->elem_size * num_possible_cpus();
0645         value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN);
0646         if (!value_buf)
0647             return -ENOMEM;
0648 
0649         seq_info->percpu_value_buf = value_buf;
0650     }
0651 
0652     /* bpf_iter_attach_map() acquires a map uref, and the uref may be
0653      * released before or in the middle of iterating map elements, so
0654      * acquire an extra map uref for iterator.
0655      */
0656     bpf_map_inc_with_uref(map);
0657     seq_info->map = map;
0658     return 0;
0659 }
0660 
0661 static void bpf_iter_fini_array_map(void *priv_data)
0662 {
0663     struct bpf_iter_seq_array_map_info *seq_info = priv_data;
0664 
0665     bpf_map_put_with_uref(seq_info->map);
0666     kfree(seq_info->percpu_value_buf);
0667 }
0668 
0669 static const struct seq_operations bpf_array_map_seq_ops = {
0670     .start  = bpf_array_map_seq_start,
0671     .next   = bpf_array_map_seq_next,
0672     .stop   = bpf_array_map_seq_stop,
0673     .show   = bpf_array_map_seq_show,
0674 };
0675 
0676 static const struct bpf_iter_seq_info iter_seq_info = {
0677     .seq_ops        = &bpf_array_map_seq_ops,
0678     .init_seq_private   = bpf_iter_init_array_map,
0679     .fini_seq_private   = bpf_iter_fini_array_map,
0680     .seq_priv_size      = sizeof(struct bpf_iter_seq_array_map_info),
0681 };
0682 
0683 static int bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_fn,
0684                    void *callback_ctx, u64 flags)
0685 {
0686     u32 i, key, num_elems = 0;
0687     struct bpf_array *array;
0688     bool is_percpu;
0689     u64 ret = 0;
0690     void *val;
0691 
0692     if (flags != 0)
0693         return -EINVAL;
0694 
0695     is_percpu = map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
0696     array = container_of(map, struct bpf_array, map);
0697     if (is_percpu)
0698         migrate_disable();
0699     for (i = 0; i < map->max_entries; i++) {
0700         if (is_percpu)
0701             val = this_cpu_ptr(array->pptrs[i]);
0702         else
0703             val = array_map_elem_ptr(array, i);
0704         num_elems++;
0705         key = i;
0706         ret = callback_fn((u64)(long)map, (u64)(long)&key,
0707                   (u64)(long)val, (u64)(long)callback_ctx, 0);
0708         /* return value: 0 - continue, 1 - stop and return */
0709         if (ret)
0710             break;
0711     }
0712 
0713     if (is_percpu)
0714         migrate_enable();
0715     return num_elems;
0716 }
0717 
0718 BTF_ID_LIST_SINGLE(array_map_btf_ids, struct, bpf_array)
0719 const struct bpf_map_ops array_map_ops = {
0720     .map_meta_equal = array_map_meta_equal,
0721     .map_alloc_check = array_map_alloc_check,
0722     .map_alloc = array_map_alloc,
0723     .map_free = array_map_free,
0724     .map_get_next_key = array_map_get_next_key,
0725     .map_release_uref = array_map_free_timers,
0726     .map_lookup_elem = array_map_lookup_elem,
0727     .map_update_elem = array_map_update_elem,
0728     .map_delete_elem = array_map_delete_elem,
0729     .map_gen_lookup = array_map_gen_lookup,
0730     .map_direct_value_addr = array_map_direct_value_addr,
0731     .map_direct_value_meta = array_map_direct_value_meta,
0732     .map_mmap = array_map_mmap,
0733     .map_seq_show_elem = array_map_seq_show_elem,
0734     .map_check_btf = array_map_check_btf,
0735     .map_lookup_batch = generic_map_lookup_batch,
0736     .map_update_batch = generic_map_update_batch,
0737     .map_set_for_each_callback_args = map_set_for_each_callback_args,
0738     .map_for_each_callback = bpf_for_each_array_elem,
0739     .map_btf_id = &array_map_btf_ids[0],
0740     .iter_seq_info = &iter_seq_info,
0741 };
0742 
0743 const struct bpf_map_ops percpu_array_map_ops = {
0744     .map_meta_equal = bpf_map_meta_equal,
0745     .map_alloc_check = array_map_alloc_check,
0746     .map_alloc = array_map_alloc,
0747     .map_free = array_map_free,
0748     .map_get_next_key = array_map_get_next_key,
0749     .map_lookup_elem = percpu_array_map_lookup_elem,
0750     .map_update_elem = array_map_update_elem,
0751     .map_delete_elem = array_map_delete_elem,
0752     .map_lookup_percpu_elem = percpu_array_map_lookup_percpu_elem,
0753     .map_seq_show_elem = percpu_array_map_seq_show_elem,
0754     .map_check_btf = array_map_check_btf,
0755     .map_lookup_batch = generic_map_lookup_batch,
0756     .map_update_batch = generic_map_update_batch,
0757     .map_set_for_each_callback_args = map_set_for_each_callback_args,
0758     .map_for_each_callback = bpf_for_each_array_elem,
0759     .map_btf_id = &array_map_btf_ids[0],
0760     .iter_seq_info = &iter_seq_info,
0761 };
0762 
0763 static int fd_array_map_alloc_check(union bpf_attr *attr)
0764 {
0765     /* only file descriptors can be stored in this type of map */
0766     if (attr->value_size != sizeof(u32))
0767         return -EINVAL;
0768     /* Program read-only/write-only not supported for special maps yet. */
0769     if (attr->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG))
0770         return -EINVAL;
0771     return array_map_alloc_check(attr);
0772 }
0773 
0774 static void fd_array_map_free(struct bpf_map *map)
0775 {
0776     struct bpf_array *array = container_of(map, struct bpf_array, map);
0777     int i;
0778 
0779     /* make sure it's empty */
0780     for (i = 0; i < array->map.max_entries; i++)
0781         BUG_ON(array->ptrs[i] != NULL);
0782 
0783     bpf_map_area_free(array);
0784 }
0785 
0786 static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
0787 {
0788     return ERR_PTR(-EOPNOTSUPP);
0789 }
0790 
0791 /* only called from syscall */
0792 int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value)
0793 {
0794     void **elem, *ptr;
0795     int ret =  0;
0796 
0797     if (!map->ops->map_fd_sys_lookup_elem)
0798         return -ENOTSUPP;
0799 
0800     rcu_read_lock();
0801     elem = array_map_lookup_elem(map, key);
0802     if (elem && (ptr = READ_ONCE(*elem)))
0803         *value = map->ops->map_fd_sys_lookup_elem(ptr);
0804     else
0805         ret = -ENOENT;
0806     rcu_read_unlock();
0807 
0808     return ret;
0809 }
0810 
0811 /* only called from syscall */
0812 int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
0813                  void *key, void *value, u64 map_flags)
0814 {
0815     struct bpf_array *array = container_of(map, struct bpf_array, map);
0816     void *new_ptr, *old_ptr;
0817     u32 index = *(u32 *)key, ufd;
0818 
0819     if (map_flags != BPF_ANY)
0820         return -EINVAL;
0821 
0822     if (index >= array->map.max_entries)
0823         return -E2BIG;
0824 
0825     ufd = *(u32 *)value;
0826     new_ptr = map->ops->map_fd_get_ptr(map, map_file, ufd);
0827     if (IS_ERR(new_ptr))
0828         return PTR_ERR(new_ptr);
0829 
0830     if (map->ops->map_poke_run) {
0831         mutex_lock(&array->aux->poke_mutex);
0832         old_ptr = xchg(array->ptrs + index, new_ptr);
0833         map->ops->map_poke_run(map, index, old_ptr, new_ptr);
0834         mutex_unlock(&array->aux->poke_mutex);
0835     } else {
0836         old_ptr = xchg(array->ptrs + index, new_ptr);
0837     }
0838 
0839     if (old_ptr)
0840         map->ops->map_fd_put_ptr(old_ptr);
0841     return 0;
0842 }
0843 
0844 static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
0845 {
0846     struct bpf_array *array = container_of(map, struct bpf_array, map);
0847     void *old_ptr;
0848     u32 index = *(u32 *)key;
0849 
0850     if (index >= array->map.max_entries)
0851         return -E2BIG;
0852 
0853     if (map->ops->map_poke_run) {
0854         mutex_lock(&array->aux->poke_mutex);
0855         old_ptr = xchg(array->ptrs + index, NULL);
0856         map->ops->map_poke_run(map, index, old_ptr, NULL);
0857         mutex_unlock(&array->aux->poke_mutex);
0858     } else {
0859         old_ptr = xchg(array->ptrs + index, NULL);
0860     }
0861 
0862     if (old_ptr) {
0863         map->ops->map_fd_put_ptr(old_ptr);
0864         return 0;
0865     } else {
0866         return -ENOENT;
0867     }
0868 }
0869 
0870 static void *prog_fd_array_get_ptr(struct bpf_map *map,
0871                    struct file *map_file, int fd)
0872 {
0873     struct bpf_prog *prog = bpf_prog_get(fd);
0874 
0875     if (IS_ERR(prog))
0876         return prog;
0877 
0878     if (!bpf_prog_map_compatible(map, prog)) {
0879         bpf_prog_put(prog);
0880         return ERR_PTR(-EINVAL);
0881     }
0882 
0883     return prog;
0884 }
0885 
0886 static void prog_fd_array_put_ptr(void *ptr)
0887 {
0888     bpf_prog_put(ptr);
0889 }
0890 
0891 static u32 prog_fd_array_sys_lookup_elem(void *ptr)
0892 {
0893     return ((struct bpf_prog *)ptr)->aux->id;
0894 }
0895 
0896 /* decrement refcnt of all bpf_progs that are stored in this map */
0897 static void bpf_fd_array_map_clear(struct bpf_map *map)
0898 {
0899     struct bpf_array *array = container_of(map, struct bpf_array, map);
0900     int i;
0901 
0902     for (i = 0; i < array->map.max_entries; i++)
0903         fd_array_map_delete_elem(map, &i);
0904 }
0905 
0906 static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key,
0907                      struct seq_file *m)
0908 {
0909     void **elem, *ptr;
0910     u32 prog_id;
0911 
0912     rcu_read_lock();
0913 
0914     elem = array_map_lookup_elem(map, key);
0915     if (elem) {
0916         ptr = READ_ONCE(*elem);
0917         if (ptr) {
0918             seq_printf(m, "%u: ", *(u32 *)key);
0919             prog_id = prog_fd_array_sys_lookup_elem(ptr);
0920             btf_type_seq_show(map->btf, map->btf_value_type_id,
0921                       &prog_id, m);
0922             seq_puts(m, "\n");
0923         }
0924     }
0925 
0926     rcu_read_unlock();
0927 }
0928 
0929 struct prog_poke_elem {
0930     struct list_head list;
0931     struct bpf_prog_aux *aux;
0932 };
0933 
0934 static int prog_array_map_poke_track(struct bpf_map *map,
0935                      struct bpf_prog_aux *prog_aux)
0936 {
0937     struct prog_poke_elem *elem;
0938     struct bpf_array_aux *aux;
0939     int ret = 0;
0940 
0941     aux = container_of(map, struct bpf_array, map)->aux;
0942     mutex_lock(&aux->poke_mutex);
0943     list_for_each_entry(elem, &aux->poke_progs, list) {
0944         if (elem->aux == prog_aux)
0945             goto out;
0946     }
0947 
0948     elem = kmalloc(sizeof(*elem), GFP_KERNEL);
0949     if (!elem) {
0950         ret = -ENOMEM;
0951         goto out;
0952     }
0953 
0954     INIT_LIST_HEAD(&elem->list);
0955     /* We must track the program's aux info at this point in time
0956      * since the program pointer itself may not be stable yet, see
0957      * also comment in prog_array_map_poke_run().
0958      */
0959     elem->aux = prog_aux;
0960 
0961     list_add_tail(&elem->list, &aux->poke_progs);
0962 out:
0963     mutex_unlock(&aux->poke_mutex);
0964     return ret;
0965 }
0966 
0967 static void prog_array_map_poke_untrack(struct bpf_map *map,
0968                     struct bpf_prog_aux *prog_aux)
0969 {
0970     struct prog_poke_elem *elem, *tmp;
0971     struct bpf_array_aux *aux;
0972 
0973     aux = container_of(map, struct bpf_array, map)->aux;
0974     mutex_lock(&aux->poke_mutex);
0975     list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) {
0976         if (elem->aux == prog_aux) {
0977             list_del_init(&elem->list);
0978             kfree(elem);
0979             break;
0980         }
0981     }
0982     mutex_unlock(&aux->poke_mutex);
0983 }
0984 
0985 static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
0986                     struct bpf_prog *old,
0987                     struct bpf_prog *new)
0988 {
0989     u8 *old_addr, *new_addr, *old_bypass_addr;
0990     struct prog_poke_elem *elem;
0991     struct bpf_array_aux *aux;
0992 
0993     aux = container_of(map, struct bpf_array, map)->aux;
0994     WARN_ON_ONCE(!mutex_is_locked(&aux->poke_mutex));
0995 
0996     list_for_each_entry(elem, &aux->poke_progs, list) {
0997         struct bpf_jit_poke_descriptor *poke;
0998         int i, ret;
0999 
1000         for (i = 0; i < elem->aux->size_poke_tab; i++) {
1001             poke = &elem->aux->poke_tab[i];
1002 
1003             /* Few things to be aware of:
1004              *
1005              * 1) We can only ever access aux in this context, but
1006              *    not aux->prog since it might not be stable yet and
1007              *    there could be danger of use after free otherwise.
1008              * 2) Initially when we start tracking aux, the program
1009              *    is not JITed yet and also does not have a kallsyms
1010              *    entry. We skip these as poke->tailcall_target_stable
1011              *    is not active yet. The JIT will do the final fixup
1012              *    before setting it stable. The various
1013              *    poke->tailcall_target_stable are successively
1014              *    activated, so tail call updates can arrive from here
1015              *    while JIT is still finishing its final fixup for
1016              *    non-activated poke entries.
1017              * 3) On program teardown, the program's kallsym entry gets
1018              *    removed out of RCU callback, but we can only untrack
1019              *    from sleepable context, therefore bpf_arch_text_poke()
1020              *    might not see that this is in BPF text section and
1021              *    bails out with -EINVAL. As these are unreachable since
1022              *    RCU grace period already passed, we simply skip them.
1023              * 4) Also programs reaching refcount of zero while patching
1024              *    is in progress is okay since we're protected under
1025              *    poke_mutex and untrack the programs before the JIT
1026              *    buffer is freed. When we're still in the middle of
1027              *    patching and suddenly kallsyms entry of the program
1028              *    gets evicted, we just skip the rest which is fine due
1029              *    to point 3).
1030              * 5) Any other error happening below from bpf_arch_text_poke()
1031              *    is a unexpected bug.
1032              */
1033             if (!READ_ONCE(poke->tailcall_target_stable))
1034                 continue;
1035             if (poke->reason != BPF_POKE_REASON_TAIL_CALL)
1036                 continue;
1037             if (poke->tail_call.map != map ||
1038                 poke->tail_call.key != key)
1039                 continue;
1040 
1041             old_bypass_addr = old ? NULL : poke->bypass_addr;
1042             old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL;
1043             new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL;
1044 
1045             if (new) {
1046                 ret = bpf_arch_text_poke(poke->tailcall_target,
1047                              BPF_MOD_JUMP,
1048                              old_addr, new_addr);
1049                 BUG_ON(ret < 0 && ret != -EINVAL);
1050                 if (!old) {
1051                     ret = bpf_arch_text_poke(poke->tailcall_bypass,
1052                                  BPF_MOD_JUMP,
1053                                  poke->bypass_addr,
1054                                  NULL);
1055                     BUG_ON(ret < 0 && ret != -EINVAL);
1056                 }
1057             } else {
1058                 ret = bpf_arch_text_poke(poke->tailcall_bypass,
1059                              BPF_MOD_JUMP,
1060                              old_bypass_addr,
1061                              poke->bypass_addr);
1062                 BUG_ON(ret < 0 && ret != -EINVAL);
1063                 /* let other CPUs finish the execution of program
1064                  * so that it will not possible to expose them
1065                  * to invalid nop, stack unwind, nop state
1066                  */
1067                 if (!ret)
1068                     synchronize_rcu();
1069                 ret = bpf_arch_text_poke(poke->tailcall_target,
1070                              BPF_MOD_JUMP,
1071                              old_addr, NULL);
1072                 BUG_ON(ret < 0 && ret != -EINVAL);
1073             }
1074         }
1075     }
1076 }
1077 
1078 static void prog_array_map_clear_deferred(struct work_struct *work)
1079 {
1080     struct bpf_map *map = container_of(work, struct bpf_array_aux,
1081                        work)->map;
1082     bpf_fd_array_map_clear(map);
1083     bpf_map_put(map);
1084 }
1085 
1086 static void prog_array_map_clear(struct bpf_map *map)
1087 {
1088     struct bpf_array_aux *aux = container_of(map, struct bpf_array,
1089                          map)->aux;
1090     bpf_map_inc(map);
1091     schedule_work(&aux->work);
1092 }
1093 
1094 static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
1095 {
1096     struct bpf_array_aux *aux;
1097     struct bpf_map *map;
1098 
1099     aux = kzalloc(sizeof(*aux), GFP_KERNEL_ACCOUNT);
1100     if (!aux)
1101         return ERR_PTR(-ENOMEM);
1102 
1103     INIT_WORK(&aux->work, prog_array_map_clear_deferred);
1104     INIT_LIST_HEAD(&aux->poke_progs);
1105     mutex_init(&aux->poke_mutex);
1106 
1107     map = array_map_alloc(attr);
1108     if (IS_ERR(map)) {
1109         kfree(aux);
1110         return map;
1111     }
1112 
1113     container_of(map, struct bpf_array, map)->aux = aux;
1114     aux->map = map;
1115 
1116     return map;
1117 }
1118 
1119 static void prog_array_map_free(struct bpf_map *map)
1120 {
1121     struct prog_poke_elem *elem, *tmp;
1122     struct bpf_array_aux *aux;
1123 
1124     aux = container_of(map, struct bpf_array, map)->aux;
1125     list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) {
1126         list_del_init(&elem->list);
1127         kfree(elem);
1128     }
1129     kfree(aux);
1130     fd_array_map_free(map);
1131 }
1132 
1133 /* prog_array->aux->{type,jited} is a runtime binding.
1134  * Doing static check alone in the verifier is not enough.
1135  * Thus, prog_array_map cannot be used as an inner_map
1136  * and map_meta_equal is not implemented.
1137  */
1138 const struct bpf_map_ops prog_array_map_ops = {
1139     .map_alloc_check = fd_array_map_alloc_check,
1140     .map_alloc = prog_array_map_alloc,
1141     .map_free = prog_array_map_free,
1142     .map_poke_track = prog_array_map_poke_track,
1143     .map_poke_untrack = prog_array_map_poke_untrack,
1144     .map_poke_run = prog_array_map_poke_run,
1145     .map_get_next_key = array_map_get_next_key,
1146     .map_lookup_elem = fd_array_map_lookup_elem,
1147     .map_delete_elem = fd_array_map_delete_elem,
1148     .map_fd_get_ptr = prog_fd_array_get_ptr,
1149     .map_fd_put_ptr = prog_fd_array_put_ptr,
1150     .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,
1151     .map_release_uref = prog_array_map_clear,
1152     .map_seq_show_elem = prog_array_map_seq_show_elem,
1153     .map_btf_id = &array_map_btf_ids[0],
1154 };
1155 
1156 static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
1157                            struct file *map_file)
1158 {
1159     struct bpf_event_entry *ee;
1160 
1161     ee = kzalloc(sizeof(*ee), GFP_ATOMIC);
1162     if (ee) {
1163         ee->event = perf_file->private_data;
1164         ee->perf_file = perf_file;
1165         ee->map_file = map_file;
1166     }
1167 
1168     return ee;
1169 }
1170 
1171 static void __bpf_event_entry_free(struct rcu_head *rcu)
1172 {
1173     struct bpf_event_entry *ee;
1174 
1175     ee = container_of(rcu, struct bpf_event_entry, rcu);
1176     fput(ee->perf_file);
1177     kfree(ee);
1178 }
1179 
1180 static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee)
1181 {
1182     call_rcu(&ee->rcu, __bpf_event_entry_free);
1183 }
1184 
1185 static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
1186                      struct file *map_file, int fd)
1187 {
1188     struct bpf_event_entry *ee;
1189     struct perf_event *event;
1190     struct file *perf_file;
1191     u64 value;
1192 
1193     perf_file = perf_event_get(fd);
1194     if (IS_ERR(perf_file))
1195         return perf_file;
1196 
1197     ee = ERR_PTR(-EOPNOTSUPP);
1198     event = perf_file->private_data;
1199     if (perf_event_read_local(event, &value, NULL, NULL) == -EOPNOTSUPP)
1200         goto err_out;
1201 
1202     ee = bpf_event_entry_gen(perf_file, map_file);
1203     if (ee)
1204         return ee;
1205     ee = ERR_PTR(-ENOMEM);
1206 err_out:
1207     fput(perf_file);
1208     return ee;
1209 }
1210 
1211 static void perf_event_fd_array_put_ptr(void *ptr)
1212 {
1213     bpf_event_entry_free_rcu(ptr);
1214 }
1215 
1216 static void perf_event_fd_array_release(struct bpf_map *map,
1217                     struct file *map_file)
1218 {
1219     struct bpf_array *array = container_of(map, struct bpf_array, map);
1220     struct bpf_event_entry *ee;
1221     int i;
1222 
1223     if (map->map_flags & BPF_F_PRESERVE_ELEMS)
1224         return;
1225 
1226     rcu_read_lock();
1227     for (i = 0; i < array->map.max_entries; i++) {
1228         ee = READ_ONCE(array->ptrs[i]);
1229         if (ee && ee->map_file == map_file)
1230             fd_array_map_delete_elem(map, &i);
1231     }
1232     rcu_read_unlock();
1233 }
1234 
1235 static void perf_event_fd_array_map_free(struct bpf_map *map)
1236 {
1237     if (map->map_flags & BPF_F_PRESERVE_ELEMS)
1238         bpf_fd_array_map_clear(map);
1239     fd_array_map_free(map);
1240 }
1241 
1242 const struct bpf_map_ops perf_event_array_map_ops = {
1243     .map_meta_equal = bpf_map_meta_equal,
1244     .map_alloc_check = fd_array_map_alloc_check,
1245     .map_alloc = array_map_alloc,
1246     .map_free = perf_event_fd_array_map_free,
1247     .map_get_next_key = array_map_get_next_key,
1248     .map_lookup_elem = fd_array_map_lookup_elem,
1249     .map_delete_elem = fd_array_map_delete_elem,
1250     .map_fd_get_ptr = perf_event_fd_array_get_ptr,
1251     .map_fd_put_ptr = perf_event_fd_array_put_ptr,
1252     .map_release = perf_event_fd_array_release,
1253     .map_check_btf = map_check_no_btf,
1254     .map_btf_id = &array_map_btf_ids[0],
1255 };
1256 
1257 #ifdef CONFIG_CGROUPS
1258 static void *cgroup_fd_array_get_ptr(struct bpf_map *map,
1259                      struct file *map_file /* not used */,
1260                      int fd)
1261 {
1262     return cgroup_get_from_fd(fd);
1263 }
1264 
1265 static void cgroup_fd_array_put_ptr(void *ptr)
1266 {
1267     /* cgroup_put free cgrp after a rcu grace period */
1268     cgroup_put(ptr);
1269 }
1270 
1271 static void cgroup_fd_array_free(struct bpf_map *map)
1272 {
1273     bpf_fd_array_map_clear(map);
1274     fd_array_map_free(map);
1275 }
1276 
1277 const struct bpf_map_ops cgroup_array_map_ops = {
1278     .map_meta_equal = bpf_map_meta_equal,
1279     .map_alloc_check = fd_array_map_alloc_check,
1280     .map_alloc = array_map_alloc,
1281     .map_free = cgroup_fd_array_free,
1282     .map_get_next_key = array_map_get_next_key,
1283     .map_lookup_elem = fd_array_map_lookup_elem,
1284     .map_delete_elem = fd_array_map_delete_elem,
1285     .map_fd_get_ptr = cgroup_fd_array_get_ptr,
1286     .map_fd_put_ptr = cgroup_fd_array_put_ptr,
1287     .map_check_btf = map_check_no_btf,
1288     .map_btf_id = &array_map_btf_ids[0],
1289 };
1290 #endif
1291 
1292 static struct bpf_map *array_of_map_alloc(union bpf_attr *attr)
1293 {
1294     struct bpf_map *map, *inner_map_meta;
1295 
1296     inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd);
1297     if (IS_ERR(inner_map_meta))
1298         return inner_map_meta;
1299 
1300     map = array_map_alloc(attr);
1301     if (IS_ERR(map)) {
1302         bpf_map_meta_free(inner_map_meta);
1303         return map;
1304     }
1305 
1306     map->inner_map_meta = inner_map_meta;
1307 
1308     return map;
1309 }
1310 
1311 static void array_of_map_free(struct bpf_map *map)
1312 {
1313     /* map->inner_map_meta is only accessed by syscall which
1314      * is protected by fdget/fdput.
1315      */
1316     bpf_map_meta_free(map->inner_map_meta);
1317     bpf_fd_array_map_clear(map);
1318     fd_array_map_free(map);
1319 }
1320 
1321 static void *array_of_map_lookup_elem(struct bpf_map *map, void *key)
1322 {
1323     struct bpf_map **inner_map = array_map_lookup_elem(map, key);
1324 
1325     if (!inner_map)
1326         return NULL;
1327 
1328     return READ_ONCE(*inner_map);
1329 }
1330 
1331 static int array_of_map_gen_lookup(struct bpf_map *map,
1332                    struct bpf_insn *insn_buf)
1333 {
1334     struct bpf_array *array = container_of(map, struct bpf_array, map);
1335     u32 elem_size = array->elem_size;
1336     struct bpf_insn *insn = insn_buf;
1337     const int ret = BPF_REG_0;
1338     const int map_ptr = BPF_REG_1;
1339     const int index = BPF_REG_2;
1340 
1341     *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
1342     *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
1343     if (!map->bypass_spec_v1) {
1344         *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6);
1345         *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
1346     } else {
1347         *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
1348     }
1349     if (is_power_of_2(elem_size))
1350         *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
1351     else
1352         *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size);
1353     *insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr);
1354     *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0);
1355     *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
1356     *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
1357     *insn++ = BPF_MOV64_IMM(ret, 0);
1358 
1359     return insn - insn_buf;
1360 }
1361 
1362 const struct bpf_map_ops array_of_maps_map_ops = {
1363     .map_alloc_check = fd_array_map_alloc_check,
1364     .map_alloc = array_of_map_alloc,
1365     .map_free = array_of_map_free,
1366     .map_get_next_key = array_map_get_next_key,
1367     .map_lookup_elem = array_of_map_lookup_elem,
1368     .map_delete_elem = fd_array_map_delete_elem,
1369     .map_fd_get_ptr = bpf_map_fd_get_ptr,
1370     .map_fd_put_ptr = bpf_map_fd_put_ptr,
1371     .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
1372     .map_gen_lookup = array_of_map_gen_lookup,
1373     .map_lookup_batch = generic_map_lookup_batch,
1374     .map_update_batch = generic_map_update_batch,
1375     .map_check_btf = map_check_no_btf,
1376     .map_btf_id = &array_map_btf_ids[0],
1377 };