Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * Longest prefix match list implementation
0004  *
0005  * Copyright (c) 2016,2017 Daniel Mack
0006  * Copyright (c) 2016 David Herrmann
0007  */
0008 
0009 #include <linux/bpf.h>
0010 #include <linux/btf.h>
0011 #include <linux/err.h>
0012 #include <linux/slab.h>
0013 #include <linux/spinlock.h>
0014 #include <linux/vmalloc.h>
0015 #include <net/ipv6.h>
0016 #include <uapi/linux/btf.h>
0017 #include <linux/btf_ids.h>
0018 
0019 /* Intermediate node */
0020 #define LPM_TREE_NODE_FLAG_IM BIT(0)
0021 
0022 struct lpm_trie_node;
0023 
0024 struct lpm_trie_node {
0025     struct rcu_head rcu;
0026     struct lpm_trie_node __rcu  *child[2];
0027     u32             prefixlen;
0028     u32             flags;
0029     u8              data[];
0030 };
0031 
0032 struct lpm_trie {
0033     struct bpf_map          map;
0034     struct lpm_trie_node __rcu  *root;
0035     size_t              n_entries;
0036     size_t              max_prefixlen;
0037     size_t              data_size;
0038     spinlock_t          lock;
0039 };
0040 
0041 /* This trie implements a longest prefix match algorithm that can be used to
0042  * match IP addresses to a stored set of ranges.
0043  *
0044  * Data stored in @data of struct bpf_lpm_key and struct lpm_trie_node is
0045  * interpreted as big endian, so data[0] stores the most significant byte.
0046  *
0047  * Match ranges are internally stored in instances of struct lpm_trie_node
0048  * which each contain their prefix length as well as two pointers that may
0049  * lead to more nodes containing more specific matches. Each node also stores
0050  * a value that is defined by and returned to userspace via the update_elem
0051  * and lookup functions.
0052  *
0053  * For instance, let's start with a trie that was created with a prefix length
0054  * of 32, so it can be used for IPv4 addresses, and one single element that
0055  * matches 192.168.0.0/16. The data array would hence contain
0056  * [0xc0, 0xa8, 0x00, 0x00] in big-endian notation. This documentation will
0057  * stick to IP-address notation for readability though.
0058  *
0059  * As the trie is empty initially, the new node (1) will be places as root
0060  * node, denoted as (R) in the example below. As there are no other node, both
0061  * child pointers are %NULL.
0062  *
0063  *              +----------------+
0064  *              |       (1)  (R) |
0065  *              | 192.168.0.0/16 |
0066  *              |    value: 1    |
0067  *              |   [0]    [1]   |
0068  *              +----------------+
0069  *
0070  * Next, let's add a new node (2) matching 192.168.0.0/24. As there is already
0071  * a node with the same data and a smaller prefix (ie, a less specific one),
0072  * node (2) will become a child of (1). In child index depends on the next bit
0073  * that is outside of what (1) matches, and that bit is 0, so (2) will be
0074  * child[0] of (1):
0075  *
0076  *              +----------------+
0077  *              |       (1)  (R) |
0078  *              | 192.168.0.0/16 |
0079  *              |    value: 1    |
0080  *              |   [0]    [1]   |
0081  *              +----------------+
0082  *                   |
0083  *    +----------------+
0084  *    |       (2)      |
0085  *    | 192.168.0.0/24 |
0086  *    |    value: 2    |
0087  *    |   [0]    [1]   |
0088  *    +----------------+
0089  *
0090  * The child[1] slot of (1) could be filled with another node which has bit #17
0091  * (the next bit after the ones that (1) matches on) set to 1. For instance,
0092  * 192.168.128.0/24:
0093  *
0094  *              +----------------+
0095  *              |       (1)  (R) |
0096  *              | 192.168.0.0/16 |
0097  *              |    value: 1    |
0098  *              |   [0]    [1]   |
0099  *              +----------------+
0100  *                   |      |
0101  *    +----------------+  +------------------+
0102  *    |       (2)      |  |        (3)       |
0103  *    | 192.168.0.0/24 |  | 192.168.128.0/24 |
0104  *    |    value: 2    |  |     value: 3     |
0105  *    |   [0]    [1]   |  |    [0]    [1]    |
0106  *    +----------------+  +------------------+
0107  *
0108  * Let's add another node (4) to the game for 192.168.1.0/24. In order to place
0109  * it, node (1) is looked at first, and because (4) of the semantics laid out
0110  * above (bit #17 is 0), it would normally be attached to (1) as child[0].
0111  * However, that slot is already allocated, so a new node is needed in between.
0112  * That node does not have a value attached to it and it will never be
0113  * returned to users as result of a lookup. It is only there to differentiate
0114  * the traversal further. It will get a prefix as wide as necessary to
0115  * distinguish its two children:
0116  *
0117  *                      +----------------+
0118  *                      |       (1)  (R) |
0119  *                      | 192.168.0.0/16 |
0120  *                      |    value: 1    |
0121  *                      |   [0]    [1]   |
0122  *                      +----------------+
0123  *                           |      |
0124  *            +----------------+  +------------------+
0125  *            |       (4)  (I) |  |        (3)       |
0126  *            | 192.168.0.0/23 |  | 192.168.128.0/24 |
0127  *            |    value: ---  |  |     value: 3     |
0128  *            |   [0]    [1]   |  |    [0]    [1]    |
0129  *            +----------------+  +------------------+
0130  *                 |      |
0131  *  +----------------+  +----------------+
0132  *  |       (2)      |  |       (5)      |
0133  *  | 192.168.0.0/24 |  | 192.168.1.0/24 |
0134  *  |    value: 2    |  |     value: 5   |
0135  *  |   [0]    [1]   |  |   [0]    [1]   |
0136  *  +----------------+  +----------------+
0137  *
0138  * 192.168.1.1/32 would be a child of (5) etc.
0139  *
0140  * An intermediate node will be turned into a 'real' node on demand. In the
0141  * example above, (4) would be re-used if 192.168.0.0/23 is added to the trie.
0142  *
0143  * A fully populated trie would have a height of 32 nodes, as the trie was
0144  * created with a prefix length of 32.
0145  *
0146  * The lookup starts at the root node. If the current node matches and if there
0147  * is a child that can be used to become more specific, the trie is traversed
0148  * downwards. The last node in the traversal that is a non-intermediate one is
0149  * returned.
0150  */
0151 
0152 static inline int extract_bit(const u8 *data, size_t index)
0153 {
0154     return !!(data[index / 8] & (1 << (7 - (index % 8))));
0155 }
0156 
0157 /**
0158  * longest_prefix_match() - determine the longest prefix
0159  * @trie:   The trie to get internal sizes from
0160  * @node:   The node to operate on
0161  * @key:    The key to compare to @node
0162  *
0163  * Determine the longest prefix of @node that matches the bits in @key.
0164  */
0165 static size_t longest_prefix_match(const struct lpm_trie *trie,
0166                    const struct lpm_trie_node *node,
0167                    const struct bpf_lpm_trie_key *key)
0168 {
0169     u32 limit = min(node->prefixlen, key->prefixlen);
0170     u32 prefixlen = 0, i = 0;
0171 
0172     BUILD_BUG_ON(offsetof(struct lpm_trie_node, data) % sizeof(u32));
0173     BUILD_BUG_ON(offsetof(struct bpf_lpm_trie_key, data) % sizeof(u32));
0174 
0175 #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && defined(CONFIG_64BIT)
0176 
0177     /* data_size >= 16 has very small probability.
0178      * We do not use a loop for optimal code generation.
0179      */
0180     if (trie->data_size >= 8) {
0181         u64 diff = be64_to_cpu(*(__be64 *)node->data ^
0182                        *(__be64 *)key->data);
0183 
0184         prefixlen = 64 - fls64(diff);
0185         if (prefixlen >= limit)
0186             return limit;
0187         if (diff)
0188             return prefixlen;
0189         i = 8;
0190     }
0191 #endif
0192 
0193     while (trie->data_size >= i + 4) {
0194         u32 diff = be32_to_cpu(*(__be32 *)&node->data[i] ^
0195                        *(__be32 *)&key->data[i]);
0196 
0197         prefixlen += 32 - fls(diff);
0198         if (prefixlen >= limit)
0199             return limit;
0200         if (diff)
0201             return prefixlen;
0202         i += 4;
0203     }
0204 
0205     if (trie->data_size >= i + 2) {
0206         u16 diff = be16_to_cpu(*(__be16 *)&node->data[i] ^
0207                        *(__be16 *)&key->data[i]);
0208 
0209         prefixlen += 16 - fls(diff);
0210         if (prefixlen >= limit)
0211             return limit;
0212         if (diff)
0213             return prefixlen;
0214         i += 2;
0215     }
0216 
0217     if (trie->data_size >= i + 1) {
0218         prefixlen += 8 - fls(node->data[i] ^ key->data[i]);
0219 
0220         if (prefixlen >= limit)
0221             return limit;
0222     }
0223 
0224     return prefixlen;
0225 }
0226 
0227 /* Called from syscall or from eBPF program */
0228 static void *trie_lookup_elem(struct bpf_map *map, void *_key)
0229 {
0230     struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
0231     struct lpm_trie_node *node, *found = NULL;
0232     struct bpf_lpm_trie_key *key = _key;
0233 
0234     /* Start walking the trie from the root node ... */
0235 
0236     for (node = rcu_dereference_check(trie->root, rcu_read_lock_bh_held());
0237          node;) {
0238         unsigned int next_bit;
0239         size_t matchlen;
0240 
0241         /* Determine the longest prefix of @node that matches @key.
0242          * If it's the maximum possible prefix for this trie, we have
0243          * an exact match and can return it directly.
0244          */
0245         matchlen = longest_prefix_match(trie, node, key);
0246         if (matchlen == trie->max_prefixlen) {
0247             found = node;
0248             break;
0249         }
0250 
0251         /* If the number of bits that match is smaller than the prefix
0252          * length of @node, bail out and return the node we have seen
0253          * last in the traversal (ie, the parent).
0254          */
0255         if (matchlen < node->prefixlen)
0256             break;
0257 
0258         /* Consider this node as return candidate unless it is an
0259          * artificially added intermediate one.
0260          */
0261         if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
0262             found = node;
0263 
0264         /* If the node match is fully satisfied, let's see if we can
0265          * become more specific. Determine the next bit in the key and
0266          * traverse down.
0267          */
0268         next_bit = extract_bit(key->data, node->prefixlen);
0269         node = rcu_dereference_check(node->child[next_bit],
0270                          rcu_read_lock_bh_held());
0271     }
0272 
0273     if (!found)
0274         return NULL;
0275 
0276     return found->data + trie->data_size;
0277 }
0278 
0279 static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
0280                          const void *value)
0281 {
0282     struct lpm_trie_node *node;
0283     size_t size = sizeof(struct lpm_trie_node) + trie->data_size;
0284 
0285     if (value)
0286         size += trie->map.value_size;
0287 
0288     node = bpf_map_kmalloc_node(&trie->map, size, GFP_NOWAIT | __GFP_NOWARN,
0289                     trie->map.numa_node);
0290     if (!node)
0291         return NULL;
0292 
0293     node->flags = 0;
0294 
0295     if (value)
0296         memcpy(node->data + trie->data_size, value,
0297                trie->map.value_size);
0298 
0299     return node;
0300 }
0301 
0302 /* Called from syscall or from eBPF program */
0303 static int trie_update_elem(struct bpf_map *map,
0304                 void *_key, void *value, u64 flags)
0305 {
0306     struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
0307     struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL;
0308     struct lpm_trie_node __rcu **slot;
0309     struct bpf_lpm_trie_key *key = _key;
0310     unsigned long irq_flags;
0311     unsigned int next_bit;
0312     size_t matchlen = 0;
0313     int ret = 0;
0314 
0315     if (unlikely(flags > BPF_EXIST))
0316         return -EINVAL;
0317 
0318     if (key->prefixlen > trie->max_prefixlen)
0319         return -EINVAL;
0320 
0321     spin_lock_irqsave(&trie->lock, irq_flags);
0322 
0323     /* Allocate and fill a new node */
0324 
0325     if (trie->n_entries == trie->map.max_entries) {
0326         ret = -ENOSPC;
0327         goto out;
0328     }
0329 
0330     new_node = lpm_trie_node_alloc(trie, value);
0331     if (!new_node) {
0332         ret = -ENOMEM;
0333         goto out;
0334     }
0335 
0336     trie->n_entries++;
0337 
0338     new_node->prefixlen = key->prefixlen;
0339     RCU_INIT_POINTER(new_node->child[0], NULL);
0340     RCU_INIT_POINTER(new_node->child[1], NULL);
0341     memcpy(new_node->data, key->data, trie->data_size);
0342 
0343     /* Now find a slot to attach the new node. To do that, walk the tree
0344      * from the root and match as many bits as possible for each node until
0345      * we either find an empty slot or a slot that needs to be replaced by
0346      * an intermediate node.
0347      */
0348     slot = &trie->root;
0349 
0350     while ((node = rcu_dereference_protected(*slot,
0351                     lockdep_is_held(&trie->lock)))) {
0352         matchlen = longest_prefix_match(trie, node, key);
0353 
0354         if (node->prefixlen != matchlen ||
0355             node->prefixlen == key->prefixlen ||
0356             node->prefixlen == trie->max_prefixlen)
0357             break;
0358 
0359         next_bit = extract_bit(key->data, node->prefixlen);
0360         slot = &node->child[next_bit];
0361     }
0362 
0363     /* If the slot is empty (a free child pointer or an empty root),
0364      * simply assign the @new_node to that slot and be done.
0365      */
0366     if (!node) {
0367         rcu_assign_pointer(*slot, new_node);
0368         goto out;
0369     }
0370 
0371     /* If the slot we picked already exists, replace it with @new_node
0372      * which already has the correct data array set.
0373      */
0374     if (node->prefixlen == matchlen) {
0375         new_node->child[0] = node->child[0];
0376         new_node->child[1] = node->child[1];
0377 
0378         if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
0379             trie->n_entries--;
0380 
0381         rcu_assign_pointer(*slot, new_node);
0382         kfree_rcu(node, rcu);
0383 
0384         goto out;
0385     }
0386 
0387     /* If the new node matches the prefix completely, it must be inserted
0388      * as an ancestor. Simply insert it between @node and *@slot.
0389      */
0390     if (matchlen == key->prefixlen) {
0391         next_bit = extract_bit(node->data, matchlen);
0392         rcu_assign_pointer(new_node->child[next_bit], node);
0393         rcu_assign_pointer(*slot, new_node);
0394         goto out;
0395     }
0396 
0397     im_node = lpm_trie_node_alloc(trie, NULL);
0398     if (!im_node) {
0399         ret = -ENOMEM;
0400         goto out;
0401     }
0402 
0403     im_node->prefixlen = matchlen;
0404     im_node->flags |= LPM_TREE_NODE_FLAG_IM;
0405     memcpy(im_node->data, node->data, trie->data_size);
0406 
0407     /* Now determine which child to install in which slot */
0408     if (extract_bit(key->data, matchlen)) {
0409         rcu_assign_pointer(im_node->child[0], node);
0410         rcu_assign_pointer(im_node->child[1], new_node);
0411     } else {
0412         rcu_assign_pointer(im_node->child[0], new_node);
0413         rcu_assign_pointer(im_node->child[1], node);
0414     }
0415 
0416     /* Finally, assign the intermediate node to the determined slot */
0417     rcu_assign_pointer(*slot, im_node);
0418 
0419 out:
0420     if (ret) {
0421         if (new_node)
0422             trie->n_entries--;
0423 
0424         kfree(new_node);
0425         kfree(im_node);
0426     }
0427 
0428     spin_unlock_irqrestore(&trie->lock, irq_flags);
0429 
0430     return ret;
0431 }
0432 
0433 /* Called from syscall or from eBPF program */
0434 static int trie_delete_elem(struct bpf_map *map, void *_key)
0435 {
0436     struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
0437     struct bpf_lpm_trie_key *key = _key;
0438     struct lpm_trie_node __rcu **trim, **trim2;
0439     struct lpm_trie_node *node, *parent;
0440     unsigned long irq_flags;
0441     unsigned int next_bit;
0442     size_t matchlen = 0;
0443     int ret = 0;
0444 
0445     if (key->prefixlen > trie->max_prefixlen)
0446         return -EINVAL;
0447 
0448     spin_lock_irqsave(&trie->lock, irq_flags);
0449 
0450     /* Walk the tree looking for an exact key/length match and keeping
0451      * track of the path we traverse.  We will need to know the node
0452      * we wish to delete, and the slot that points to the node we want
0453      * to delete.  We may also need to know the nodes parent and the
0454      * slot that contains it.
0455      */
0456     trim = &trie->root;
0457     trim2 = trim;
0458     parent = NULL;
0459     while ((node = rcu_dereference_protected(
0460                *trim, lockdep_is_held(&trie->lock)))) {
0461         matchlen = longest_prefix_match(trie, node, key);
0462 
0463         if (node->prefixlen != matchlen ||
0464             node->prefixlen == key->prefixlen)
0465             break;
0466 
0467         parent = node;
0468         trim2 = trim;
0469         next_bit = extract_bit(key->data, node->prefixlen);
0470         trim = &node->child[next_bit];
0471     }
0472 
0473     if (!node || node->prefixlen != key->prefixlen ||
0474         node->prefixlen != matchlen ||
0475         (node->flags & LPM_TREE_NODE_FLAG_IM)) {
0476         ret = -ENOENT;
0477         goto out;
0478     }
0479 
0480     trie->n_entries--;
0481 
0482     /* If the node we are removing has two children, simply mark it
0483      * as intermediate and we are done.
0484      */
0485     if (rcu_access_pointer(node->child[0]) &&
0486         rcu_access_pointer(node->child[1])) {
0487         node->flags |= LPM_TREE_NODE_FLAG_IM;
0488         goto out;
0489     }
0490 
0491     /* If the parent of the node we are about to delete is an intermediate
0492      * node, and the deleted node doesn't have any children, we can delete
0493      * the intermediate parent as well and promote its other child
0494      * up the tree.  Doing this maintains the invariant that all
0495      * intermediate nodes have exactly 2 children and that there are no
0496      * unnecessary intermediate nodes in the tree.
0497      */
0498     if (parent && (parent->flags & LPM_TREE_NODE_FLAG_IM) &&
0499         !node->child[0] && !node->child[1]) {
0500         if (node == rcu_access_pointer(parent->child[0]))
0501             rcu_assign_pointer(
0502                 *trim2, rcu_access_pointer(parent->child[1]));
0503         else
0504             rcu_assign_pointer(
0505                 *trim2, rcu_access_pointer(parent->child[0]));
0506         kfree_rcu(parent, rcu);
0507         kfree_rcu(node, rcu);
0508         goto out;
0509     }
0510 
0511     /* The node we are removing has either zero or one child. If there
0512      * is a child, move it into the removed node's slot then delete
0513      * the node.  Otherwise just clear the slot and delete the node.
0514      */
0515     if (node->child[0])
0516         rcu_assign_pointer(*trim, rcu_access_pointer(node->child[0]));
0517     else if (node->child[1])
0518         rcu_assign_pointer(*trim, rcu_access_pointer(node->child[1]));
0519     else
0520         RCU_INIT_POINTER(*trim, NULL);
0521     kfree_rcu(node, rcu);
0522 
0523 out:
0524     spin_unlock_irqrestore(&trie->lock, irq_flags);
0525 
0526     return ret;
0527 }
0528 
0529 #define LPM_DATA_SIZE_MAX   256
0530 #define LPM_DATA_SIZE_MIN   1
0531 
0532 #define LPM_VAL_SIZE_MAX    (KMALLOC_MAX_SIZE - LPM_DATA_SIZE_MAX - \
0533                  sizeof(struct lpm_trie_node))
0534 #define LPM_VAL_SIZE_MIN    1
0535 
0536 #define LPM_KEY_SIZE(X)     (sizeof(struct bpf_lpm_trie_key) + (X))
0537 #define LPM_KEY_SIZE_MAX    LPM_KEY_SIZE(LPM_DATA_SIZE_MAX)
0538 #define LPM_KEY_SIZE_MIN    LPM_KEY_SIZE(LPM_DATA_SIZE_MIN)
0539 
0540 #define LPM_CREATE_FLAG_MASK    (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE |  \
0541                  BPF_F_ACCESS_MASK)
0542 
0543 static struct bpf_map *trie_alloc(union bpf_attr *attr)
0544 {
0545     struct lpm_trie *trie;
0546 
0547     if (!bpf_capable())
0548         return ERR_PTR(-EPERM);
0549 
0550     /* check sanity of attributes */
0551     if (attr->max_entries == 0 ||
0552         !(attr->map_flags & BPF_F_NO_PREALLOC) ||
0553         attr->map_flags & ~LPM_CREATE_FLAG_MASK ||
0554         !bpf_map_flags_access_ok(attr->map_flags) ||
0555         attr->key_size < LPM_KEY_SIZE_MIN ||
0556         attr->key_size > LPM_KEY_SIZE_MAX ||
0557         attr->value_size < LPM_VAL_SIZE_MIN ||
0558         attr->value_size > LPM_VAL_SIZE_MAX)
0559         return ERR_PTR(-EINVAL);
0560 
0561     trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT);
0562     if (!trie)
0563         return ERR_PTR(-ENOMEM);
0564 
0565     /* copy mandatory map attributes */
0566     bpf_map_init_from_attr(&trie->map, attr);
0567     trie->data_size = attr->key_size -
0568               offsetof(struct bpf_lpm_trie_key, data);
0569     trie->max_prefixlen = trie->data_size * 8;
0570 
0571     spin_lock_init(&trie->lock);
0572 
0573     return &trie->map;
0574 }
0575 
0576 static void trie_free(struct bpf_map *map)
0577 {
0578     struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
0579     struct lpm_trie_node __rcu **slot;
0580     struct lpm_trie_node *node;
0581 
0582     /* Always start at the root and walk down to a node that has no
0583      * children. Then free that node, nullify its reference in the parent
0584      * and start over.
0585      */
0586 
0587     for (;;) {
0588         slot = &trie->root;
0589 
0590         for (;;) {
0591             node = rcu_dereference_protected(*slot, 1);
0592             if (!node)
0593                 goto out;
0594 
0595             if (rcu_access_pointer(node->child[0])) {
0596                 slot = &node->child[0];
0597                 continue;
0598             }
0599 
0600             if (rcu_access_pointer(node->child[1])) {
0601                 slot = &node->child[1];
0602                 continue;
0603             }
0604 
0605             kfree(node);
0606             RCU_INIT_POINTER(*slot, NULL);
0607             break;
0608         }
0609     }
0610 
0611 out:
0612     kfree(trie);
0613 }
0614 
0615 static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
0616 {
0617     struct lpm_trie_node *node, *next_node = NULL, *parent, *search_root;
0618     struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
0619     struct bpf_lpm_trie_key *key = _key, *next_key = _next_key;
0620     struct lpm_trie_node **node_stack = NULL;
0621     int err = 0, stack_ptr = -1;
0622     unsigned int next_bit;
0623     size_t matchlen;
0624 
0625     /* The get_next_key follows postorder. For the 4 node example in
0626      * the top of this file, the trie_get_next_key() returns the following
0627      * one after another:
0628      *   192.168.0.0/24
0629      *   192.168.1.0/24
0630      *   192.168.128.0/24
0631      *   192.168.0.0/16
0632      *
0633      * The idea is to return more specific keys before less specific ones.
0634      */
0635 
0636     /* Empty trie */
0637     search_root = rcu_dereference(trie->root);
0638     if (!search_root)
0639         return -ENOENT;
0640 
0641     /* For invalid key, find the leftmost node in the trie */
0642     if (!key || key->prefixlen > trie->max_prefixlen)
0643         goto find_leftmost;
0644 
0645     node_stack = kmalloc_array(trie->max_prefixlen,
0646                    sizeof(struct lpm_trie_node *),
0647                    GFP_ATOMIC | __GFP_NOWARN);
0648     if (!node_stack)
0649         return -ENOMEM;
0650 
0651     /* Try to find the exact node for the given key */
0652     for (node = search_root; node;) {
0653         node_stack[++stack_ptr] = node;
0654         matchlen = longest_prefix_match(trie, node, key);
0655         if (node->prefixlen != matchlen ||
0656             node->prefixlen == key->prefixlen)
0657             break;
0658 
0659         next_bit = extract_bit(key->data, node->prefixlen);
0660         node = rcu_dereference(node->child[next_bit]);
0661     }
0662     if (!node || node->prefixlen != key->prefixlen ||
0663         (node->flags & LPM_TREE_NODE_FLAG_IM))
0664         goto find_leftmost;
0665 
0666     /* The node with the exactly-matching key has been found,
0667      * find the first node in postorder after the matched node.
0668      */
0669     node = node_stack[stack_ptr];
0670     while (stack_ptr > 0) {
0671         parent = node_stack[stack_ptr - 1];
0672         if (rcu_dereference(parent->child[0]) == node) {
0673             search_root = rcu_dereference(parent->child[1]);
0674             if (search_root)
0675                 goto find_leftmost;
0676         }
0677         if (!(parent->flags & LPM_TREE_NODE_FLAG_IM)) {
0678             next_node = parent;
0679             goto do_copy;
0680         }
0681 
0682         node = parent;
0683         stack_ptr--;
0684     }
0685 
0686     /* did not find anything */
0687     err = -ENOENT;
0688     goto free_stack;
0689 
0690 find_leftmost:
0691     /* Find the leftmost non-intermediate node, all intermediate nodes
0692      * have exact two children, so this function will never return NULL.
0693      */
0694     for (node = search_root; node;) {
0695         if (node->flags & LPM_TREE_NODE_FLAG_IM) {
0696             node = rcu_dereference(node->child[0]);
0697         } else {
0698             next_node = node;
0699             node = rcu_dereference(node->child[0]);
0700             if (!node)
0701                 node = rcu_dereference(next_node->child[1]);
0702         }
0703     }
0704 do_copy:
0705     next_key->prefixlen = next_node->prefixlen;
0706     memcpy((void *)next_key + offsetof(struct bpf_lpm_trie_key, data),
0707            next_node->data, trie->data_size);
0708 free_stack:
0709     kfree(node_stack);
0710     return err;
0711 }
0712 
0713 static int trie_check_btf(const struct bpf_map *map,
0714               const struct btf *btf,
0715               const struct btf_type *key_type,
0716               const struct btf_type *value_type)
0717 {
0718     /* Keys must have struct bpf_lpm_trie_key embedded. */
0719     return BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT ?
0720            -EINVAL : 0;
0721 }
0722 
0723 BTF_ID_LIST_SINGLE(trie_map_btf_ids, struct, lpm_trie)
0724 const struct bpf_map_ops trie_map_ops = {
0725     .map_meta_equal = bpf_map_meta_equal,
0726     .map_alloc = trie_alloc,
0727     .map_free = trie_free,
0728     .map_get_next_key = trie_get_next_key,
0729     .map_lookup_elem = trie_lookup_elem,
0730     .map_update_elem = trie_update_elem,
0731     .map_delete_elem = trie_delete_elem,
0732     .map_lookup_batch = generic_map_lookup_batch,
0733     .map_update_batch = generic_map_update_batch,
0734     .map_delete_batch = generic_map_delete_batch,
0735     .map_check_btf = trie_check_btf,
0736     .map_btf_id = &trie_map_btf_ids[0],
0737 };