Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * To speed up listener socket lookup, create an array to store all sockets
0004  * listening on the same port.  This allows a decision to be made after finding
0005  * the first socket.  An optional BPF program can also be configured for
0006  * selecting the socket index from the array of available sockets.
0007  */
0008 
0009 #include <net/ip.h>
0010 #include <net/sock_reuseport.h>
0011 #include <linux/bpf.h>
0012 #include <linux/idr.h>
0013 #include <linux/filter.h>
0014 #include <linux/rcupdate.h>
0015 
0016 #define INIT_SOCKS 128
0017 
0018 DEFINE_SPINLOCK(reuseport_lock);
0019 
0020 static DEFINE_IDA(reuseport_ida);
0021 static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse,
0022                    struct sock_reuseport *reuse, bool bind_inany);
0023 
0024 static int reuseport_sock_index(struct sock *sk,
0025                 const struct sock_reuseport *reuse,
0026                 bool closed)
0027 {
0028     int left, right;
0029 
0030     if (!closed) {
0031         left = 0;
0032         right = reuse->num_socks;
0033     } else {
0034         left = reuse->max_socks - reuse->num_closed_socks;
0035         right = reuse->max_socks;
0036     }
0037 
0038     for (; left < right; left++)
0039         if (reuse->socks[left] == sk)
0040             return left;
0041     return -1;
0042 }
0043 
0044 static void __reuseport_add_sock(struct sock *sk,
0045                  struct sock_reuseport *reuse)
0046 {
0047     reuse->socks[reuse->num_socks] = sk;
0048     /* paired with smp_rmb() in reuseport_(select|migrate)_sock() */
0049     smp_wmb();
0050     reuse->num_socks++;
0051 }
0052 
0053 static bool __reuseport_detach_sock(struct sock *sk,
0054                     struct sock_reuseport *reuse)
0055 {
0056     int i = reuseport_sock_index(sk, reuse, false);
0057 
0058     if (i == -1)
0059         return false;
0060 
0061     reuse->socks[i] = reuse->socks[reuse->num_socks - 1];
0062     reuse->num_socks--;
0063 
0064     return true;
0065 }
0066 
0067 static void __reuseport_add_closed_sock(struct sock *sk,
0068                     struct sock_reuseport *reuse)
0069 {
0070     reuse->socks[reuse->max_socks - reuse->num_closed_socks - 1] = sk;
0071     /* paired with READ_ONCE() in inet_csk_bind_conflict() */
0072     WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks + 1);
0073 }
0074 
0075 static bool __reuseport_detach_closed_sock(struct sock *sk,
0076                        struct sock_reuseport *reuse)
0077 {
0078     int i = reuseport_sock_index(sk, reuse, true);
0079 
0080     if (i == -1)
0081         return false;
0082 
0083     reuse->socks[i] = reuse->socks[reuse->max_socks - reuse->num_closed_socks];
0084     /* paired with READ_ONCE() in inet_csk_bind_conflict() */
0085     WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks - 1);
0086 
0087     return true;
0088 }
0089 
0090 static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
0091 {
0092     unsigned int size = sizeof(struct sock_reuseport) +
0093               sizeof(struct sock *) * max_socks;
0094     struct sock_reuseport *reuse = kzalloc(size, GFP_ATOMIC);
0095 
0096     if (!reuse)
0097         return NULL;
0098 
0099     reuse->max_socks = max_socks;
0100 
0101     RCU_INIT_POINTER(reuse->prog, NULL);
0102     return reuse;
0103 }
0104 
0105 int reuseport_alloc(struct sock *sk, bool bind_inany)
0106 {
0107     struct sock_reuseport *reuse;
0108     int id, ret = 0;
0109 
0110     /* bh lock used since this function call may precede hlist lock in
0111      * soft irq of receive path or setsockopt from process context
0112      */
0113     spin_lock_bh(&reuseport_lock);
0114 
0115     /* Allocation attempts can occur concurrently via the setsockopt path
0116      * and the bind/hash path.  Nothing to do when we lose the race.
0117      */
0118     reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
0119                       lockdep_is_held(&reuseport_lock));
0120     if (reuse) {
0121         if (reuse->num_closed_socks) {
0122             /* sk was shutdown()ed before */
0123             ret = reuseport_resurrect(sk, reuse, NULL, bind_inany);
0124             goto out;
0125         }
0126 
0127         /* Only set reuse->bind_inany if the bind_inany is true.
0128          * Otherwise, it will overwrite the reuse->bind_inany
0129          * which was set by the bind/hash path.
0130          */
0131         if (bind_inany)
0132             reuse->bind_inany = bind_inany;
0133         goto out;
0134     }
0135 
0136     reuse = __reuseport_alloc(INIT_SOCKS);
0137     if (!reuse) {
0138         ret = -ENOMEM;
0139         goto out;
0140     }
0141 
0142     id = ida_alloc(&reuseport_ida, GFP_ATOMIC);
0143     if (id < 0) {
0144         kfree(reuse);
0145         ret = id;
0146         goto out;
0147     }
0148 
0149     reuse->reuseport_id = id;
0150     reuse->bind_inany = bind_inany;
0151     reuse->socks[0] = sk;
0152     reuse->num_socks = 1;
0153     rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
0154 
0155 out:
0156     spin_unlock_bh(&reuseport_lock);
0157 
0158     return ret;
0159 }
0160 EXPORT_SYMBOL(reuseport_alloc);
0161 
0162 static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
0163 {
0164     struct sock_reuseport *more_reuse;
0165     u32 more_socks_size, i;
0166 
0167     more_socks_size = reuse->max_socks * 2U;
0168     if (more_socks_size > U16_MAX) {
0169         if (reuse->num_closed_socks) {
0170             /* Make room by removing a closed sk.
0171              * The child has already been migrated.
0172              * Only reqsk left at this point.
0173              */
0174             struct sock *sk;
0175 
0176             sk = reuse->socks[reuse->max_socks - reuse->num_closed_socks];
0177             RCU_INIT_POINTER(sk->sk_reuseport_cb, NULL);
0178             __reuseport_detach_closed_sock(sk, reuse);
0179 
0180             return reuse;
0181         }
0182 
0183         return NULL;
0184     }
0185 
0186     more_reuse = __reuseport_alloc(more_socks_size);
0187     if (!more_reuse)
0188         return NULL;
0189 
0190     more_reuse->num_socks = reuse->num_socks;
0191     more_reuse->num_closed_socks = reuse->num_closed_socks;
0192     more_reuse->prog = reuse->prog;
0193     more_reuse->reuseport_id = reuse->reuseport_id;
0194     more_reuse->bind_inany = reuse->bind_inany;
0195     more_reuse->has_conns = reuse->has_conns;
0196 
0197     memcpy(more_reuse->socks, reuse->socks,
0198            reuse->num_socks * sizeof(struct sock *));
0199     memcpy(more_reuse->socks +
0200            (more_reuse->max_socks - more_reuse->num_closed_socks),
0201            reuse->socks + (reuse->max_socks - reuse->num_closed_socks),
0202            reuse->num_closed_socks * sizeof(struct sock *));
0203     more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts);
0204 
0205     for (i = 0; i < reuse->max_socks; ++i)
0206         rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb,
0207                    more_reuse);
0208 
0209     /* Note: we use kfree_rcu here instead of reuseport_free_rcu so
0210      * that reuse and more_reuse can temporarily share a reference
0211      * to prog.
0212      */
0213     kfree_rcu(reuse, rcu);
0214     return more_reuse;
0215 }
0216 
0217 static void reuseport_free_rcu(struct rcu_head *head)
0218 {
0219     struct sock_reuseport *reuse;
0220 
0221     reuse = container_of(head, struct sock_reuseport, rcu);
0222     sk_reuseport_prog_free(rcu_dereference_protected(reuse->prog, 1));
0223     ida_free(&reuseport_ida, reuse->reuseport_id);
0224     kfree(reuse);
0225 }
0226 
0227 /**
0228  *  reuseport_add_sock - Add a socket to the reuseport group of another.
0229  *  @sk:  New socket to add to the group.
0230  *  @sk2: Socket belonging to the existing reuseport group.
0231  *  @bind_inany: Whether or not the group is bound to a local INANY address.
0232  *
0233  *  May return ENOMEM and not add socket to group under memory pressure.
0234  */
0235 int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
0236 {
0237     struct sock_reuseport *old_reuse, *reuse;
0238 
0239     if (!rcu_access_pointer(sk2->sk_reuseport_cb)) {
0240         int err = reuseport_alloc(sk2, bind_inany);
0241 
0242         if (err)
0243             return err;
0244     }
0245 
0246     spin_lock_bh(&reuseport_lock);
0247     reuse = rcu_dereference_protected(sk2->sk_reuseport_cb,
0248                       lockdep_is_held(&reuseport_lock));
0249     old_reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
0250                           lockdep_is_held(&reuseport_lock));
0251     if (old_reuse && old_reuse->num_closed_socks) {
0252         /* sk was shutdown()ed before */
0253         int err = reuseport_resurrect(sk, old_reuse, reuse, reuse->bind_inany);
0254 
0255         spin_unlock_bh(&reuseport_lock);
0256         return err;
0257     }
0258 
0259     if (old_reuse && old_reuse->num_socks != 1) {
0260         spin_unlock_bh(&reuseport_lock);
0261         return -EBUSY;
0262     }
0263 
0264     if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) {
0265         reuse = reuseport_grow(reuse);
0266         if (!reuse) {
0267             spin_unlock_bh(&reuseport_lock);
0268             return -ENOMEM;
0269         }
0270     }
0271 
0272     __reuseport_add_sock(sk, reuse);
0273     rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
0274 
0275     spin_unlock_bh(&reuseport_lock);
0276 
0277     if (old_reuse)
0278         call_rcu(&old_reuse->rcu, reuseport_free_rcu);
0279     return 0;
0280 }
0281 EXPORT_SYMBOL(reuseport_add_sock);
0282 
0283 static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse,
0284                    struct sock_reuseport *reuse, bool bind_inany)
0285 {
0286     if (old_reuse == reuse) {
0287         /* If sk was in the same reuseport group, just pop sk out of
0288          * the closed section and push sk into the listening section.
0289          */
0290         __reuseport_detach_closed_sock(sk, old_reuse);
0291         __reuseport_add_sock(sk, old_reuse);
0292         return 0;
0293     }
0294 
0295     if (!reuse) {
0296         /* In bind()/listen() path, we cannot carry over the eBPF prog
0297          * for the shutdown()ed socket. In setsockopt() path, we should
0298          * not change the eBPF prog of listening sockets by attaching a
0299          * prog to the shutdown()ed socket. Thus, we will allocate a new
0300          * reuseport group and detach sk from the old group.
0301          */
0302         int id;
0303 
0304         reuse = __reuseport_alloc(INIT_SOCKS);
0305         if (!reuse)
0306             return -ENOMEM;
0307 
0308         id = ida_alloc(&reuseport_ida, GFP_ATOMIC);
0309         if (id < 0) {
0310             kfree(reuse);
0311             return id;
0312         }
0313 
0314         reuse->reuseport_id = id;
0315         reuse->bind_inany = bind_inany;
0316     } else {
0317         /* Move sk from the old group to the new one if
0318          * - all the other listeners in the old group were close()d or
0319          *   shutdown()ed, and then sk2 has listen()ed on the same port
0320          * OR
0321          * - sk listen()ed without bind() (or with autobind), was
0322          *   shutdown()ed, and then listen()s on another port which
0323          *   sk2 listen()s on.
0324          */
0325         if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) {
0326             reuse = reuseport_grow(reuse);
0327             if (!reuse)
0328                 return -ENOMEM;
0329         }
0330     }
0331 
0332     __reuseport_detach_closed_sock(sk, old_reuse);
0333     __reuseport_add_sock(sk, reuse);
0334     rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
0335 
0336     if (old_reuse->num_socks + old_reuse->num_closed_socks == 0)
0337         call_rcu(&old_reuse->rcu, reuseport_free_rcu);
0338 
0339     return 0;
0340 }
0341 
0342 void reuseport_detach_sock(struct sock *sk)
0343 {
0344     struct sock_reuseport *reuse;
0345 
0346     spin_lock_bh(&reuseport_lock);
0347     reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
0348                       lockdep_is_held(&reuseport_lock));
0349 
0350     /* reuseport_grow() has detached a closed sk */
0351     if (!reuse)
0352         goto out;
0353 
0354     /* Notify the bpf side. The sk may be added to a sockarray
0355      * map. If so, sockarray logic will remove it from the map.
0356      *
0357      * Other bpf map types that work with reuseport, like sockmap,
0358      * don't need an explicit callback from here. They override sk
0359      * unhash/close ops to remove the sk from the map before we
0360      * get to this point.
0361      */
0362     bpf_sk_reuseport_detach(sk);
0363 
0364     rcu_assign_pointer(sk->sk_reuseport_cb, NULL);
0365 
0366     if (!__reuseport_detach_closed_sock(sk, reuse))
0367         __reuseport_detach_sock(sk, reuse);
0368 
0369     if (reuse->num_socks + reuse->num_closed_socks == 0)
0370         call_rcu(&reuse->rcu, reuseport_free_rcu);
0371 
0372 out:
0373     spin_unlock_bh(&reuseport_lock);
0374 }
0375 EXPORT_SYMBOL(reuseport_detach_sock);
0376 
0377 void reuseport_stop_listen_sock(struct sock *sk)
0378 {
0379     if (sk->sk_protocol == IPPROTO_TCP) {
0380         struct sock_reuseport *reuse;
0381         struct bpf_prog *prog;
0382 
0383         spin_lock_bh(&reuseport_lock);
0384 
0385         reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
0386                           lockdep_is_held(&reuseport_lock));
0387         prog = rcu_dereference_protected(reuse->prog,
0388                          lockdep_is_held(&reuseport_lock));
0389 
0390         if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_migrate_req) ||
0391             (prog && prog->expected_attach_type == BPF_SK_REUSEPORT_SELECT_OR_MIGRATE)) {
0392             /* Migration capable, move sk from the listening section
0393              * to the closed section.
0394              */
0395             bpf_sk_reuseport_detach(sk);
0396 
0397             __reuseport_detach_sock(sk, reuse);
0398             __reuseport_add_closed_sock(sk, reuse);
0399 
0400             spin_unlock_bh(&reuseport_lock);
0401             return;
0402         }
0403 
0404         spin_unlock_bh(&reuseport_lock);
0405     }
0406 
0407     /* Not capable to do migration, detach immediately */
0408     reuseport_detach_sock(sk);
0409 }
0410 EXPORT_SYMBOL(reuseport_stop_listen_sock);
0411 
0412 static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks,
0413                    struct bpf_prog *prog, struct sk_buff *skb,
0414                    int hdr_len)
0415 {
0416     struct sk_buff *nskb = NULL;
0417     u32 index;
0418 
0419     if (skb_shared(skb)) {
0420         nskb = skb_clone(skb, GFP_ATOMIC);
0421         if (!nskb)
0422             return NULL;
0423         skb = nskb;
0424     }
0425 
0426     /* temporarily advance data past protocol header */
0427     if (!pskb_pull(skb, hdr_len)) {
0428         kfree_skb(nskb);
0429         return NULL;
0430     }
0431     index = bpf_prog_run_save_cb(prog, skb);
0432     __skb_push(skb, hdr_len);
0433 
0434     consume_skb(nskb);
0435 
0436     if (index >= socks)
0437         return NULL;
0438 
0439     return reuse->socks[index];
0440 }
0441 
0442 static struct sock *reuseport_select_sock_by_hash(struct sock_reuseport *reuse,
0443                           u32 hash, u16 num_socks)
0444 {
0445     int i, j;
0446 
0447     i = j = reciprocal_scale(hash, num_socks);
0448     while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) {
0449         i++;
0450         if (i >= num_socks)
0451             i = 0;
0452         if (i == j)
0453             return NULL;
0454     }
0455 
0456     return reuse->socks[i];
0457 }
0458 
0459 /**
0460  *  reuseport_select_sock - Select a socket from an SO_REUSEPORT group.
0461  *  @sk: First socket in the group.
0462  *  @hash: When no BPF filter is available, use this hash to select.
0463  *  @skb: skb to run through BPF filter.
0464  *  @hdr_len: BPF filter expects skb data pointer at payload data.  If
0465  *    the skb does not yet point at the payload, this parameter represents
0466  *    how far the pointer needs to advance to reach the payload.
0467  *  Returns a socket that should receive the packet (or NULL on error).
0468  */
0469 struct sock *reuseport_select_sock(struct sock *sk,
0470                    u32 hash,
0471                    struct sk_buff *skb,
0472                    int hdr_len)
0473 {
0474     struct sock_reuseport *reuse;
0475     struct bpf_prog *prog;
0476     struct sock *sk2 = NULL;
0477     u16 socks;
0478 
0479     rcu_read_lock();
0480     reuse = rcu_dereference(sk->sk_reuseport_cb);
0481 
0482     /* if memory allocation failed or add call is not yet complete */
0483     if (!reuse)
0484         goto out;
0485 
0486     prog = rcu_dereference(reuse->prog);
0487     socks = READ_ONCE(reuse->num_socks);
0488     if (likely(socks)) {
0489         /* paired with smp_wmb() in __reuseport_add_sock() */
0490         smp_rmb();
0491 
0492         if (!prog || !skb)
0493             goto select_by_hash;
0494 
0495         if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
0496             sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, NULL, hash);
0497         else
0498             sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len);
0499 
0500 select_by_hash:
0501         /* no bpf or invalid bpf result: fall back to hash usage */
0502         if (!sk2)
0503             sk2 = reuseport_select_sock_by_hash(reuse, hash, socks);
0504     }
0505 
0506 out:
0507     rcu_read_unlock();
0508     return sk2;
0509 }
0510 EXPORT_SYMBOL(reuseport_select_sock);
0511 
0512 /**
0513  *  reuseport_migrate_sock - Select a socket from an SO_REUSEPORT group.
0514  *  @sk: close()ed or shutdown()ed socket in the group.
0515  *  @migrating_sk: ESTABLISHED/SYN_RECV full socket in the accept queue or
0516  *    NEW_SYN_RECV request socket during 3WHS.
0517  *  @skb: skb to run through BPF filter.
0518  *  Returns a socket (with sk_refcnt +1) that should accept the child socket
0519  *  (or NULL on error).
0520  */
0521 struct sock *reuseport_migrate_sock(struct sock *sk,
0522                     struct sock *migrating_sk,
0523                     struct sk_buff *skb)
0524 {
0525     struct sock_reuseport *reuse;
0526     struct sock *nsk = NULL;
0527     bool allocated = false;
0528     struct bpf_prog *prog;
0529     u16 socks;
0530     u32 hash;
0531 
0532     rcu_read_lock();
0533 
0534     reuse = rcu_dereference(sk->sk_reuseport_cb);
0535     if (!reuse)
0536         goto out;
0537 
0538     socks = READ_ONCE(reuse->num_socks);
0539     if (unlikely(!socks))
0540         goto failure;
0541 
0542     /* paired with smp_wmb() in __reuseport_add_sock() */
0543     smp_rmb();
0544 
0545     hash = migrating_sk->sk_hash;
0546     prog = rcu_dereference(reuse->prog);
0547     if (!prog || prog->expected_attach_type != BPF_SK_REUSEPORT_SELECT_OR_MIGRATE) {
0548         if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_migrate_req))
0549             goto select_by_hash;
0550         goto failure;
0551     }
0552 
0553     if (!skb) {
0554         skb = alloc_skb(0, GFP_ATOMIC);
0555         if (!skb)
0556             goto failure;
0557         allocated = true;
0558     }
0559 
0560     nsk = bpf_run_sk_reuseport(reuse, sk, prog, skb, migrating_sk, hash);
0561 
0562     if (allocated)
0563         kfree_skb(skb);
0564 
0565 select_by_hash:
0566     if (!nsk)
0567         nsk = reuseport_select_sock_by_hash(reuse, hash, socks);
0568 
0569     if (IS_ERR_OR_NULL(nsk) || unlikely(!refcount_inc_not_zero(&nsk->sk_refcnt))) {
0570         nsk = NULL;
0571         goto failure;
0572     }
0573 
0574 out:
0575     rcu_read_unlock();
0576     return nsk;
0577 
0578 failure:
0579     __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE);
0580     goto out;
0581 }
0582 EXPORT_SYMBOL(reuseport_migrate_sock);
0583 
0584 int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
0585 {
0586     struct sock_reuseport *reuse;
0587     struct bpf_prog *old_prog;
0588 
0589     if (sk_unhashed(sk)) {
0590         int err;
0591 
0592         if (!sk->sk_reuseport)
0593             return -EINVAL;
0594 
0595         err = reuseport_alloc(sk, false);
0596         if (err)
0597             return err;
0598     } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) {
0599         /* The socket wasn't bound with SO_REUSEPORT */
0600         return -EINVAL;
0601     }
0602 
0603     spin_lock_bh(&reuseport_lock);
0604     reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
0605                       lockdep_is_held(&reuseport_lock));
0606     old_prog = rcu_dereference_protected(reuse->prog,
0607                          lockdep_is_held(&reuseport_lock));
0608     rcu_assign_pointer(reuse->prog, prog);
0609     spin_unlock_bh(&reuseport_lock);
0610 
0611     sk_reuseport_prog_free(old_prog);
0612     return 0;
0613 }
0614 EXPORT_SYMBOL(reuseport_attach_prog);
0615 
0616 int reuseport_detach_prog(struct sock *sk)
0617 {
0618     struct sock_reuseport *reuse;
0619     struct bpf_prog *old_prog;
0620 
0621     old_prog = NULL;
0622     spin_lock_bh(&reuseport_lock);
0623     reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
0624                       lockdep_is_held(&reuseport_lock));
0625 
0626     /* reuse must be checked after acquiring the reuseport_lock
0627      * because reuseport_grow() can detach a closed sk.
0628      */
0629     if (!reuse) {
0630         spin_unlock_bh(&reuseport_lock);
0631         return sk->sk_reuseport ? -ENOENT : -EINVAL;
0632     }
0633 
0634     if (sk_unhashed(sk) && reuse->num_closed_socks) {
0635         spin_unlock_bh(&reuseport_lock);
0636         return -ENOENT;
0637     }
0638 
0639     old_prog = rcu_replace_pointer(reuse->prog, old_prog,
0640                        lockdep_is_held(&reuseport_lock));
0641     spin_unlock_bh(&reuseport_lock);
0642 
0643     if (!old_prog)
0644         return -ENOENT;
0645 
0646     sk_reuseport_prog_free(old_prog);
0647     return 0;
0648 }
0649 EXPORT_SYMBOL(reuseport_detach_prog);