0001
0002
0003
0004
0005
0006
0007
0008
0009 #include <net/ip.h>
0010 #include <net/sock_reuseport.h>
0011 #include <linux/bpf.h>
0012 #include <linux/idr.h>
0013 #include <linux/filter.h>
0014 #include <linux/rcupdate.h>
0015
0016 #define INIT_SOCKS 128
0017
0018 DEFINE_SPINLOCK(reuseport_lock);
0019
0020 static DEFINE_IDA(reuseport_ida);
0021 static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse,
0022 struct sock_reuseport *reuse, bool bind_inany);
0023
0024 static int reuseport_sock_index(struct sock *sk,
0025 const struct sock_reuseport *reuse,
0026 bool closed)
0027 {
0028 int left, right;
0029
0030 if (!closed) {
0031 left = 0;
0032 right = reuse->num_socks;
0033 } else {
0034 left = reuse->max_socks - reuse->num_closed_socks;
0035 right = reuse->max_socks;
0036 }
0037
0038 for (; left < right; left++)
0039 if (reuse->socks[left] == sk)
0040 return left;
0041 return -1;
0042 }
0043
0044 static void __reuseport_add_sock(struct sock *sk,
0045 struct sock_reuseport *reuse)
0046 {
0047 reuse->socks[reuse->num_socks] = sk;
0048
0049 smp_wmb();
0050 reuse->num_socks++;
0051 }
0052
0053 static bool __reuseport_detach_sock(struct sock *sk,
0054 struct sock_reuseport *reuse)
0055 {
0056 int i = reuseport_sock_index(sk, reuse, false);
0057
0058 if (i == -1)
0059 return false;
0060
0061 reuse->socks[i] = reuse->socks[reuse->num_socks - 1];
0062 reuse->num_socks--;
0063
0064 return true;
0065 }
0066
0067 static void __reuseport_add_closed_sock(struct sock *sk,
0068 struct sock_reuseport *reuse)
0069 {
0070 reuse->socks[reuse->max_socks - reuse->num_closed_socks - 1] = sk;
0071
0072 WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks + 1);
0073 }
0074
0075 static bool __reuseport_detach_closed_sock(struct sock *sk,
0076 struct sock_reuseport *reuse)
0077 {
0078 int i = reuseport_sock_index(sk, reuse, true);
0079
0080 if (i == -1)
0081 return false;
0082
0083 reuse->socks[i] = reuse->socks[reuse->max_socks - reuse->num_closed_socks];
0084
0085 WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks - 1);
0086
0087 return true;
0088 }
0089
0090 static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
0091 {
0092 unsigned int size = sizeof(struct sock_reuseport) +
0093 sizeof(struct sock *) * max_socks;
0094 struct sock_reuseport *reuse = kzalloc(size, GFP_ATOMIC);
0095
0096 if (!reuse)
0097 return NULL;
0098
0099 reuse->max_socks = max_socks;
0100
0101 RCU_INIT_POINTER(reuse->prog, NULL);
0102 return reuse;
0103 }
0104
0105 int reuseport_alloc(struct sock *sk, bool bind_inany)
0106 {
0107 struct sock_reuseport *reuse;
0108 int id, ret = 0;
0109
0110
0111
0112
0113 spin_lock_bh(&reuseport_lock);
0114
0115
0116
0117
0118 reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
0119 lockdep_is_held(&reuseport_lock));
0120 if (reuse) {
0121 if (reuse->num_closed_socks) {
0122
0123 ret = reuseport_resurrect(sk, reuse, NULL, bind_inany);
0124 goto out;
0125 }
0126
0127
0128
0129
0130
0131 if (bind_inany)
0132 reuse->bind_inany = bind_inany;
0133 goto out;
0134 }
0135
0136 reuse = __reuseport_alloc(INIT_SOCKS);
0137 if (!reuse) {
0138 ret = -ENOMEM;
0139 goto out;
0140 }
0141
0142 id = ida_alloc(&reuseport_ida, GFP_ATOMIC);
0143 if (id < 0) {
0144 kfree(reuse);
0145 ret = id;
0146 goto out;
0147 }
0148
0149 reuse->reuseport_id = id;
0150 reuse->bind_inany = bind_inany;
0151 reuse->socks[0] = sk;
0152 reuse->num_socks = 1;
0153 rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
0154
0155 out:
0156 spin_unlock_bh(&reuseport_lock);
0157
0158 return ret;
0159 }
0160 EXPORT_SYMBOL(reuseport_alloc);
0161
0162 static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
0163 {
0164 struct sock_reuseport *more_reuse;
0165 u32 more_socks_size, i;
0166
0167 more_socks_size = reuse->max_socks * 2U;
0168 if (more_socks_size > U16_MAX) {
0169 if (reuse->num_closed_socks) {
0170
0171
0172
0173
0174 struct sock *sk;
0175
0176 sk = reuse->socks[reuse->max_socks - reuse->num_closed_socks];
0177 RCU_INIT_POINTER(sk->sk_reuseport_cb, NULL);
0178 __reuseport_detach_closed_sock(sk, reuse);
0179
0180 return reuse;
0181 }
0182
0183 return NULL;
0184 }
0185
0186 more_reuse = __reuseport_alloc(more_socks_size);
0187 if (!more_reuse)
0188 return NULL;
0189
0190 more_reuse->num_socks = reuse->num_socks;
0191 more_reuse->num_closed_socks = reuse->num_closed_socks;
0192 more_reuse->prog = reuse->prog;
0193 more_reuse->reuseport_id = reuse->reuseport_id;
0194 more_reuse->bind_inany = reuse->bind_inany;
0195 more_reuse->has_conns = reuse->has_conns;
0196
0197 memcpy(more_reuse->socks, reuse->socks,
0198 reuse->num_socks * sizeof(struct sock *));
0199 memcpy(more_reuse->socks +
0200 (more_reuse->max_socks - more_reuse->num_closed_socks),
0201 reuse->socks + (reuse->max_socks - reuse->num_closed_socks),
0202 reuse->num_closed_socks * sizeof(struct sock *));
0203 more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts);
0204
0205 for (i = 0; i < reuse->max_socks; ++i)
0206 rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb,
0207 more_reuse);
0208
0209
0210
0211
0212
0213 kfree_rcu(reuse, rcu);
0214 return more_reuse;
0215 }
0216
0217 static void reuseport_free_rcu(struct rcu_head *head)
0218 {
0219 struct sock_reuseport *reuse;
0220
0221 reuse = container_of(head, struct sock_reuseport, rcu);
0222 sk_reuseport_prog_free(rcu_dereference_protected(reuse->prog, 1));
0223 ida_free(&reuseport_ida, reuse->reuseport_id);
0224 kfree(reuse);
0225 }
0226
0227
0228
0229
0230
0231
0232
0233
0234
0235 int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
0236 {
0237 struct sock_reuseport *old_reuse, *reuse;
0238
0239 if (!rcu_access_pointer(sk2->sk_reuseport_cb)) {
0240 int err = reuseport_alloc(sk2, bind_inany);
0241
0242 if (err)
0243 return err;
0244 }
0245
0246 spin_lock_bh(&reuseport_lock);
0247 reuse = rcu_dereference_protected(sk2->sk_reuseport_cb,
0248 lockdep_is_held(&reuseport_lock));
0249 old_reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
0250 lockdep_is_held(&reuseport_lock));
0251 if (old_reuse && old_reuse->num_closed_socks) {
0252
0253 int err = reuseport_resurrect(sk, old_reuse, reuse, reuse->bind_inany);
0254
0255 spin_unlock_bh(&reuseport_lock);
0256 return err;
0257 }
0258
0259 if (old_reuse && old_reuse->num_socks != 1) {
0260 spin_unlock_bh(&reuseport_lock);
0261 return -EBUSY;
0262 }
0263
0264 if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) {
0265 reuse = reuseport_grow(reuse);
0266 if (!reuse) {
0267 spin_unlock_bh(&reuseport_lock);
0268 return -ENOMEM;
0269 }
0270 }
0271
0272 __reuseport_add_sock(sk, reuse);
0273 rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
0274
0275 spin_unlock_bh(&reuseport_lock);
0276
0277 if (old_reuse)
0278 call_rcu(&old_reuse->rcu, reuseport_free_rcu);
0279 return 0;
0280 }
0281 EXPORT_SYMBOL(reuseport_add_sock);
0282
0283 static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse,
0284 struct sock_reuseport *reuse, bool bind_inany)
0285 {
0286 if (old_reuse == reuse) {
0287
0288
0289
0290 __reuseport_detach_closed_sock(sk, old_reuse);
0291 __reuseport_add_sock(sk, old_reuse);
0292 return 0;
0293 }
0294
0295 if (!reuse) {
0296
0297
0298
0299
0300
0301
0302 int id;
0303
0304 reuse = __reuseport_alloc(INIT_SOCKS);
0305 if (!reuse)
0306 return -ENOMEM;
0307
0308 id = ida_alloc(&reuseport_ida, GFP_ATOMIC);
0309 if (id < 0) {
0310 kfree(reuse);
0311 return id;
0312 }
0313
0314 reuse->reuseport_id = id;
0315 reuse->bind_inany = bind_inany;
0316 } else {
0317
0318
0319
0320
0321
0322
0323
0324
0325 if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) {
0326 reuse = reuseport_grow(reuse);
0327 if (!reuse)
0328 return -ENOMEM;
0329 }
0330 }
0331
0332 __reuseport_detach_closed_sock(sk, old_reuse);
0333 __reuseport_add_sock(sk, reuse);
0334 rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
0335
0336 if (old_reuse->num_socks + old_reuse->num_closed_socks == 0)
0337 call_rcu(&old_reuse->rcu, reuseport_free_rcu);
0338
0339 return 0;
0340 }
0341
0342 void reuseport_detach_sock(struct sock *sk)
0343 {
0344 struct sock_reuseport *reuse;
0345
0346 spin_lock_bh(&reuseport_lock);
0347 reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
0348 lockdep_is_held(&reuseport_lock));
0349
0350
0351 if (!reuse)
0352 goto out;
0353
0354
0355
0356
0357
0358
0359
0360
0361
0362 bpf_sk_reuseport_detach(sk);
0363
0364 rcu_assign_pointer(sk->sk_reuseport_cb, NULL);
0365
0366 if (!__reuseport_detach_closed_sock(sk, reuse))
0367 __reuseport_detach_sock(sk, reuse);
0368
0369 if (reuse->num_socks + reuse->num_closed_socks == 0)
0370 call_rcu(&reuse->rcu, reuseport_free_rcu);
0371
0372 out:
0373 spin_unlock_bh(&reuseport_lock);
0374 }
0375 EXPORT_SYMBOL(reuseport_detach_sock);
0376
0377 void reuseport_stop_listen_sock(struct sock *sk)
0378 {
0379 if (sk->sk_protocol == IPPROTO_TCP) {
0380 struct sock_reuseport *reuse;
0381 struct bpf_prog *prog;
0382
0383 spin_lock_bh(&reuseport_lock);
0384
0385 reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
0386 lockdep_is_held(&reuseport_lock));
0387 prog = rcu_dereference_protected(reuse->prog,
0388 lockdep_is_held(&reuseport_lock));
0389
0390 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_migrate_req) ||
0391 (prog && prog->expected_attach_type == BPF_SK_REUSEPORT_SELECT_OR_MIGRATE)) {
0392
0393
0394
0395 bpf_sk_reuseport_detach(sk);
0396
0397 __reuseport_detach_sock(sk, reuse);
0398 __reuseport_add_closed_sock(sk, reuse);
0399
0400 spin_unlock_bh(&reuseport_lock);
0401 return;
0402 }
0403
0404 spin_unlock_bh(&reuseport_lock);
0405 }
0406
0407
0408 reuseport_detach_sock(sk);
0409 }
0410 EXPORT_SYMBOL(reuseport_stop_listen_sock);
0411
0412 static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks,
0413 struct bpf_prog *prog, struct sk_buff *skb,
0414 int hdr_len)
0415 {
0416 struct sk_buff *nskb = NULL;
0417 u32 index;
0418
0419 if (skb_shared(skb)) {
0420 nskb = skb_clone(skb, GFP_ATOMIC);
0421 if (!nskb)
0422 return NULL;
0423 skb = nskb;
0424 }
0425
0426
0427 if (!pskb_pull(skb, hdr_len)) {
0428 kfree_skb(nskb);
0429 return NULL;
0430 }
0431 index = bpf_prog_run_save_cb(prog, skb);
0432 __skb_push(skb, hdr_len);
0433
0434 consume_skb(nskb);
0435
0436 if (index >= socks)
0437 return NULL;
0438
0439 return reuse->socks[index];
0440 }
0441
0442 static struct sock *reuseport_select_sock_by_hash(struct sock_reuseport *reuse,
0443 u32 hash, u16 num_socks)
0444 {
0445 int i, j;
0446
0447 i = j = reciprocal_scale(hash, num_socks);
0448 while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) {
0449 i++;
0450 if (i >= num_socks)
0451 i = 0;
0452 if (i == j)
0453 return NULL;
0454 }
0455
0456 return reuse->socks[i];
0457 }
0458
0459
0460
0461
0462
0463
0464
0465
0466
0467
0468
0469 struct sock *reuseport_select_sock(struct sock *sk,
0470 u32 hash,
0471 struct sk_buff *skb,
0472 int hdr_len)
0473 {
0474 struct sock_reuseport *reuse;
0475 struct bpf_prog *prog;
0476 struct sock *sk2 = NULL;
0477 u16 socks;
0478
0479 rcu_read_lock();
0480 reuse = rcu_dereference(sk->sk_reuseport_cb);
0481
0482
0483 if (!reuse)
0484 goto out;
0485
0486 prog = rcu_dereference(reuse->prog);
0487 socks = READ_ONCE(reuse->num_socks);
0488 if (likely(socks)) {
0489
0490 smp_rmb();
0491
0492 if (!prog || !skb)
0493 goto select_by_hash;
0494
0495 if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
0496 sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, NULL, hash);
0497 else
0498 sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len);
0499
0500 select_by_hash:
0501
0502 if (!sk2)
0503 sk2 = reuseport_select_sock_by_hash(reuse, hash, socks);
0504 }
0505
0506 out:
0507 rcu_read_unlock();
0508 return sk2;
0509 }
0510 EXPORT_SYMBOL(reuseport_select_sock);
0511
0512
0513
0514
0515
0516
0517
0518
0519
0520
0521 struct sock *reuseport_migrate_sock(struct sock *sk,
0522 struct sock *migrating_sk,
0523 struct sk_buff *skb)
0524 {
0525 struct sock_reuseport *reuse;
0526 struct sock *nsk = NULL;
0527 bool allocated = false;
0528 struct bpf_prog *prog;
0529 u16 socks;
0530 u32 hash;
0531
0532 rcu_read_lock();
0533
0534 reuse = rcu_dereference(sk->sk_reuseport_cb);
0535 if (!reuse)
0536 goto out;
0537
0538 socks = READ_ONCE(reuse->num_socks);
0539 if (unlikely(!socks))
0540 goto failure;
0541
0542
0543 smp_rmb();
0544
0545 hash = migrating_sk->sk_hash;
0546 prog = rcu_dereference(reuse->prog);
0547 if (!prog || prog->expected_attach_type != BPF_SK_REUSEPORT_SELECT_OR_MIGRATE) {
0548 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_migrate_req))
0549 goto select_by_hash;
0550 goto failure;
0551 }
0552
0553 if (!skb) {
0554 skb = alloc_skb(0, GFP_ATOMIC);
0555 if (!skb)
0556 goto failure;
0557 allocated = true;
0558 }
0559
0560 nsk = bpf_run_sk_reuseport(reuse, sk, prog, skb, migrating_sk, hash);
0561
0562 if (allocated)
0563 kfree_skb(skb);
0564
0565 select_by_hash:
0566 if (!nsk)
0567 nsk = reuseport_select_sock_by_hash(reuse, hash, socks);
0568
0569 if (IS_ERR_OR_NULL(nsk) || unlikely(!refcount_inc_not_zero(&nsk->sk_refcnt))) {
0570 nsk = NULL;
0571 goto failure;
0572 }
0573
0574 out:
0575 rcu_read_unlock();
0576 return nsk;
0577
0578 failure:
0579 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE);
0580 goto out;
0581 }
0582 EXPORT_SYMBOL(reuseport_migrate_sock);
0583
0584 int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
0585 {
0586 struct sock_reuseport *reuse;
0587 struct bpf_prog *old_prog;
0588
0589 if (sk_unhashed(sk)) {
0590 int err;
0591
0592 if (!sk->sk_reuseport)
0593 return -EINVAL;
0594
0595 err = reuseport_alloc(sk, false);
0596 if (err)
0597 return err;
0598 } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) {
0599
0600 return -EINVAL;
0601 }
0602
0603 spin_lock_bh(&reuseport_lock);
0604 reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
0605 lockdep_is_held(&reuseport_lock));
0606 old_prog = rcu_dereference_protected(reuse->prog,
0607 lockdep_is_held(&reuseport_lock));
0608 rcu_assign_pointer(reuse->prog, prog);
0609 spin_unlock_bh(&reuseport_lock);
0610
0611 sk_reuseport_prog_free(old_prog);
0612 return 0;
0613 }
0614 EXPORT_SYMBOL(reuseport_attach_prog);
0615
0616 int reuseport_detach_prog(struct sock *sk)
0617 {
0618 struct sock_reuseport *reuse;
0619 struct bpf_prog *old_prog;
0620
0621 old_prog = NULL;
0622 spin_lock_bh(&reuseport_lock);
0623 reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
0624 lockdep_is_held(&reuseport_lock));
0625
0626
0627
0628
0629 if (!reuse) {
0630 spin_unlock_bh(&reuseport_lock);
0631 return sk->sk_reuseport ? -ENOENT : -EINVAL;
0632 }
0633
0634 if (sk_unhashed(sk) && reuse->num_closed_socks) {
0635 spin_unlock_bh(&reuseport_lock);
0636 return -ENOENT;
0637 }
0638
0639 old_prog = rcu_replace_pointer(reuse->prog, old_prog,
0640 lockdep_is_held(&reuseport_lock));
0641 spin_unlock_bh(&reuseport_lock);
0642
0643 if (!old_prog)
0644 return -ENOENT;
0645
0646 sk_reuseport_prog_free(old_prog);
0647 return 0;
0648 }
0649 EXPORT_SYMBOL(reuseport_detach_prog);