net/ipv4/tcp_cong.c

0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * Pluggable TCP congestion control support and newReno
0004  * congestion control.
0005  * Based on ideas from I/O scheduler support and Web100.
0006  *
0007  * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org>
0008  */
0009
0010 #define pr_fmt(fmt) "TCP: " fmt
0011
0012 #include <linux/module.h>
0013 #include <linux/mm.h>
0014 #include <linux/types.h>
0015 #include <linux/list.h>
0016 #include <linux/gfp.h>
0017 #include <linux/jhash.h>
0018 #include <net/tcp.h>
0019 #include <trace/events/tcp.h>
0020
0021 static DEFINE_SPINLOCK(tcp_cong_list_lock);
0022 static LIST_HEAD(tcp_cong_list);
0023
0024 /* Simple linear search, don't expect many entries! */
0025 struct tcp_congestion_ops *tcp_ca_find(const char *name)
0026 {
0027     struct tcp_congestion_ops *e;
0028
0029     list_for_each_entry_rcu(e, &tcp_cong_list, list) {
0030         if (strcmp(e->name, name) == 0)
0031             return e;
0032     }
0033
0034     return NULL;
0035 }
0036
0037 void tcp_set_ca_state(struct sock *sk, const u8 ca_state)
0038 {
0039     struct inet_connection_sock *icsk = inet_csk(sk);
0040
0041     trace_tcp_cong_state_set(sk, ca_state);
0042
0043     if (icsk->icsk_ca_ops->set_state)
0044         icsk->icsk_ca_ops->set_state(sk, ca_state);
0045     icsk->icsk_ca_state = ca_state;
0046 }
0047
0048 /* Must be called with rcu lock held */
0049 static struct tcp_congestion_ops *tcp_ca_find_autoload(struct net *net,
0050                                const char *name)
0051 {
0052     struct tcp_congestion_ops *ca = tcp_ca_find(name);
0053
0054 #ifdef CONFIG_MODULES
0055     if (!ca && capable(CAP_NET_ADMIN)) {
0056         rcu_read_unlock();
0057         request_module("tcp_%s", name);
0058         rcu_read_lock();
0059         ca = tcp_ca_find(name);
0060     }
0061 #endif
0062     return ca;
0063 }
0064
0065 /* Simple linear search, not much in here. */
0066 struct tcp_congestion_ops *tcp_ca_find_key(u32 key)
0067 {
0068     struct tcp_congestion_ops *e;
0069
0070     list_for_each_entry_rcu(e, &tcp_cong_list, list) {
0071         if (e->key == key)
0072             return e;
0073     }
0074
0075     return NULL;
0076 }
0077
0078 /*
0079  * Attach new congestion control algorithm to the list
0080  * of available options.
0081  */
0082 int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
0083 {
0084     int ret = 0;
0085
0086     /* all algorithms must implement these */
0087     if (!ca->ssthresh || !ca->undo_cwnd ||
0088         !(ca->cong_avoid || ca->cong_control)) {
0089         pr_err("%s does not implement required ops\n", ca->name);
0090         return -EINVAL;
0091     }
0092
0093     ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));
0094
0095     spin_lock(&tcp_cong_list_lock);
0096     if (ca->key == TCP_CA_UNSPEC || tcp_ca_find_key(ca->key)) {
0097         pr_notice("%s already registered or non-unique key\n",
0098               ca->name);
0099         ret = -EEXIST;
0100     } else {
0101         list_add_tail_rcu(&ca->list, &tcp_cong_list);
0102         pr_debug("%s registered\n", ca->name);
0103     }
0104     spin_unlock(&tcp_cong_list_lock);
0105
0106     return ret;
0107 }
0108 EXPORT_SYMBOL_GPL(tcp_register_congestion_control);
0109
0110 /*
0111  * Remove congestion control algorithm, called from
0112  * the module's remove function.  Module ref counts are used
0113  * to ensure that this can't be done till all sockets using
0114  * that method are closed.
0115  */
0116 void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
0117 {
0118     spin_lock(&tcp_cong_list_lock);
0119     list_del_rcu(&ca->list);
0120     spin_unlock(&tcp_cong_list_lock);
0121
0122     /* Wait for outstanding readers to complete before the
0123      * module gets removed entirely.
0124      *
0125      * A try_module_get() should fail by now as our module is
0126      * in "going" state since no refs are held anymore and
0127      * module_exit() handler being called.
0128      */
0129     synchronize_rcu();
0130 }
0131 EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
0132
0133 u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca)
0134 {
0135     const struct tcp_congestion_ops *ca;
0136     u32 key = TCP_CA_UNSPEC;
0137
0138     might_sleep();
0139
0140     rcu_read_lock();
0141     ca = tcp_ca_find_autoload(net, name);
0142     if (ca) {
0143         key = ca->key;
0144         *ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN;
0145     }
0146     rcu_read_unlock();
0147
0148     return key;
0149 }
0150
0151 char *tcp_ca_get_name_by_key(u32 key, char *buffer)
0152 {
0153     const struct tcp_congestion_ops *ca;
0154     char *ret = NULL;
0155
0156     rcu_read_lock();
0157     ca = tcp_ca_find_key(key);
0158     if (ca)
0159         ret = strncpy(buffer, ca->name,
0160                   TCP_CA_NAME_MAX);
0161     rcu_read_unlock();
0162
0163     return ret;
0164 }
0165
0166 /* Assign choice of congestion control. */
0167 void tcp_assign_congestion_control(struct sock *sk)
0168 {
0169     struct net *net = sock_net(sk);
0170     struct inet_connection_sock *icsk = inet_csk(sk);
0171     const struct tcp_congestion_ops *ca;
0172
0173     rcu_read_lock();
0174     ca = rcu_dereference(net->ipv4.tcp_congestion_control);
0175     if (unlikely(!bpf_try_module_get(ca, ca->owner)))
0176         ca = &tcp_reno;
0177     icsk->icsk_ca_ops = ca;
0178     rcu_read_unlock();
0179
0180     memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
0181     if (ca->flags & TCP_CONG_NEEDS_ECN)
0182         INET_ECN_xmit(sk);
0183     else
0184         INET_ECN_dontxmit(sk);
0185 }
0186
0187 void tcp_init_congestion_control(struct sock *sk)
0188 {
0189     struct inet_connection_sock *icsk = inet_csk(sk);
0190
0191     tcp_sk(sk)->prior_ssthresh = 0;
0192     if (icsk->icsk_ca_ops->init)
0193         icsk->icsk_ca_ops->init(sk);
0194     if (tcp_ca_needs_ecn(sk))
0195         INET_ECN_xmit(sk);
0196     else
0197         INET_ECN_dontxmit(sk);
0198     icsk->icsk_ca_initialized = 1;
0199 }
0200
0201 static void tcp_reinit_congestion_control(struct sock *sk,
0202                       const struct tcp_congestion_ops *ca)
0203 {
0204     struct inet_connection_sock *icsk = inet_csk(sk);
0205
0206     tcp_cleanup_congestion_control(sk);
0207     icsk->icsk_ca_ops = ca;
0208     icsk->icsk_ca_setsockopt = 1;
0209     memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
0210
0211     if (ca->flags & TCP_CONG_NEEDS_ECN)
0212         INET_ECN_xmit(sk);
0213     else
0214         INET_ECN_dontxmit(sk);
0215
0216     if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
0217         tcp_init_congestion_control(sk);
0218 }
0219
0220 /* Manage refcounts on socket close. */
0221 void tcp_cleanup_congestion_control(struct sock *sk)
0222 {
0223     struct inet_connection_sock *icsk = inet_csk(sk);
0224
0225     if (icsk->icsk_ca_ops->release)
0226         icsk->icsk_ca_ops->release(sk);
0227     bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner);
0228 }
0229
0230 /* Used by sysctl to change default congestion control */
0231 int tcp_set_default_congestion_control(struct net *net, const char *name)
0232 {
0233     struct tcp_congestion_ops *ca;
0234     const struct tcp_congestion_ops *prev;
0235     int ret;
0236
0237     rcu_read_lock();
0238     ca = tcp_ca_find_autoload(net, name);
0239     if (!ca) {
0240         ret = -ENOENT;
0241     } else if (!bpf_try_module_get(ca, ca->owner)) {
0242         ret = -EBUSY;
0243     } else if (!net_eq(net, &init_net) &&
0244             !(ca->flags & TCP_CONG_NON_RESTRICTED)) {
0245         /* Only init netns can set default to a restricted algorithm */
0246         ret = -EPERM;
0247     } else {
0248         prev = xchg(&net->ipv4.tcp_congestion_control, ca);
0249         if (prev)
0250             bpf_module_put(prev, prev->owner);
0251
0252         ca->flags |= TCP_CONG_NON_RESTRICTED;
0253         ret = 0;
0254     }
0255     rcu_read_unlock();
0256
0257     return ret;
0258 }
0259
0260 /* Set default value from kernel configuration at bootup */
0261 static int __init tcp_congestion_default(void)
0262 {
0263     return tcp_set_default_congestion_control(&init_net,
0264                           CONFIG_DEFAULT_TCP_CONG);
0265 }
0266 late_initcall(tcp_congestion_default);
0267
0268 /* Build string with list of available congestion control values */
0269 void tcp_get_available_congestion_control(char *buf, size_t maxlen)
0270 {
0271     struct tcp_congestion_ops *ca;
0272     size_t offs = 0;
0273
0274     rcu_read_lock();
0275     list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
0276         offs += snprintf(buf + offs, maxlen - offs,
0277                  "%s%s",
0278                  offs == 0 ? "" : " ", ca->name);
0279
0280         if (WARN_ON_ONCE(offs >= maxlen))
0281             break;
0282     }
0283     rcu_read_unlock();
0284 }
0285
0286 /* Get current default congestion control */
0287 void tcp_get_default_congestion_control(struct net *net, char *name)
0288 {
0289     const struct tcp_congestion_ops *ca;
0290
0291     rcu_read_lock();
0292     ca = rcu_dereference(net->ipv4.tcp_congestion_control);
0293     strncpy(name, ca->name, TCP_CA_NAME_MAX);
0294     rcu_read_unlock();
0295 }
0296
0297 /* Built list of non-restricted congestion control values */
0298 void tcp_get_allowed_congestion_control(char *buf, size_t maxlen)
0299 {
0300     struct tcp_congestion_ops *ca;
0301     size_t offs = 0;
0302
0303     *buf = '\0';
0304     rcu_read_lock();
0305     list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
0306         if (!(ca->flags & TCP_CONG_NON_RESTRICTED))
0307             continue;
0308         offs += snprintf(buf + offs, maxlen - offs,
0309                  "%s%s",
0310                  offs == 0 ? "" : " ", ca->name);
0311
0312         if (WARN_ON_ONCE(offs >= maxlen))
0313             break;
0314     }
0315     rcu_read_unlock();
0316 }
0317
0318 /* Change list of non-restricted congestion control */
0319 int tcp_set_allowed_congestion_control(char *val)
0320 {
0321     struct tcp_congestion_ops *ca;
0322     char *saved_clone, *clone, *name;
0323     int ret = 0;
0324
0325     saved_clone = clone = kstrdup(val, GFP_USER);
0326     if (!clone)
0327         return -ENOMEM;
0328
0329     spin_lock(&tcp_cong_list_lock);
0330     /* pass 1 check for bad entries */
0331     while ((name = strsep(&clone, " ")) && *name) {
0332         ca = tcp_ca_find(name);
0333         if (!ca) {
0334             ret = -ENOENT;
0335             goto out;
0336         }
0337     }
0338
0339     /* pass 2 clear old values */
0340     list_for_each_entry_rcu(ca, &tcp_cong_list, list)
0341         ca->flags &= ~TCP_CONG_NON_RESTRICTED;
0342
0343     /* pass 3 mark as allowed */
0344     while ((name = strsep(&val, " ")) && *name) {
0345         ca = tcp_ca_find(name);
0346         WARN_ON(!ca);
0347         if (ca)
0348             ca->flags |= TCP_CONG_NON_RESTRICTED;
0349     }
0350 out:
0351     spin_unlock(&tcp_cong_list_lock);
0352     kfree(saved_clone);
0353
0354     return ret;
0355 }
0356
0357 /* Change congestion control for socket. If load is false, then it is the
0358  * responsibility of the caller to call tcp_init_congestion_control or
0359  * tcp_reinit_congestion_control (if the current congestion control was
0360  * already initialized.
0361  */
0362 int tcp_set_congestion_control(struct sock *sk, const char *name, bool load,
0363                    bool cap_net_admin)
0364 {
0365     struct inet_connection_sock *icsk = inet_csk(sk);
0366     const struct tcp_congestion_ops *ca;
0367     int err = 0;
0368
0369     if (icsk->icsk_ca_dst_locked)
0370         return -EPERM;
0371
0372     rcu_read_lock();
0373     if (!load)
0374         ca = tcp_ca_find(name);
0375     else
0376         ca = tcp_ca_find_autoload(sock_net(sk), name);
0377
0378     /* No change asking for existing value */
0379     if (ca == icsk->icsk_ca_ops) {
0380         icsk->icsk_ca_setsockopt = 1;
0381         goto out;
0382     }
0383
0384     if (!ca)
0385         err = -ENOENT;
0386     else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin))
0387         err = -EPERM;
0388     else if (!bpf_try_module_get(ca, ca->owner))
0389         err = -EBUSY;
0390     else
0391         tcp_reinit_congestion_control(sk, ca);
0392  out:
0393     rcu_read_unlock();
0394     return err;
0395 }
0396
0397 /* Slow start is used when congestion window is no greater than the slow start
0398  * threshold. We base on RFC2581 and also handle stretch ACKs properly.
0399  * We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but
0400  * something better;) a packet is only considered (s)acked in its entirety to
0401  * defend the ACK attacks described in the RFC. Slow start processes a stretch
0402  * ACK of degree N as if N acks of degree 1 are received back to back except
0403  * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and
0404  * returns the leftover acks to adjust cwnd in congestion avoidance mode.
0405  */
0406 u32 tcp_slow_start(struct tcp_sock *tp, u32 acked)
0407 {
0408     u32 cwnd = min(tcp_snd_cwnd(tp) + acked, tp->snd_ssthresh);
0409
0410     acked -= cwnd - tcp_snd_cwnd(tp);
0411     tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp));
0412
0413     return acked;
0414 }
0415 EXPORT_SYMBOL_GPL(tcp_slow_start);
0416
0417 /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w),
0418  * for every packet that was ACKed.
0419  */
0420 void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked)
0421 {
0422     /* If credits accumulated at a higher w, apply them gently now. */
0423     if (tp->snd_cwnd_cnt >= w) {
0424         tp->snd_cwnd_cnt = 0;
0425         tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1);
0426     }
0427
0428     tp->snd_cwnd_cnt += acked;
0429     if (tp->snd_cwnd_cnt >= w) {
0430         u32 delta = tp->snd_cwnd_cnt / w;
0431
0432         tp->snd_cwnd_cnt -= delta * w;
0433         tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + delta);
0434     }
0435     tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), tp->snd_cwnd_clamp));
0436 }
0437 EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);
0438
0439 /*
0440  * TCP Reno congestion control
0441  * This is special case used for fallback as well.
0442  */
0443 /* This is Jacobson's slow start and congestion avoidance.
0444  * SIGCOMM '88, p. 328.
0445  */
0446 void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
0447 {
0448     struct tcp_sock *tp = tcp_sk(sk);
0449
0450     if (!tcp_is_cwnd_limited(sk))
0451         return;
0452
0453     /* In "safe" area, increase. */
0454     if (tcp_in_slow_start(tp)) {
0455         acked = tcp_slow_start(tp, acked);
0456         if (!acked)
0457             return;
0458     }
0459     /* In dangerous area, increase slowly. */
0460     tcp_cong_avoid_ai(tp, tcp_snd_cwnd(tp), acked);
0461 }
0462 EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
0463
0464 /* Slow start threshold is half the congestion window (min 2) */
0465 u32 tcp_reno_ssthresh(struct sock *sk)
0466 {
0467     const struct tcp_sock *tp = tcp_sk(sk);
0468
0469     return max(tcp_snd_cwnd(tp) >> 1U, 2U);
0470 }
0471 EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
0472
0473 u32 tcp_reno_undo_cwnd(struct sock *sk)
0474 {
0475     const struct tcp_sock *tp = tcp_sk(sk);
0476
0477     return max(tcp_snd_cwnd(tp), tp->prior_cwnd);
0478 }
0479 EXPORT_SYMBOL_GPL(tcp_reno_undo_cwnd);
0480
0481 struct tcp_congestion_ops tcp_reno = {
0482     .flags      = TCP_CONG_NON_RESTRICTED,
0483     .name       = "reno",
0484     .owner      = THIS_MODULE,
0485     .ssthresh   = tcp_reno_ssthresh,
0486     .cong_avoid = tcp_reno_cong_avoid,
0487     .undo_cwnd  = tcp_reno_undo_cwnd,
0488 };