0001
0002
0003
0004
0005
0006
0007
0008
0009
0010 #define pr_fmt(fmt) "TCP: " fmt
0011
0012 #include <linux/module.h>
0013 #include <linux/mm.h>
0014 #include <linux/types.h>
0015 #include <linux/list.h>
0016 #include <linux/gfp.h>
0017 #include <linux/jhash.h>
0018 #include <net/tcp.h>
0019 #include <trace/events/tcp.h>
0020
0021 static DEFINE_SPINLOCK(tcp_cong_list_lock);
0022 static LIST_HEAD(tcp_cong_list);
0023
0024
0025 struct tcp_congestion_ops *tcp_ca_find(const char *name)
0026 {
0027 struct tcp_congestion_ops *e;
0028
0029 list_for_each_entry_rcu(e, &tcp_cong_list, list) {
0030 if (strcmp(e->name, name) == 0)
0031 return e;
0032 }
0033
0034 return NULL;
0035 }
0036
0037 void tcp_set_ca_state(struct sock *sk, const u8 ca_state)
0038 {
0039 struct inet_connection_sock *icsk = inet_csk(sk);
0040
0041 trace_tcp_cong_state_set(sk, ca_state);
0042
0043 if (icsk->icsk_ca_ops->set_state)
0044 icsk->icsk_ca_ops->set_state(sk, ca_state);
0045 icsk->icsk_ca_state = ca_state;
0046 }
0047
0048
0049 static struct tcp_congestion_ops *tcp_ca_find_autoload(struct net *net,
0050 const char *name)
0051 {
0052 struct tcp_congestion_ops *ca = tcp_ca_find(name);
0053
0054 #ifdef CONFIG_MODULES
0055 if (!ca && capable(CAP_NET_ADMIN)) {
0056 rcu_read_unlock();
0057 request_module("tcp_%s", name);
0058 rcu_read_lock();
0059 ca = tcp_ca_find(name);
0060 }
0061 #endif
0062 return ca;
0063 }
0064
0065
0066 struct tcp_congestion_ops *tcp_ca_find_key(u32 key)
0067 {
0068 struct tcp_congestion_ops *e;
0069
0070 list_for_each_entry_rcu(e, &tcp_cong_list, list) {
0071 if (e->key == key)
0072 return e;
0073 }
0074
0075 return NULL;
0076 }
0077
0078
0079
0080
0081
0082 int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
0083 {
0084 int ret = 0;
0085
0086
0087 if (!ca->ssthresh || !ca->undo_cwnd ||
0088 !(ca->cong_avoid || ca->cong_control)) {
0089 pr_err("%s does not implement required ops\n", ca->name);
0090 return -EINVAL;
0091 }
0092
0093 ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));
0094
0095 spin_lock(&tcp_cong_list_lock);
0096 if (ca->key == TCP_CA_UNSPEC || tcp_ca_find_key(ca->key)) {
0097 pr_notice("%s already registered or non-unique key\n",
0098 ca->name);
0099 ret = -EEXIST;
0100 } else {
0101 list_add_tail_rcu(&ca->list, &tcp_cong_list);
0102 pr_debug("%s registered\n", ca->name);
0103 }
0104 spin_unlock(&tcp_cong_list_lock);
0105
0106 return ret;
0107 }
0108 EXPORT_SYMBOL_GPL(tcp_register_congestion_control);
0109
0110
0111
0112
0113
0114
0115
0116 void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
0117 {
0118 spin_lock(&tcp_cong_list_lock);
0119 list_del_rcu(&ca->list);
0120 spin_unlock(&tcp_cong_list_lock);
0121
0122
0123
0124
0125
0126
0127
0128
0129 synchronize_rcu();
0130 }
0131 EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
0132
0133 u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca)
0134 {
0135 const struct tcp_congestion_ops *ca;
0136 u32 key = TCP_CA_UNSPEC;
0137
0138 might_sleep();
0139
0140 rcu_read_lock();
0141 ca = tcp_ca_find_autoload(net, name);
0142 if (ca) {
0143 key = ca->key;
0144 *ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN;
0145 }
0146 rcu_read_unlock();
0147
0148 return key;
0149 }
0150
0151 char *tcp_ca_get_name_by_key(u32 key, char *buffer)
0152 {
0153 const struct tcp_congestion_ops *ca;
0154 char *ret = NULL;
0155
0156 rcu_read_lock();
0157 ca = tcp_ca_find_key(key);
0158 if (ca)
0159 ret = strncpy(buffer, ca->name,
0160 TCP_CA_NAME_MAX);
0161 rcu_read_unlock();
0162
0163 return ret;
0164 }
0165
0166
0167 void tcp_assign_congestion_control(struct sock *sk)
0168 {
0169 struct net *net = sock_net(sk);
0170 struct inet_connection_sock *icsk = inet_csk(sk);
0171 const struct tcp_congestion_ops *ca;
0172
0173 rcu_read_lock();
0174 ca = rcu_dereference(net->ipv4.tcp_congestion_control);
0175 if (unlikely(!bpf_try_module_get(ca, ca->owner)))
0176 ca = &tcp_reno;
0177 icsk->icsk_ca_ops = ca;
0178 rcu_read_unlock();
0179
0180 memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
0181 if (ca->flags & TCP_CONG_NEEDS_ECN)
0182 INET_ECN_xmit(sk);
0183 else
0184 INET_ECN_dontxmit(sk);
0185 }
0186
0187 void tcp_init_congestion_control(struct sock *sk)
0188 {
0189 struct inet_connection_sock *icsk = inet_csk(sk);
0190
0191 tcp_sk(sk)->prior_ssthresh = 0;
0192 if (icsk->icsk_ca_ops->init)
0193 icsk->icsk_ca_ops->init(sk);
0194 if (tcp_ca_needs_ecn(sk))
0195 INET_ECN_xmit(sk);
0196 else
0197 INET_ECN_dontxmit(sk);
0198 icsk->icsk_ca_initialized = 1;
0199 }
0200
0201 static void tcp_reinit_congestion_control(struct sock *sk,
0202 const struct tcp_congestion_ops *ca)
0203 {
0204 struct inet_connection_sock *icsk = inet_csk(sk);
0205
0206 tcp_cleanup_congestion_control(sk);
0207 icsk->icsk_ca_ops = ca;
0208 icsk->icsk_ca_setsockopt = 1;
0209 memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
0210
0211 if (ca->flags & TCP_CONG_NEEDS_ECN)
0212 INET_ECN_xmit(sk);
0213 else
0214 INET_ECN_dontxmit(sk);
0215
0216 if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
0217 tcp_init_congestion_control(sk);
0218 }
0219
0220
0221 void tcp_cleanup_congestion_control(struct sock *sk)
0222 {
0223 struct inet_connection_sock *icsk = inet_csk(sk);
0224
0225 if (icsk->icsk_ca_ops->release)
0226 icsk->icsk_ca_ops->release(sk);
0227 bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner);
0228 }
0229
0230
0231 int tcp_set_default_congestion_control(struct net *net, const char *name)
0232 {
0233 struct tcp_congestion_ops *ca;
0234 const struct tcp_congestion_ops *prev;
0235 int ret;
0236
0237 rcu_read_lock();
0238 ca = tcp_ca_find_autoload(net, name);
0239 if (!ca) {
0240 ret = -ENOENT;
0241 } else if (!bpf_try_module_get(ca, ca->owner)) {
0242 ret = -EBUSY;
0243 } else if (!net_eq(net, &init_net) &&
0244 !(ca->flags & TCP_CONG_NON_RESTRICTED)) {
0245
0246 ret = -EPERM;
0247 } else {
0248 prev = xchg(&net->ipv4.tcp_congestion_control, ca);
0249 if (prev)
0250 bpf_module_put(prev, prev->owner);
0251
0252 ca->flags |= TCP_CONG_NON_RESTRICTED;
0253 ret = 0;
0254 }
0255 rcu_read_unlock();
0256
0257 return ret;
0258 }
0259
0260
0261 static int __init tcp_congestion_default(void)
0262 {
0263 return tcp_set_default_congestion_control(&init_net,
0264 CONFIG_DEFAULT_TCP_CONG);
0265 }
0266 late_initcall(tcp_congestion_default);
0267
0268
0269 void tcp_get_available_congestion_control(char *buf, size_t maxlen)
0270 {
0271 struct tcp_congestion_ops *ca;
0272 size_t offs = 0;
0273
0274 rcu_read_lock();
0275 list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
0276 offs += snprintf(buf + offs, maxlen - offs,
0277 "%s%s",
0278 offs == 0 ? "" : " ", ca->name);
0279
0280 if (WARN_ON_ONCE(offs >= maxlen))
0281 break;
0282 }
0283 rcu_read_unlock();
0284 }
0285
0286
0287 void tcp_get_default_congestion_control(struct net *net, char *name)
0288 {
0289 const struct tcp_congestion_ops *ca;
0290
0291 rcu_read_lock();
0292 ca = rcu_dereference(net->ipv4.tcp_congestion_control);
0293 strncpy(name, ca->name, TCP_CA_NAME_MAX);
0294 rcu_read_unlock();
0295 }
0296
0297
0298 void tcp_get_allowed_congestion_control(char *buf, size_t maxlen)
0299 {
0300 struct tcp_congestion_ops *ca;
0301 size_t offs = 0;
0302
0303 *buf = '\0';
0304 rcu_read_lock();
0305 list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
0306 if (!(ca->flags & TCP_CONG_NON_RESTRICTED))
0307 continue;
0308 offs += snprintf(buf + offs, maxlen - offs,
0309 "%s%s",
0310 offs == 0 ? "" : " ", ca->name);
0311
0312 if (WARN_ON_ONCE(offs >= maxlen))
0313 break;
0314 }
0315 rcu_read_unlock();
0316 }
0317
0318
0319 int tcp_set_allowed_congestion_control(char *val)
0320 {
0321 struct tcp_congestion_ops *ca;
0322 char *saved_clone, *clone, *name;
0323 int ret = 0;
0324
0325 saved_clone = clone = kstrdup(val, GFP_USER);
0326 if (!clone)
0327 return -ENOMEM;
0328
0329 spin_lock(&tcp_cong_list_lock);
0330
0331 while ((name = strsep(&clone, " ")) && *name) {
0332 ca = tcp_ca_find(name);
0333 if (!ca) {
0334 ret = -ENOENT;
0335 goto out;
0336 }
0337 }
0338
0339
0340 list_for_each_entry_rcu(ca, &tcp_cong_list, list)
0341 ca->flags &= ~TCP_CONG_NON_RESTRICTED;
0342
0343
0344 while ((name = strsep(&val, " ")) && *name) {
0345 ca = tcp_ca_find(name);
0346 WARN_ON(!ca);
0347 if (ca)
0348 ca->flags |= TCP_CONG_NON_RESTRICTED;
0349 }
0350 out:
0351 spin_unlock(&tcp_cong_list_lock);
0352 kfree(saved_clone);
0353
0354 return ret;
0355 }
0356
0357
0358
0359
0360
0361
0362 int tcp_set_congestion_control(struct sock *sk, const char *name, bool load,
0363 bool cap_net_admin)
0364 {
0365 struct inet_connection_sock *icsk = inet_csk(sk);
0366 const struct tcp_congestion_ops *ca;
0367 int err = 0;
0368
0369 if (icsk->icsk_ca_dst_locked)
0370 return -EPERM;
0371
0372 rcu_read_lock();
0373 if (!load)
0374 ca = tcp_ca_find(name);
0375 else
0376 ca = tcp_ca_find_autoload(sock_net(sk), name);
0377
0378
0379 if (ca == icsk->icsk_ca_ops) {
0380 icsk->icsk_ca_setsockopt = 1;
0381 goto out;
0382 }
0383
0384 if (!ca)
0385 err = -ENOENT;
0386 else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin))
0387 err = -EPERM;
0388 else if (!bpf_try_module_get(ca, ca->owner))
0389 err = -EBUSY;
0390 else
0391 tcp_reinit_congestion_control(sk, ca);
0392 out:
0393 rcu_read_unlock();
0394 return err;
0395 }
0396
0397
0398
0399
0400
0401
0402
0403
0404
0405
0406 u32 tcp_slow_start(struct tcp_sock *tp, u32 acked)
0407 {
0408 u32 cwnd = min(tcp_snd_cwnd(tp) + acked, tp->snd_ssthresh);
0409
0410 acked -= cwnd - tcp_snd_cwnd(tp);
0411 tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp));
0412
0413 return acked;
0414 }
0415 EXPORT_SYMBOL_GPL(tcp_slow_start);
0416
0417
0418
0419
0420 void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked)
0421 {
0422
0423 if (tp->snd_cwnd_cnt >= w) {
0424 tp->snd_cwnd_cnt = 0;
0425 tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1);
0426 }
0427
0428 tp->snd_cwnd_cnt += acked;
0429 if (tp->snd_cwnd_cnt >= w) {
0430 u32 delta = tp->snd_cwnd_cnt / w;
0431
0432 tp->snd_cwnd_cnt -= delta * w;
0433 tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + delta);
0434 }
0435 tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), tp->snd_cwnd_clamp));
0436 }
0437 EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);
0438
0439
0440
0441
0442
0443
0444
0445
0446 void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
0447 {
0448 struct tcp_sock *tp = tcp_sk(sk);
0449
0450 if (!tcp_is_cwnd_limited(sk))
0451 return;
0452
0453
0454 if (tcp_in_slow_start(tp)) {
0455 acked = tcp_slow_start(tp, acked);
0456 if (!acked)
0457 return;
0458 }
0459
0460 tcp_cong_avoid_ai(tp, tcp_snd_cwnd(tp), acked);
0461 }
0462 EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
0463
0464
0465 u32 tcp_reno_ssthresh(struct sock *sk)
0466 {
0467 const struct tcp_sock *tp = tcp_sk(sk);
0468
0469 return max(tcp_snd_cwnd(tp) >> 1U, 2U);
0470 }
0471 EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
0472
0473 u32 tcp_reno_undo_cwnd(struct sock *sk)
0474 {
0475 const struct tcp_sock *tp = tcp_sk(sk);
0476
0477 return max(tcp_snd_cwnd(tp), tp->prior_cwnd);
0478 }
0479 EXPORT_SYMBOL_GPL(tcp_reno_undo_cwnd);
0480
0481 struct tcp_congestion_ops tcp_reno = {
0482 .flags = TCP_CONG_NON_RESTRICTED,
0483 .name = "reno",
0484 .owner = THIS_MODULE,
0485 .ssthresh = tcp_reno_ssthresh,
0486 .cong_avoid = tcp_reno_cong_avoid,
0487 .undo_cwnd = tcp_reno_undo_cwnd,
0488 };