0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027 #include <linux/mm.h>
0028 #include <linux/btf.h>
0029 #include <linux/btf_ids.h>
0030 #include <linux/module.h>
0031 #include <linux/math64.h>
0032 #include <net/tcp.h>
0033
0034 #define BICTCP_BETA_SCALE 1024
0035
0036
0037 #define BICTCP_HZ 10
0038
0039
0040 #define HYSTART_ACK_TRAIN 0x1
0041 #define HYSTART_DELAY 0x2
0042
0043
0044 #define HYSTART_MIN_SAMPLES 8
0045 #define HYSTART_DELAY_MIN (4000U)
0046 #define HYSTART_DELAY_MAX (16000U)
0047 #define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX)
0048
0049 static int fast_convergence __read_mostly = 1;
0050 static int beta __read_mostly = 717;
0051 static int initial_ssthresh __read_mostly;
0052 static int bic_scale __read_mostly = 41;
0053 static int tcp_friendliness __read_mostly = 1;
0054
0055 static int hystart __read_mostly = 1;
0056 static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY;
0057 static int hystart_low_window __read_mostly = 16;
0058 static int hystart_ack_delta_us __read_mostly = 2000;
0059
0060 static u32 cube_rtt_scale __read_mostly;
0061 static u32 beta_scale __read_mostly;
0062 static u64 cube_factor __read_mostly;
0063
0064
0065 module_param(fast_convergence, int, 0644);
0066 MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence");
0067 module_param(beta, int, 0644);
0068 MODULE_PARM_DESC(beta, "beta for multiplicative increase");
0069 module_param(initial_ssthresh, int, 0644);
0070 MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
0071 module_param(bic_scale, int, 0444);
0072 MODULE_PARM_DESC(bic_scale, "scale (scaled by 1024) value for bic function (bic_scale/1024)");
0073 module_param(tcp_friendliness, int, 0644);
0074 MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness");
0075 module_param(hystart, int, 0644);
0076 MODULE_PARM_DESC(hystart, "turn on/off hybrid slow start algorithm");
0077 module_param(hystart_detect, int, 0644);
0078 MODULE_PARM_DESC(hystart_detect, "hybrid slow start detection mechanisms"
0079 " 1: packet-train 2: delay 3: both packet-train and delay");
0080 module_param(hystart_low_window, int, 0644);
0081 MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start");
0082 module_param(hystart_ack_delta_us, int, 0644);
0083 MODULE_PARM_DESC(hystart_ack_delta_us, "spacing between ack's indicating train (usecs)");
0084
0085
0086 struct bictcp {
0087 u32 cnt;
0088 u32 last_max_cwnd;
0089 u32 last_cwnd;
0090 u32 last_time;
0091 u32 bic_origin_point;
0092 u32 bic_K;
0093
0094 u32 delay_min;
0095 u32 epoch_start;
0096 u32 ack_cnt;
0097 u32 tcp_cwnd;
0098 u16 unused;
0099 u8 sample_cnt;
0100 u8 found;
0101 u32 round_start;
0102 u32 end_seq;
0103 u32 last_ack;
0104 u32 curr_rtt;
0105 };
0106
0107 static inline void bictcp_reset(struct bictcp *ca)
0108 {
0109 memset(ca, 0, offsetof(struct bictcp, unused));
0110 ca->found = 0;
0111 }
0112
0113 static inline u32 bictcp_clock_us(const struct sock *sk)
0114 {
0115 return tcp_sk(sk)->tcp_mstamp;
0116 }
0117
0118 static inline void bictcp_hystart_reset(struct sock *sk)
0119 {
0120 struct tcp_sock *tp = tcp_sk(sk);
0121 struct bictcp *ca = inet_csk_ca(sk);
0122
0123 ca->round_start = ca->last_ack = bictcp_clock_us(sk);
0124 ca->end_seq = tp->snd_nxt;
0125 ca->curr_rtt = ~0U;
0126 ca->sample_cnt = 0;
0127 }
0128
0129 static void cubictcp_init(struct sock *sk)
0130 {
0131 struct bictcp *ca = inet_csk_ca(sk);
0132
0133 bictcp_reset(ca);
0134
0135 if (hystart)
0136 bictcp_hystart_reset(sk);
0137
0138 if (!hystart && initial_ssthresh)
0139 tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
0140 }
0141
0142 static void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event)
0143 {
0144 if (event == CA_EVENT_TX_START) {
0145 struct bictcp *ca = inet_csk_ca(sk);
0146 u32 now = tcp_jiffies32;
0147 s32 delta;
0148
0149 delta = now - tcp_sk(sk)->lsndtime;
0150
0151
0152
0153
0154 if (ca->epoch_start && delta > 0) {
0155 ca->epoch_start += delta;
0156 if (after(ca->epoch_start, now))
0157 ca->epoch_start = now;
0158 }
0159 return;
0160 }
0161 }
0162
0163
0164
0165
0166
0167 static u32 cubic_root(u64 a)
0168 {
0169 u32 x, b, shift;
0170
0171
0172
0173
0174
0175
0176
0177
0178 static const u8 v[] = {
0179 0, 54, 54, 54, 118, 118, 118, 118,
0180 123, 129, 134, 138, 143, 147, 151, 156,
0181 157, 161, 164, 168, 170, 173, 176, 179,
0182 181, 185, 187, 190, 192, 194, 197, 199,
0183 200, 202, 204, 206, 209, 211, 213, 215,
0184 217, 219, 221, 222, 224, 225, 227, 229,
0185 231, 232, 234, 236, 237, 239, 240, 242,
0186 244, 245, 246, 248, 250, 251, 252, 254,
0187 };
0188
0189 b = fls64(a);
0190 if (b < 7) {
0191
0192 return ((u32)v[(u32)a] + 35) >> 6;
0193 }
0194
0195 b = ((b * 84) >> 8) - 1;
0196 shift = (a >> (b * 3));
0197
0198 x = ((u32)(((u32)v[shift] + 10) << b)) >> 6;
0199
0200
0201
0202
0203
0204
0205
0206 x = (2 * x + (u32)div64_u64(a, (u64)x * (u64)(x - 1)));
0207 x = ((x * 341) >> 10);
0208 return x;
0209 }
0210
0211
0212
0213
0214 static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked)
0215 {
0216 u32 delta, bic_target, max_cnt;
0217 u64 offs, t;
0218
0219 ca->ack_cnt += acked;
0220
0221 if (ca->last_cwnd == cwnd &&
0222 (s32)(tcp_jiffies32 - ca->last_time) <= HZ / 32)
0223 return;
0224
0225
0226
0227
0228
0229 if (ca->epoch_start && tcp_jiffies32 == ca->last_time)
0230 goto tcp_friendliness;
0231
0232 ca->last_cwnd = cwnd;
0233 ca->last_time = tcp_jiffies32;
0234
0235 if (ca->epoch_start == 0) {
0236 ca->epoch_start = tcp_jiffies32;
0237 ca->ack_cnt = acked;
0238 ca->tcp_cwnd = cwnd;
0239
0240 if (ca->last_max_cwnd <= cwnd) {
0241 ca->bic_K = 0;
0242 ca->bic_origin_point = cwnd;
0243 } else {
0244
0245
0246
0247 ca->bic_K = cubic_root(cube_factor
0248 * (ca->last_max_cwnd - cwnd));
0249 ca->bic_origin_point = ca->last_max_cwnd;
0250 }
0251 }
0252
0253
0254
0255
0256
0257
0258
0259
0260
0261
0262
0263
0264
0265
0266
0267 t = (s32)(tcp_jiffies32 - ca->epoch_start);
0268 t += usecs_to_jiffies(ca->delay_min);
0269
0270 t <<= BICTCP_HZ;
0271 do_div(t, HZ);
0272
0273 if (t < ca->bic_K)
0274 offs = ca->bic_K - t;
0275 else
0276 offs = t - ca->bic_K;
0277
0278
0279 delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ);
0280 if (t < ca->bic_K)
0281 bic_target = ca->bic_origin_point - delta;
0282 else
0283 bic_target = ca->bic_origin_point + delta;
0284
0285
0286 if (bic_target > cwnd) {
0287 ca->cnt = cwnd / (bic_target - cwnd);
0288 } else {
0289 ca->cnt = 100 * cwnd;
0290 }
0291
0292
0293
0294
0295
0296 if (ca->last_max_cwnd == 0 && ca->cnt > 20)
0297 ca->cnt = 20;
0298
0299 tcp_friendliness:
0300
0301 if (tcp_friendliness) {
0302 u32 scale = beta_scale;
0303
0304 delta = (cwnd * scale) >> 3;
0305 while (ca->ack_cnt > delta) {
0306 ca->ack_cnt -= delta;
0307 ca->tcp_cwnd++;
0308 }
0309
0310 if (ca->tcp_cwnd > cwnd) {
0311 delta = ca->tcp_cwnd - cwnd;
0312 max_cnt = cwnd / delta;
0313 if (ca->cnt > max_cnt)
0314 ca->cnt = max_cnt;
0315 }
0316 }
0317
0318
0319
0320
0321 ca->cnt = max(ca->cnt, 2U);
0322 }
0323
0324 static void cubictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
0325 {
0326 struct tcp_sock *tp = tcp_sk(sk);
0327 struct bictcp *ca = inet_csk_ca(sk);
0328
0329 if (!tcp_is_cwnd_limited(sk))
0330 return;
0331
0332 if (tcp_in_slow_start(tp)) {
0333 acked = tcp_slow_start(tp, acked);
0334 if (!acked)
0335 return;
0336 }
0337 bictcp_update(ca, tcp_snd_cwnd(tp), acked);
0338 tcp_cong_avoid_ai(tp, ca->cnt, acked);
0339 }
0340
0341 static u32 cubictcp_recalc_ssthresh(struct sock *sk)
0342 {
0343 const struct tcp_sock *tp = tcp_sk(sk);
0344 struct bictcp *ca = inet_csk_ca(sk);
0345
0346 ca->epoch_start = 0;
0347
0348
0349 if (tcp_snd_cwnd(tp) < ca->last_max_cwnd && fast_convergence)
0350 ca->last_max_cwnd = (tcp_snd_cwnd(tp) * (BICTCP_BETA_SCALE + beta))
0351 / (2 * BICTCP_BETA_SCALE);
0352 else
0353 ca->last_max_cwnd = tcp_snd_cwnd(tp);
0354
0355 return max((tcp_snd_cwnd(tp) * beta) / BICTCP_BETA_SCALE, 2U);
0356 }
0357
0358 static void cubictcp_state(struct sock *sk, u8 new_state)
0359 {
0360 if (new_state == TCP_CA_Loss) {
0361 bictcp_reset(inet_csk_ca(sk));
0362 bictcp_hystart_reset(sk);
0363 }
0364 }
0365
0366
0367
0368
0369
0370
0371
0372
0373
0374
0375 static u32 hystart_ack_delay(const struct sock *sk)
0376 {
0377 unsigned long rate;
0378
0379 rate = READ_ONCE(sk->sk_pacing_rate);
0380 if (!rate)
0381 return 0;
0382 return min_t(u64, USEC_PER_MSEC,
0383 div64_ul((u64)sk->sk_gso_max_size * 4 * USEC_PER_SEC, rate));
0384 }
0385
0386 static void hystart_update(struct sock *sk, u32 delay)
0387 {
0388 struct tcp_sock *tp = tcp_sk(sk);
0389 struct bictcp *ca = inet_csk_ca(sk);
0390 u32 threshold;
0391
0392 if (after(tp->snd_una, ca->end_seq))
0393 bictcp_hystart_reset(sk);
0394
0395 if (hystart_detect & HYSTART_ACK_TRAIN) {
0396 u32 now = bictcp_clock_us(sk);
0397
0398
0399 if ((s32)(now - ca->last_ack) <= hystart_ack_delta_us) {
0400 ca->last_ack = now;
0401
0402 threshold = ca->delay_min + hystart_ack_delay(sk);
0403
0404
0405
0406
0407
0408
0409 if (sk->sk_pacing_status == SK_PACING_NONE)
0410 threshold >>= 1;
0411
0412 if ((s32)(now - ca->round_start) > threshold) {
0413 ca->found = 1;
0414 pr_debug("hystart_ack_train (%u > %u) delay_min %u (+ ack_delay %u) cwnd %u\n",
0415 now - ca->round_start, threshold,
0416 ca->delay_min, hystart_ack_delay(sk), tcp_snd_cwnd(tp));
0417 NET_INC_STATS(sock_net(sk),
0418 LINUX_MIB_TCPHYSTARTTRAINDETECT);
0419 NET_ADD_STATS(sock_net(sk),
0420 LINUX_MIB_TCPHYSTARTTRAINCWND,
0421 tcp_snd_cwnd(tp));
0422 tp->snd_ssthresh = tcp_snd_cwnd(tp);
0423 }
0424 }
0425 }
0426
0427 if (hystart_detect & HYSTART_DELAY) {
0428
0429 if (ca->curr_rtt > delay)
0430 ca->curr_rtt = delay;
0431 if (ca->sample_cnt < HYSTART_MIN_SAMPLES) {
0432 ca->sample_cnt++;
0433 } else {
0434 if (ca->curr_rtt > ca->delay_min +
0435 HYSTART_DELAY_THRESH(ca->delay_min >> 3)) {
0436 ca->found = 1;
0437 NET_INC_STATS(sock_net(sk),
0438 LINUX_MIB_TCPHYSTARTDELAYDETECT);
0439 NET_ADD_STATS(sock_net(sk),
0440 LINUX_MIB_TCPHYSTARTDELAYCWND,
0441 tcp_snd_cwnd(tp));
0442 tp->snd_ssthresh = tcp_snd_cwnd(tp);
0443 }
0444 }
0445 }
0446 }
0447
0448 static void cubictcp_acked(struct sock *sk, const struct ack_sample *sample)
0449 {
0450 const struct tcp_sock *tp = tcp_sk(sk);
0451 struct bictcp *ca = inet_csk_ca(sk);
0452 u32 delay;
0453
0454
0455 if (sample->rtt_us < 0)
0456 return;
0457
0458
0459 if (ca->epoch_start && (s32)(tcp_jiffies32 - ca->epoch_start) < HZ)
0460 return;
0461
0462 delay = sample->rtt_us;
0463 if (delay == 0)
0464 delay = 1;
0465
0466
0467 if (ca->delay_min == 0 || ca->delay_min > delay)
0468 ca->delay_min = delay;
0469
0470
0471 if (!ca->found && tcp_in_slow_start(tp) && hystart &&
0472 tcp_snd_cwnd(tp) >= hystart_low_window)
0473 hystart_update(sk, delay);
0474 }
0475
0476 static struct tcp_congestion_ops cubictcp __read_mostly = {
0477 .init = cubictcp_init,
0478 .ssthresh = cubictcp_recalc_ssthresh,
0479 .cong_avoid = cubictcp_cong_avoid,
0480 .set_state = cubictcp_state,
0481 .undo_cwnd = tcp_reno_undo_cwnd,
0482 .cwnd_event = cubictcp_cwnd_event,
0483 .pkts_acked = cubictcp_acked,
0484 .owner = THIS_MODULE,
0485 .name = "cubic",
0486 };
0487
0488 BTF_SET8_START(tcp_cubic_check_kfunc_ids)
0489 #ifdef CONFIG_X86
0490 #ifdef CONFIG_DYNAMIC_FTRACE
0491 BTF_ID_FLAGS(func, cubictcp_init)
0492 BTF_ID_FLAGS(func, cubictcp_recalc_ssthresh)
0493 BTF_ID_FLAGS(func, cubictcp_cong_avoid)
0494 BTF_ID_FLAGS(func, cubictcp_state)
0495 BTF_ID_FLAGS(func, cubictcp_cwnd_event)
0496 BTF_ID_FLAGS(func, cubictcp_acked)
0497 #endif
0498 #endif
0499 BTF_SET8_END(tcp_cubic_check_kfunc_ids)
0500
0501 static const struct btf_kfunc_id_set tcp_cubic_kfunc_set = {
0502 .owner = THIS_MODULE,
0503 .set = &tcp_cubic_check_kfunc_ids,
0504 };
0505
0506 static int __init cubictcp_register(void)
0507 {
0508 int ret;
0509
0510 BUILD_BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE);
0511
0512
0513
0514
0515
0516 beta_scale = 8*(BICTCP_BETA_SCALE+beta) / 3
0517 / (BICTCP_BETA_SCALE - beta);
0518
0519 cube_rtt_scale = (bic_scale * 10);
0520
0521
0522
0523
0524
0525
0526
0527
0528
0529
0530
0531
0532
0533
0534
0535 cube_factor = 1ull << (10+3*BICTCP_HZ);
0536
0537
0538 do_div(cube_factor, bic_scale * 10);
0539
0540 ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &tcp_cubic_kfunc_set);
0541 if (ret < 0)
0542 return ret;
0543 return tcp_register_congestion_control(&cubictcp);
0544 }
0545
0546 static void __exit cubictcp_unregister(void)
0547 {
0548 tcp_unregister_congestion_control(&cubictcp);
0549 }
0550
0551 module_init(cubictcp_register);
0552 module_exit(cubictcp_unregister);
0553
0554 MODULE_AUTHOR("Sangtae Ha, Stephen Hemminger");
0555 MODULE_LICENSE("GPL");
0556 MODULE_DESCRIPTION("CUBIC TCP");
0557 MODULE_VERSION("2.3");