0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017 #include <linux/bpf.h>
0018 #include <linux/stddef.h>
0019 #include <linux/tcp.h>
0020 #include "bpf_tcp_helpers.h"
0021
0022 char _license[] SEC("license") = "GPL";
0023
0024 #define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi)
0025
0026 #define BICTCP_BETA_SCALE 1024
0027
0028
0029 #define BICTCP_HZ 10
0030
0031
0032 #define HYSTART_ACK_TRAIN 0x1
0033 #define HYSTART_DELAY 0x2
0034
0035
0036 #define HYSTART_MIN_SAMPLES 8
0037 #define HYSTART_DELAY_MIN (4000U)
0038 #define HYSTART_DELAY_MAX (16000U)
0039 #define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX)
0040
0041 static int fast_convergence = 1;
0042 static const int beta = 717;
0043 static int initial_ssthresh;
0044 static const int bic_scale = 41;
0045 static int tcp_friendliness = 1;
0046
0047 static int hystart = 1;
0048 static int hystart_detect = HYSTART_ACK_TRAIN | HYSTART_DELAY;
0049 static int hystart_low_window = 16;
0050 static int hystart_ack_delta_us = 2000;
0051
0052 static const __u32 cube_rtt_scale = (bic_scale * 10);
0053 static const __u32 beta_scale = 8*(BICTCP_BETA_SCALE+beta) / 3
0054 / (BICTCP_BETA_SCALE - beta);
0055
0056
0057
0058
0059
0060
0061
0062
0063
0064
0065
0066
0067
0068
0069 static const __u64 cube_factor = (__u64)(1ull << (10+3*BICTCP_HZ))
0070 / (bic_scale * 10);
0071
0072
0073 struct bictcp {
0074 __u32 cnt;
0075 __u32 last_max_cwnd;
0076 __u32 last_cwnd;
0077 __u32 last_time;
0078 __u32 bic_origin_point;
0079 __u32 bic_K;
0080
0081 __u32 delay_min;
0082 __u32 epoch_start;
0083 __u32 ack_cnt;
0084 __u32 tcp_cwnd;
0085 __u16 unused;
0086 __u8 sample_cnt;
0087 __u8 found;
0088 __u32 round_start;
0089 __u32 end_seq;
0090 __u32 last_ack;
0091 __u32 curr_rtt;
0092 };
0093
0094 static inline void bictcp_reset(struct bictcp *ca)
0095 {
0096 ca->cnt = 0;
0097 ca->last_max_cwnd = 0;
0098 ca->last_cwnd = 0;
0099 ca->last_time = 0;
0100 ca->bic_origin_point = 0;
0101 ca->bic_K = 0;
0102 ca->delay_min = 0;
0103 ca->epoch_start = 0;
0104 ca->ack_cnt = 0;
0105 ca->tcp_cwnd = 0;
0106 ca->found = 0;
0107 }
0108
0109 extern unsigned long CONFIG_HZ __kconfig;
0110 #define HZ CONFIG_HZ
0111 #define USEC_PER_MSEC 1000UL
0112 #define USEC_PER_SEC 1000000UL
0113 #define USEC_PER_JIFFY (USEC_PER_SEC / HZ)
0114
0115 static __always_inline __u64 div64_u64(__u64 dividend, __u64 divisor)
0116 {
0117 return dividend / divisor;
0118 }
0119
0120 #define div64_ul div64_u64
0121
0122 #define BITS_PER_U64 (sizeof(__u64) * 8)
0123 static __always_inline int fls64(__u64 x)
0124 {
0125 int num = BITS_PER_U64 - 1;
0126
0127 if (x == 0)
0128 return 0;
0129
0130 if (!(x & (~0ull << (BITS_PER_U64-32)))) {
0131 num -= 32;
0132 x <<= 32;
0133 }
0134 if (!(x & (~0ull << (BITS_PER_U64-16)))) {
0135 num -= 16;
0136 x <<= 16;
0137 }
0138 if (!(x & (~0ull << (BITS_PER_U64-8)))) {
0139 num -= 8;
0140 x <<= 8;
0141 }
0142 if (!(x & (~0ull << (BITS_PER_U64-4)))) {
0143 num -= 4;
0144 x <<= 4;
0145 }
0146 if (!(x & (~0ull << (BITS_PER_U64-2)))) {
0147 num -= 2;
0148 x <<= 2;
0149 }
0150 if (!(x & (~0ull << (BITS_PER_U64-1))))
0151 num -= 1;
0152
0153 return num + 1;
0154 }
0155
0156 static __always_inline __u32 bictcp_clock_us(const struct sock *sk)
0157 {
0158 return tcp_sk(sk)->tcp_mstamp;
0159 }
0160
0161 static __always_inline void bictcp_hystart_reset(struct sock *sk)
0162 {
0163 struct tcp_sock *tp = tcp_sk(sk);
0164 struct bictcp *ca = inet_csk_ca(sk);
0165
0166 ca->round_start = ca->last_ack = bictcp_clock_us(sk);
0167 ca->end_seq = tp->snd_nxt;
0168 ca->curr_rtt = ~0U;
0169 ca->sample_cnt = 0;
0170 }
0171
0172
0173 SEC("struct_ops/bpf_cubic_init")
0174 void BPF_PROG(bpf_cubic_init, struct sock *sk)
0175 {
0176 struct bictcp *ca = inet_csk_ca(sk);
0177
0178 bictcp_reset(ca);
0179
0180 if (hystart)
0181 bictcp_hystart_reset(sk);
0182
0183 if (!hystart && initial_ssthresh)
0184 tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
0185 }
0186
0187
0188 SEC("struct_ops/bpf_cubic_cwnd_event")
0189 void BPF_PROG(bpf_cubic_cwnd_event, struct sock *sk, enum tcp_ca_event event)
0190 {
0191 if (event == CA_EVENT_TX_START) {
0192 struct bictcp *ca = inet_csk_ca(sk);
0193 __u32 now = tcp_jiffies32;
0194 __s32 delta;
0195
0196 delta = now - tcp_sk(sk)->lsndtime;
0197
0198
0199
0200
0201 if (ca->epoch_start && delta > 0) {
0202 ca->epoch_start += delta;
0203 if (after(ca->epoch_start, now))
0204 ca->epoch_start = now;
0205 }
0206 return;
0207 }
0208 }
0209
0210
0211
0212
0213
0214
0215
0216
0217
0218 static const __u8 v[] = {
0219 0, 54, 54, 54, 118, 118, 118, 118,
0220 123, 129, 134, 138, 143, 147, 151, 156,
0221 157, 161, 164, 168, 170, 173, 176, 179,
0222 181, 185, 187, 190, 192, 194, 197, 199,
0223 200, 202, 204, 206, 209, 211, 213, 215,
0224 217, 219, 221, 222, 224, 225, 227, 229,
0225 231, 232, 234, 236, 237, 239, 240, 242,
0226 244, 245, 246, 248, 250, 251, 252, 254,
0227 };
0228
0229
0230
0231
0232
0233 static __always_inline __u32 cubic_root(__u64 a)
0234 {
0235 __u32 x, b, shift;
0236
0237 if (a < 64) {
0238
0239 return ((__u32)v[(__u32)a] + 35) >> 6;
0240 }
0241
0242 b = fls64(a);
0243 b = ((b * 84) >> 8) - 1;
0244 shift = (a >> (b * 3));
0245
0246
0247 if (shift >= 64)
0248 return 0;
0249
0250 x = ((__u32)(((__u32)v[shift] + 10) << b)) >> 6;
0251
0252
0253
0254
0255
0256
0257
0258 x = (2 * x + (__u32)div64_u64(a, (__u64)x * (__u64)(x - 1)));
0259 x = ((x * 341) >> 10);
0260 return x;
0261 }
0262
0263
0264
0265
0266 static __always_inline void bictcp_update(struct bictcp *ca, __u32 cwnd,
0267 __u32 acked)
0268 {
0269 __u32 delta, bic_target, max_cnt;
0270 __u64 offs, t;
0271
0272 ca->ack_cnt += acked;
0273
0274 if (ca->last_cwnd == cwnd &&
0275 (__s32)(tcp_jiffies32 - ca->last_time) <= HZ / 32)
0276 return;
0277
0278
0279
0280
0281
0282 if (ca->epoch_start && tcp_jiffies32 == ca->last_time)
0283 goto tcp_friendliness;
0284
0285 ca->last_cwnd = cwnd;
0286 ca->last_time = tcp_jiffies32;
0287
0288 if (ca->epoch_start == 0) {
0289 ca->epoch_start = tcp_jiffies32;
0290 ca->ack_cnt = acked;
0291 ca->tcp_cwnd = cwnd;
0292
0293 if (ca->last_max_cwnd <= cwnd) {
0294 ca->bic_K = 0;
0295 ca->bic_origin_point = cwnd;
0296 } else {
0297
0298
0299
0300 ca->bic_K = cubic_root(cube_factor
0301 * (ca->last_max_cwnd - cwnd));
0302 ca->bic_origin_point = ca->last_max_cwnd;
0303 }
0304 }
0305
0306
0307
0308
0309
0310
0311
0312
0313
0314
0315
0316
0317
0318
0319
0320 t = (__s32)(tcp_jiffies32 - ca->epoch_start) * USEC_PER_JIFFY;
0321 t += ca->delay_min;
0322
0323 t <<= BICTCP_HZ;
0324 t /= USEC_PER_SEC;
0325
0326 if (t < ca->bic_K)
0327 offs = ca->bic_K - t;
0328 else
0329 offs = t - ca->bic_K;
0330
0331
0332 delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ);
0333 if (t < ca->bic_K)
0334 bic_target = ca->bic_origin_point - delta;
0335 else
0336 bic_target = ca->bic_origin_point + delta;
0337
0338
0339 if (bic_target > cwnd) {
0340 ca->cnt = cwnd / (bic_target - cwnd);
0341 } else {
0342 ca->cnt = 100 * cwnd;
0343 }
0344
0345
0346
0347
0348
0349 if (ca->last_max_cwnd == 0 && ca->cnt > 20)
0350 ca->cnt = 20;
0351
0352 tcp_friendliness:
0353
0354 if (tcp_friendliness) {
0355 __u32 scale = beta_scale;
0356 __u32 n;
0357
0358
0359 delta = (cwnd * scale) >> 3;
0360 if (ca->ack_cnt > delta && delta) {
0361 n = ca->ack_cnt / delta;
0362 ca->ack_cnt -= n * delta;
0363 ca->tcp_cwnd += n;
0364 }
0365
0366 if (ca->tcp_cwnd > cwnd) {
0367 delta = ca->tcp_cwnd - cwnd;
0368 max_cnt = cwnd / delta;
0369 if (ca->cnt > max_cnt)
0370 ca->cnt = max_cnt;
0371 }
0372 }
0373
0374
0375
0376
0377 ca->cnt = max(ca->cnt, 2U);
0378 }
0379
0380
0381 void BPF_STRUCT_OPS(bpf_cubic_cong_avoid, struct sock *sk, __u32 ack, __u32 acked)
0382 {
0383 struct tcp_sock *tp = tcp_sk(sk);
0384 struct bictcp *ca = inet_csk_ca(sk);
0385
0386 if (!tcp_is_cwnd_limited(sk))
0387 return;
0388
0389 if (tcp_in_slow_start(tp)) {
0390 if (hystart && after(ack, ca->end_seq))
0391 bictcp_hystart_reset(sk);
0392 acked = tcp_slow_start(tp, acked);
0393 if (!acked)
0394 return;
0395 }
0396 bictcp_update(ca, tp->snd_cwnd, acked);
0397 tcp_cong_avoid_ai(tp, ca->cnt, acked);
0398 }
0399
0400 __u32 BPF_STRUCT_OPS(bpf_cubic_recalc_ssthresh, struct sock *sk)
0401 {
0402 const struct tcp_sock *tp = tcp_sk(sk);
0403 struct bictcp *ca = inet_csk_ca(sk);
0404
0405 ca->epoch_start = 0;
0406
0407
0408 if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
0409 ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
0410 / (2 * BICTCP_BETA_SCALE);
0411 else
0412 ca->last_max_cwnd = tp->snd_cwnd;
0413
0414 return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
0415 }
0416
0417 void BPF_STRUCT_OPS(bpf_cubic_state, struct sock *sk, __u8 new_state)
0418 {
0419 if (new_state == TCP_CA_Loss) {
0420 bictcp_reset(inet_csk_ca(sk));
0421 bictcp_hystart_reset(sk);
0422 }
0423 }
0424
0425 #define GSO_MAX_SIZE 65536
0426
0427
0428
0429
0430
0431
0432
0433
0434
0435
0436 static __always_inline __u32 hystart_ack_delay(struct sock *sk)
0437 {
0438 unsigned long rate;
0439
0440 rate = sk->sk_pacing_rate;
0441 if (!rate)
0442 return 0;
0443 return min((__u64)USEC_PER_MSEC,
0444 div64_ul((__u64)GSO_MAX_SIZE * 4 * USEC_PER_SEC, rate));
0445 }
0446
0447 static __always_inline void hystart_update(struct sock *sk, __u32 delay)
0448 {
0449 struct tcp_sock *tp = tcp_sk(sk);
0450 struct bictcp *ca = inet_csk_ca(sk);
0451 __u32 threshold;
0452
0453 if (hystart_detect & HYSTART_ACK_TRAIN) {
0454 __u32 now = bictcp_clock_us(sk);
0455
0456
0457 if ((__s32)(now - ca->last_ack) <= hystart_ack_delta_us) {
0458 ca->last_ack = now;
0459
0460 threshold = ca->delay_min + hystart_ack_delay(sk);
0461
0462
0463
0464
0465
0466
0467 if (sk->sk_pacing_status == SK_PACING_NONE)
0468 threshold >>= 1;
0469
0470 if ((__s32)(now - ca->round_start) > threshold) {
0471 ca->found = 1;
0472 tp->snd_ssthresh = tp->snd_cwnd;
0473 }
0474 }
0475 }
0476
0477 if (hystart_detect & HYSTART_DELAY) {
0478
0479 if (ca->curr_rtt > delay)
0480 ca->curr_rtt = delay;
0481 if (ca->sample_cnt < HYSTART_MIN_SAMPLES) {
0482 ca->sample_cnt++;
0483 } else {
0484 if (ca->curr_rtt > ca->delay_min +
0485 HYSTART_DELAY_THRESH(ca->delay_min >> 3)) {
0486 ca->found = 1;
0487 tp->snd_ssthresh = tp->snd_cwnd;
0488 }
0489 }
0490 }
0491 }
0492
0493 void BPF_STRUCT_OPS(bpf_cubic_acked, struct sock *sk,
0494 const struct ack_sample *sample)
0495 {
0496 const struct tcp_sock *tp = tcp_sk(sk);
0497 struct bictcp *ca = inet_csk_ca(sk);
0498 __u32 delay;
0499
0500
0501 if (sample->rtt_us < 0)
0502 return;
0503
0504
0505 if (ca->epoch_start && (__s32)(tcp_jiffies32 - ca->epoch_start) < HZ)
0506 return;
0507
0508 delay = sample->rtt_us;
0509 if (delay == 0)
0510 delay = 1;
0511
0512
0513 if (ca->delay_min == 0 || ca->delay_min > delay)
0514 ca->delay_min = delay;
0515
0516
0517 if (!ca->found && tcp_in_slow_start(tp) && hystart &&
0518 tp->snd_cwnd >= hystart_low_window)
0519 hystart_update(sk, delay);
0520 }
0521
0522 extern __u32 tcp_reno_undo_cwnd(struct sock *sk) __ksym;
0523
0524 __u32 BPF_STRUCT_OPS(bpf_cubic_undo_cwnd, struct sock *sk)
0525 {
0526 return tcp_reno_undo_cwnd(sk);
0527 }
0528
0529 SEC(".struct_ops")
0530 struct tcp_congestion_ops cubic = {
0531 .init = (void *)bpf_cubic_init,
0532 .ssthresh = (void *)bpf_cubic_recalc_ssthresh,
0533 .cong_avoid = (void *)bpf_cubic_cong_avoid,
0534 .set_state = (void *)bpf_cubic_state,
0535 .undo_cwnd = (void *)bpf_cubic_undo_cwnd,
0536 .cwnd_event = (void *)bpf_cubic_cwnd_event,
0537 .pkts_acked = (void *)bpf_cubic_acked,
0538 .name = "bpf_cubic",
0539 };