0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038
0039
0040
0041
0042
0043
0044
0045
0046
0047
0048
0049
0050
0051
0052
0053
0054
0055
0056
0057
0058
0059 #include <linux/btf.h>
0060 #include <linux/btf_ids.h>
0061 #include <linux/module.h>
0062 #include <net/tcp.h>
0063 #include <linux/inet_diag.h>
0064 #include <linux/inet.h>
0065 #include <linux/random.h>
0066 #include <linux/win_minmax.h>
0067
0068
0069
0070
0071
0072
0073
0074 #define BW_SCALE 24
0075 #define BW_UNIT (1 << BW_SCALE)
0076
0077 #define BBR_SCALE 8
0078 #define BBR_UNIT (1 << BBR_SCALE)
0079
0080
0081 enum bbr_mode {
0082 BBR_STARTUP,
0083 BBR_DRAIN,
0084 BBR_PROBE_BW,
0085 BBR_PROBE_RTT,
0086 };
0087
0088
0089 struct bbr {
0090 u32 min_rtt_us;
0091 u32 min_rtt_stamp;
0092 u32 probe_rtt_done_stamp;
0093 struct minmax bw;
0094 u32 rtt_cnt;
0095 u32 next_rtt_delivered;
0096 u64 cycle_mstamp;
0097 u32 mode:3,
0098 prev_ca_state:3,
0099 packet_conservation:1,
0100 round_start:1,
0101 idle_restart:1,
0102 probe_rtt_round_done:1,
0103 unused:13,
0104 lt_is_sampling:1,
0105 lt_rtt_cnt:7,
0106 lt_use_bw:1;
0107 u32 lt_bw;
0108 u32 lt_last_delivered;
0109 u32 lt_last_stamp;
0110 u32 lt_last_lost;
0111 u32 pacing_gain:10,
0112 cwnd_gain:10,
0113 full_bw_reached:1,
0114 full_bw_cnt:2,
0115 cycle_idx:3,
0116 has_seen_rtt:1,
0117 unused_b:5;
0118 u32 prior_cwnd;
0119 u32 full_bw;
0120
0121
0122 u64 ack_epoch_mstamp;
0123 u16 extra_acked[2];
0124 u32 ack_epoch_acked:20,
0125 extra_acked_win_rtts:5,
0126 extra_acked_win_idx:1,
0127 unused_c:6;
0128 };
0129
0130 #define CYCLE_LEN 8
0131
0132
0133 static const int bbr_bw_rtts = CYCLE_LEN + 2;
0134
0135 static const u32 bbr_min_rtt_win_sec = 10;
0136
0137 static const u32 bbr_probe_rtt_mode_ms = 200;
0138
0139 static const int bbr_min_tso_rate = 1200000;
0140
0141
0142
0143
0144
0145
0146
0147 static const int bbr_pacing_margin_percent = 1;
0148
0149
0150
0151
0152
0153
0154 static const int bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1;
0155
0156
0157
0158 static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
0159
0160 static const int bbr_cwnd_gain = BBR_UNIT * 2;
0161
0162 static const int bbr_pacing_gain[] = {
0163 BBR_UNIT * 5 / 4,
0164 BBR_UNIT * 3 / 4,
0165 BBR_UNIT, BBR_UNIT, BBR_UNIT,
0166 BBR_UNIT, BBR_UNIT, BBR_UNIT
0167 };
0168
0169 static const u32 bbr_cycle_rand = 7;
0170
0171
0172
0173
0174
0175 static const u32 bbr_cwnd_min_target = 4;
0176
0177
0178
0179 static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4;
0180
0181 static const u32 bbr_full_bw_cnt = 3;
0182
0183
0184
0185 static const u32 bbr_lt_intvl_min_rtts = 4;
0186
0187 static const u32 bbr_lt_loss_thresh = 50;
0188
0189 static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8;
0190
0191 static const u32 bbr_lt_bw_diff = 4000 / 8;
0192
0193 static const u32 bbr_lt_bw_max_rtts = 48;
0194
0195
0196 static const int bbr_extra_acked_gain = BBR_UNIT;
0197
0198 static const u32 bbr_extra_acked_win_rtts = 5;
0199
0200 static const u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20;
0201
0202 static const u32 bbr_extra_acked_max_us = 100 * 1000;
0203
0204 static void bbr_check_probe_rtt_done(struct sock *sk);
0205
0206
0207 static bool bbr_full_bw_reached(const struct sock *sk)
0208 {
0209 const struct bbr *bbr = inet_csk_ca(sk);
0210
0211 return bbr->full_bw_reached;
0212 }
0213
0214
0215 static u32 bbr_max_bw(const struct sock *sk)
0216 {
0217 struct bbr *bbr = inet_csk_ca(sk);
0218
0219 return minmax_get(&bbr->bw);
0220 }
0221
0222
0223 static u32 bbr_bw(const struct sock *sk)
0224 {
0225 struct bbr *bbr = inet_csk_ca(sk);
0226
0227 return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk);
0228 }
0229
0230
0231
0232
0233 static u16 bbr_extra_acked(const struct sock *sk)
0234 {
0235 struct bbr *bbr = inet_csk_ca(sk);
0236
0237 return max(bbr->extra_acked[0], bbr->extra_acked[1]);
0238 }
0239
0240
0241
0242
0243
0244 static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
0245 {
0246 unsigned int mss = tcp_sk(sk)->mss_cache;
0247
0248 rate *= mss;
0249 rate *= gain;
0250 rate >>= BBR_SCALE;
0251 rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent);
0252 return rate >> BW_SCALE;
0253 }
0254
0255
0256 static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
0257 {
0258 u64 rate = bw;
0259
0260 rate = bbr_rate_bytes_per_sec(sk, rate, gain);
0261 rate = min_t(u64, rate, sk->sk_max_pacing_rate);
0262 return rate;
0263 }
0264
0265
0266 static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
0267 {
0268 struct tcp_sock *tp = tcp_sk(sk);
0269 struct bbr *bbr = inet_csk_ca(sk);
0270 u64 bw;
0271 u32 rtt_us;
0272
0273 if (tp->srtt_us) {
0274 rtt_us = max(tp->srtt_us >> 3, 1U);
0275 bbr->has_seen_rtt = 1;
0276 } else {
0277 rtt_us = USEC_PER_MSEC;
0278 }
0279 bw = (u64)tcp_snd_cwnd(tp) * BW_UNIT;
0280 do_div(bw, rtt_us);
0281 sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain);
0282 }
0283
0284
0285 static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
0286 {
0287 struct tcp_sock *tp = tcp_sk(sk);
0288 struct bbr *bbr = inet_csk_ca(sk);
0289 unsigned long rate = bbr_bw_to_pacing_rate(sk, bw, gain);
0290
0291 if (unlikely(!bbr->has_seen_rtt && tp->srtt_us))
0292 bbr_init_pacing_rate_from_rtt(sk);
0293 if (bbr_full_bw_reached(sk) || rate > sk->sk_pacing_rate)
0294 sk->sk_pacing_rate = rate;
0295 }
0296
0297
0298 static u32 bbr_min_tso_segs(struct sock *sk)
0299 {
0300 return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
0301 }
0302
0303 static u32 bbr_tso_segs_goal(struct sock *sk)
0304 {
0305 struct tcp_sock *tp = tcp_sk(sk);
0306 u32 segs, bytes;
0307
0308
0309
0310
0311 bytes = min_t(unsigned long,
0312 sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift),
0313 GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER);
0314 segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
0315
0316 return min(segs, 0x7FU);
0317 }
0318
0319
0320 static void bbr_save_cwnd(struct sock *sk)
0321 {
0322 struct tcp_sock *tp = tcp_sk(sk);
0323 struct bbr *bbr = inet_csk_ca(sk);
0324
0325 if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT)
0326 bbr->prior_cwnd = tcp_snd_cwnd(tp);
0327 else
0328 bbr->prior_cwnd = max(bbr->prior_cwnd, tcp_snd_cwnd(tp));
0329 }
0330
0331 static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
0332 {
0333 struct tcp_sock *tp = tcp_sk(sk);
0334 struct bbr *bbr = inet_csk_ca(sk);
0335
0336 if (event == CA_EVENT_TX_START && tp->app_limited) {
0337 bbr->idle_restart = 1;
0338 bbr->ack_epoch_mstamp = tp->tcp_mstamp;
0339 bbr->ack_epoch_acked = 0;
0340
0341
0342
0343 if (bbr->mode == BBR_PROBE_BW)
0344 bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT);
0345 else if (bbr->mode == BBR_PROBE_RTT)
0346 bbr_check_probe_rtt_done(sk);
0347 }
0348 }
0349
0350
0351
0352
0353
0354
0355
0356
0357
0358
0359 static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
0360 {
0361 struct bbr *bbr = inet_csk_ca(sk);
0362 u32 bdp;
0363 u64 w;
0364
0365
0366
0367
0368
0369
0370
0371 if (unlikely(bbr->min_rtt_us == ~0U))
0372 return TCP_INIT_CWND;
0373
0374 w = (u64)bw * bbr->min_rtt_us;
0375
0376
0377
0378
0379 bdp = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT;
0380
0381 return bdp;
0382 }
0383
0384
0385
0386
0387
0388
0389
0390
0391
0392
0393
0394 static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd)
0395 {
0396 struct bbr *bbr = inet_csk_ca(sk);
0397
0398
0399 cwnd += 3 * bbr_tso_segs_goal(sk);
0400
0401
0402 cwnd = (cwnd + 1) & ~1U;
0403
0404
0405 if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0)
0406 cwnd += 2;
0407
0408 return cwnd;
0409 }
0410
0411
0412 static u32 bbr_inflight(struct sock *sk, u32 bw, int gain)
0413 {
0414 u32 inflight;
0415
0416 inflight = bbr_bdp(sk, bw, gain);
0417 inflight = bbr_quantization_budget(sk, inflight);
0418
0419 return inflight;
0420 }
0421
0422
0423
0424
0425
0426
0427
0428
0429
0430
0431
0432
0433
0434
0435
0436 static u32 bbr_packets_in_net_at_edt(struct sock *sk, u32 inflight_now)
0437 {
0438 struct tcp_sock *tp = tcp_sk(sk);
0439 struct bbr *bbr = inet_csk_ca(sk);
0440 u64 now_ns, edt_ns, interval_us;
0441 u32 interval_delivered, inflight_at_edt;
0442
0443 now_ns = tp->tcp_clock_cache;
0444 edt_ns = max(tp->tcp_wstamp_ns, now_ns);
0445 interval_us = div_u64(edt_ns - now_ns, NSEC_PER_USEC);
0446 interval_delivered = (u64)bbr_bw(sk) * interval_us >> BW_SCALE;
0447 inflight_at_edt = inflight_now;
0448 if (bbr->pacing_gain > BBR_UNIT)
0449 inflight_at_edt += bbr_tso_segs_goal(sk);
0450 if (interval_delivered >= inflight_at_edt)
0451 return 0;
0452 return inflight_at_edt - interval_delivered;
0453 }
0454
0455
0456 static u32 bbr_ack_aggregation_cwnd(struct sock *sk)
0457 {
0458 u32 max_aggr_cwnd, aggr_cwnd = 0;
0459
0460 if (bbr_extra_acked_gain && bbr_full_bw_reached(sk)) {
0461 max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us)
0462 / BW_UNIT;
0463 aggr_cwnd = (bbr_extra_acked_gain * bbr_extra_acked(sk))
0464 >> BBR_SCALE;
0465 aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd);
0466 }
0467
0468 return aggr_cwnd;
0469 }
0470
0471
0472
0473
0474
0475
0476
0477
0478
0479 static bool bbr_set_cwnd_to_recover_or_restore(
0480 struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd)
0481 {
0482 struct tcp_sock *tp = tcp_sk(sk);
0483 struct bbr *bbr = inet_csk_ca(sk);
0484 u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state;
0485 u32 cwnd = tcp_snd_cwnd(tp);
0486
0487
0488
0489
0490
0491 if (rs->losses > 0)
0492 cwnd = max_t(s32, cwnd - rs->losses, 1);
0493
0494 if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) {
0495
0496 bbr->packet_conservation = 1;
0497 bbr->next_rtt_delivered = tp->delivered;
0498
0499 cwnd = tcp_packets_in_flight(tp) + acked;
0500 } else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) {
0501
0502 cwnd = max(cwnd, bbr->prior_cwnd);
0503 bbr->packet_conservation = 0;
0504 }
0505 bbr->prev_ca_state = state;
0506
0507 if (bbr->packet_conservation) {
0508 *new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked);
0509 return true;
0510 }
0511 *new_cwnd = cwnd;
0512 return false;
0513 }
0514
0515
0516
0517
0518 static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
0519 u32 acked, u32 bw, int gain)
0520 {
0521 struct tcp_sock *tp = tcp_sk(sk);
0522 struct bbr *bbr = inet_csk_ca(sk);
0523 u32 cwnd = tcp_snd_cwnd(tp), target_cwnd = 0;
0524
0525 if (!acked)
0526 goto done;
0527
0528 if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd))
0529 goto done;
0530
0531 target_cwnd = bbr_bdp(sk, bw, gain);
0532
0533
0534
0535
0536 target_cwnd += bbr_ack_aggregation_cwnd(sk);
0537 target_cwnd = bbr_quantization_budget(sk, target_cwnd);
0538
0539
0540 if (bbr_full_bw_reached(sk))
0541 cwnd = min(cwnd + acked, target_cwnd);
0542 else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND)
0543 cwnd = cwnd + acked;
0544 cwnd = max(cwnd, bbr_cwnd_min_target);
0545
0546 done:
0547 tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp));
0548 if (bbr->mode == BBR_PROBE_RTT)
0549 tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), bbr_cwnd_min_target));
0550 }
0551
0552
0553 static bool bbr_is_next_cycle_phase(struct sock *sk,
0554 const struct rate_sample *rs)
0555 {
0556 struct tcp_sock *tp = tcp_sk(sk);
0557 struct bbr *bbr = inet_csk_ca(sk);
0558 bool is_full_length =
0559 tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) >
0560 bbr->min_rtt_us;
0561 u32 inflight, bw;
0562
0563
0564
0565
0566 if (bbr->pacing_gain == BBR_UNIT)
0567 return is_full_length;
0568
0569 inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
0570 bw = bbr_max_bw(sk);
0571
0572
0573
0574
0575
0576
0577 if (bbr->pacing_gain > BBR_UNIT)
0578 return is_full_length &&
0579 (rs->losses ||
0580 inflight >= bbr_inflight(sk, bw, bbr->pacing_gain));
0581
0582
0583
0584
0585
0586 return is_full_length ||
0587 inflight <= bbr_inflight(sk, bw, BBR_UNIT);
0588 }
0589
0590 static void bbr_advance_cycle_phase(struct sock *sk)
0591 {
0592 struct tcp_sock *tp = tcp_sk(sk);
0593 struct bbr *bbr = inet_csk_ca(sk);
0594
0595 bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1);
0596 bbr->cycle_mstamp = tp->delivered_mstamp;
0597 }
0598
0599
0600 static void bbr_update_cycle_phase(struct sock *sk,
0601 const struct rate_sample *rs)
0602 {
0603 struct bbr *bbr = inet_csk_ca(sk);
0604
0605 if (bbr->mode == BBR_PROBE_BW && bbr_is_next_cycle_phase(sk, rs))
0606 bbr_advance_cycle_phase(sk);
0607 }
0608
0609 static void bbr_reset_startup_mode(struct sock *sk)
0610 {
0611 struct bbr *bbr = inet_csk_ca(sk);
0612
0613 bbr->mode = BBR_STARTUP;
0614 }
0615
0616 static void bbr_reset_probe_bw_mode(struct sock *sk)
0617 {
0618 struct bbr *bbr = inet_csk_ca(sk);
0619
0620 bbr->mode = BBR_PROBE_BW;
0621 bbr->cycle_idx = CYCLE_LEN - 1 - prandom_u32_max(bbr_cycle_rand);
0622 bbr_advance_cycle_phase(sk);
0623 }
0624
0625 static void bbr_reset_mode(struct sock *sk)
0626 {
0627 if (!bbr_full_bw_reached(sk))
0628 bbr_reset_startup_mode(sk);
0629 else
0630 bbr_reset_probe_bw_mode(sk);
0631 }
0632
0633
0634 static void bbr_reset_lt_bw_sampling_interval(struct sock *sk)
0635 {
0636 struct tcp_sock *tp = tcp_sk(sk);
0637 struct bbr *bbr = inet_csk_ca(sk);
0638
0639 bbr->lt_last_stamp = div_u64(tp->delivered_mstamp, USEC_PER_MSEC);
0640 bbr->lt_last_delivered = tp->delivered;
0641 bbr->lt_last_lost = tp->lost;
0642 bbr->lt_rtt_cnt = 0;
0643 }
0644
0645
0646 static void bbr_reset_lt_bw_sampling(struct sock *sk)
0647 {
0648 struct bbr *bbr = inet_csk_ca(sk);
0649
0650 bbr->lt_bw = 0;
0651 bbr->lt_use_bw = 0;
0652 bbr->lt_is_sampling = false;
0653 bbr_reset_lt_bw_sampling_interval(sk);
0654 }
0655
0656
0657 static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw)
0658 {
0659 struct bbr *bbr = inet_csk_ca(sk);
0660 u32 diff;
0661
0662 if (bbr->lt_bw) {
0663
0664 diff = abs(bw - bbr->lt_bw);
0665 if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) ||
0666 (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <=
0667 bbr_lt_bw_diff)) {
0668
0669 bbr->lt_bw = (bw + bbr->lt_bw) >> 1;
0670 bbr->lt_use_bw = 1;
0671 bbr->pacing_gain = BBR_UNIT;
0672 bbr->lt_rtt_cnt = 0;
0673 return;
0674 }
0675 }
0676 bbr->lt_bw = bw;
0677 bbr_reset_lt_bw_sampling_interval(sk);
0678 }
0679
0680
0681
0682
0683
0684
0685
0686
0687 static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs)
0688 {
0689 struct tcp_sock *tp = tcp_sk(sk);
0690 struct bbr *bbr = inet_csk_ca(sk);
0691 u32 lost, delivered;
0692 u64 bw;
0693 u32 t;
0694
0695 if (bbr->lt_use_bw) {
0696 if (bbr->mode == BBR_PROBE_BW && bbr->round_start &&
0697 ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) {
0698 bbr_reset_lt_bw_sampling(sk);
0699 bbr_reset_probe_bw_mode(sk);
0700 }
0701 return;
0702 }
0703
0704
0705
0706
0707
0708 if (!bbr->lt_is_sampling) {
0709 if (!rs->losses)
0710 return;
0711 bbr_reset_lt_bw_sampling_interval(sk);
0712 bbr->lt_is_sampling = true;
0713 }
0714
0715
0716 if (rs->is_app_limited) {
0717 bbr_reset_lt_bw_sampling(sk);
0718 return;
0719 }
0720
0721 if (bbr->round_start)
0722 bbr->lt_rtt_cnt++;
0723 if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts)
0724 return;
0725 if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) {
0726 bbr_reset_lt_bw_sampling(sk);
0727 return;
0728 }
0729
0730
0731
0732
0733
0734 if (!rs->losses)
0735 return;
0736
0737
0738 lost = tp->lost - bbr->lt_last_lost;
0739 delivered = tp->delivered - bbr->lt_last_delivered;
0740
0741 if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered)
0742 return;
0743
0744
0745 t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp;
0746 if ((s32)t < 1)
0747 return;
0748
0749 if (t >= ~0U / USEC_PER_MSEC) {
0750 bbr_reset_lt_bw_sampling(sk);
0751 return;
0752 }
0753 t *= USEC_PER_MSEC;
0754 bw = (u64)delivered * BW_UNIT;
0755 do_div(bw, t);
0756 bbr_lt_bw_interval_done(sk, bw);
0757 }
0758
0759
0760 static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
0761 {
0762 struct tcp_sock *tp = tcp_sk(sk);
0763 struct bbr *bbr = inet_csk_ca(sk);
0764 u64 bw;
0765
0766 bbr->round_start = 0;
0767 if (rs->delivered < 0 || rs->interval_us <= 0)
0768 return;
0769
0770
0771 if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) {
0772 bbr->next_rtt_delivered = tp->delivered;
0773 bbr->rtt_cnt++;
0774 bbr->round_start = 1;
0775 bbr->packet_conservation = 0;
0776 }
0777
0778 bbr_lt_bw_sampling(sk, rs);
0779
0780
0781
0782
0783
0784 bw = div64_long((u64)rs->delivered * BW_UNIT, rs->interval_us);
0785
0786
0787
0788
0789
0790
0791
0792
0793
0794
0795
0796
0797 if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) {
0798
0799 minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw);
0800 }
0801 }
0802
0803
0804
0805
0806
0807
0808
0809
0810
0811
0812
0813
0814
0815
0816 static void bbr_update_ack_aggregation(struct sock *sk,
0817 const struct rate_sample *rs)
0818 {
0819 u32 epoch_us, expected_acked, extra_acked;
0820 struct bbr *bbr = inet_csk_ca(sk);
0821 struct tcp_sock *tp = tcp_sk(sk);
0822
0823 if (!bbr_extra_acked_gain || rs->acked_sacked <= 0 ||
0824 rs->delivered < 0 || rs->interval_us <= 0)
0825 return;
0826
0827 if (bbr->round_start) {
0828 bbr->extra_acked_win_rtts = min(0x1F,
0829 bbr->extra_acked_win_rtts + 1);
0830 if (bbr->extra_acked_win_rtts >= bbr_extra_acked_win_rtts) {
0831 bbr->extra_acked_win_rtts = 0;
0832 bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ?
0833 0 : 1;
0834 bbr->extra_acked[bbr->extra_acked_win_idx] = 0;
0835 }
0836 }
0837
0838
0839 epoch_us = tcp_stamp_us_delta(tp->delivered_mstamp,
0840 bbr->ack_epoch_mstamp);
0841 expected_acked = ((u64)bbr_bw(sk) * epoch_us) / BW_UNIT;
0842
0843
0844
0845
0846
0847 if (bbr->ack_epoch_acked <= expected_acked ||
0848 (bbr->ack_epoch_acked + rs->acked_sacked >=
0849 bbr_ack_epoch_acked_reset_thresh)) {
0850 bbr->ack_epoch_acked = 0;
0851 bbr->ack_epoch_mstamp = tp->delivered_mstamp;
0852 expected_acked = 0;
0853 }
0854
0855
0856 bbr->ack_epoch_acked = min_t(u32, 0xFFFFF,
0857 bbr->ack_epoch_acked + rs->acked_sacked);
0858 extra_acked = bbr->ack_epoch_acked - expected_acked;
0859 extra_acked = min(extra_acked, tcp_snd_cwnd(tp));
0860 if (extra_acked > bbr->extra_acked[bbr->extra_acked_win_idx])
0861 bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked;
0862 }
0863
0864
0865
0866
0867
0868
0869
0870
0871
0872 static void bbr_check_full_bw_reached(struct sock *sk,
0873 const struct rate_sample *rs)
0874 {
0875 struct bbr *bbr = inet_csk_ca(sk);
0876 u32 bw_thresh;
0877
0878 if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited)
0879 return;
0880
0881 bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE;
0882 if (bbr_max_bw(sk) >= bw_thresh) {
0883 bbr->full_bw = bbr_max_bw(sk);
0884 bbr->full_bw_cnt = 0;
0885 return;
0886 }
0887 ++bbr->full_bw_cnt;
0888 bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt;
0889 }
0890
0891
0892 static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
0893 {
0894 struct bbr *bbr = inet_csk_ca(sk);
0895
0896 if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
0897 bbr->mode = BBR_DRAIN;
0898 tcp_sk(sk)->snd_ssthresh =
0899 bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
0900 }
0901 if (bbr->mode == BBR_DRAIN &&
0902 bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
0903 bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT))
0904 bbr_reset_probe_bw_mode(sk);
0905 }
0906
0907 static void bbr_check_probe_rtt_done(struct sock *sk)
0908 {
0909 struct tcp_sock *tp = tcp_sk(sk);
0910 struct bbr *bbr = inet_csk_ca(sk);
0911
0912 if (!(bbr->probe_rtt_done_stamp &&
0913 after(tcp_jiffies32, bbr->probe_rtt_done_stamp)))
0914 return;
0915
0916 bbr->min_rtt_stamp = tcp_jiffies32;
0917 tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd));
0918 bbr_reset_mode(sk);
0919 }
0920
0921
0922
0923
0924
0925
0926
0927
0928
0929
0930
0931
0932
0933
0934
0935
0936
0937
0938
0939
0940 static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
0941 {
0942 struct tcp_sock *tp = tcp_sk(sk);
0943 struct bbr *bbr = inet_csk_ca(sk);
0944 bool filter_expired;
0945
0946
0947 filter_expired = after(tcp_jiffies32,
0948 bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ);
0949 if (rs->rtt_us >= 0 &&
0950 (rs->rtt_us < bbr->min_rtt_us ||
0951 (filter_expired && !rs->is_ack_delayed))) {
0952 bbr->min_rtt_us = rs->rtt_us;
0953 bbr->min_rtt_stamp = tcp_jiffies32;
0954 }
0955
0956 if (bbr_probe_rtt_mode_ms > 0 && filter_expired &&
0957 !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
0958 bbr->mode = BBR_PROBE_RTT;
0959 bbr_save_cwnd(sk);
0960 bbr->probe_rtt_done_stamp = 0;
0961 }
0962
0963 if (bbr->mode == BBR_PROBE_RTT) {
0964
0965 tp->app_limited =
0966 (tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
0967
0968 if (!bbr->probe_rtt_done_stamp &&
0969 tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) {
0970 bbr->probe_rtt_done_stamp = tcp_jiffies32 +
0971 msecs_to_jiffies(bbr_probe_rtt_mode_ms);
0972 bbr->probe_rtt_round_done = 0;
0973 bbr->next_rtt_delivered = tp->delivered;
0974 } else if (bbr->probe_rtt_done_stamp) {
0975 if (bbr->round_start)
0976 bbr->probe_rtt_round_done = 1;
0977 if (bbr->probe_rtt_round_done)
0978 bbr_check_probe_rtt_done(sk);
0979 }
0980 }
0981
0982 if (rs->delivered > 0)
0983 bbr->idle_restart = 0;
0984 }
0985
0986 static void bbr_update_gains(struct sock *sk)
0987 {
0988 struct bbr *bbr = inet_csk_ca(sk);
0989
0990 switch (bbr->mode) {
0991 case BBR_STARTUP:
0992 bbr->pacing_gain = bbr_high_gain;
0993 bbr->cwnd_gain = bbr_high_gain;
0994 break;
0995 case BBR_DRAIN:
0996 bbr->pacing_gain = bbr_drain_gain;
0997 bbr->cwnd_gain = bbr_high_gain;
0998 break;
0999 case BBR_PROBE_BW:
1000 bbr->pacing_gain = (bbr->lt_use_bw ?
1001 BBR_UNIT :
1002 bbr_pacing_gain[bbr->cycle_idx]);
1003 bbr->cwnd_gain = bbr_cwnd_gain;
1004 break;
1005 case BBR_PROBE_RTT:
1006 bbr->pacing_gain = BBR_UNIT;
1007 bbr->cwnd_gain = BBR_UNIT;
1008 break;
1009 default:
1010 WARN_ONCE(1, "BBR bad mode: %u\n", bbr->mode);
1011 break;
1012 }
1013 }
1014
1015 static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
1016 {
1017 bbr_update_bw(sk, rs);
1018 bbr_update_ack_aggregation(sk, rs);
1019 bbr_update_cycle_phase(sk, rs);
1020 bbr_check_full_bw_reached(sk, rs);
1021 bbr_check_drain(sk, rs);
1022 bbr_update_min_rtt(sk, rs);
1023 bbr_update_gains(sk);
1024 }
1025
1026 static void bbr_main(struct sock *sk, const struct rate_sample *rs)
1027 {
1028 struct bbr *bbr = inet_csk_ca(sk);
1029 u32 bw;
1030
1031 bbr_update_model(sk, rs);
1032
1033 bw = bbr_bw(sk);
1034 bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
1035 bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain);
1036 }
1037
1038 static void bbr_init(struct sock *sk)
1039 {
1040 struct tcp_sock *tp = tcp_sk(sk);
1041 struct bbr *bbr = inet_csk_ca(sk);
1042
1043 bbr->prior_cwnd = 0;
1044 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1045 bbr->rtt_cnt = 0;
1046 bbr->next_rtt_delivered = tp->delivered;
1047 bbr->prev_ca_state = TCP_CA_Open;
1048 bbr->packet_conservation = 0;
1049
1050 bbr->probe_rtt_done_stamp = 0;
1051 bbr->probe_rtt_round_done = 0;
1052 bbr->min_rtt_us = tcp_min_rtt(tp);
1053 bbr->min_rtt_stamp = tcp_jiffies32;
1054
1055 minmax_reset(&bbr->bw, bbr->rtt_cnt, 0);
1056
1057 bbr->has_seen_rtt = 0;
1058 bbr_init_pacing_rate_from_rtt(sk);
1059
1060 bbr->round_start = 0;
1061 bbr->idle_restart = 0;
1062 bbr->full_bw_reached = 0;
1063 bbr->full_bw = 0;
1064 bbr->full_bw_cnt = 0;
1065 bbr->cycle_mstamp = 0;
1066 bbr->cycle_idx = 0;
1067 bbr_reset_lt_bw_sampling(sk);
1068 bbr_reset_startup_mode(sk);
1069
1070 bbr->ack_epoch_mstamp = tp->tcp_mstamp;
1071 bbr->ack_epoch_acked = 0;
1072 bbr->extra_acked_win_rtts = 0;
1073 bbr->extra_acked_win_idx = 0;
1074 bbr->extra_acked[0] = 0;
1075 bbr->extra_acked[1] = 0;
1076
1077 cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
1078 }
1079
1080 static u32 bbr_sndbuf_expand(struct sock *sk)
1081 {
1082
1083 return 3;
1084 }
1085
1086
1087
1088
1089 static u32 bbr_undo_cwnd(struct sock *sk)
1090 {
1091 struct bbr *bbr = inet_csk_ca(sk);
1092
1093 bbr->full_bw = 0;
1094 bbr->full_bw_cnt = 0;
1095 bbr_reset_lt_bw_sampling(sk);
1096 return tcp_snd_cwnd(tcp_sk(sk));
1097 }
1098
1099
1100 static u32 bbr_ssthresh(struct sock *sk)
1101 {
1102 bbr_save_cwnd(sk);
1103 return tcp_sk(sk)->snd_ssthresh;
1104 }
1105
1106 static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr,
1107 union tcp_cc_info *info)
1108 {
1109 if (ext & (1 << (INET_DIAG_BBRINFO - 1)) ||
1110 ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
1111 struct tcp_sock *tp = tcp_sk(sk);
1112 struct bbr *bbr = inet_csk_ca(sk);
1113 u64 bw = bbr_bw(sk);
1114
1115 bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE;
1116 memset(&info->bbr, 0, sizeof(info->bbr));
1117 info->bbr.bbr_bw_lo = (u32)bw;
1118 info->bbr.bbr_bw_hi = (u32)(bw >> 32);
1119 info->bbr.bbr_min_rtt = bbr->min_rtt_us;
1120 info->bbr.bbr_pacing_gain = bbr->pacing_gain;
1121 info->bbr.bbr_cwnd_gain = bbr->cwnd_gain;
1122 *attr = INET_DIAG_BBRINFO;
1123 return sizeof(info->bbr);
1124 }
1125 return 0;
1126 }
1127
1128 static void bbr_set_state(struct sock *sk, u8 new_state)
1129 {
1130 struct bbr *bbr = inet_csk_ca(sk);
1131
1132 if (new_state == TCP_CA_Loss) {
1133 struct rate_sample rs = { .losses = 1 };
1134
1135 bbr->prev_ca_state = TCP_CA_Loss;
1136 bbr->full_bw = 0;
1137 bbr->round_start = 1;
1138 bbr_lt_bw_sampling(sk, &rs);
1139 }
1140 }
1141
1142 static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
1143 .flags = TCP_CONG_NON_RESTRICTED,
1144 .name = "bbr",
1145 .owner = THIS_MODULE,
1146 .init = bbr_init,
1147 .cong_control = bbr_main,
1148 .sndbuf_expand = bbr_sndbuf_expand,
1149 .undo_cwnd = bbr_undo_cwnd,
1150 .cwnd_event = bbr_cwnd_event,
1151 .ssthresh = bbr_ssthresh,
1152 .min_tso_segs = bbr_min_tso_segs,
1153 .get_info = bbr_get_info,
1154 .set_state = bbr_set_state,
1155 };
1156
1157 BTF_SET8_START(tcp_bbr_check_kfunc_ids)
1158 #ifdef CONFIG_X86
1159 #ifdef CONFIG_DYNAMIC_FTRACE
1160 BTF_ID_FLAGS(func, bbr_init)
1161 BTF_ID_FLAGS(func, bbr_main)
1162 BTF_ID_FLAGS(func, bbr_sndbuf_expand)
1163 BTF_ID_FLAGS(func, bbr_undo_cwnd)
1164 BTF_ID_FLAGS(func, bbr_cwnd_event)
1165 BTF_ID_FLAGS(func, bbr_ssthresh)
1166 BTF_ID_FLAGS(func, bbr_min_tso_segs)
1167 BTF_ID_FLAGS(func, bbr_set_state)
1168 #endif
1169 #endif
1170 BTF_SET8_END(tcp_bbr_check_kfunc_ids)
1171
1172 static const struct btf_kfunc_id_set tcp_bbr_kfunc_set = {
1173 .owner = THIS_MODULE,
1174 .set = &tcp_bbr_check_kfunc_ids,
1175 };
1176
1177 static int __init bbr_register(void)
1178 {
1179 int ret;
1180
1181 BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE);
1182
1183 ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &tcp_bbr_kfunc_set);
1184 if (ret < 0)
1185 return ret;
1186 return tcp_register_congestion_control(&tcp_bbr_cong_ops);
1187 }
1188
1189 static void __exit bbr_unregister(void)
1190 {
1191 tcp_unregister_congestion_control(&tcp_bbr_cong_ops);
1192 }
1193
1194 module_init(bbr_register);
1195 module_exit(bbr_unregister);
1196
1197 MODULE_AUTHOR("Van Jacobson <vanj@google.com>");
1198 MODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>");
1199 MODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>");
1200 MODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>");
1201 MODULE_LICENSE("Dual BSD/GPL");
1202 MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)");