Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 
0003 /* WARNING: This implemenation is not necessarily the same
0004  * as the tcp_cubic.c.  The purpose is mainly for testing
0005  * the kernel BPF logic.
0006  *
0007  * Highlights:
0008  * 1. CONFIG_HZ .kconfig map is used.
0009  * 2. In bictcp_update(), calculation is changed to use usec
0010  *    resolution (i.e. USEC_PER_JIFFY) instead of using jiffies.
0011  *    Thus, usecs_to_jiffies() is not used in the bpf_cubic.c.
0012  * 3. In bitctcp_update() [under tcp_friendliness], the original
0013  *    "while (ca->ack_cnt > delta)" loop is changed to the equivalent
0014  *    "ca->ack_cnt / delta" operation.
0015  */
0016 
0017 #include <linux/bpf.h>
0018 #include <linux/stddef.h>
0019 #include <linux/tcp.h>
0020 #include "bpf_tcp_helpers.h"
0021 
0022 char _license[] SEC("license") = "GPL";
0023 
0024 #define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi)
0025 
0026 #define BICTCP_BETA_SCALE    1024   /* Scale factor beta calculation
0027                      * max_cwnd = snd_cwnd * beta
0028                      */
0029 #define BICTCP_HZ       10  /* BIC HZ 2^10 = 1024 */
0030 
0031 /* Two methods of hybrid slow start */
0032 #define HYSTART_ACK_TRAIN   0x1
0033 #define HYSTART_DELAY       0x2
0034 
0035 /* Number of delay samples for detecting the increase of delay */
0036 #define HYSTART_MIN_SAMPLES 8
0037 #define HYSTART_DELAY_MIN   (4000U) /* 4ms */
0038 #define HYSTART_DELAY_MAX   (16000U)    /* 16 ms */
0039 #define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX)
0040 
0041 static int fast_convergence = 1;
0042 static const int beta = 717;    /* = 717/1024 (BICTCP_BETA_SCALE) */
0043 static int initial_ssthresh;
0044 static const int bic_scale = 41;
0045 static int tcp_friendliness = 1;
0046 
0047 static int hystart = 1;
0048 static int hystart_detect = HYSTART_ACK_TRAIN | HYSTART_DELAY;
0049 static int hystart_low_window = 16;
0050 static int hystart_ack_delta_us = 2000;
0051 
0052 static const __u32 cube_rtt_scale = (bic_scale * 10);   /* 1024*c/rtt */
0053 static const __u32 beta_scale = 8*(BICTCP_BETA_SCALE+beta) / 3
0054                 / (BICTCP_BETA_SCALE - beta);
0055 /* calculate the "K" for (wmax-cwnd) = c/rtt * K^3
0056  *  so K = cubic_root( (wmax-cwnd)*rtt/c )
0057  * the unit of K is bictcp_HZ=2^10, not HZ
0058  *
0059  *  c = bic_scale >> 10
0060  *  rtt = 100ms
0061  *
0062  * the following code has been designed and tested for
0063  * cwnd < 1 million packets
0064  * RTT < 100 seconds
0065  * HZ < 1,000,00  (corresponding to 10 nano-second)
0066  */
0067 
0068 /* 1/c * 2^2*bictcp_HZ * srtt, 2^40 */
0069 static const __u64 cube_factor = (__u64)(1ull << (10+3*BICTCP_HZ))
0070                 / (bic_scale * 10);
0071 
0072 /* BIC TCP Parameters */
0073 struct bictcp {
0074     __u32   cnt;        /* increase cwnd by 1 after ACKs */
0075     __u32   last_max_cwnd;  /* last maximum snd_cwnd */
0076     __u32   last_cwnd;  /* the last snd_cwnd */
0077     __u32   last_time;  /* time when updated last_cwnd */
0078     __u32   bic_origin_point;/* origin point of bic function */
0079     __u32   bic_K;      /* time to origin point
0080                    from the beginning of the current epoch */
0081     __u32   delay_min;  /* min delay (usec) */
0082     __u32   epoch_start;    /* beginning of an epoch */
0083     __u32   ack_cnt;    /* number of acks */
0084     __u32   tcp_cwnd;   /* estimated tcp cwnd */
0085     __u16   unused;
0086     __u8    sample_cnt; /* number of samples to decide curr_rtt */
0087     __u8    found;      /* the exit point is found? */
0088     __u32   round_start;    /* beginning of each round */
0089     __u32   end_seq;    /* end_seq of the round */
0090     __u32   last_ack;   /* last time when the ACK spacing is close */
0091     __u32   curr_rtt;   /* the minimum rtt of current round */
0092 };
0093 
0094 static inline void bictcp_reset(struct bictcp *ca)
0095 {
0096     ca->cnt = 0;
0097     ca->last_max_cwnd = 0;
0098     ca->last_cwnd = 0;
0099     ca->last_time = 0;
0100     ca->bic_origin_point = 0;
0101     ca->bic_K = 0;
0102     ca->delay_min = 0;
0103     ca->epoch_start = 0;
0104     ca->ack_cnt = 0;
0105     ca->tcp_cwnd = 0;
0106     ca->found = 0;
0107 }
0108 
0109 extern unsigned long CONFIG_HZ __kconfig;
0110 #define HZ CONFIG_HZ
0111 #define USEC_PER_MSEC   1000UL
0112 #define USEC_PER_SEC    1000000UL
0113 #define USEC_PER_JIFFY  (USEC_PER_SEC / HZ)
0114 
0115 static __always_inline __u64 div64_u64(__u64 dividend, __u64 divisor)
0116 {
0117     return dividend / divisor;
0118 }
0119 
0120 #define div64_ul div64_u64
0121 
0122 #define BITS_PER_U64 (sizeof(__u64) * 8)
0123 static __always_inline int fls64(__u64 x)
0124 {
0125     int num = BITS_PER_U64 - 1;
0126 
0127     if (x == 0)
0128         return 0;
0129 
0130     if (!(x & (~0ull << (BITS_PER_U64-32)))) {
0131         num -= 32;
0132         x <<= 32;
0133     }
0134     if (!(x & (~0ull << (BITS_PER_U64-16)))) {
0135         num -= 16;
0136         x <<= 16;
0137     }
0138     if (!(x & (~0ull << (BITS_PER_U64-8)))) {
0139         num -= 8;
0140         x <<= 8;
0141     }
0142     if (!(x & (~0ull << (BITS_PER_U64-4)))) {
0143         num -= 4;
0144         x <<= 4;
0145     }
0146     if (!(x & (~0ull << (BITS_PER_U64-2)))) {
0147         num -= 2;
0148         x <<= 2;
0149     }
0150     if (!(x & (~0ull << (BITS_PER_U64-1))))
0151         num -= 1;
0152 
0153     return num + 1;
0154 }
0155 
0156 static __always_inline __u32 bictcp_clock_us(const struct sock *sk)
0157 {
0158     return tcp_sk(sk)->tcp_mstamp;
0159 }
0160 
0161 static __always_inline void bictcp_hystart_reset(struct sock *sk)
0162 {
0163     struct tcp_sock *tp = tcp_sk(sk);
0164     struct bictcp *ca = inet_csk_ca(sk);
0165 
0166     ca->round_start = ca->last_ack = bictcp_clock_us(sk);
0167     ca->end_seq = tp->snd_nxt;
0168     ca->curr_rtt = ~0U;
0169     ca->sample_cnt = 0;
0170 }
0171 
0172 /* "struct_ops/" prefix is a requirement */
0173 SEC("struct_ops/bpf_cubic_init")
0174 void BPF_PROG(bpf_cubic_init, struct sock *sk)
0175 {
0176     struct bictcp *ca = inet_csk_ca(sk);
0177 
0178     bictcp_reset(ca);
0179 
0180     if (hystart)
0181         bictcp_hystart_reset(sk);
0182 
0183     if (!hystart && initial_ssthresh)
0184         tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
0185 }
0186 
0187 /* "struct_ops" prefix is a requirement */
0188 SEC("struct_ops/bpf_cubic_cwnd_event")
0189 void BPF_PROG(bpf_cubic_cwnd_event, struct sock *sk, enum tcp_ca_event event)
0190 {
0191     if (event == CA_EVENT_TX_START) {
0192         struct bictcp *ca = inet_csk_ca(sk);
0193         __u32 now = tcp_jiffies32;
0194         __s32 delta;
0195 
0196         delta = now - tcp_sk(sk)->lsndtime;
0197 
0198         /* We were application limited (idle) for a while.
0199          * Shift epoch_start to keep cwnd growth to cubic curve.
0200          */
0201         if (ca->epoch_start && delta > 0) {
0202             ca->epoch_start += delta;
0203             if (after(ca->epoch_start, now))
0204                 ca->epoch_start = now;
0205         }
0206         return;
0207     }
0208 }
0209 
0210 /*
0211  * cbrt(x) MSB values for x MSB values in [0..63].
0212  * Precomputed then refined by hand - Willy Tarreau
0213  *
0214  * For x in [0..63],
0215  *   v = cbrt(x << 18) - 1
0216  *   cbrt(x) = (v[x] + 10) >> 6
0217  */
0218 static const __u8 v[] = {
0219     /* 0x00 */    0,   54,   54,   54,  118,  118,  118,  118,
0220     /* 0x08 */  123,  129,  134,  138,  143,  147,  151,  156,
0221     /* 0x10 */  157,  161,  164,  168,  170,  173,  176,  179,
0222     /* 0x18 */  181,  185,  187,  190,  192,  194,  197,  199,
0223     /* 0x20 */  200,  202,  204,  206,  209,  211,  213,  215,
0224     /* 0x28 */  217,  219,  221,  222,  224,  225,  227,  229,
0225     /* 0x30 */  231,  232,  234,  236,  237,  239,  240,  242,
0226     /* 0x38 */  244,  245,  246,  248,  250,  251,  252,  254,
0227 };
0228 
0229 /* calculate the cubic root of x using a table lookup followed by one
0230  * Newton-Raphson iteration.
0231  * Avg err ~= 0.195%
0232  */
0233 static __always_inline __u32 cubic_root(__u64 a)
0234 {
0235     __u32 x, b, shift;
0236 
0237     if (a < 64) {
0238         /* a in [0..63] */
0239         return ((__u32)v[(__u32)a] + 35) >> 6;
0240     }
0241 
0242     b = fls64(a);
0243     b = ((b * 84) >> 8) - 1;
0244     shift = (a >> (b * 3));
0245 
0246     /* it is needed for verifier's bound check on v */
0247     if (shift >= 64)
0248         return 0;
0249 
0250     x = ((__u32)(((__u32)v[shift] + 10) << b)) >> 6;
0251 
0252     /*
0253      * Newton-Raphson iteration
0254      *                         2
0255      * x    = ( 2 * x  +  a / x  ) / 3
0256      *  k+1          k         k
0257      */
0258     x = (2 * x + (__u32)div64_u64(a, (__u64)x * (__u64)(x - 1)));
0259     x = ((x * 341) >> 10);
0260     return x;
0261 }
0262 
0263 /*
0264  * Compute congestion window to use.
0265  */
0266 static __always_inline void bictcp_update(struct bictcp *ca, __u32 cwnd,
0267                       __u32 acked)
0268 {
0269     __u32 delta, bic_target, max_cnt;
0270     __u64 offs, t;
0271 
0272     ca->ack_cnt += acked;   /* count the number of ACKed packets */
0273 
0274     if (ca->last_cwnd == cwnd &&
0275         (__s32)(tcp_jiffies32 - ca->last_time) <= HZ / 32)
0276         return;
0277 
0278     /* The CUBIC function can update ca->cnt at most once per jiffy.
0279      * On all cwnd reduction events, ca->epoch_start is set to 0,
0280      * which will force a recalculation of ca->cnt.
0281      */
0282     if (ca->epoch_start && tcp_jiffies32 == ca->last_time)
0283         goto tcp_friendliness;
0284 
0285     ca->last_cwnd = cwnd;
0286     ca->last_time = tcp_jiffies32;
0287 
0288     if (ca->epoch_start == 0) {
0289         ca->epoch_start = tcp_jiffies32;    /* record beginning */
0290         ca->ack_cnt = acked;            /* start counting */
0291         ca->tcp_cwnd = cwnd;            /* syn with cubic */
0292 
0293         if (ca->last_max_cwnd <= cwnd) {
0294             ca->bic_K = 0;
0295             ca->bic_origin_point = cwnd;
0296         } else {
0297             /* Compute new K based on
0298              * (wmax-cwnd) * (srtt>>3 / HZ) / c * 2^(3*bictcp_HZ)
0299              */
0300             ca->bic_K = cubic_root(cube_factor
0301                            * (ca->last_max_cwnd - cwnd));
0302             ca->bic_origin_point = ca->last_max_cwnd;
0303         }
0304     }
0305 
0306     /* cubic function - calc*/
0307     /* calculate c * time^3 / rtt,
0308      *  while considering overflow in calculation of time^3
0309      * (so time^3 is done by using 64 bit)
0310      * and without the support of division of 64bit numbers
0311      * (so all divisions are done by using 32 bit)
0312      *  also NOTE the unit of those veriables
0313      *    time  = (t - K) / 2^bictcp_HZ
0314      *    c = bic_scale >> 10
0315      * rtt  = (srtt >> 3) / HZ
0316      * !!! The following code does not have overflow problems,
0317      * if the cwnd < 1 million packets !!!
0318      */
0319 
0320     t = (__s32)(tcp_jiffies32 - ca->epoch_start) * USEC_PER_JIFFY;
0321     t += ca->delay_min;
0322     /* change the unit from usec to bictcp_HZ */
0323     t <<= BICTCP_HZ;
0324     t /= USEC_PER_SEC;
0325 
0326     if (t < ca->bic_K)      /* t - K */
0327         offs = ca->bic_K - t;
0328     else
0329         offs = t - ca->bic_K;
0330 
0331     /* c/rtt * (t-K)^3 */
0332     delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ);
0333     if (t < ca->bic_K)                            /* below origin*/
0334         bic_target = ca->bic_origin_point - delta;
0335     else                                          /* above origin*/
0336         bic_target = ca->bic_origin_point + delta;
0337 
0338     /* cubic function - calc bictcp_cnt*/
0339     if (bic_target > cwnd) {
0340         ca->cnt = cwnd / (bic_target - cwnd);
0341     } else {
0342         ca->cnt = 100 * cwnd;              /* very small increment*/
0343     }
0344 
0345     /*
0346      * The initial growth of cubic function may be too conservative
0347      * when the available bandwidth is still unknown.
0348      */
0349     if (ca->last_max_cwnd == 0 && ca->cnt > 20)
0350         ca->cnt = 20;   /* increase cwnd 5% per RTT */
0351 
0352 tcp_friendliness:
0353     /* TCP Friendly */
0354     if (tcp_friendliness) {
0355         __u32 scale = beta_scale;
0356         __u32 n;
0357 
0358         /* update tcp cwnd */
0359         delta = (cwnd * scale) >> 3;
0360         if (ca->ack_cnt > delta && delta) {
0361             n = ca->ack_cnt / delta;
0362             ca->ack_cnt -= n * delta;
0363             ca->tcp_cwnd += n;
0364         }
0365 
0366         if (ca->tcp_cwnd > cwnd) {  /* if bic is slower than tcp */
0367             delta = ca->tcp_cwnd - cwnd;
0368             max_cnt = cwnd / delta;
0369             if (ca->cnt > max_cnt)
0370                 ca->cnt = max_cnt;
0371         }
0372     }
0373 
0374     /* The maximum rate of cwnd increase CUBIC allows is 1 packet per
0375      * 2 packets ACKed, meaning cwnd grows at 1.5x per RTT.
0376      */
0377     ca->cnt = max(ca->cnt, 2U);
0378 }
0379 
0380 /* Or simply use the BPF_STRUCT_OPS to avoid the SEC boiler plate. */
0381 void BPF_STRUCT_OPS(bpf_cubic_cong_avoid, struct sock *sk, __u32 ack, __u32 acked)
0382 {
0383     struct tcp_sock *tp = tcp_sk(sk);
0384     struct bictcp *ca = inet_csk_ca(sk);
0385 
0386     if (!tcp_is_cwnd_limited(sk))
0387         return;
0388 
0389     if (tcp_in_slow_start(tp)) {
0390         if (hystart && after(ack, ca->end_seq))
0391             bictcp_hystart_reset(sk);
0392         acked = tcp_slow_start(tp, acked);
0393         if (!acked)
0394             return;
0395     }
0396     bictcp_update(ca, tp->snd_cwnd, acked);
0397     tcp_cong_avoid_ai(tp, ca->cnt, acked);
0398 }
0399 
0400 __u32 BPF_STRUCT_OPS(bpf_cubic_recalc_ssthresh, struct sock *sk)
0401 {
0402     const struct tcp_sock *tp = tcp_sk(sk);
0403     struct bictcp *ca = inet_csk_ca(sk);
0404 
0405     ca->epoch_start = 0;    /* end of epoch */
0406 
0407     /* Wmax and fast convergence */
0408     if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
0409         ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
0410             / (2 * BICTCP_BETA_SCALE);
0411     else
0412         ca->last_max_cwnd = tp->snd_cwnd;
0413 
0414     return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
0415 }
0416 
0417 void BPF_STRUCT_OPS(bpf_cubic_state, struct sock *sk, __u8 new_state)
0418 {
0419     if (new_state == TCP_CA_Loss) {
0420         bictcp_reset(inet_csk_ca(sk));
0421         bictcp_hystart_reset(sk);
0422     }
0423 }
0424 
0425 #define GSO_MAX_SIZE        65536
0426 
0427 /* Account for TSO/GRO delays.
0428  * Otherwise short RTT flows could get too small ssthresh, since during
0429  * slow start we begin with small TSO packets and ca->delay_min would
0430  * not account for long aggregation delay when TSO packets get bigger.
0431  * Ideally even with a very small RTT we would like to have at least one
0432  * TSO packet being sent and received by GRO, and another one in qdisc layer.
0433  * We apply another 100% factor because @rate is doubled at this point.
0434  * We cap the cushion to 1ms.
0435  */
0436 static __always_inline __u32 hystart_ack_delay(struct sock *sk)
0437 {
0438     unsigned long rate;
0439 
0440     rate = sk->sk_pacing_rate;
0441     if (!rate)
0442         return 0;
0443     return min((__u64)USEC_PER_MSEC,
0444            div64_ul((__u64)GSO_MAX_SIZE * 4 * USEC_PER_SEC, rate));
0445 }
0446 
0447 static __always_inline void hystart_update(struct sock *sk, __u32 delay)
0448 {
0449     struct tcp_sock *tp = tcp_sk(sk);
0450     struct bictcp *ca = inet_csk_ca(sk);
0451     __u32 threshold;
0452 
0453     if (hystart_detect & HYSTART_ACK_TRAIN) {
0454         __u32 now = bictcp_clock_us(sk);
0455 
0456         /* first detection parameter - ack-train detection */
0457         if ((__s32)(now - ca->last_ack) <= hystart_ack_delta_us) {
0458             ca->last_ack = now;
0459 
0460             threshold = ca->delay_min + hystart_ack_delay(sk);
0461 
0462             /* Hystart ack train triggers if we get ack past
0463              * ca->delay_min/2.
0464              * Pacing might have delayed packets up to RTT/2
0465              * during slow start.
0466              */
0467             if (sk->sk_pacing_status == SK_PACING_NONE)
0468                 threshold >>= 1;
0469 
0470             if ((__s32)(now - ca->round_start) > threshold) {
0471                 ca->found = 1;
0472                 tp->snd_ssthresh = tp->snd_cwnd;
0473             }
0474         }
0475     }
0476 
0477     if (hystart_detect & HYSTART_DELAY) {
0478         /* obtain the minimum delay of more than sampling packets */
0479         if (ca->curr_rtt > delay)
0480             ca->curr_rtt = delay;
0481         if (ca->sample_cnt < HYSTART_MIN_SAMPLES) {
0482             ca->sample_cnt++;
0483         } else {
0484             if (ca->curr_rtt > ca->delay_min +
0485                 HYSTART_DELAY_THRESH(ca->delay_min >> 3)) {
0486                 ca->found = 1;
0487                 tp->snd_ssthresh = tp->snd_cwnd;
0488             }
0489         }
0490     }
0491 }
0492 
0493 void BPF_STRUCT_OPS(bpf_cubic_acked, struct sock *sk,
0494             const struct ack_sample *sample)
0495 {
0496     const struct tcp_sock *tp = tcp_sk(sk);
0497     struct bictcp *ca = inet_csk_ca(sk);
0498     __u32 delay;
0499 
0500     /* Some calls are for duplicates without timetamps */
0501     if (sample->rtt_us < 0)
0502         return;
0503 
0504     /* Discard delay samples right after fast recovery */
0505     if (ca->epoch_start && (__s32)(tcp_jiffies32 - ca->epoch_start) < HZ)
0506         return;
0507 
0508     delay = sample->rtt_us;
0509     if (delay == 0)
0510         delay = 1;
0511 
0512     /* first time call or link delay decreases */
0513     if (ca->delay_min == 0 || ca->delay_min > delay)
0514         ca->delay_min = delay;
0515 
0516     /* hystart triggers when cwnd is larger than some threshold */
0517     if (!ca->found && tcp_in_slow_start(tp) && hystart &&
0518         tp->snd_cwnd >= hystart_low_window)
0519         hystart_update(sk, delay);
0520 }
0521 
0522 extern __u32 tcp_reno_undo_cwnd(struct sock *sk) __ksym;
0523 
0524 __u32 BPF_STRUCT_OPS(bpf_cubic_undo_cwnd, struct sock *sk)
0525 {
0526     return tcp_reno_undo_cwnd(sk);
0527 }
0528 
0529 SEC(".struct_ops")
0530 struct tcp_congestion_ops cubic = {
0531     .init       = (void *)bpf_cubic_init,
0532     .ssthresh   = (void *)bpf_cubic_recalc_ssthresh,
0533     .cong_avoid = (void *)bpf_cubic_cong_avoid,
0534     .set_state  = (void *)bpf_cubic_state,
0535     .undo_cwnd  = (void *)bpf_cubic_undo_cwnd,
0536     .cwnd_event = (void *)bpf_cubic_cwnd_event,
0537     .pkts_acked     = (void *)bpf_cubic_acked,
0538     .name       = "bpf_cubic",
0539 };