net/ipv4/tcp_nv.c

0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * TCP NV: TCP with Congestion Avoidance
0004  *
0005  * TCP-NV is a successor of TCP-Vegas that has been developed to
0006  * deal with the issues that occur in modern networks.
0007  * Like TCP-Vegas, TCP-NV supports true congestion avoidance,
0008  * the ability to detect congestion before packet losses occur.
0009  * When congestion (queue buildup) starts to occur, TCP-NV
0010  * predicts what the cwnd size should be for the current
0011  * throughput and it reduces the cwnd proportionally to
0012  * the difference between the current cwnd and the predicted cwnd.
0013  *
0014  * NV is only recommeneded for traffic within a data center, and when
0015  * all the flows are NV (at least those within the data center). This
0016  * is due to the inherent unfairness between flows using losses to
0017  * detect congestion (congestion control) and those that use queue
0018  * buildup to detect congestion (congestion avoidance).
0019  *
0020  * Note: High NIC coalescence values may lower the performance of NV
0021  * due to the increased noise in RTT values. In particular, we have
0022  * seen issues with rx-frames values greater than 8.
0023  *
0024  * TODO:
0025  * 1) Add mechanism to deal with reverse congestion.
0026  */
0027
0028 #include <linux/module.h>
0029 #include <linux/math64.h>
0030 #include <net/tcp.h>
0031 #include <linux/inet_diag.h>
0032
0033 /* TCP NV parameters
0034  *
0035  * nv_pad       Max number of queued packets allowed in network
0036  * nv_pad_buffer    Do not grow cwnd if this closed to nv_pad
0037  * nv_reset_period  How often (in) seconds)to reset min_rtt
0038  * nv_min_cwnd      Don't decrease cwnd below this if there are no losses
0039  * nv_cong_dec_mult Decrease cwnd by X% (30%) of congestion when detected
0040  * nv_ssthresh_factor   On congestion set ssthresh to this * <desired cwnd> / 8
0041  * nv_rtt_factor    RTT averaging factor
0042  * nv_loss_dec_factor   Decrease cwnd to this (80%) when losses occur
0043  * nv_dec_eval_min_calls    Wait this many RTT measurements before dec cwnd
0044  * nv_inc_eval_min_calls    Wait this many RTT measurements before inc cwnd
0045  * nv_ssthresh_eval_min_calls   Wait this many RTT measurements before stopping
0046  *              slow-start due to congestion
0047  * nv_stop_rtt_cnt  Only grow cwnd for this many RTTs after non-congestion
0048  * nv_rtt_min_cnt   Wait these many RTTs before making congesion decision
0049  * nv_cwnd_growth_rate_neg
0050  * nv_cwnd_growth_rate_pos
0051  *  How quickly to double growth rate (not rate) of cwnd when not
0052  *  congested. One value (nv_cwnd_growth_rate_neg) for when
0053  *  rate < 1 pkt/RTT (after losses). The other (nv_cwnd_growth_rate_pos)
0054  *  otherwise.
0055  */
0056
0057 static int nv_pad __read_mostly = 10;
0058 static int nv_pad_buffer __read_mostly = 2;
0059 static int nv_reset_period __read_mostly = 5; /* in seconds */
0060 static int nv_min_cwnd __read_mostly = 2;
0061 static int nv_cong_dec_mult __read_mostly = 30 * 128 / 100; /* = 30% */
0062 static int nv_ssthresh_factor __read_mostly = 8; /* = 1 */
0063 static int nv_rtt_factor __read_mostly = 128; /* = 1/2*old + 1/2*new */
0064 static int nv_loss_dec_factor __read_mostly = 819; /* => 80% */
0065 static int nv_cwnd_growth_rate_neg __read_mostly = 8;
0066 static int nv_cwnd_growth_rate_pos __read_mostly; /* 0 => fixed like Reno */
0067 static int nv_dec_eval_min_calls __read_mostly = 60;
0068 static int nv_inc_eval_min_calls __read_mostly = 20;
0069 static int nv_ssthresh_eval_min_calls __read_mostly = 30;
0070 static int nv_stop_rtt_cnt __read_mostly = 10;
0071 static int nv_rtt_min_cnt __read_mostly = 2;
0072
0073 module_param(nv_pad, int, 0644);
0074 MODULE_PARM_DESC(nv_pad, "max queued packets allowed in network");
0075 module_param(nv_reset_period, int, 0644);
0076 MODULE_PARM_DESC(nv_reset_period, "nv_min_rtt reset period (secs)");
0077 module_param(nv_min_cwnd, int, 0644);
0078 MODULE_PARM_DESC(nv_min_cwnd, "NV will not decrease cwnd below this value"
0079          " without losses");
0080
0081 /* TCP NV Parameters */
0082 struct tcpnv {
0083     unsigned long nv_min_rtt_reset_jiffies;  /* when to switch to
0084                           * nv_min_rtt_new */
0085     s8  cwnd_growth_factor; /* Current cwnd growth factor,
0086                  * < 0 => less than 1 packet/RTT */
0087     u8  available8;
0088     u16 available16;
0089     u8  nv_allow_cwnd_growth:1, /* whether cwnd can grow */
0090         nv_reset:1,     /* whether to reset values */
0091         nv_catchup:1;       /* whether we are growing because
0092                      * of temporary cwnd decrease */
0093     u8  nv_eval_call_cnt;   /* call count since last eval */
0094     u8  nv_min_cwnd;    /* nv won't make a ca decision if cwnd is
0095                  * smaller than this. It may grow to handle
0096                  * TSO, LRO and interrupt coalescence because
0097                  * with these a small cwnd cannot saturate
0098                  * the link. Note that this is different from
0099                  * the file local nv_min_cwnd */
0100     u8  nv_rtt_cnt;     /* RTTs without making ca decision */;
0101     u32 nv_last_rtt;    /* last rtt */
0102     u32 nv_min_rtt;     /* active min rtt. Used to determine slope */
0103     u32 nv_min_rtt_new; /* min rtt for future use */
0104     u32 nv_base_rtt;        /* If non-zero it represents the threshold for
0105                  * congestion */
0106     u32 nv_lower_bound_rtt; /* Used in conjunction with nv_base_rtt. It is
0107                  * set to 80% of nv_base_rtt. It helps reduce
0108                  * unfairness between flows */
0109     u32 nv_rtt_max_rate;    /* max rate seen during current RTT */
0110     u32 nv_rtt_start_seq;   /* current RTT ends when packet arrives
0111                  * acking beyond nv_rtt_start_seq */
0112     u32 nv_last_snd_una;    /* Previous value of tp->snd_una. It is
0113                  * used to determine bytes acked since last
0114                  * call to bictcp_acked */
0115     u32 nv_no_cong_cnt; /* Consecutive no congestion decisions */
0116 };
0117
0118 #define NV_INIT_RTT   U32_MAX
0119 #define NV_MIN_CWND   4
0120 #define NV_MIN_CWND_GROW  2
0121 #define NV_TSO_CWND_BOUND 80
0122
0123 static inline void tcpnv_reset(struct tcpnv *ca, struct sock *sk)
0124 {
0125     struct tcp_sock *tp = tcp_sk(sk);
0126
0127     ca->nv_reset = 0;
0128     ca->nv_no_cong_cnt = 0;
0129     ca->nv_rtt_cnt = 0;
0130     ca->nv_last_rtt = 0;
0131     ca->nv_rtt_max_rate = 0;
0132     ca->nv_rtt_start_seq = tp->snd_una;
0133     ca->nv_eval_call_cnt = 0;
0134     ca->nv_last_snd_una = tp->snd_una;
0135 }
0136
0137 static void tcpnv_init(struct sock *sk)
0138 {
0139     struct tcpnv *ca = inet_csk_ca(sk);
0140     int base_rtt;
0141
0142     tcpnv_reset(ca, sk);
0143
0144     /* See if base_rtt is available from socket_ops bpf program.
0145      * It is meant to be used in environments, such as communication
0146      * within a datacenter, where we have reasonable estimates of
0147      * RTTs
0148      */
0149     base_rtt = tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT, 0, NULL);
0150     if (base_rtt > 0) {
0151         ca->nv_base_rtt = base_rtt;
0152         ca->nv_lower_bound_rtt = (base_rtt * 205) >> 8; /* 80% */
0153     } else {
0154         ca->nv_base_rtt = 0;
0155         ca->nv_lower_bound_rtt = 0;
0156     }
0157
0158     ca->nv_allow_cwnd_growth = 1;
0159     ca->nv_min_rtt_reset_jiffies = jiffies + 2 * HZ;
0160     ca->nv_min_rtt = NV_INIT_RTT;
0161     ca->nv_min_rtt_new = NV_INIT_RTT;
0162     ca->nv_min_cwnd = NV_MIN_CWND;
0163     ca->nv_catchup = 0;
0164     ca->cwnd_growth_factor = 0;
0165 }
0166
0167 /* If provided, apply upper (base_rtt) and lower (lower_bound_rtt)
0168  * bounds to RTT.
0169  */
0170 inline u32 nv_get_bounded_rtt(struct tcpnv *ca, u32 val)
0171 {
0172     if (ca->nv_lower_bound_rtt > 0 && val < ca->nv_lower_bound_rtt)
0173         return ca->nv_lower_bound_rtt;
0174     else if (ca->nv_base_rtt > 0 && val > ca->nv_base_rtt)
0175         return ca->nv_base_rtt;
0176     else
0177         return val;
0178 }
0179
0180 static void tcpnv_cong_avoid(struct sock *sk, u32 ack, u32 acked)
0181 {
0182     struct tcp_sock *tp = tcp_sk(sk);
0183     struct tcpnv *ca = inet_csk_ca(sk);
0184     u32 cnt;
0185
0186     if (!tcp_is_cwnd_limited(sk))
0187         return;
0188
0189     /* Only grow cwnd if NV has not detected congestion */
0190     if (!ca->nv_allow_cwnd_growth)
0191         return;
0192
0193     if (tcp_in_slow_start(tp)) {
0194         acked = tcp_slow_start(tp, acked);
0195         if (!acked)
0196             return;
0197     }
0198
0199     if (ca->cwnd_growth_factor < 0) {
0200         cnt = tcp_snd_cwnd(tp) << -ca->cwnd_growth_factor;
0201         tcp_cong_avoid_ai(tp, cnt, acked);
0202     } else {
0203         cnt = max(4U, tcp_snd_cwnd(tp) >> ca->cwnd_growth_factor);
0204         tcp_cong_avoid_ai(tp, cnt, acked);
0205     }
0206 }
0207
0208 static u32 tcpnv_recalc_ssthresh(struct sock *sk)
0209 {
0210     const struct tcp_sock *tp = tcp_sk(sk);
0211
0212     return max((tcp_snd_cwnd(tp) * nv_loss_dec_factor) >> 10, 2U);
0213 }
0214
0215 static void tcpnv_state(struct sock *sk, u8 new_state)
0216 {
0217     struct tcpnv *ca = inet_csk_ca(sk);
0218
0219     if (new_state == TCP_CA_Open && ca->nv_reset) {
0220         tcpnv_reset(ca, sk);
0221     } else if (new_state == TCP_CA_Loss || new_state == TCP_CA_CWR ||
0222         new_state == TCP_CA_Recovery) {
0223         ca->nv_reset = 1;
0224         ca->nv_allow_cwnd_growth = 0;
0225         if (new_state == TCP_CA_Loss) {
0226             /* Reset cwnd growth factor to Reno value */
0227             if (ca->cwnd_growth_factor > 0)
0228                 ca->cwnd_growth_factor = 0;
0229             /* Decrease growth rate if allowed */
0230             if (nv_cwnd_growth_rate_neg > 0 &&
0231                 ca->cwnd_growth_factor > -8)
0232                 ca->cwnd_growth_factor--;
0233         }
0234     }
0235 }
0236
0237 /* Do congestion avoidance calculations for TCP-NV
0238  */
0239 static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample)
0240 {
0241     const struct inet_connection_sock *icsk = inet_csk(sk);
0242     struct tcp_sock *tp = tcp_sk(sk);
0243     struct tcpnv *ca = inet_csk_ca(sk);
0244     unsigned long now = jiffies;
0245     u64 rate64;
0246     u32 rate, max_win, cwnd_by_slope;
0247     u32 avg_rtt;
0248     u32 bytes_acked = 0;
0249
0250     /* Some calls are for duplicates without timetamps */
0251     if (sample->rtt_us < 0)
0252         return;
0253
0254     /* If not in TCP_CA_Open or TCP_CA_Disorder states, skip. */
0255     if (icsk->icsk_ca_state != TCP_CA_Open &&
0256         icsk->icsk_ca_state != TCP_CA_Disorder)
0257         return;
0258
0259     /* Stop cwnd growth if we were in catch up mode */
0260     if (ca->nv_catchup && tcp_snd_cwnd(tp) >= nv_min_cwnd) {
0261         ca->nv_catchup = 0;
0262         ca->nv_allow_cwnd_growth = 0;
0263     }
0264
0265     bytes_acked = tp->snd_una - ca->nv_last_snd_una;
0266     ca->nv_last_snd_una = tp->snd_una;
0267
0268     if (sample->in_flight == 0)
0269         return;
0270
0271     /* Calculate moving average of RTT */
0272     if (nv_rtt_factor > 0) {
0273         if (ca->nv_last_rtt > 0) {
0274             avg_rtt = (((u64)sample->rtt_us) * nv_rtt_factor +
0275                    ((u64)ca->nv_last_rtt)
0276                    * (256 - nv_rtt_factor)) >> 8;
0277         } else {
0278             avg_rtt = sample->rtt_us;
0279             ca->nv_min_rtt = avg_rtt << 1;
0280         }
0281         ca->nv_last_rtt = avg_rtt;
0282     } else {
0283         avg_rtt = sample->rtt_us;
0284     }
0285
0286     /* rate in 100's bits per second */
0287     rate64 = ((u64)sample->in_flight) * 80000;
0288     do_div(rate64, avg_rtt ?: 1);
0289     rate = (u32)rate64;
0290
0291     /* Remember the maximum rate seen during this RTT
0292      * Note: It may be more than one RTT. This function should be
0293      *       called at least nv_dec_eval_min_calls times.
0294      */
0295     if (ca->nv_rtt_max_rate < rate)
0296         ca->nv_rtt_max_rate = rate;
0297
0298     /* We have valid information, increment counter */
0299     if (ca->nv_eval_call_cnt < 255)
0300         ca->nv_eval_call_cnt++;
0301
0302     /* Apply bounds to rtt. Only used to update min_rtt */
0303     avg_rtt = nv_get_bounded_rtt(ca, avg_rtt);
0304
0305     /* update min rtt if necessary */
0306     if (avg_rtt < ca->nv_min_rtt)
0307         ca->nv_min_rtt = avg_rtt;
0308
0309     /* update future min_rtt if necessary */
0310     if (avg_rtt < ca->nv_min_rtt_new)
0311         ca->nv_min_rtt_new = avg_rtt;
0312
0313     /* nv_min_rtt is updated with the minimum (possibley averaged) rtt
0314      * seen in the last sysctl_tcp_nv_reset_period seconds (i.e. a
0315      * warm reset). This new nv_min_rtt will be continued to be updated
0316      * and be used for another sysctl_tcp_nv_reset_period seconds,
0317      * when it will be updated again.
0318      * In practice we introduce some randomness, so the actual period used
0319      * is chosen randomly from the range:
0320      *   [sysctl_tcp_nv_reset_period*3/4, sysctl_tcp_nv_reset_period*5/4)
0321      */
0322     if (time_after_eq(now, ca->nv_min_rtt_reset_jiffies)) {
0323         unsigned char rand;
0324
0325         ca->nv_min_rtt = ca->nv_min_rtt_new;
0326         ca->nv_min_rtt_new = NV_INIT_RTT;
0327         get_random_bytes(&rand, 1);
0328         ca->nv_min_rtt_reset_jiffies =
0329             now + ((nv_reset_period * (384 + rand) * HZ) >> 9);
0330         /* Every so often we decrease ca->nv_min_cwnd in case previous
0331          *  value is no longer accurate.
0332          */
0333         ca->nv_min_cwnd = max(ca->nv_min_cwnd / 2, NV_MIN_CWND);
0334     }
0335
0336     /* Once per RTT check if we need to do congestion avoidance */
0337     if (before(ca->nv_rtt_start_seq, tp->snd_una)) {
0338         ca->nv_rtt_start_seq = tp->snd_nxt;
0339         if (ca->nv_rtt_cnt < 0xff)
0340             /* Increase counter for RTTs without CA decision */
0341             ca->nv_rtt_cnt++;
0342
0343         /* If this function is only called once within an RTT
0344          * the cwnd is probably too small (in some cases due to
0345          * tso, lro or interrupt coalescence), so we increase
0346          * ca->nv_min_cwnd.
0347          */
0348         if (ca->nv_eval_call_cnt == 1 &&
0349             bytes_acked >= (ca->nv_min_cwnd - 1) * tp->mss_cache &&
0350             ca->nv_min_cwnd < (NV_TSO_CWND_BOUND + 1)) {
0351             ca->nv_min_cwnd = min(ca->nv_min_cwnd
0352                           + NV_MIN_CWND_GROW,
0353                           NV_TSO_CWND_BOUND + 1);
0354             ca->nv_rtt_start_seq = tp->snd_nxt +
0355                 ca->nv_min_cwnd * tp->mss_cache;
0356             ca->nv_eval_call_cnt = 0;
0357             ca->nv_allow_cwnd_growth = 1;
0358             return;
0359         }
0360
0361         /* Find the ideal cwnd for current rate from slope
0362          * slope = 80000.0 * mss / nv_min_rtt
0363          * cwnd_by_slope = nv_rtt_max_rate / slope
0364          */
0365         cwnd_by_slope = (u32)
0366             div64_u64(((u64)ca->nv_rtt_max_rate) * ca->nv_min_rtt,
0367                   80000ULL * tp->mss_cache);
0368         max_win = cwnd_by_slope + nv_pad;
0369
0370         /* If cwnd > max_win, decrease cwnd
0371          * if cwnd < max_win, grow cwnd
0372          * else leave the same
0373          */
0374         if (tcp_snd_cwnd(tp) > max_win) {
0375             /* there is congestion, check that it is ok
0376              * to make a CA decision
0377              * 1. We should have at least nv_dec_eval_min_calls
0378              *    data points before making a CA  decision
0379              * 2. We only make a congesion decision after
0380              *    nv_rtt_min_cnt RTTs
0381              */
0382             if (ca->nv_rtt_cnt < nv_rtt_min_cnt) {
0383                 return;
0384             } else if (tp->snd_ssthresh == TCP_INFINITE_SSTHRESH) {
0385                 if (ca->nv_eval_call_cnt <
0386                     nv_ssthresh_eval_min_calls)
0387                     return;
0388                 /* otherwise we will decrease cwnd */
0389             } else if (ca->nv_eval_call_cnt <
0390                    nv_dec_eval_min_calls) {
0391                 if (ca->nv_allow_cwnd_growth &&
0392                     ca->nv_rtt_cnt > nv_stop_rtt_cnt)
0393                     ca->nv_allow_cwnd_growth = 0;
0394                 return;
0395             }
0396
0397             /* We have enough data to determine we are congested */
0398             ca->nv_allow_cwnd_growth = 0;
0399             tp->snd_ssthresh =
0400                 (nv_ssthresh_factor * max_win) >> 3;
0401             if (tcp_snd_cwnd(tp) - max_win > 2) {
0402                 /* gap > 2, we do exponential cwnd decrease */
0403                 int dec;
0404
0405                 dec = max(2U, ((tcp_snd_cwnd(tp) - max_win) *
0406                            nv_cong_dec_mult) >> 7);
0407                 tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) - dec);
0408             } else if (nv_cong_dec_mult > 0) {
0409                 tcp_snd_cwnd_set(tp, max_win);
0410             }
0411             if (ca->cwnd_growth_factor > 0)
0412                 ca->cwnd_growth_factor = 0;
0413             ca->nv_no_cong_cnt = 0;
0414         } else if (tcp_snd_cwnd(tp) <= max_win - nv_pad_buffer) {
0415             /* There is no congestion, grow cwnd if allowed*/
0416             if (ca->nv_eval_call_cnt < nv_inc_eval_min_calls)
0417                 return;
0418
0419             ca->nv_allow_cwnd_growth = 1;
0420             ca->nv_no_cong_cnt++;
0421             if (ca->cwnd_growth_factor < 0 &&
0422                 nv_cwnd_growth_rate_neg > 0 &&
0423                 ca->nv_no_cong_cnt > nv_cwnd_growth_rate_neg) {
0424                 ca->cwnd_growth_factor++;
0425                 ca->nv_no_cong_cnt = 0;
0426             } else if (ca->cwnd_growth_factor >= 0 &&
0427                    nv_cwnd_growth_rate_pos > 0 &&
0428                    ca->nv_no_cong_cnt >
0429                    nv_cwnd_growth_rate_pos) {
0430                 ca->cwnd_growth_factor++;
0431                 ca->nv_no_cong_cnt = 0;
0432             }
0433         } else {
0434             /* cwnd is in-between, so do nothing */
0435             return;
0436         }
0437
0438         /* update state */
0439         ca->nv_eval_call_cnt = 0;
0440         ca->nv_rtt_cnt = 0;
0441         ca->nv_rtt_max_rate = 0;
0442
0443         /* Don't want to make cwnd < nv_min_cwnd
0444          * (it wasn't before, if it is now is because nv
0445          *  decreased it).
0446          */
0447         if (tcp_snd_cwnd(tp) < nv_min_cwnd)
0448             tcp_snd_cwnd_set(tp, nv_min_cwnd);
0449     }
0450 }
0451
0452 /* Extract info for Tcp socket info provided via netlink */
0453 static size_t tcpnv_get_info(struct sock *sk, u32 ext, int *attr,
0454                  union tcp_cc_info *info)
0455 {
0456     const struct tcpnv *ca = inet_csk_ca(sk);
0457
0458     if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
0459         info->vegas.tcpv_enabled = 1;
0460         info->vegas.tcpv_rttcnt = ca->nv_rtt_cnt;
0461         info->vegas.tcpv_rtt = ca->nv_last_rtt;
0462         info->vegas.tcpv_minrtt = ca->nv_min_rtt;
0463
0464         *attr = INET_DIAG_VEGASINFO;
0465         return sizeof(struct tcpvegas_info);
0466     }
0467     return 0;
0468 }
0469
0470 static struct tcp_congestion_ops tcpnv __read_mostly = {
0471     .init       = tcpnv_init,
0472     .ssthresh   = tcpnv_recalc_ssthresh,
0473     .cong_avoid = tcpnv_cong_avoid,
0474     .set_state  = tcpnv_state,
0475     .undo_cwnd  = tcp_reno_undo_cwnd,
0476     .pkts_acked     = tcpnv_acked,
0477     .get_info   = tcpnv_get_info,
0478
0479     .owner      = THIS_MODULE,
0480     .name       = "nv",
0481 };
0482
0483 static int __init tcpnv_register(void)
0484 {
0485     BUILD_BUG_ON(sizeof(struct tcpnv) > ICSK_CA_PRIV_SIZE);
0486
0487     return tcp_register_congestion_control(&tcpnv);
0488 }
0489
0490 static void __exit tcpnv_unregister(void)
0491 {
0492     tcp_unregister_congestion_control(&tcpnv);
0493 }
0494
0495 module_init(tcpnv_register);
0496 module_exit(tcpnv_unregister);
0497
0498 MODULE_AUTHOR("Lawrence Brakmo");
0499 MODULE_LICENSE("GPL");
0500 MODULE_DESCRIPTION("TCP NV");
0501 MODULE_VERSION("1.0");