0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028 #include <linux/kernel.h>
0029 #include <linux/random.h>
0030 #include <linux/module.h>
0031 #include <linux/sched/clock.h>
0032
0033 #include <net/tcp.h>
0034
0035 #define HYSTART_ACK_TRAIN 1
0036 #define HYSTART_DELAY 2
0037
0038 static int window __read_mostly = 8;
0039 static unsigned int backoff_beta __read_mostly = 0.7071 * 1024;
0040 static unsigned int backoff_factor __read_mostly = 42;
0041 static unsigned int hystart_detect __read_mostly = 3;
0042 static unsigned int use_ineff __read_mostly = 5;
0043 static bool use_shadow __read_mostly = true;
0044 static bool use_tolerance __read_mostly;
0045
0046 module_param(window, int, 0444);
0047 MODULE_PARM_DESC(window, "gradient window size (power of two <= 256)");
0048 module_param(backoff_beta, uint, 0644);
0049 MODULE_PARM_DESC(backoff_beta, "backoff beta (0-1024)");
0050 module_param(backoff_factor, uint, 0644);
0051 MODULE_PARM_DESC(backoff_factor, "backoff probability scale factor");
0052 module_param(hystart_detect, uint, 0644);
0053 MODULE_PARM_DESC(hystart_detect, "use Hybrid Slow start "
0054 "(0: disabled, 1: ACK train, 2: delay threshold, 3: both)");
0055 module_param(use_ineff, uint, 0644);
0056 MODULE_PARM_DESC(use_ineff, "use ineffectual backoff detection (threshold)");
0057 module_param(use_shadow, bool, 0644);
0058 MODULE_PARM_DESC(use_shadow, "use shadow window heuristic");
0059 module_param(use_tolerance, bool, 0644);
0060 MODULE_PARM_DESC(use_tolerance, "use loss tolerance heuristic");
0061
0062 struct cdg_minmax {
0063 union {
0064 struct {
0065 s32 min;
0066 s32 max;
0067 };
0068 u64 v64;
0069 };
0070 };
0071
0072 enum cdg_state {
0073 CDG_UNKNOWN = 0,
0074 CDG_NONFULL = 1,
0075 CDG_FULL = 2,
0076 CDG_BACKOFF = 3,
0077 };
0078
0079 struct cdg {
0080 struct cdg_minmax rtt;
0081 struct cdg_minmax rtt_prev;
0082 struct cdg_minmax *gradients;
0083 struct cdg_minmax gsum;
0084 bool gfilled;
0085 u8 tail;
0086 u8 state;
0087 u8 delack;
0088 u32 rtt_seq;
0089 u32 shadow_wnd;
0090 u16 backoff_cnt;
0091 u16 sample_cnt;
0092 s32 delay_min;
0093 u32 last_ack;
0094 u32 round_start;
0095 };
0096
0097
0098
0099
0100
0101
0102
0103 static u32 __pure nexp_u32(u32 ux)
0104 {
0105 static const u16 v[] = {
0106
0107 65535,
0108 65518, 65501, 65468, 65401, 65267, 65001, 64470, 63422,
0109 61378, 57484, 50423, 38795, 22965, 8047, 987, 14,
0110 };
0111 u32 msb = ux >> 8;
0112 u32 res;
0113 int i;
0114
0115
0116 if (msb > U16_MAX)
0117 return 0;
0118
0119
0120 res = U32_MAX - (ux & 0xff) * (U32_MAX / 1000000);
0121
0122
0123 for (i = 1; msb; i++, msb >>= 1) {
0124 u32 y = v[i & -(msb & 1)] + U32_C(1);
0125
0126 res = ((u64)res * y) >> 16;
0127 }
0128
0129 return res;
0130 }
0131
0132
0133
0134
0135
0136
0137
0138
0139
0140 static void tcp_cdg_hystart_update(struct sock *sk)
0141 {
0142 struct cdg *ca = inet_csk_ca(sk);
0143 struct tcp_sock *tp = tcp_sk(sk);
0144
0145 ca->delay_min = min_not_zero(ca->delay_min, ca->rtt.min);
0146 if (ca->delay_min == 0)
0147 return;
0148
0149 if (hystart_detect & HYSTART_ACK_TRAIN) {
0150 u32 now_us = tp->tcp_mstamp;
0151
0152 if (ca->last_ack == 0 || !tcp_is_cwnd_limited(sk)) {
0153 ca->last_ack = now_us;
0154 ca->round_start = now_us;
0155 } else if (before(now_us, ca->last_ack + 3000)) {
0156 u32 base_owd = max(ca->delay_min / 2U, 125U);
0157
0158 ca->last_ack = now_us;
0159 if (after(now_us, ca->round_start + base_owd)) {
0160 NET_INC_STATS(sock_net(sk),
0161 LINUX_MIB_TCPHYSTARTTRAINDETECT);
0162 NET_ADD_STATS(sock_net(sk),
0163 LINUX_MIB_TCPHYSTARTTRAINCWND,
0164 tcp_snd_cwnd(tp));
0165 tp->snd_ssthresh = tcp_snd_cwnd(tp);
0166 return;
0167 }
0168 }
0169 }
0170
0171 if (hystart_detect & HYSTART_DELAY) {
0172 if (ca->sample_cnt < 8) {
0173 ca->sample_cnt++;
0174 } else {
0175 s32 thresh = max(ca->delay_min + ca->delay_min / 8U,
0176 125U);
0177
0178 if (ca->rtt.min > thresh) {
0179 NET_INC_STATS(sock_net(sk),
0180 LINUX_MIB_TCPHYSTARTDELAYDETECT);
0181 NET_ADD_STATS(sock_net(sk),
0182 LINUX_MIB_TCPHYSTARTDELAYCWND,
0183 tcp_snd_cwnd(tp));
0184 tp->snd_ssthresh = tcp_snd_cwnd(tp);
0185 }
0186 }
0187 }
0188 }
0189
0190 static s32 tcp_cdg_grad(struct cdg *ca)
0191 {
0192 s32 gmin = ca->rtt.min - ca->rtt_prev.min;
0193 s32 gmax = ca->rtt.max - ca->rtt_prev.max;
0194 s32 grad;
0195
0196 if (ca->gradients) {
0197 ca->gsum.min += gmin - ca->gradients[ca->tail].min;
0198 ca->gsum.max += gmax - ca->gradients[ca->tail].max;
0199 ca->gradients[ca->tail].min = gmin;
0200 ca->gradients[ca->tail].max = gmax;
0201 ca->tail = (ca->tail + 1) & (window - 1);
0202 gmin = ca->gsum.min;
0203 gmax = ca->gsum.max;
0204 }
0205
0206
0207
0208
0209
0210
0211
0212 grad = gmin > 0 ? gmin : gmax;
0213
0214
0215 if (!ca->gfilled) {
0216 if (!ca->gradients && window > 1)
0217 grad *= window;
0218 else if (ca->tail == 0)
0219 ca->gfilled = true;
0220 else
0221 grad = (grad * window) / (int)ca->tail;
0222 }
0223
0224
0225 if (gmin <= -32 || gmax <= -32)
0226 ca->backoff_cnt = 0;
0227
0228 if (use_tolerance) {
0229
0230 gmin = DIV_ROUND_CLOSEST(gmin, 64);
0231 gmax = DIV_ROUND_CLOSEST(gmax, 64);
0232
0233 if (gmin > 0 && gmax <= 0)
0234 ca->state = CDG_FULL;
0235 else if ((gmin > 0 && gmax > 0) || gmax < 0)
0236 ca->state = CDG_NONFULL;
0237 }
0238 return grad;
0239 }
0240
0241 static bool tcp_cdg_backoff(struct sock *sk, u32 grad)
0242 {
0243 struct cdg *ca = inet_csk_ca(sk);
0244 struct tcp_sock *tp = tcp_sk(sk);
0245
0246 if (prandom_u32() <= nexp_u32(grad * backoff_factor))
0247 return false;
0248
0249 if (use_ineff) {
0250 ca->backoff_cnt++;
0251 if (ca->backoff_cnt > use_ineff)
0252 return false;
0253 }
0254
0255 ca->shadow_wnd = max(ca->shadow_wnd, tcp_snd_cwnd(tp));
0256 ca->state = CDG_BACKOFF;
0257 tcp_enter_cwr(sk);
0258 return true;
0259 }
0260
0261
0262 static void tcp_cdg_cong_avoid(struct sock *sk, u32 ack, u32 acked)
0263 {
0264 struct cdg *ca = inet_csk_ca(sk);
0265 struct tcp_sock *tp = tcp_sk(sk);
0266 u32 prior_snd_cwnd;
0267 u32 incr;
0268
0269 if (tcp_in_slow_start(tp) && hystart_detect)
0270 tcp_cdg_hystart_update(sk);
0271
0272 if (after(ack, ca->rtt_seq) && ca->rtt.v64) {
0273 s32 grad = 0;
0274
0275 if (ca->rtt_prev.v64)
0276 grad = tcp_cdg_grad(ca);
0277 ca->rtt_seq = tp->snd_nxt;
0278 ca->rtt_prev = ca->rtt;
0279 ca->rtt.v64 = 0;
0280 ca->last_ack = 0;
0281 ca->sample_cnt = 0;
0282
0283 if (grad > 0 && tcp_cdg_backoff(sk, grad))
0284 return;
0285 }
0286
0287 if (!tcp_is_cwnd_limited(sk)) {
0288 ca->shadow_wnd = min(ca->shadow_wnd, tcp_snd_cwnd(tp));
0289 return;
0290 }
0291
0292 prior_snd_cwnd = tcp_snd_cwnd(tp);
0293 tcp_reno_cong_avoid(sk, ack, acked);
0294
0295 incr = tcp_snd_cwnd(tp) - prior_snd_cwnd;
0296 ca->shadow_wnd = max(ca->shadow_wnd, ca->shadow_wnd + incr);
0297 }
0298
0299 static void tcp_cdg_acked(struct sock *sk, const struct ack_sample *sample)
0300 {
0301 struct cdg *ca = inet_csk_ca(sk);
0302 struct tcp_sock *tp = tcp_sk(sk);
0303
0304 if (sample->rtt_us <= 0)
0305 return;
0306
0307
0308
0309
0310
0311 if (tp->sacked_out == 0) {
0312 if (sample->pkts_acked == 1 && ca->delack) {
0313
0314
0315
0316 ca->rtt.min = min(ca->rtt.min, sample->rtt_us);
0317 ca->delack--;
0318 return;
0319 } else if (sample->pkts_acked > 1 && ca->delack < 5) {
0320 ca->delack++;
0321 }
0322 }
0323
0324 ca->rtt.min = min_not_zero(ca->rtt.min, sample->rtt_us);
0325 ca->rtt.max = max(ca->rtt.max, sample->rtt_us);
0326 }
0327
0328 static u32 tcp_cdg_ssthresh(struct sock *sk)
0329 {
0330 struct cdg *ca = inet_csk_ca(sk);
0331 struct tcp_sock *tp = tcp_sk(sk);
0332
0333 if (ca->state == CDG_BACKOFF)
0334 return max(2U, (tcp_snd_cwnd(tp) * min(1024U, backoff_beta)) >> 10);
0335
0336 if (ca->state == CDG_NONFULL && use_tolerance)
0337 return tcp_snd_cwnd(tp);
0338
0339 ca->shadow_wnd = min(ca->shadow_wnd >> 1, tcp_snd_cwnd(tp));
0340 if (use_shadow)
0341 return max3(2U, ca->shadow_wnd, tcp_snd_cwnd(tp) >> 1);
0342 return max(2U, tcp_snd_cwnd(tp) >> 1);
0343 }
0344
0345 static void tcp_cdg_cwnd_event(struct sock *sk, const enum tcp_ca_event ev)
0346 {
0347 struct cdg *ca = inet_csk_ca(sk);
0348 struct tcp_sock *tp = tcp_sk(sk);
0349 struct cdg_minmax *gradients;
0350
0351 switch (ev) {
0352 case CA_EVENT_CWND_RESTART:
0353 gradients = ca->gradients;
0354 if (gradients)
0355 memset(gradients, 0, window * sizeof(gradients[0]));
0356 memset(ca, 0, sizeof(*ca));
0357
0358 ca->gradients = gradients;
0359 ca->rtt_seq = tp->snd_nxt;
0360 ca->shadow_wnd = tcp_snd_cwnd(tp);
0361 break;
0362 case CA_EVENT_COMPLETE_CWR:
0363 ca->state = CDG_UNKNOWN;
0364 ca->rtt_seq = tp->snd_nxt;
0365 ca->rtt_prev = ca->rtt;
0366 ca->rtt.v64 = 0;
0367 break;
0368 default:
0369 break;
0370 }
0371 }
0372
0373 static void tcp_cdg_init(struct sock *sk)
0374 {
0375 struct cdg *ca = inet_csk_ca(sk);
0376 struct tcp_sock *tp = tcp_sk(sk);
0377
0378
0379 if (window > 1)
0380 ca->gradients = kcalloc(window, sizeof(ca->gradients[0]),
0381 GFP_NOWAIT | __GFP_NOWARN);
0382 ca->rtt_seq = tp->snd_nxt;
0383 ca->shadow_wnd = tcp_snd_cwnd(tp);
0384 }
0385
0386 static void tcp_cdg_release(struct sock *sk)
0387 {
0388 struct cdg *ca = inet_csk_ca(sk);
0389
0390 kfree(ca->gradients);
0391 }
0392
0393 static struct tcp_congestion_ops tcp_cdg __read_mostly = {
0394 .cong_avoid = tcp_cdg_cong_avoid,
0395 .cwnd_event = tcp_cdg_cwnd_event,
0396 .pkts_acked = tcp_cdg_acked,
0397 .undo_cwnd = tcp_reno_undo_cwnd,
0398 .ssthresh = tcp_cdg_ssthresh,
0399 .release = tcp_cdg_release,
0400 .init = tcp_cdg_init,
0401 .owner = THIS_MODULE,
0402 .name = "cdg",
0403 };
0404
0405 static int __init tcp_cdg_register(void)
0406 {
0407 if (backoff_beta > 1024 || window < 1 || window > 256)
0408 return -ERANGE;
0409 if (!is_power_of_2(window))
0410 return -EINVAL;
0411
0412 BUILD_BUG_ON(sizeof(struct cdg) > ICSK_CA_PRIV_SIZE);
0413 tcp_register_congestion_control(&tcp_cdg);
0414 return 0;
0415 }
0416
0417 static void __exit tcp_cdg_unregister(void)
0418 {
0419 tcp_unregister_congestion_control(&tcp_cdg);
0420 }
0421
0422 module_init(tcp_cdg_register);
0423 module_exit(tcp_cdg_unregister);
0424 MODULE_AUTHOR("Kenneth Klette Jonassen");
0425 MODULE_LICENSE("GPL");
0426 MODULE_DESCRIPTION("TCP CDG");