0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038
0039 #include <linux/btf.h>
0040 #include <linux/btf_ids.h>
0041 #include <linux/module.h>
0042 #include <linux/mm.h>
0043 #include <net/tcp.h>
0044 #include <linux/inet_diag.h>
0045 #include "tcp_dctcp.h"
0046
0047 #define DCTCP_MAX_ALPHA 1024U
0048
0049 struct dctcp {
0050 u32 old_delivered;
0051 u32 old_delivered_ce;
0052 u32 prior_rcv_nxt;
0053 u32 dctcp_alpha;
0054 u32 next_seq;
0055 u32 ce_state;
0056 u32 loss_cwnd;
0057 };
0058
0059 static unsigned int dctcp_shift_g __read_mostly = 4;
0060 module_param(dctcp_shift_g, uint, 0644);
0061 MODULE_PARM_DESC(dctcp_shift_g, "parameter g for updating dctcp_alpha");
0062
0063 static unsigned int dctcp_alpha_on_init __read_mostly = DCTCP_MAX_ALPHA;
0064 module_param(dctcp_alpha_on_init, uint, 0644);
0065 MODULE_PARM_DESC(dctcp_alpha_on_init, "parameter for initial alpha value");
0066
0067 static struct tcp_congestion_ops dctcp_reno;
0068
0069 static void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca)
0070 {
0071 ca->next_seq = tp->snd_nxt;
0072
0073 ca->old_delivered = tp->delivered;
0074 ca->old_delivered_ce = tp->delivered_ce;
0075 }
0076
0077 static void dctcp_init(struct sock *sk)
0078 {
0079 const struct tcp_sock *tp = tcp_sk(sk);
0080
0081 if ((tp->ecn_flags & TCP_ECN_OK) ||
0082 (sk->sk_state == TCP_LISTEN ||
0083 sk->sk_state == TCP_CLOSE)) {
0084 struct dctcp *ca = inet_csk_ca(sk);
0085
0086 ca->prior_rcv_nxt = tp->rcv_nxt;
0087
0088 ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA);
0089
0090 ca->loss_cwnd = 0;
0091 ca->ce_state = 0;
0092
0093 dctcp_reset(tp, ca);
0094 return;
0095 }
0096
0097
0098
0099
0100 inet_csk(sk)->icsk_ca_ops = &dctcp_reno;
0101 INET_ECN_dontxmit(sk);
0102 }
0103
0104 static u32 dctcp_ssthresh(struct sock *sk)
0105 {
0106 struct dctcp *ca = inet_csk_ca(sk);
0107 struct tcp_sock *tp = tcp_sk(sk);
0108
0109 ca->loss_cwnd = tcp_snd_cwnd(tp);
0110 return max(tcp_snd_cwnd(tp) - ((tcp_snd_cwnd(tp) * ca->dctcp_alpha) >> 11U), 2U);
0111 }
0112
0113 static void dctcp_update_alpha(struct sock *sk, u32 flags)
0114 {
0115 const struct tcp_sock *tp = tcp_sk(sk);
0116 struct dctcp *ca = inet_csk_ca(sk);
0117
0118
0119 if (!before(tp->snd_una, ca->next_seq)) {
0120 u32 delivered_ce = tp->delivered_ce - ca->old_delivered_ce;
0121 u32 alpha = ca->dctcp_alpha;
0122
0123
0124
0125 alpha -= min_not_zero(alpha, alpha >> dctcp_shift_g);
0126 if (delivered_ce) {
0127 u32 delivered = tp->delivered - ca->old_delivered;
0128
0129
0130
0131
0132 delivered_ce <<= (10 - dctcp_shift_g);
0133 delivered_ce /= max(1U, delivered);
0134
0135 alpha = min(alpha + delivered_ce, DCTCP_MAX_ALPHA);
0136 }
0137
0138
0139
0140
0141 WRITE_ONCE(ca->dctcp_alpha, alpha);
0142 dctcp_reset(tp, ca);
0143 }
0144 }
0145
0146 static void dctcp_react_to_loss(struct sock *sk)
0147 {
0148 struct dctcp *ca = inet_csk_ca(sk);
0149 struct tcp_sock *tp = tcp_sk(sk);
0150
0151 ca->loss_cwnd = tcp_snd_cwnd(tp);
0152 tp->snd_ssthresh = max(tcp_snd_cwnd(tp) >> 1U, 2U);
0153 }
0154
0155 static void dctcp_state(struct sock *sk, u8 new_state)
0156 {
0157 if (new_state == TCP_CA_Recovery &&
0158 new_state != inet_csk(sk)->icsk_ca_state)
0159 dctcp_react_to_loss(sk);
0160
0161
0162
0163 }
0164
0165 static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
0166 {
0167 struct dctcp *ca = inet_csk_ca(sk);
0168
0169 switch (ev) {
0170 case CA_EVENT_ECN_IS_CE:
0171 case CA_EVENT_ECN_NO_CE:
0172 dctcp_ece_ack_update(sk, ev, &ca->prior_rcv_nxt, &ca->ce_state);
0173 break;
0174 case CA_EVENT_LOSS:
0175 dctcp_react_to_loss(sk);
0176 break;
0177 default:
0178
0179 break;
0180 }
0181 }
0182
0183 static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr,
0184 union tcp_cc_info *info)
0185 {
0186 const struct dctcp *ca = inet_csk_ca(sk);
0187 const struct tcp_sock *tp = tcp_sk(sk);
0188
0189
0190
0191
0192 if (ext & (1 << (INET_DIAG_DCTCPINFO - 1)) ||
0193 ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
0194 memset(&info->dctcp, 0, sizeof(info->dctcp));
0195 if (inet_csk(sk)->icsk_ca_ops != &dctcp_reno) {
0196 info->dctcp.dctcp_enabled = 1;
0197 info->dctcp.dctcp_ce_state = (u16) ca->ce_state;
0198 info->dctcp.dctcp_alpha = ca->dctcp_alpha;
0199 info->dctcp.dctcp_ab_ecn = tp->mss_cache *
0200 (tp->delivered_ce - ca->old_delivered_ce);
0201 info->dctcp.dctcp_ab_tot = tp->mss_cache *
0202 (tp->delivered - ca->old_delivered);
0203 }
0204
0205 *attr = INET_DIAG_DCTCPINFO;
0206 return sizeof(info->dctcp);
0207 }
0208 return 0;
0209 }
0210
0211 static u32 dctcp_cwnd_undo(struct sock *sk)
0212 {
0213 const struct dctcp *ca = inet_csk_ca(sk);
0214 struct tcp_sock *tp = tcp_sk(sk);
0215
0216 return max(tcp_snd_cwnd(tp), ca->loss_cwnd);
0217 }
0218
0219 static struct tcp_congestion_ops dctcp __read_mostly = {
0220 .init = dctcp_init,
0221 .in_ack_event = dctcp_update_alpha,
0222 .cwnd_event = dctcp_cwnd_event,
0223 .ssthresh = dctcp_ssthresh,
0224 .cong_avoid = tcp_reno_cong_avoid,
0225 .undo_cwnd = dctcp_cwnd_undo,
0226 .set_state = dctcp_state,
0227 .get_info = dctcp_get_info,
0228 .flags = TCP_CONG_NEEDS_ECN,
0229 .owner = THIS_MODULE,
0230 .name = "dctcp",
0231 };
0232
0233 static struct tcp_congestion_ops dctcp_reno __read_mostly = {
0234 .ssthresh = tcp_reno_ssthresh,
0235 .cong_avoid = tcp_reno_cong_avoid,
0236 .undo_cwnd = tcp_reno_undo_cwnd,
0237 .get_info = dctcp_get_info,
0238 .owner = THIS_MODULE,
0239 .name = "dctcp-reno",
0240 };
0241
0242 BTF_SET8_START(tcp_dctcp_check_kfunc_ids)
0243 #ifdef CONFIG_X86
0244 #ifdef CONFIG_DYNAMIC_FTRACE
0245 BTF_ID_FLAGS(func, dctcp_init)
0246 BTF_ID_FLAGS(func, dctcp_update_alpha)
0247 BTF_ID_FLAGS(func, dctcp_cwnd_event)
0248 BTF_ID_FLAGS(func, dctcp_ssthresh)
0249 BTF_ID_FLAGS(func, dctcp_cwnd_undo)
0250 BTF_ID_FLAGS(func, dctcp_state)
0251 #endif
0252 #endif
0253 BTF_SET8_END(tcp_dctcp_check_kfunc_ids)
0254
0255 static const struct btf_kfunc_id_set tcp_dctcp_kfunc_set = {
0256 .owner = THIS_MODULE,
0257 .set = &tcp_dctcp_check_kfunc_ids,
0258 };
0259
0260 static int __init dctcp_register(void)
0261 {
0262 int ret;
0263
0264 BUILD_BUG_ON(sizeof(struct dctcp) > ICSK_CA_PRIV_SIZE);
0265
0266 ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &tcp_dctcp_kfunc_set);
0267 if (ret < 0)
0268 return ret;
0269 return tcp_register_congestion_control(&dctcp);
0270 }
0271
0272 static void __exit dctcp_unregister(void)
0273 {
0274 tcp_unregister_congestion_control(&dctcp);
0275 }
0276
0277 module_init(dctcp_register);
0278 module_exit(dctcp_unregister);
0279
0280 MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
0281 MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
0282 MODULE_AUTHOR("Glenn Judd <glenn.judd@morganstanley.com>");
0283
0284 MODULE_LICENSE("GPL v2");
0285 MODULE_DESCRIPTION("DataCenter TCP (DCTCP)");