From 4012ef7754cf29ce27d6fd728437830dea9915fd Mon Sep 17 00:00:00 2001 From: Richard Scheffenegger Date: Wed, 31 Aug 2022 15:01:25 +0200 Subject: [PATCH] tcp: Functional implementation of Accurate ECN The AccECN handshake and TCP header flags are supported, no support yet for the AccECN option. This minimalistic implementation is sufficient to support DCTCP while dramatically cutting the number of ACKs, and provide ECN response from the receiver to the CC modules. Reviewed By: #transport, #manpages, rrs, pauamma Sponsored by: NetApp, Inc. Differential Revision: https://reviews.freebsd.org/D21011 --- share/man/man4/tcp.4 | 7 + sys/netinet/tcp_ecn.c | 326 +++++++++++++++++++++++++++++++--- sys/netinet/tcp_ecn.h | 1 + sys/netinet/tcp_output.c | 2 +- sys/netinet/tcp_stacks/rack.c | 8 +- sys/netinet/tcp_var.h | 4 +- 6 files changed, 316 insertions(+), 32 deletions(-) diff --git a/share/man/man4/tcp.4 b/share/man/man4/tcp.4 index cdb58c7cbacc..753184418b1f 100644 --- a/share/man/man4/tcp.4 +++ b/share/man/man4/tcp.4 @@ -495,6 +495,13 @@ Outgoing connections will request ECN. Allow incoming connections to request ECN. Outgoing connections will not request ECN. (default) +.It 3 +Negotiate on incoming connection for Accurate ECN, ECN, or no ECN. +Outgoing connections will request Accurate ECN and fall back to +ECN depending on the capabilities of the server. +.It 4 +Negotiate on incoming connection for Accurate ECN, ECN, or no ECN. +Outgoing connections will not request ECN. .El .It Va ecn.maxretries Number of retries (SYN or SYN/ACK retransmits) before disabling ECN on a diff --git a/sys/netinet/tcp_ecn.c b/sys/netinet/tcp_ecn.c index 20f84a0c9d98..05591c2de2f7 100644 --- a/sys/netinet/tcp_ecn.c +++ b/sys/netinet/tcp_ecn.c @@ -109,12 +109,91 @@ __FBSDID("$FreeBSD$"); void tcp_ecn_input_syn_sent(struct tcpcb *tp, uint16_t thflags, int iptos) { - thflags &= (TH_CWR|TH_ECE); - if (((thflags & (TH_CWR | TH_ECE)) == TH_ECE) && - V_tcp_do_ecn) { - tp->t_flags2 |= TF2_ECN_PERMIT; - TCPSTAT_INC(tcps_ecn_shs); + if (V_tcp_do_ecn == 0) + return; + if ((V_tcp_do_ecn == 1) || + (V_tcp_do_ecn == 2)) { + /* RFC3168 ECN handling */ + if ((thflags & (TH_CWR | TH_ECE)) == (0 | TH_ECE)) { + tp->t_flags2 |= TF2_ECN_PERMIT; + TCPSTAT_INC(tcps_ecn_shs); + } + } else + /* decoding Accurate ECN according to table in section 3.1.1 */ + if ((V_tcp_do_ecn == 3) || + (V_tcp_do_ecn == 4)) { + /* + * on the SYN,ACK, process the AccECN + * flags indicating the state the SYN + * was delivered. + * Reactions to Path ECN mangling can + * come here. + */ + switch (thflags & (TH_AE | TH_CWR | TH_ECE)) { + /* RFC3168 SYN */ + case (0|0|TH_ECE): + tp->t_flags2 |= TF2_ECN_PERMIT; + TCPSTAT_INC(tcps_ecn_shs); + break; + /* non-ECT SYN */ + case (0|TH_CWR|0): + tp->t_flags2 |= TF2_ACE_PERMIT; + tp->t_scep = 5; + TCPSTAT_INC(tcps_ecn_shs); + TCPSTAT_INC(tcps_ace_nect); + break; + /* ECT0 SYN */ + case (TH_AE|0|0): + tp->t_flags2 |= TF2_ACE_PERMIT; + tp->t_scep = 5; + TCPSTAT_INC(tcps_ecn_shs); + TCPSTAT_INC(tcps_ace_ect0); + break; + /* ECT1 SYN */ + case (0|TH_CWR|TH_ECE): + tp->t_flags2 |= TF2_ACE_PERMIT; + tp->t_scep = 5; + TCPSTAT_INC(tcps_ecn_shs); + TCPSTAT_INC(tcps_ace_ect1); + break; + /* CE SYN */ + case (TH_AE|TH_CWR|0): + tp->t_flags2 |= TF2_ACE_PERMIT; + tp->t_scep = 6; + /* + * reduce the IW to 2 MSS (to + * account for delayed acks) if + * the SYN,ACK was CE marked + */ + tp->snd_cwnd = 2 * tcp_maxseg(tp); + TCPSTAT_INC(tcps_ecn_shs); + TCPSTAT_INC(tcps_ace_nect); + break; + default: + break; + } + /* + * Set the AccECN Codepoints on + * the outgoing to the ECN + * state of the + * according to table 3 in the + * AccECN draft + */ + switch (iptos & IPTOS_ECN_MASK) { + case (IPTOS_ECN_NOTECT): + tp->t_rcep = 0b010; + break; + case (IPTOS_ECN_ECT0): + tp->t_rcep = 0b100; + break; + case (IPTOS_ECN_ECT1): + tp->t_rcep = 0b011; + break; + case (IPTOS_ECN_CE): + tp->t_rcep = 0b110; + break; + } } } @@ -128,13 +207,53 @@ tcp_ecn_input_parallel_syn(struct tcpcb *tp, uint16_t thflags, int iptos) return; if (V_tcp_do_ecn == 0) return; - if ((V_tcp_do_ecn == 1) || (V_tcp_do_ecn == 2)) { + if ((V_tcp_do_ecn == 1) || + (V_tcp_do_ecn == 2)) { /* RFC3168 ECN handling */ if ((thflags & (TH_CWR | TH_ECE)) == (TH_CWR | TH_ECE)) { tp->t_flags2 |= TF2_ECN_PERMIT; tp->t_flags2 |= TF2_ECN_SND_ECE; TCPSTAT_INC(tcps_ecn_shs); } + } else + if ((V_tcp_do_ecn == 3) || + (V_tcp_do_ecn == 4)) { + /* AccECN handling */ + switch (thflags & (TH_AE | TH_CWR | TH_ECE)) { + default: + case (0|0|0): + break; + case (0|TH_CWR|TH_ECE): + tp->t_flags2 |= TF2_ECN_PERMIT; + tp->t_flags2 |= TF2_ECN_SND_ECE; + TCPSTAT_INC(tcps_ecn_shs); + break; + case (TH_AE|TH_CWR|TH_ECE): + tp->t_flags2 |= TF2_ACE_PERMIT; + TCPSTAT_INC(tcps_ecn_shs); + /* + * Set the AccECN Codepoints on + * the outgoing to the ECN + * state of the + * according to table 3 in the + * AccECN draft + */ + switch (iptos & IPTOS_ECN_MASK) { + case (IPTOS_ECN_NOTECT): + tp->t_rcep = 0b010; + break; + case (IPTOS_ECN_ECT0): + tp->t_rcep = 0b100; + break; + case (IPTOS_ECN_ECT1): + tp->t_rcep = 0b011; + break; + case (IPTOS_ECN_CE): + tp->t_rcep = 0b110; + break; + } + break; + } } } @@ -146,7 +265,7 @@ tcp_ecn_input_segment(struct tcpcb *tp, uint16_t thflags, int iptos) { int delta_ace = 0; - if (tp->t_flags2 & TF2_ECN_PERMIT) { + if (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) { switch (iptos & IPTOS_ECN_MASK) { case IPTOS_ECN_CE: TCPSTAT_INC(tcps_ecn_ce); @@ -159,15 +278,52 @@ tcp_ecn_input_segment(struct tcpcb *tp, uint16_t thflags, int iptos) break; } - /* RFC3168 ECN handling */ - if (thflags & TH_ECE) - delta_ace = 1; - if (thflags & TH_CWR) { - tp->t_flags2 &= ~TF2_ECN_SND_ECE; - tp->t_flags |= TF_ACKNOW; + if (tp->t_flags2 & TF2_ACE_PERMIT) { + if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) + tp->t_rcep += 1; + if (tp->t_flags2 & TF2_ECN_PERMIT) { + delta_ace = (tcp_ecn_get_ace(thflags) + 8 - + (tp->t_scep & 0x07)) & 0x07; + tp->t_scep += delta_ace; + } else { + /* + * process the final ACK of the 3WHS + * see table 3 in draft-ietf-tcpm-accurate-ecn + */ + switch (tcp_ecn_get_ace(thflags)) { + case 0b010: + /* nonECT SYN or SYN,ACK */ + /* Fallthrough */ + case 0b011: + /* ECT1 SYN or SYN,ACK */ + /* Fallthrough */ + case 0b100: + /* ECT0 SYN or SYN,ACK */ + tp->t_scep = 5; + break; + case 0b110: + /* CE SYN or SYN,ACK */ + tp->t_scep = 6; + tp->snd_cwnd = 2 * tcp_maxseg(tp); + break; + default: + /* mangled AccECN handshake */ + tp->t_scep = 5; + break; + } + tp->t_flags2 |= TF2_ECN_PERMIT; + } + } else { + /* RFC3168 ECN handling */ + if (thflags & TH_ECE) + delta_ace = 1; + if (thflags & TH_CWR) { + tp->t_flags2 &= ~TF2_ECN_SND_ECE; + tp->t_flags |= TF_ACKNOW; + } + if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) + tp->t_flags2 |= TF2_ECN_SND_ECE; } - if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) - tp->t_flags2 |= TF2_ECN_SND_ECE; /* Process a packet differently from RFC3168. */ cc_ecnpkt_handler_flags(tp, thflags, iptos); @@ -184,6 +340,8 @@ tcp_ecn_output_syn_sent(struct tcpcb *tp) { uint16_t thflags = 0; + if (V_tcp_do_ecn == 0) + return thflags; if (V_tcp_do_ecn == 1) { /* Send a RFC3168 ECN setup packet */ if (tp->t_rxtshift >= 1) { @@ -191,6 +349,14 @@ tcp_ecn_output_syn_sent(struct tcpcb *tp) thflags = TH_ECE|TH_CWR; } else thflags = TH_ECE|TH_CWR; + } else + if (V_tcp_do_ecn == 3) { + /* Send an Accurate ECN setup packet */ + if (tp->t_rxtshift >= 1) { + if (tp->t_rxtshift <= V_tcp_ecn_maxretries) + thflags = TH_ECE|TH_CWR|TH_AE; + } else + thflags = TH_ECE|TH_CWR|TH_AE; } return thflags; @@ -215,6 +381,7 @@ tcp_ecn_output_established(struct tcpcb *tp, uint16_t *thflags, int len, bool rx newdata = (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && !rxmit && !((tp->t_flags & TF_FORCEDATA) && len == 1)); + /* RFC3168 ECN marking, only new data segments */ if (newdata) { ipecn = IPTOS_ECN_ECT0; TCPSTAT_INC(tcps_ecn_ect0); @@ -222,13 +389,35 @@ tcp_ecn_output_established(struct tcpcb *tp, uint16_t *thflags, int len, bool rx /* * Reply with proper ECN notifications. */ - if (newdata && - (tp->t_flags2 & TF2_ECN_SND_CWR)) { - *thflags |= TH_CWR; - tp->t_flags2 &= ~TF2_ECN_SND_CWR; + if (tp->t_flags2 & TF2_ACE_PERMIT) { + *thflags &= ~(TH_AE|TH_CWR|TH_ECE); + if (tp->t_rcep & 0x01) + *thflags |= TH_ECE; + if (tp->t_rcep & 0x02) + *thflags |= TH_CWR; + if (tp->t_rcep & 0x04) + *thflags |= TH_AE; + if (!(tp->t_flags2 & TF2_ECN_PERMIT)) { + /* + * here we process the final + * ACK of the 3WHS + */ + if (tp->t_rcep == 0b110) { + tp->t_rcep = 6; + } else { + tp->t_rcep = 5; + } + tp->t_flags2 |= TF2_ECN_PERMIT; + } + } else { + if (newdata && + (tp->t_flags2 & TF2_ECN_SND_CWR)) { + *thflags |= TH_CWR; + tp->t_flags2 &= ~TF2_ECN_SND_CWR; + } + if (tp->t_flags2 & TF2_ECN_SND_ECE) + *thflags |= TH_ECE; } - if (tp->t_flags2 & TF2_ECN_SND_ECE) - *thflags |= TH_ECE; return ipecn; } @@ -245,6 +434,20 @@ tcp_ecn_syncache_socket(struct tcpcb *tp, struct syncache *sc) case SCF_ECN: tp->t_flags2 |= TF2_ECN_PERMIT; break; + case SCF_ACE_N: + /* Fallthrough */ + case SCF_ACE_0: + /* Fallthrough */ + case SCF_ACE_1: + tp->t_flags2 |= TF2_ACE_PERMIT; + tp->t_scep = 5; + tp->t_rcep = 5; + break; + case SCF_ACE_CE: + tp->t_flags2 |= TF2_ACE_PERMIT; + tp->t_scep = 6; + tp->t_rcep = 6; + break; /* undefined SCF codepoint */ default: break; @@ -261,15 +464,54 @@ tcp_ecn_syncache_add(uint16_t thflags, int iptos) { int scflags = 0; - switch (thflags & (TH_CWR|TH_ECE)) { + switch (thflags & (TH_AE|TH_CWR|TH_ECE)) { /* no ECN */ - case (0|0): + case (0|0|0): break; /* legacy ECN */ - case (TH_CWR|TH_ECE): + case (0|TH_CWR|TH_ECE): scflags = SCF_ECN; break; + /* Accurate ECN */ + case (TH_AE|TH_CWR|TH_ECE): + if ((V_tcp_do_ecn == 3) || + (V_tcp_do_ecn == 4)) { + switch (iptos & IPTOS_ECN_MASK) { + case IPTOS_ECN_CE: + scflags = SCF_ACE_CE; + break; + case IPTOS_ECN_ECT0: + scflags = SCF_ACE_0; + break; + case IPTOS_ECN_ECT1: + scflags = SCF_ACE_1; + break; + case IPTOS_ECN_NOTECT: + scflags = SCF_ACE_N; + break; + } + } else + scflags = SCF_ECN; + break; + /* Default Case (section 3.1.2) */ default: + if ((V_tcp_do_ecn == 3) || + (V_tcp_do_ecn == 4)) { + switch (iptos & IPTOS_ECN_MASK) { + case IPTOS_ECN_CE: + scflags = SCF_ACE_CE; + break; + case IPTOS_ECN_ECT0: + scflags = SCF_ACE_0; + break; + case IPTOS_ECN_ECT1: + scflags = SCF_ACE_1; + break; + case IPTOS_ECN_NOTECT: + scflags = SCF_ACE_N; + break; + } + } break; } return scflags; @@ -286,9 +528,29 @@ tcp_ecn_syncache_respond(uint16_t thflags, struct syncache *sc) (sc->sc_flags & SCF_ECN_MASK)) { switch (sc->sc_flags & SCF_ECN_MASK) { case SCF_ECN: - thflags |= (0 | TH_ECE); + thflags |= (0 | 0 | TH_ECE); TCPSTAT_INC(tcps_ecn_shs); break; + case SCF_ACE_N: + thflags |= (0 | TH_CWR | 0); + TCPSTAT_INC(tcps_ecn_shs); + TCPSTAT_INC(tcps_ace_nect); + break; + case SCF_ACE_0: + thflags |= (TH_AE | 0 | 0); + TCPSTAT_INC(tcps_ecn_shs); + TCPSTAT_INC(tcps_ace_ect0); + break; + case SCF_ACE_1: + thflags |= (0 | TH_ECE | TH_CWR); + TCPSTAT_INC(tcps_ecn_shs); + TCPSTAT_INC(tcps_ace_ect1); + break; + case SCF_ACE_CE: + thflags |= (TH_AE | TH_CWR | 0); + TCPSTAT_INC(tcps_ecn_shs); + TCPSTAT_INC(tcps_ace_ce); + break; /* undefined SCF codepoint */ default: break; @@ -296,3 +558,17 @@ tcp_ecn_syncache_respond(uint16_t thflags, struct syncache *sc) } return thflags; } + +int +tcp_ecn_get_ace(uint16_t thflags) +{ + int ace = 0; + + if (thflags & TH_ECE) + ace += 1; + if (thflags & TH_CWR) + ace += 2; + if (thflags & TH_AE) + ace += 4; + return ace; +} diff --git a/sys/netinet/tcp_ecn.h b/sys/netinet/tcp_ecn.h index 38ac2f398e54..deade12b75d1 100644 --- a/sys/netinet/tcp_ecn.h +++ b/sys/netinet/tcp_ecn.h @@ -49,6 +49,7 @@ int tcp_ecn_output_established(struct tcpcb *, uint16_t *, int, bool); void tcp_ecn_syncache_socket(struct tcpcb *, struct syncache *); int tcp_ecn_syncache_add(uint16_t, int); uint16_t tcp_ecn_syncache_respond(uint16_t, struct syncache *); +int tcp_ecn_get_ace(uint16_t); #endif /* _KERNEL */ diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c index 4aa2f3664c55..0630e288c20f 100644 --- a/sys/netinet/tcp_output.c +++ b/sys/netinet/tcp_output.c @@ -1209,7 +1209,7 @@ tcp_default_output(struct tcpcb *tp) } /* Also handle parallel SYN for ECN */ if ((TCPS_HAVERCVDSYN(tp->t_state)) && - (tp->t_flags2 & TF2_ECN_PERMIT)) { + (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) { int ect = tcp_ecn_output_established(tp, &flags, len, sack_rxmit); if ((tp->t_state == TCPS_SYN_RECEIVED) && (tp->t_flags2 & TF2_ECN_SND_ECE)) diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c index ea370fe9247c..47bcd3b76d90 100644 --- a/sys/netinet/tcp_stacks/rack.c +++ b/sys/netinet/tcp_stacks/rack.c @@ -15883,7 +15883,7 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma } m->m_pkthdr.rcvif = (struct ifnet *)0; if (TCPS_HAVERCVDSYN(tp->t_state) && - (tp->t_flags2 & TF2_ECN_PERMIT)) { + (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) { int ect = tcp_ecn_output_established(tp, &flags, len, true); if ((tp->t_state == TCPS_SYN_RECEIVED) && (tp->t_flags2 & TF2_ECN_SND_ECE)) @@ -16362,7 +16362,7 @@ rack_fast_output(struct tcpcb *tp, struct tcp_rack *rack, uint64_t ts_val, } m->m_pkthdr.rcvif = (struct ifnet *)0; if (TCPS_HAVERCVDSYN(tp->t_state) && - (tp->t_flags2 & TF2_ECN_PERMIT)) { + (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) { int ect = tcp_ecn_output_established(tp, &flags, len, false); if ((tp->t_state == TCPS_SYN_RECEIVED) && (tp->t_flags2 & TF2_ECN_SND_ECE)) @@ -18487,7 +18487,7 @@ rack_output(struct tcpcb *tp) } /* Also handle parallel SYN for ECN */ if (TCPS_HAVERCVDSYN(tp->t_state) && - (tp->t_flags2 & TF2_ECN_PERMIT)) { + (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) { int ect = tcp_ecn_output_established(tp, &flags, len, sack_rxmit); if ((tp->t_state == TCPS_SYN_RECEIVED) && (tp->t_flags2 & TF2_ECN_SND_ECE)) @@ -20489,7 +20489,7 @@ rack_fill_info(struct tcpcb *tp, struct tcp_info *ti) ti->tcpi_snd_wscale = tp->snd_scale; ti->tcpi_rcv_wscale = tp->rcv_scale; } - if (tp->t_flags2 & TF2_ECN_PERMIT) + if (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) ti->tcpi_options |= TCPI_OPT_ECN; if (tp->t_flags & TF_FASTOPEN) ti->tcpi_options |= TCPI_OPT_TFO; diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h index d80bbef1c51d..20bf72dcd9d0 100644 --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -249,8 +249,8 @@ struct tcpcb { int t_dupacks; /* consecutive dup acks recd */ int t_lognum; /* Number of log entries */ int t_loglimit; /* Maximum number of log entries */ - uint32_t r_cep; /* Number of received CE marked packets */ - uint32_t s_cep; /* Synced number of delivered CE packets */ + uint32_t t_rcep; /* Number of received CE marked packets */ + uint32_t t_scep; /* Synced number of delivered CE packets */ int64_t t_pacing_rate; /* bytes / sec, -1 => unlimited */ struct tcp_log_stailq t_logs; /* Log buffer */ struct tcp_log_id_node *t_lin;