tcp: Functional implementation of Accurate ECN

The AccECN handshake and TCP header flags are supported,
no support yet for the AccECN option. This minimalistic
implementation is sufficient to support DCTCP while
dramatically cutting the number of ACKs, and provide ECN
response from the receiver to the CC modules.

Reviewed By:		#transport, #manpages, rrs, pauamma
Sponsored by:		NetApp, Inc.
Differential Revision:	https://reviews.freebsd.org/D21011
This commit is contained in:
Richard Scheffenegger 2022-08-31 15:01:25 +02:00
parent c21b7b55be
commit 4012ef7754
6 changed files with 316 additions and 32 deletions

View File

@ -495,6 +495,13 @@ Outgoing connections will request ECN.
Allow incoming connections to request ECN.
Outgoing connections will not request ECN.
(default)
.It 3
Negotiate on incoming connection for Accurate ECN, ECN, or no ECN.
Outgoing connections will request Accurate ECN and fall back to
ECN depending on the capabilities of the server.
.It 4
Negotiate on incoming connection for Accurate ECN, ECN, or no ECN.
Outgoing connections will not request ECN.
.El
.It Va ecn.maxretries
Number of retries (SYN or SYN/ACK retransmits) before disabling ECN on a

View File

@ -109,12 +109,91 @@ __FBSDID("$FreeBSD$");
void
tcp_ecn_input_syn_sent(struct tcpcb *tp, uint16_t thflags, int iptos)
{
thflags &= (TH_CWR|TH_ECE);
if (((thflags & (TH_CWR | TH_ECE)) == TH_ECE) &&
V_tcp_do_ecn) {
tp->t_flags2 |= TF2_ECN_PERMIT;
TCPSTAT_INC(tcps_ecn_shs);
if (V_tcp_do_ecn == 0)
return;
if ((V_tcp_do_ecn == 1) ||
(V_tcp_do_ecn == 2)) {
/* RFC3168 ECN handling */
if ((thflags & (TH_CWR | TH_ECE)) == (0 | TH_ECE)) {
tp->t_flags2 |= TF2_ECN_PERMIT;
TCPSTAT_INC(tcps_ecn_shs);
}
} else
/* decoding Accurate ECN according to table in section 3.1.1 */
if ((V_tcp_do_ecn == 3) ||
(V_tcp_do_ecn == 4)) {
/*
* on the SYN,ACK, process the AccECN
* flags indicating the state the SYN
* was delivered.
* Reactions to Path ECN mangling can
* come here.
*/
switch (thflags & (TH_AE | TH_CWR | TH_ECE)) {
/* RFC3168 SYN */
case (0|0|TH_ECE):
tp->t_flags2 |= TF2_ECN_PERMIT;
TCPSTAT_INC(tcps_ecn_shs);
break;
/* non-ECT SYN */
case (0|TH_CWR|0):
tp->t_flags2 |= TF2_ACE_PERMIT;
tp->t_scep = 5;
TCPSTAT_INC(tcps_ecn_shs);
TCPSTAT_INC(tcps_ace_nect);
break;
/* ECT0 SYN */
case (TH_AE|0|0):
tp->t_flags2 |= TF2_ACE_PERMIT;
tp->t_scep = 5;
TCPSTAT_INC(tcps_ecn_shs);
TCPSTAT_INC(tcps_ace_ect0);
break;
/* ECT1 SYN */
case (0|TH_CWR|TH_ECE):
tp->t_flags2 |= TF2_ACE_PERMIT;
tp->t_scep = 5;
TCPSTAT_INC(tcps_ecn_shs);
TCPSTAT_INC(tcps_ace_ect1);
break;
/* CE SYN */
case (TH_AE|TH_CWR|0):
tp->t_flags2 |= TF2_ACE_PERMIT;
tp->t_scep = 6;
/*
* reduce the IW to 2 MSS (to
* account for delayed acks) if
* the SYN,ACK was CE marked
*/
tp->snd_cwnd = 2 * tcp_maxseg(tp);
TCPSTAT_INC(tcps_ecn_shs);
TCPSTAT_INC(tcps_ace_nect);
break;
default:
break;
}
/*
* Set the AccECN Codepoints on
* the outgoing <ACK> to the ECN
* state of the <SYN,ACK>
* according to table 3 in the
* AccECN draft
*/
switch (iptos & IPTOS_ECN_MASK) {
case (IPTOS_ECN_NOTECT):
tp->t_rcep = 0b010;
break;
case (IPTOS_ECN_ECT0):
tp->t_rcep = 0b100;
break;
case (IPTOS_ECN_ECT1):
tp->t_rcep = 0b011;
break;
case (IPTOS_ECN_CE):
tp->t_rcep = 0b110;
break;
}
}
}
@ -128,13 +207,53 @@ tcp_ecn_input_parallel_syn(struct tcpcb *tp, uint16_t thflags, int iptos)
return;
if (V_tcp_do_ecn == 0)
return;
if ((V_tcp_do_ecn == 1) || (V_tcp_do_ecn == 2)) {
if ((V_tcp_do_ecn == 1) ||
(V_tcp_do_ecn == 2)) {
/* RFC3168 ECN handling */
if ((thflags & (TH_CWR | TH_ECE)) == (TH_CWR | TH_ECE)) {
tp->t_flags2 |= TF2_ECN_PERMIT;
tp->t_flags2 |= TF2_ECN_SND_ECE;
TCPSTAT_INC(tcps_ecn_shs);
}
} else
if ((V_tcp_do_ecn == 3) ||
(V_tcp_do_ecn == 4)) {
/* AccECN handling */
switch (thflags & (TH_AE | TH_CWR | TH_ECE)) {
default:
case (0|0|0):
break;
case (0|TH_CWR|TH_ECE):
tp->t_flags2 |= TF2_ECN_PERMIT;
tp->t_flags2 |= TF2_ECN_SND_ECE;
TCPSTAT_INC(tcps_ecn_shs);
break;
case (TH_AE|TH_CWR|TH_ECE):
tp->t_flags2 |= TF2_ACE_PERMIT;
TCPSTAT_INC(tcps_ecn_shs);
/*
* Set the AccECN Codepoints on
* the outgoing <ACK> to the ECN
* state of the <SYN,ACK>
* according to table 3 in the
* AccECN draft
*/
switch (iptos & IPTOS_ECN_MASK) {
case (IPTOS_ECN_NOTECT):
tp->t_rcep = 0b010;
break;
case (IPTOS_ECN_ECT0):
tp->t_rcep = 0b100;
break;
case (IPTOS_ECN_ECT1):
tp->t_rcep = 0b011;
break;
case (IPTOS_ECN_CE):
tp->t_rcep = 0b110;
break;
}
break;
}
}
}
@ -146,7 +265,7 @@ tcp_ecn_input_segment(struct tcpcb *tp, uint16_t thflags, int iptos)
{
int delta_ace = 0;
if (tp->t_flags2 & TF2_ECN_PERMIT) {
if (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) {
switch (iptos & IPTOS_ECN_MASK) {
case IPTOS_ECN_CE:
TCPSTAT_INC(tcps_ecn_ce);
@ -159,15 +278,52 @@ tcp_ecn_input_segment(struct tcpcb *tp, uint16_t thflags, int iptos)
break;
}
/* RFC3168 ECN handling */
if (thflags & TH_ECE)
delta_ace = 1;
if (thflags & TH_CWR) {
tp->t_flags2 &= ~TF2_ECN_SND_ECE;
tp->t_flags |= TF_ACKNOW;
if (tp->t_flags2 & TF2_ACE_PERMIT) {
if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
tp->t_rcep += 1;
if (tp->t_flags2 & TF2_ECN_PERMIT) {
delta_ace = (tcp_ecn_get_ace(thflags) + 8 -
(tp->t_scep & 0x07)) & 0x07;
tp->t_scep += delta_ace;
} else {
/*
* process the final ACK of the 3WHS
* see table 3 in draft-ietf-tcpm-accurate-ecn
*/
switch (tcp_ecn_get_ace(thflags)) {
case 0b010:
/* nonECT SYN or SYN,ACK */
/* Fallthrough */
case 0b011:
/* ECT1 SYN or SYN,ACK */
/* Fallthrough */
case 0b100:
/* ECT0 SYN or SYN,ACK */
tp->t_scep = 5;
break;
case 0b110:
/* CE SYN or SYN,ACK */
tp->t_scep = 6;
tp->snd_cwnd = 2 * tcp_maxseg(tp);
break;
default:
/* mangled AccECN handshake */
tp->t_scep = 5;
break;
}
tp->t_flags2 |= TF2_ECN_PERMIT;
}
} else {
/* RFC3168 ECN handling */
if (thflags & TH_ECE)
delta_ace = 1;
if (thflags & TH_CWR) {
tp->t_flags2 &= ~TF2_ECN_SND_ECE;
tp->t_flags |= TF_ACKNOW;
}
if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
tp->t_flags2 |= TF2_ECN_SND_ECE;
}
if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
tp->t_flags2 |= TF2_ECN_SND_ECE;
/* Process a packet differently from RFC3168. */
cc_ecnpkt_handler_flags(tp, thflags, iptos);
@ -184,6 +340,8 @@ tcp_ecn_output_syn_sent(struct tcpcb *tp)
{
uint16_t thflags = 0;
if (V_tcp_do_ecn == 0)
return thflags;
if (V_tcp_do_ecn == 1) {
/* Send a RFC3168 ECN setup <SYN> packet */
if (tp->t_rxtshift >= 1) {
@ -191,6 +349,14 @@ tcp_ecn_output_syn_sent(struct tcpcb *tp)
thflags = TH_ECE|TH_CWR;
} else
thflags = TH_ECE|TH_CWR;
} else
if (V_tcp_do_ecn == 3) {
/* Send an Accurate ECN setup <SYN> packet */
if (tp->t_rxtshift >= 1) {
if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
thflags = TH_ECE|TH_CWR|TH_AE;
} else
thflags = TH_ECE|TH_CWR|TH_AE;
}
return thflags;
@ -215,6 +381,7 @@ tcp_ecn_output_established(struct tcpcb *tp, uint16_t *thflags, int len, bool rx
newdata = (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
!rxmit &&
!((tp->t_flags & TF_FORCEDATA) && len == 1));
/* RFC3168 ECN marking, only new data segments */
if (newdata) {
ipecn = IPTOS_ECN_ECT0;
TCPSTAT_INC(tcps_ecn_ect0);
@ -222,13 +389,35 @@ tcp_ecn_output_established(struct tcpcb *tp, uint16_t *thflags, int len, bool rx
/*
* Reply with proper ECN notifications.
*/
if (newdata &&
(tp->t_flags2 & TF2_ECN_SND_CWR)) {
*thflags |= TH_CWR;
tp->t_flags2 &= ~TF2_ECN_SND_CWR;
if (tp->t_flags2 & TF2_ACE_PERMIT) {
*thflags &= ~(TH_AE|TH_CWR|TH_ECE);
if (tp->t_rcep & 0x01)
*thflags |= TH_ECE;
if (tp->t_rcep & 0x02)
*thflags |= TH_CWR;
if (tp->t_rcep & 0x04)
*thflags |= TH_AE;
if (!(tp->t_flags2 & TF2_ECN_PERMIT)) {
/*
* here we process the final
* ACK of the 3WHS
*/
if (tp->t_rcep == 0b110) {
tp->t_rcep = 6;
} else {
tp->t_rcep = 5;
}
tp->t_flags2 |= TF2_ECN_PERMIT;
}
} else {
if (newdata &&
(tp->t_flags2 & TF2_ECN_SND_CWR)) {
*thflags |= TH_CWR;
tp->t_flags2 &= ~TF2_ECN_SND_CWR;
}
if (tp->t_flags2 & TF2_ECN_SND_ECE)
*thflags |= TH_ECE;
}
if (tp->t_flags2 & TF2_ECN_SND_ECE)
*thflags |= TH_ECE;
return ipecn;
}
@ -245,6 +434,20 @@ tcp_ecn_syncache_socket(struct tcpcb *tp, struct syncache *sc)
case SCF_ECN:
tp->t_flags2 |= TF2_ECN_PERMIT;
break;
case SCF_ACE_N:
/* Fallthrough */
case SCF_ACE_0:
/* Fallthrough */
case SCF_ACE_1:
tp->t_flags2 |= TF2_ACE_PERMIT;
tp->t_scep = 5;
tp->t_rcep = 5;
break;
case SCF_ACE_CE:
tp->t_flags2 |= TF2_ACE_PERMIT;
tp->t_scep = 6;
tp->t_rcep = 6;
break;
/* undefined SCF codepoint */
default:
break;
@ -261,15 +464,54 @@ tcp_ecn_syncache_add(uint16_t thflags, int iptos)
{
int scflags = 0;
switch (thflags & (TH_CWR|TH_ECE)) {
switch (thflags & (TH_AE|TH_CWR|TH_ECE)) {
/* no ECN */
case (0|0):
case (0|0|0):
break;
/* legacy ECN */
case (TH_CWR|TH_ECE):
case (0|TH_CWR|TH_ECE):
scflags = SCF_ECN;
break;
/* Accurate ECN */
case (TH_AE|TH_CWR|TH_ECE):
if ((V_tcp_do_ecn == 3) ||
(V_tcp_do_ecn == 4)) {
switch (iptos & IPTOS_ECN_MASK) {
case IPTOS_ECN_CE:
scflags = SCF_ACE_CE;
break;
case IPTOS_ECN_ECT0:
scflags = SCF_ACE_0;
break;
case IPTOS_ECN_ECT1:
scflags = SCF_ACE_1;
break;
case IPTOS_ECN_NOTECT:
scflags = SCF_ACE_N;
break;
}
} else
scflags = SCF_ECN;
break;
/* Default Case (section 3.1.2) */
default:
if ((V_tcp_do_ecn == 3) ||
(V_tcp_do_ecn == 4)) {
switch (iptos & IPTOS_ECN_MASK) {
case IPTOS_ECN_CE:
scflags = SCF_ACE_CE;
break;
case IPTOS_ECN_ECT0:
scflags = SCF_ACE_0;
break;
case IPTOS_ECN_ECT1:
scflags = SCF_ACE_1;
break;
case IPTOS_ECN_NOTECT:
scflags = SCF_ACE_N;
break;
}
}
break;
}
return scflags;
@ -286,9 +528,29 @@ tcp_ecn_syncache_respond(uint16_t thflags, struct syncache *sc)
(sc->sc_flags & SCF_ECN_MASK)) {
switch (sc->sc_flags & SCF_ECN_MASK) {
case SCF_ECN:
thflags |= (0 | TH_ECE);
thflags |= (0 | 0 | TH_ECE);
TCPSTAT_INC(tcps_ecn_shs);
break;
case SCF_ACE_N:
thflags |= (0 | TH_CWR | 0);
TCPSTAT_INC(tcps_ecn_shs);
TCPSTAT_INC(tcps_ace_nect);
break;
case SCF_ACE_0:
thflags |= (TH_AE | 0 | 0);
TCPSTAT_INC(tcps_ecn_shs);
TCPSTAT_INC(tcps_ace_ect0);
break;
case SCF_ACE_1:
thflags |= (0 | TH_ECE | TH_CWR);
TCPSTAT_INC(tcps_ecn_shs);
TCPSTAT_INC(tcps_ace_ect1);
break;
case SCF_ACE_CE:
thflags |= (TH_AE | TH_CWR | 0);
TCPSTAT_INC(tcps_ecn_shs);
TCPSTAT_INC(tcps_ace_ce);
break;
/* undefined SCF codepoint */
default:
break;
@ -296,3 +558,17 @@ tcp_ecn_syncache_respond(uint16_t thflags, struct syncache *sc)
}
return thflags;
}
int
tcp_ecn_get_ace(uint16_t thflags)
{
int ace = 0;
if (thflags & TH_ECE)
ace += 1;
if (thflags & TH_CWR)
ace += 2;
if (thflags & TH_AE)
ace += 4;
return ace;
}

View File

@ -49,6 +49,7 @@ int tcp_ecn_output_established(struct tcpcb *, uint16_t *, int, bool);
void tcp_ecn_syncache_socket(struct tcpcb *, struct syncache *);
int tcp_ecn_syncache_add(uint16_t, int);
uint16_t tcp_ecn_syncache_respond(uint16_t, struct syncache *);
int tcp_ecn_get_ace(uint16_t);
#endif /* _KERNEL */

View File

@ -1209,7 +1209,7 @@ tcp_default_output(struct tcpcb *tp)
}
/* Also handle parallel SYN for ECN */
if ((TCPS_HAVERCVDSYN(tp->t_state)) &&
(tp->t_flags2 & TF2_ECN_PERMIT)) {
(tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) {
int ect = tcp_ecn_output_established(tp, &flags, len, sack_rxmit);
if ((tp->t_state == TCPS_SYN_RECEIVED) &&
(tp->t_flags2 & TF2_ECN_SND_ECE))

View File

@ -15883,7 +15883,7 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma
}
m->m_pkthdr.rcvif = (struct ifnet *)0;
if (TCPS_HAVERCVDSYN(tp->t_state) &&
(tp->t_flags2 & TF2_ECN_PERMIT)) {
(tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) {
int ect = tcp_ecn_output_established(tp, &flags, len, true);
if ((tp->t_state == TCPS_SYN_RECEIVED) &&
(tp->t_flags2 & TF2_ECN_SND_ECE))
@ -16362,7 +16362,7 @@ rack_fast_output(struct tcpcb *tp, struct tcp_rack *rack, uint64_t ts_val,
}
m->m_pkthdr.rcvif = (struct ifnet *)0;
if (TCPS_HAVERCVDSYN(tp->t_state) &&
(tp->t_flags2 & TF2_ECN_PERMIT)) {
(tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) {
int ect = tcp_ecn_output_established(tp, &flags, len, false);
if ((tp->t_state == TCPS_SYN_RECEIVED) &&
(tp->t_flags2 & TF2_ECN_SND_ECE))
@ -18487,7 +18487,7 @@ rack_output(struct tcpcb *tp)
}
/* Also handle parallel SYN for ECN */
if (TCPS_HAVERCVDSYN(tp->t_state) &&
(tp->t_flags2 & TF2_ECN_PERMIT)) {
(tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) {
int ect = tcp_ecn_output_established(tp, &flags, len, sack_rxmit);
if ((tp->t_state == TCPS_SYN_RECEIVED) &&
(tp->t_flags2 & TF2_ECN_SND_ECE))
@ -20489,7 +20489,7 @@ rack_fill_info(struct tcpcb *tp, struct tcp_info *ti)
ti->tcpi_snd_wscale = tp->snd_scale;
ti->tcpi_rcv_wscale = tp->rcv_scale;
}
if (tp->t_flags2 & TF2_ECN_PERMIT)
if (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))
ti->tcpi_options |= TCPI_OPT_ECN;
if (tp->t_flags & TF_FASTOPEN)
ti->tcpi_options |= TCPI_OPT_TFO;

View File

@ -249,8 +249,8 @@ struct tcpcb {
int t_dupacks; /* consecutive dup acks recd */
int t_lognum; /* Number of log entries */
int t_loglimit; /* Maximum number of log entries */
uint32_t r_cep; /* Number of received CE marked packets */
uint32_t s_cep; /* Synced number of delivered CE packets */
uint32_t t_rcep; /* Number of received CE marked packets */
uint32_t t_scep; /* Synced number of delivered CE packets */
int64_t t_pacing_rate; /* bytes / sec, -1 => unlimited */
struct tcp_log_stailq t_logs; /* Log buffer */
struct tcp_log_id_node *t_lin;