MFp4 (//depot/projects/tcpecn/):

TCP ECN support. Merge of my GSoC 2006 work for NetBSD.
  TCP ECN is defined in RFC 3168.

Partly reviewed by:	dwmalone, silby
Obtained from:		NetBSD
This commit is contained in:
Rui Paulo 2008-07-31 15:10:09 +00:00
parent 6d9e8f2b3a
commit f2512ba12a
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=181056
5 changed files with 155 additions and 14 deletions

View File

@ -128,6 +128,14 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW,
&tcp_do_rfc3390, 0,
"Enable RFC 3390 (Increasing TCP's Initial Congestion Window)");
int tcp_do_ecn = 0;
int tcp_ecn_maxretries = 1;
SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, CTLFLAG_RW, 0, "TCP ECN");
SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, enable, CTLFLAG_RW,
&tcp_do_ecn, 0, "TCP ECN support");
SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, maxretries, CTLFLAG_RW,
&tcp_ecn_maxretries, 0, "Max retries before giving up on ECN");
static int tcp_insecure_rst = 0;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_RW,
&tcp_insecure_rst, 0,
@ -152,13 +160,31 @@ struct inpcbinfo tcbinfo;
static void tcp_dooptions(struct tcpopt *, u_char *, int, int);
static void tcp_do_segment(struct mbuf *, struct tcphdr *,
struct socket *, struct tcpcb *, int, int);
struct socket *, struct tcpcb *, int, int, uint8_t);
static void tcp_dropwithreset(struct mbuf *, struct tcphdr *,
struct tcpcb *, int, int);
static void tcp_pulloutofband(struct socket *,
struct tcphdr *, struct mbuf *, int);
static void tcp_xmit_timer(struct tcpcb *, int);
static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *);
static void inline
tcp_congestion_exp(struct tcpcb *);
static void inline
tcp_congestion_exp(struct tcpcb *tp)
{
u_int win;
win = min(tp->snd_wnd, tp->snd_cwnd) /
2 / tp->t_maxseg;
if (win < 2)
win = 2;
tp->snd_ssthresh = win * tp->t_maxseg;
ENTER_FASTRECOVERY(tp);
tp->snd_recover = tp->snd_max;
if (tp->t_flags & TF_ECN_PERMIT)
tp->t_flags |= TF_ECN_SND_CWR;
}
/* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */
#ifdef INET6
@ -238,6 +264,7 @@ tcp_input(struct mbuf *m, int off0)
int drop_hdrlen;
int thflags;
int rstreason = 0; /* For badport_bandlim accounting purposes */
uint8_t iptos;
#ifdef IPFIREWALL_FORWARD
struct m_tag *fwd_tag;
#endif
@ -347,6 +374,13 @@ tcp_input(struct mbuf *m, int off0)
ip->ip_v = IPVERSION;
}
#ifdef INET6
if (isipv6)
iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
else
#endif
iptos = ip->ip_tos;
/*
* Check that TCP offset makes sense,
* pull out TCP options and adjust length. XXX
@ -643,7 +677,8 @@ tcp_input(struct mbuf *m, int off0)
* contains. tcp_do_segment() consumes
* the mbuf chain and unlocks the inpcb.
*/
tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen);
tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen,
iptos);
INP_INFO_UNLOCK_ASSERT(&tcbinfo);
return;
}
@ -843,7 +878,7 @@ tcp_input(struct mbuf *m, int off0)
* state. tcp_do_segment() always consumes the mbuf chain, unlocks
* the inpcb, and unlocks pcbinfo.
*/
tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen);
tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos);
INP_INFO_UNLOCK_ASSERT(&tcbinfo);
return;
@ -867,7 +902,7 @@ tcp_input(struct mbuf *m, int off0)
static void
tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
struct tcpcb *tp, int drop_hdrlen, int tlen)
struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos)
{
int thflags, acked, ourfinisacked, needoutput = 0;
int headlocked = 1;
@ -909,6 +944,37 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
tiwin = th->th_win << tp->snd_scale;
/*
* TCP ECN processing.
*/
if (tp->t_flags & TF_ECN_PERMIT) {
switch (iptos & IPTOS_ECN_MASK) {
case IPTOS_ECN_CE:
tp->t_flags |= TF_ECN_SND_ECE;
tcpstat.tcps_ecn_ce++;
break;
case IPTOS_ECN_ECT0:
tcpstat.tcps_ecn_ect0++;
break;
case IPTOS_ECN_ECT1:
tcpstat.tcps_ecn_ect1++;
break;
}
if (thflags & TH_CWR)
tp->t_flags &= ~TF_ECN_SND_ECE;
/*
* Congestion experienced.
* Ignore if we are already trying to recover.
*/
if ((thflags & TH_ECE) &&
SEQ_LEQ(th->th_ack, tp->snd_recover)) {
tcpstat.tcps_ecn_rcwnd++;
tcp_congestion_exp(tp);
}
}
/*
* Parse options on any incoming segment.
*/
@ -1254,6 +1320,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
* Otherwise this is an acceptable SYN segment
* initialize tp->rcv_nxt and tp->irs
* if seg contains ack then advance tp->snd_una
* if seg contains an ECE and ECN support is enabled, the stream
* is ECN capable.
* if SYN has been acked change to ESTABLISHED else SYN_RCVD state
* arrange for segment to be acked (eventually)
* continue processing rest of data/controls, beginning with URG
@ -1298,6 +1366,12 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
tcp_delacktime);
else
tp->t_flags |= TF_ACKNOW;
if ((thflags & TH_ECE) && tcp_do_ecn) {
tp->t_flags |= TF_ECN_PERMIT;
tcpstat.tcps_ecn_shs++;
}
/*
* Received <SYN,ACK> in SYN_SENT[*] state.
* Transitions:
@ -1759,6 +1833,9 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
* so bump cwnd by the amount in the receiver
* to keep a constant cwnd packets in the
* network.
*
* When using TCP ECN, notify the peer that
* we reduced the cwnd.
*/
if (!tcp_timer_active(tp, TT_REXMT) ||
th->th_ack != tp->snd_una)
@ -1790,7 +1867,6 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
goto drop;
} else if (tp->t_dupacks == tcprexmtthresh) {
tcp_seq onxt = tp->snd_nxt;
u_int win;
/*
* If we're doing sack, check to
@ -1804,20 +1880,15 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
tp->t_dupacks = 0;
break;
}
} else if (tcp_do_newreno) {
} else if (tcp_do_newreno ||
tcp_do_ecn) {
if (SEQ_LEQ(th->th_ack,
tp->snd_recover)) {
tp->t_dupacks = 0;
break;
}
}
win = min(tp->snd_wnd, tp->snd_cwnd) /
2 / tp->t_maxseg;
if (win < 2)
win = 2;
tp->snd_ssthresh = win * tp->t_maxseg;
ENTER_FASTRECOVERY(tp);
tp->snd_recover = tp->snd_max;
tcp_congestion_exp(tp);
tcp_timer_activate(tp, TT_REXMT, 0);
tp->t_rtttime = 0;
if (tp->t_flags & TF_SACK_PERMIT) {

View File

@ -876,6 +876,49 @@ tcp_output(struct tcpcb *tp)
if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
tp->snd_nxt == tp->snd_max)
tp->snd_nxt--;
/*
* If we are starting a connection, send ECN setup
* SYN packet. If we are on a retransmit, we may
* resend those bits a number of times as per
* RFC 3168.
*/
if (tp->t_state == TCPS_SYN_SENT && tcp_do_ecn) {
if (tp->t_rxtshift >= 1) {
if (tp->t_rxtshift <= tcp_ecn_maxretries)
flags |= TH_ECE|TH_CWR;
} else
flags |= TH_ECE|TH_CWR;
}
if (tp->t_state == TCPS_ESTABLISHED &&
(tp->t_flags & TF_ECN_PERMIT)) {
/*
* If the peer has ECN, mark data packets with
* ECN capable transmission (ECT).
* Ignore pure ack packets, retransmissions and window probes.
*/
if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
!((tp->t_flags & TF_FORCEDATA) && len == 1)) {
#ifdef INET6
if (isipv6)
ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
else
#endif
ip->ip_tos |= IPTOS_ECN_ECT0;
tcpstat.tcps_ecn_ect0++;
}
/*
* Reply with proper ECN notifications.
*/
if (tp->t_flags & TF_ECN_SND_CWR) {
flags |= TH_CWR;
tp->t_flags &= ~TF_ECN_SND_CWR;
}
if (tp->t_flags & TF_ECN_SND_ECE)
flags |= TH_ECE;
}
/*
* If we are doing retransmissions, then snd_nxt will
* not reflect the first unsent octet. For ACK only

View File

@ -129,7 +129,7 @@ struct syncache {
u_int8_t sc_ip_tos; /* IPv4 TOS */
u_int8_t sc_requested_s_scale:4,
sc_requested_r_scale:4;
u_int8_t sc_flags;
u_int16_t sc_flags;
#define SCF_NOOPT 0x01 /* no TCP options */
#define SCF_WINSCALE 0x02 /* negotiated window scaling */
#define SCF_TIMESTAMP 0x04 /* negotiated timestamps */
@ -137,6 +137,7 @@ struct syncache {
#define SCF_UNREACH 0x10 /* icmp unreachable received */
#define SCF_SIGNATURE 0x20 /* send MD5 digests */
#define SCF_SACK 0x80 /* send SACK option */
#define SCF_ECN 0x100 /* send ECN setup packet */
#ifndef TCP_OFFLOAD_DISABLE
struct toe_usrreqs *sc_tu; /* TOE operations */
void *sc_toepcb; /* TOE protocol block */
@ -807,6 +808,9 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
tp->t_flags |= TF_SACK_PERMIT;
}
if (sc->sc_flags & SCF_ECN)
tp->t_flags |= TF_ECN_PERMIT;
/*
* Set up MSS and get cached values from tcp_hostcache.
* This might overwrite some of the defaults we just set.
@ -1231,6 +1235,8 @@ _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
sc->sc_peer_mss = to->to_mss; /* peer mss may be zero */
if (noopt)
sc->sc_flags |= SCF_NOOPT;
if ((th->th_flags & (TH_ECE|TH_CWR)) && tcp_do_ecn)
sc->sc_flags |= SCF_ECN;
if (tcp_syncookies) {
syncookie_generate(sch, sc, &flowtmp);
@ -1369,6 +1375,11 @@ syncache_respond(struct syncache *sc)
th->th_win = htons(sc->sc_wnd);
th->th_urp = 0;
if (sc->sc_flags & SCF_ECN) {
th->th_flags |= TH_ECE;
tcpstat.tcps_ecn_shs++;
}
/* Tack on the TCP options. */
if ((sc->sc_flags & SCF_NOOPT) == 0) {
to.to_flags = 0;

View File

@ -1754,6 +1754,10 @@ db_print_tflags(u_int t_flags)
db_printf("%sTF_TSO", comma ? ", " : "");
comma = 1;
}
if (t_flags & TF_ECN_PERMIT) {
db_printf("%sTF_ECN_PERMIT", comma ? ", " : "");
comma = 1;
}
}
static void

View File

@ -124,6 +124,9 @@ struct tcpcb {
#define TF_FORCEDATA 0x800000 /* force out a byte */
#define TF_TSO 0x1000000 /* TSO enabled on this connection */
#define TF_TOE 0x2000000 /* this connection is offloaded */
#define TF_ECN_PERMIT 0x4000000 /* connection ECN-ready */
#define TF_ECN_SND_CWR 0x8000000 /* ECN CWR in queue */
#define TF_ECN_SND_ECE 0x10000000 /* ECN ECE in queue */
tcp_seq snd_una; /* send unacknowledged */
tcp_seq snd_max; /* highest sequence number sent;
@ -433,6 +436,13 @@ struct tcpstat {
u_long tcps_sack_rcv_blocks; /* SACK blocks (options) received */
u_long tcps_sack_send_blocks; /* SACK blocks (options) sent */
u_long tcps_sack_sboverflow; /* times scoreboard overflowed */
/* ECN related stats */
u_long tcps_ecn_ce; /* ECN Congestion Experienced */
u_long tcps_ecn_ect0; /* ECN Capable Transport */
u_long tcps_ecn_ect1; /* ECN Capable Transport */
u_long tcps_ecn_shs; /* ECN successful handshakes */
u_long tcps_ecn_rcwnd; /* # times ECN reduced the cwnd */
};
/*
@ -509,6 +519,8 @@ extern int ss_fltsz_local;
extern int tcp_do_sack; /* SACK enabled/disabled */
extern int tcp_sc_rst_sock_fail; /* RST on sock alloc failure */
extern int tcp_do_ecn; /* TCP ECN enabled/disabled */
extern int tcp_ecn_maxretries;
int tcp_addoptions(struct tcpopt *, u_char *);
struct tcpcb *