From d5deb43d0fd1476cec4cf2c44126170317fbe5fe Mon Sep 17 00:00:00 2001 From: lstewart Date: Thu, 15 Jan 2009 06:44:22 +0000 Subject: [PATCH] Add TCP Appropriate Byte Counting (RFC 3465) support to kernel. The new behaviour is on by default, and can be disabled by setting the net.inet.tcp.rfc3465 sysctl to 0 to obtain previous behaviour. The patch changes struct tcpcb in sys/netinet/tcp_var.h which breaks the ABI. Bump __FreeBSD_version to 800061 accordingly. User space tools that rely on the size of struct tcpcb (e.g. sockstat) need to be recompiled. Reviewed by: rpaulo, gnn Approved by: gnn, kmacy (mentors) Sponsored by: FreeBSD Foundation --- UPDATING | 6 ++++ sys/netinet/tcp_input.c | 70 +++++++++++++++++++++++++++++++++++------ sys/netinet/tcp_subr.c | 2 ++ sys/netinet/tcp_timer.c | 1 + sys/netinet/tcp_var.h | 1 + sys/netinet/vinet.h | 4 +++ sys/sys/param.h | 2 +- 7 files changed, 75 insertions(+), 11 deletions(-) diff --git a/UPDATING b/UPDATING index 70c2265d1f33..b6225f3f7f43 100644 --- a/UPDATING +++ b/UPDATING @@ -22,6 +22,12 @@ NOTE TO PEOPLE WHO THINK THAT FreeBSD 8.x IS SLOW: to maximize performance. (To disable malloc debugging, run ln -s aj /etc/malloc.conf.) +20090115: + TCP Appropriate Byte Counting (RFC 3465) support added to kernel. + New field in struct tcpcb breaks ABI, so bump __FreeBSD_version to + 800061. User space tools that rely on the size of struct tcpcb in + tcp_var.h (e.g. sockstat) need to be recompiled. + 20081225: ng_tty(4) module updated to match the new TTY subsystem. Due to API change, user-level applications must be updated. diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c index 30b3fde2f61a..d3b91b642a49 100644 --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -117,6 +117,8 @@ int tcp_insecure_rst; int tcp_do_autorcvbuf; int tcp_autorcvbuf_inc; int tcp_autorcvbuf_max; +int tcp_do_rfc3465; +int tcp_abc_l_var; #endif SYSCTL_V_STRUCT(V_NET, vnet_inet, _net_inet_tcp, TCPCTL_STATS, stats, @@ -144,6 +146,13 @@ SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW, tcp_do_rfc3390, 0, "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_RW, + tcp_do_rfc3465, 0, + "Enable RFC 3465 (Appropriate Byte Counting)"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, abc_l_var, CTLFLAG_RW, + tcp_abc_l_var, 2, + "Cap the max cwnd increment during slow-start to this number of segments"); + SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, CTLFLAG_RW, 0, "TCP ECN"); SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_ecn, OID_AUTO, enable, CTLFLAG_RW, tcp_do_ecn, 0, "TCP ECN support"); @@ -2293,20 +2302,59 @@ process_ACK: /* * When new data is acked, open the congestion window. - * If the window gives us less than ssthresh packets - * in flight, open exponentially (maxseg per packet). - * Otherwise open linearly: maxseg per window - * (maxseg^2 / cwnd per packet). - * If cwnd > maxseg^2, fix the cwnd increment at 1 byte - * to avoid capping cwnd (as suggested in RFC2581). + * Method depends on which congestion control state we're + * in (slow start or cong avoid) and if ABC (RFC 3465) is + * enabled. + * + * slow start: cwnd <= ssthresh + * cong avoid: cwnd > ssthresh + * + * slow start and ABC (RFC 3465): + * Grow cwnd exponentially by the amount of data + * ACKed capping the max increment per ACK to + * (abc_l_var * maxseg) bytes. + * + * slow start without ABC (RFC 2581): + * Grow cwnd exponentially by maxseg per ACK. + * + * cong avoid and ABC (RFC 3465): + * Grow cwnd linearly by maxseg per RTT for each + * cwnd worth of ACKed data. + * + * cong avoid without ABC (RFC 2581): + * Grow cwnd linearly by approximately maxseg per RTT using + * maxseg^2 / cwnd per ACK as the increment. + * If cwnd > maxseg^2, fix the cwnd increment at 1 byte to + * avoid capping cwnd. */ if ((!V_tcp_do_newreno && !(tp->t_flags & TF_SACK_PERMIT)) || !IN_FASTRECOVERY(tp)) { u_int cw = tp->snd_cwnd; u_int incr = tp->t_maxseg; - if (cw > tp->snd_ssthresh) - incr = max((incr * incr / cw), 1); - tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<snd_scale); + /* In congestion avoidance? */ + if (cw > tp->snd_ssthresh) { + if (V_tcp_do_rfc3465) { + tp->t_bytes_acked += acked; + if (tp->t_bytes_acked >= tp->snd_cwnd) + tp->t_bytes_acked -= cw; + else + incr = 0; + } + else + incr = max((incr * incr / cw), 1); + /* + * In slow-start with ABC enabled and no RTO in sight? + * (Must not use abc_l_var > 1 if slow starting after an + * RTO. On RTO, snd_nxt = snd_una, so the snd_nxt == + * snd_max check is sufficient to handle this). + */ + } else if (V_tcp_do_rfc3465 && + tp->snd_nxt == tp->snd_max) + incr = min(acked, + V_tcp_abc_l_var * tp->t_maxseg); + /* ABC is on by default, so (incr == 0) frequently. */ + if (incr > 0) + tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<snd_scale); } SOCKBUF_LOCK(&so->so_snd); if (acked > so->so_snd.sb_cc) { @@ -2328,8 +2376,10 @@ process_ACK: tp->snd_recover = th->th_ack - 1; if ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) && IN_FASTRECOVERY(tp) && - SEQ_GEQ(th->th_ack, tp->snd_recover)) + SEQ_GEQ(th->th_ack, tp->snd_recover)) { EXIT_FASTRECOVERY(tp); + tp->t_bytes_acked = 0; + } tp->snd_una = th->th_ack; if (tp->t_flags & TF_SACK_PERMIT) { if (SEQ_GT(tp->snd_una, tp->snd_recover)) diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index 9cb941ab7b91..53fc8827d0ec 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -316,6 +316,8 @@ tcp_init(void) V_tcp_do_autorcvbuf = 1; V_tcp_autorcvbuf_inc = 16*1024; V_tcp_autorcvbuf_max = 256*1024; + V_tcp_do_rfc3465 = 1; + V_tcp_abc_l_var = 2; V_tcp_mssdflt = TCP_MSS; #ifdef INET6 diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c index acce92f19307..6963d9c46397 100644 --- a/sys/netinet/tcp_timer.c +++ b/sys/netinet/tcp_timer.c @@ -587,6 +587,7 @@ tcp_timer_rexmt(void * xtp) tp->t_dupacks = 0; } EXIT_FASTRECOVERY(tp); + tp->t_bytes_acked = 0; (void) tcp_output(tp); out: diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h index a4392cb989b6..306514a5aee9 100644 --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -189,6 +189,7 @@ struct tcpcb { void *t_pspare[3]; /* toe usrreqs / toepcb * / congestion algo / vimage / 1 general use */ struct toe_usrreqs *t_tu; /* offload operations vector */ void *t_toe; /* TOE pcb pointer */ + int t_bytes_acked; /* # bytes acked during current RTT */ }; /* diff --git a/sys/netinet/vinet.h b/sys/netinet/vinet.h index 449334eaf39a..618afaa50124 100644 --- a/sys/netinet/vinet.h +++ b/sys/netinet/vinet.h @@ -127,6 +127,8 @@ struct vnet_inet { int _drop_synfin; int _tcp_do_rfc3042; int _tcp_do_rfc3390; + int _tcp_do_rfc3465; + int _tcp_abc_l_var; int _tcp_do_ecn; int _tcp_ecn_maxretries; int _tcp_insecure_rst; @@ -291,6 +293,7 @@ extern struct vnet_inet vnet_inet_0; #define V_subnetsarelocal VNET_INET(subnetsarelocal) #define V_tcb VNET_INET(tcb) #define V_tcbinfo VNET_INET(tcbinfo) +#define V_tcp_abc_l_var VNET_INET(tcp_abc_l_var) #define V_tcp_autorcvbuf_inc VNET_INET(tcp_autorcvbuf_inc) #define V_tcp_autorcvbuf_max VNET_INET(tcp_autorcvbuf_max) #define V_tcp_autosndbuf_inc VNET_INET(tcp_autosndbuf_inc) @@ -303,6 +306,7 @@ extern struct vnet_inet vnet_inet_0; #define V_tcp_do_rfc1323 VNET_INET(tcp_do_rfc1323) #define V_tcp_do_rfc3042 VNET_INET(tcp_do_rfc3042) #define V_tcp_do_rfc3390 VNET_INET(tcp_do_rfc3390) +#define V_tcp_do_rfc3465 VNET_INET(tcp_do_rfc3465) #define V_tcp_do_sack VNET_INET(tcp_do_sack) #define V_tcp_do_tso VNET_INET(tcp_do_tso) #define V_tcp_ecn_maxretries VNET_INET(tcp_ecn_maxretries) diff --git a/sys/sys/param.h b/sys/sys/param.h index fad4d518a077..770067e32a68 100644 --- a/sys/sys/param.h +++ b/sys/sys/param.h @@ -57,7 +57,7 @@ * is created, otherwise 1. */ #undef __FreeBSD_version -#define __FreeBSD_version 800060 /* Master, propagated to newvers */ +#define __FreeBSD_version 800061 /* Master, propagated to newvers */ #ifndef LOCORE #include