Auto sizing TCP socket buffers.

Normally the socket buffers are static (either derived from global defaults or set with setsockopt) and do not adapt to real network conditions. Two things happen: a) your socket buffers are too small and you can't reach the full potential of the network between both hosts; b) your socket buffers are too big and you waste a lot of kernel memory for data just sitting around. With automatic TCP send and receive socket buffers we can start with a small buffer and quickly grow it in parallel with the TCP congestion window to match real network conditions. FreeBSD has a default 32K send socket buffer. This supports a maximal transfer rate of only slightly more than 2Mbit/s on a 100ms RTT trans-continental link. Or at 200ms just above 1Mbit/s. With TCP send buffer auto scaling and the default values below it supports 20Mbit/s at 100ms and 10Mbit/s at 200ms. That's an improvement of factor 10, or 1000%. For the receive side it looks slightly better with a default of 64K buffer size. New sysctls are: net.inet.tcp.sendbuf_auto=1 (enabled) net.inet.tcp.sendbuf_inc=8192 (8K, step size) net.inet.tcp.sendbuf_max=262144 (256K, growth limit) net.inet.tcp.recvbuf_auto=1 (enabled) net.inet.tcp.recvbuf_inc=16384 (16K, step size) net.inet.tcp.recvbuf_max=262144 (256K, growth limit) Tested by: many (on HEAD and RELENG_6) Approved by: re MFC after: 1 month
svn path=/head/; revision=166405
2007-02-01 18:32:13 +00:00 · 2007-02-01 18:32:13 +00:00 · 6741ecf595 · 2020-12-20 02:59:44 +00:00
commit 6741ecf595
parent 6a37f331d7
5 changed files with 236 additions and 10 deletions
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@ -161,6 +161,18 @@ SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows, CTLFLAG_RD,
 	   &tcp_reass_overflows, 0,
 	   "Global number of TCP Segment Reassembly Queue Overflows");

+int	tcp_do_autorcvbuf = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW,
+	   &tcp_do_autorcvbuf, 0, "Enable automatic receive buffer sizing");
+
+int	tcp_autorcvbuf_inc = 16*1024;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW,
+	   &tcp_autorcvbuf_inc, 0, "Incrementor step size of automatic receive buffer");
+
+int	tcp_autorcvbuf_max = 256*1024;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW,
+	   &tcp_autorcvbuf_max, 0, "Max size of automatic receive buffer");
+
 struct inpcbhead tcb;
 #define	tcb6	tcb  /* for KAME src sync over BSD*'s */
 struct inpcbinfo tcbinfo;
@ -1295,6 +1307,8 @@ tcp_input(m, off0)
 		} else if (th->th_ack == tp->snd_una &&
 		    LIST_EMPTY(&tp->t_segq) &&
 		    tlen <= sbspace(&so->so_rcv)) {
+			int newsize = 0;	/* automatic sockbuf scaling */
+
 			KASSERT(headlocked, ("headlocked"));
 			INP_INFO_WUNLOCK(&tcbinfo);
 			headlocked = 0;
@ -1321,18 +1335,78 @@ tcp_input(m, off0)
 			tcpstat.tcps_rcvpack++;
 			tcpstat.tcps_rcvbyte += tlen;
 			ND6_HINT(tp);	/* some progress has been done */
-			/*
 #ifdef TCPDEBUG
 			if (so->so_options & SO_DEBUG)
 				tcp_trace(TA_INPUT, ostate, tp,
 				    (void *)tcp_saveipgen, &tcp_savetcp, 0);
 #endif
-			 * Add data to socket buffer.
-			 */
+		/*
+		 * Automatic sizing of receive socket buffer.  Often the send
+		 * buffer size is not optimally adjusted to the actual network
+		 * conditions at hand (delay bandwidth product).  Setting the
+		 * buffer size too small limits throughput on links with high
+		 * bandwidth and high delay (eg. trans-continental/oceanic links).
+		 *
+		 * On the receive side the socket buffer memory is only rarely
+		 * used to any significant extent.  This allows us to be much
+		 * more aggressive in scaling the receive socket buffer.  For
+		 * the case that the buffer space is actually used to a large
+		 * extent and we run out of kernel memory we can simply drop
+		 * the new segments; TCP on the sender will just retransmit it
+		 * later.  Setting the buffer size too big may only consume too
+		 * much kernel memory if the application doesn't read() from
+		 * the socket or packet loss or reordering makes use of the
+		 * reassembly queue.
+		 *
+		 * The criteria to step up the receive buffer one notch are:
+		 *  1. the number of bytes received during the time it takes
+		 *     one timestamp to be reflected back to us (the RTT);
+		 *  2. received bytes per RTT is within seven eighth of the
+		 *     current socket buffer size;
+		 *  3. receive buffer size has not hit maximal automatic size;
+		 *
+		 * This algorithm does one step per RTT at most and only if
+		 * we receive a bulk stream w/o packet losses or reorderings.
+		 * Shrinking the buffer during idle times is not necessary as
+		 * it doesn't consume any memory when idle.
+		 *
+		 * TODO: Only step up if the application is actually serving
+		 * the buffer to better manage the socket buffer resources.
+		 */
+			if (tcp_do_autorcvbuf &&
+			    to.to_tsecr &&
+			    (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
+				if (to.to_tsecr > tp->rfbuf_ts &&
+				    to.to_tsecr - tp->rfbuf_ts < hz) {
+					if (tp->rfbuf_cnt >
+					    (so->so_rcv.sb_hiwat / 8 * 7) &&
+					    so->so_rcv.sb_hiwat <
+					    tcp_autorcvbuf_max) {
+						newsize =
+						    min(so->so_rcv.sb_hiwat +
+						    tcp_autorcvbuf_inc,
+						    tcp_autorcvbuf_max);
+					}
+					/* Start over with next RTT. */
+					tp->rfbuf_ts = 0;
+					tp->rfbuf_cnt = 0;
+				} else
+					tp->rfbuf_cnt += tlen;	/* add up */
+			}
+
+			/* Add data to socket buffer. */
 			SOCKBUF_LOCK(&so->so_rcv);
 			if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 				m_freem(m);
 			} else {
+				/*
+				 * Set new socket buffer size.
+				 * Give up when limit is reached.
+				 */
+				if (newsize)
+					if (!sbreserve_locked(&so->so_rcv,
+					    newsize, so, curthread))
+						so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
 				m_adj(m, drop_hdrlen);	/* delayed header drop */
 				sbappendstream_locked(&so->so_rcv, m);
 			}
@ -1361,6 +1435,10 @@ tcp_input(m, off0)
 	tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
 	}

+	/* Reset receive buffer auto scaling when not in bulk receive mode. */
+	tp->rfbuf_ts = 0;
+	tp->rfbuf_cnt = 0;
+
 	switch (tp->t_state) {

 	/*
--- a/sys/netinet/tcp_output.c
+++ b/sys/netinet/tcp_output.c
@ -110,6 +110,19 @@ int	tcp_do_tso = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW,
 	&tcp_do_tso, 0, "Enable TCP Segmentation Offload");

+int	tcp_do_autosndbuf = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_RW,
+	&tcp_do_autosndbuf, 0, "Enable automatic send buffer sizing");
+
+int	tcp_autosndbuf_inc = 8*1024;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_RW,
+	&tcp_autosndbuf_inc, 0, "Incrementor step size of automatic send buffer");
+
+int	tcp_autosndbuf_max = 256*1024;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW,
+	&tcp_autosndbuf_max, 0, "Max size of automatic send buffer");
+
+
 /*
 * Tcp output routine: figure out what should be sent and send it.
 */
@ -380,11 +393,60 @@ tcp_output(struct tcpcb *tp)
 		}
 	}

+	/* len will be >= 0 after this point. */
+	KASSERT(len >= 0, ("%s: len < 0", __func__));
+
 	/*
-	 * len will be >= 0 after this point.  Truncate to the maximum
-	 * segment length or enable TCP Segmentation Offloading (if supported
-	 * by hardware) and ensure that FIN is removed if the length no longer
-	 * contains the last data byte.
+	 * Automatic sizing of send socket buffer.  Often the send buffer
+	 * size is not optimally adjusted to the actual network conditions
+	 * at hand (delay bandwidth product).  Setting the buffer size too
+	 * small limits throughput on links with high bandwidth and high
+	 * delay (eg. trans-continental/oceanic links).  Setting the
+	 * buffer size too big consumes too much real kernel memory,
+	 * especially with many connections on busy servers.
+	 *
+	 * The criteria to step up the send buffer one notch are:
+	 *  1. receive window of remote host is larger than send buffer
+	 *     (with a fudge factor of 5/4th);
+	 *  2. send buffer is filled to 7/8th with data (so we actually
+	 *     have data to make use of it);
+	 *  3. send buffer fill has not hit maximal automatic size;
+	 *  4. our send window (slow start and cogestion controlled) is
+	 *     larger than sent but unacknowledged data in send buffer.
+	 *
+	 * The remote host receive window scaling factor may limit the
+	 * growing of the send buffer before it reaches its allowed
+	 * maximum.
+	 *
+	 * It scales directly with slow start or congestion window
+	 * and does at most one step per received ACK.  This fast
+	 * scaling has the drawback of growing the send buffer beyond
+	 * what is strictly necessary to make full use of a given
+	 * delay*bandwith product.  However testing has shown this not
+	 * to be much of an problem.  At worst we are trading wasting
+	 * of available bandwith (the non-use of it) for wasting some
+	 * socket buffer memory.
+	 *
+	 * TODO: Shrink send buffer during idle periods together
+	 * with congestion window.  Requires another timer.  Has to
+	 * wait for upcoming tcp timer rewrite.
+	 */
+	if (tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) {
+		if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat &&
+		    so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) &&
+		    so->so_snd.sb_cc < tcp_autosndbuf_max &&
+		    sendwin >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) {
+			if (!sbreserve_locked(&so->so_snd,
+			    min(so->so_snd.sb_hiwat + tcp_autosndbuf_inc,
+			     tcp_autosndbuf_max), so, curthread))
+				so->so_snd.sb_flags &= ~SB_AUTOSIZE;
+		}
+	}
+
+	/*
+	 * Truncate to the maximum segment length or enable TCP Segmentation
+	 * Offloading (if supported by hardware) and ensure that FIN is removed
+	 * if the length no longer contains the last data byte.
 	 *
 	 * TSO may only be used if we are in a pure bulk sending state.  The
 	 * presence of TCP-MD5, SACK retransmits, SACK advertizements and
@ -606,6 +668,10 @@ tcp_output(struct tcpcb *tp)
 		optlen += TCPOLEN_TSTAMP_APPA;
 	}

+	/* Set receive buffer autosizing timestamp. */
+	if (tp->rfbuf_ts == 0 && (so->so_rcv.sb_flags & SB_AUTOSIZE))
+		tp->rfbuf_ts = ticks;
+
 #ifdef TCP_SIGNATURE
 #ifdef INET6
 	if (!isipv6)
--- a/sys/netinet/tcp_reass.c
+++ b/sys/netinet/tcp_reass.c
@ -161,6 +161,18 @@ SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows, CTLFLAG_RD,
 	   &tcp_reass_overflows, 0,
 	   "Global number of TCP Segment Reassembly Queue Overflows");

+int	tcp_do_autorcvbuf = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW,
+	   &tcp_do_autorcvbuf, 0, "Enable automatic receive buffer sizing");
+
+int	tcp_autorcvbuf_inc = 16*1024;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW,
+	   &tcp_autorcvbuf_inc, 0, "Incrementor step size of automatic receive buffer");
+
+int	tcp_autorcvbuf_max = 256*1024;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW,
+	   &tcp_autorcvbuf_max, 0, "Max size of automatic receive buffer");
+
 struct inpcbhead tcb;
 #define	tcb6	tcb  /* for KAME src sync over BSD*'s */
 struct inpcbinfo tcbinfo;
@ -1295,6 +1307,8 @@ tcp_input(m, off0)
 		} else if (th->th_ack == tp->snd_una &&
 		    LIST_EMPTY(&tp->t_segq) &&
 		    tlen <= sbspace(&so->so_rcv)) {
+			int newsize = 0;	/* automatic sockbuf scaling */
+
 			KASSERT(headlocked, ("headlocked"));
 			INP_INFO_WUNLOCK(&tcbinfo);
 			headlocked = 0;
@ -1321,18 +1335,78 @@ tcp_input(m, off0)
 			tcpstat.tcps_rcvpack++;
 			tcpstat.tcps_rcvbyte += tlen;
 			ND6_HINT(tp);	/* some progress has been done */
-			/*
 #ifdef TCPDEBUG
 			if (so->so_options & SO_DEBUG)
 				tcp_trace(TA_INPUT, ostate, tp,
 				    (void *)tcp_saveipgen, &tcp_savetcp, 0);
 #endif
-			 * Add data to socket buffer.
-			 */
+		/*
+		 * Automatic sizing of receive socket buffer.  Often the send
+		 * buffer size is not optimally adjusted to the actual network
+		 * conditions at hand (delay bandwidth product).  Setting the
+		 * buffer size too small limits throughput on links with high
+		 * bandwidth and high delay (eg. trans-continental/oceanic links).
+		 *
+		 * On the receive side the socket buffer memory is only rarely
+		 * used to any significant extent.  This allows us to be much
+		 * more aggressive in scaling the receive socket buffer.  For
+		 * the case that the buffer space is actually used to a large
+		 * extent and we run out of kernel memory we can simply drop
+		 * the new segments; TCP on the sender will just retransmit it
+		 * later.  Setting the buffer size too big may only consume too
+		 * much kernel memory if the application doesn't read() from
+		 * the socket or packet loss or reordering makes use of the
+		 * reassembly queue.
+		 *
+		 * The criteria to step up the receive buffer one notch are:
+		 *  1. the number of bytes received during the time it takes
+		 *     one timestamp to be reflected back to us (the RTT);
+		 *  2. received bytes per RTT is within seven eighth of the
+		 *     current socket buffer size;
+		 *  3. receive buffer size has not hit maximal automatic size;
+		 *
+		 * This algorithm does one step per RTT at most and only if
+		 * we receive a bulk stream w/o packet losses or reorderings.
+		 * Shrinking the buffer during idle times is not necessary as
+		 * it doesn't consume any memory when idle.
+		 *
+		 * TODO: Only step up if the application is actually serving
+		 * the buffer to better manage the socket buffer resources.
+		 */
+			if (tcp_do_autorcvbuf &&
+			    to.to_tsecr &&
+			    (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
+				if (to.to_tsecr > tp->rfbuf_ts &&
+				    to.to_tsecr - tp->rfbuf_ts < hz) {
+					if (tp->rfbuf_cnt >
+					    (so->so_rcv.sb_hiwat / 8 * 7) &&
+					    so->so_rcv.sb_hiwat <
+					    tcp_autorcvbuf_max) {
+						newsize =
+						    min(so->so_rcv.sb_hiwat +
+						    tcp_autorcvbuf_inc,
+						    tcp_autorcvbuf_max);
+					}
+					/* Start over with next RTT. */
+					tp->rfbuf_ts = 0;
+					tp->rfbuf_cnt = 0;
+				} else
+					tp->rfbuf_cnt += tlen;	/* add up */
+			}
+
+			/* Add data to socket buffer. */
 			SOCKBUF_LOCK(&so->so_rcv);
 			if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 				m_freem(m);
 			} else {
+				/*
+				 * Set new socket buffer size.
+				 * Give up when limit is reached.
+				 */
+				if (newsize)
+					if (!sbreserve_locked(&so->so_rcv,
+					    newsize, so, curthread))
+						so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
 				m_adj(m, drop_hdrlen);	/* delayed header drop */
 				sbappendstream_locked(&so->so_rcv, m);
 			}
@ -1361,6 +1435,10 @@ tcp_input(m, off0)
 	tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
 	}

+	/* Reset receive buffer auto scaling when not in bulk receive mode. */
+	tp->rfbuf_ts = 0;
+	tp->rfbuf_cnt = 0;
+
 	switch (tp->t_state) {

 	/*
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@ -1446,6 +1446,8 @@ tcp_attach(so)
 		if (error)
 			return (error);
 	}
+	so->so_rcv.sb_flags |= SB_AUTOSIZE;
+	so->so_snd.sb_flags |= SB_AUTOSIZE;
 	INP_INFO_WLOCK(&tcbinfo);
 	error = in_pcballoc(so, &tcbinfo);
 	if (error) {
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@ -202,6 +202,8 @@ struct tcpcb {
 					   episode starts at this seq number */
 	struct sackhint	sackhint;	/* SACK scoreboard hint */
 	int	t_rttlow;		/* smallest observerved RTT */
+	u_int32_t	rfbuf_ts;	/* recv buffer autoscaling timestamp */
+	int	rfbuf_cnt;		/* recv buffer autoscaling byte count */
 };

 #define IN_FASTRECOVERY(tp)	(tp->t_flags & TF_FASTRECOVERY)