Implement TCP bandwidth delay product window limiting, similar to (but

not meant to duplicate) TCP/Vegas. Add four sysctls and default the implementation to 'off'. net.inet.tcp.inflight_enable enable algorithm (defaults to 0=off) net.inet.tcp.inflight_debug debugging (defaults to 1=on) net.inet.tcp.inflight_min minimum window limit net.inet.tcp.inflight_max maximum window limit MFC after: 1 week
svn path=/head/; revision=102017
2002-08-17 18:26:02 +00:00 · 2002-08-17 18:26:02 +00:00 · 1fcc99b5de · 2020-12-20 02:59:44 +00:00
commit 1fcc99b5de
parent fecfd395b0
7 changed files with 338 additions and 0 deletions
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@ -991,6 +991,7 @@ tcp_input(m, off0)
 					    SEQ_GT(th->th_ack, tp->t_rtseq))
 					tcp_xmit_timer(tp,
 							ticks - tp->t_rtttime);
+				tcp_xmit_bandwidth_limit(tp, th->th_ack);
 				acked = th->th_ack - tp->snd_una;
 				tcpstat.tcps_rcvackpack++;
 				tcpstat.tcps_rcvackbyte += acked;
@ -1810,6 +1811,7 @@ tcp_input(m, off0)
 			tcp_xmit_timer(tp, ticks - to.to_tsecr + 1);
 		else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq))
 			tcp_xmit_timer(tp, ticks - tp->t_rtttime);
+		tcp_xmit_bandwidth_limit(tp, th->th_ack);

 		/*
 		 * If all outstanding data is acked, stop retransmit
@ -2438,6 +2440,8 @@ tcp_xmit_timer(tp, rtt)
 		delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
 		if ((tp->t_rttvar += delta) <= 0)
 			tp->t_rttvar = 1;
+		if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
+		    tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
 	} else {
 		/*
 		 * No rtt measurement yet - use the unsmoothed rtt.
@ -2446,6 +2450,7 @@ tcp_xmit_timer(tp, rtt)
 		 */
 		tp->t_srtt = rtt << TCP_RTT_SHIFT;
 		tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
+		tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
 	}
 	tp->t_rtttime = 0;
 	tp->t_rxtshift = 0;
@ -2573,6 +2578,7 @@ tcp_mss(tp, offer)
 		if (rt->rt_rmx.rmx_locks & RTV_RTT)
 			tp->t_rttmin = rtt / (RTM_RTTUNIT / hz);
 		tp->t_srtt = rtt / (RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
+		tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
 		tcpstat.tcps_usedrtt++;
 		if (rt->rt_rmx.rmx_rttvar) {
 			tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
--- a/sys/netinet/tcp_output.c
+++ b/sys/netinet/tcp_output.c
@ -168,6 +168,7 @@ tcp_output(struct tcpcb *tp)
 	sendalot = 0;
 	off = tp->snd_nxt - tp->snd_una;
 	win = min(tp->snd_wnd, tp->snd_cwnd);
+	win = min(win, tp->snd_bwnd);

 	flags = tcp_outflags[tp->t_state];
 	/*
--- a/sys/netinet/tcp_reass.c
+++ b/sys/netinet/tcp_reass.c
@ -991,6 +991,7 @@ tcp_input(m, off0)
 					    SEQ_GT(th->th_ack, tp->t_rtseq))
 					tcp_xmit_timer(tp,
 							ticks - tp->t_rtttime);
+				tcp_xmit_bandwidth_limit(tp, th->th_ack);
 				acked = th->th_ack - tp->snd_una;
 				tcpstat.tcps_rcvackpack++;
 				tcpstat.tcps_rcvackbyte += acked;
@ -1810,6 +1811,7 @@ tcp_input(m, off0)
 			tcp_xmit_timer(tp, ticks - to.to_tsecr + 1);
 		else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq))
 			tcp_xmit_timer(tp, ticks - tp->t_rtttime);
+		tcp_xmit_bandwidth_limit(tp, th->th_ack);

 		/*
 		 * If all outstanding data is acked, stop retransmit
@ -2438,6 +2440,8 @@ tcp_xmit_timer(tp, rtt)
 		delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
 		if ((tp->t_rttvar += delta) <= 0)
 			tp->t_rttvar = 1;
+		if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
+		    tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
 	} else {
 		/*
 		 * No rtt measurement yet - use the unsmoothed rtt.
@ -2446,6 +2450,7 @@ tcp_xmit_timer(tp, rtt)
 		 */
 		tp->t_srtt = rtt << TCP_RTT_SHIFT;
 		tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
+		tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
 	}
 	tp->t_rtttime = 0;
 	tp->t_rxtshift = 0;
@ -2573,6 +2578,7 @@ tcp_mss(tp, offer)
 		if (rt->rt_rmx.rmx_locks & RTV_RTT)
 			tp->t_rttmin = rtt / (RTM_RTTUNIT / hz);
 		tp->t_srtt = rtt / (RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
+		tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
 		tcpstat.tcps_usedrtt++;
 		if (rt->rt_rmx.rmx_rttvar) {
 			tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@ -146,6 +146,27 @@ static int	tcp_isn_reseed_interval = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW,
    &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret");

+/*
+ * TCP bandwidth limiting sysctls.  Note that the default lower bound of 
+ * 1024 exists only for debugging.  A good production default would be 
+ * something like 6100.
+ */
+static int	tcp_inflight_enable = 0;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_enable, CTLFLAG_RW,
+    &tcp_inflight_enable, 0, "Enable automatic TCP inflight data limiting");
+
+static int	tcp_inflight_debug = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_debug, CTLFLAG_RW,
+    &tcp_inflight_debug, 0, "Debug TCP inflight calculations");
+
+static int	tcp_inflight_min = 1024;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_min, CTLFLAG_RW,
+    &tcp_inflight_min, 0, "Lower-bound for TCP inflight window");
+
+static int	tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_max, CTLFLAG_RW,
+    &tcp_inflight_max, 0, "Upper-bound for TCP inflight window");
+
 static void	tcp_cleartaocache(void);
 static struct inpcb *tcp_notify(struct inpcb *, int);

@ -566,8 +587,10 @@ tcp_newtcpcb(inp)
 	tp->t_rttmin = tcp_rexmit_min;
 	tp->t_rxtcur = TCPTV_RTOBASE;
 	tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
+	tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	tp->t_rcvtime = ticks;
+	tp->t_bw_rtttime = ticks;
        /*
 	 * IPv4 TTL initialization is necessary for an IPv6 socket as well,
 	 * because the socket may be bound to an IPv6 wildcard address,
@ -1531,3 +1554,138 @@ static void
 tcp_cleartaocache()
 {
 }
+
+/*
+ * TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING
+ *
+ * This code attempts to calculate the bandwidth-delay product as a
+ * means of determining the optimal window size to maximize bandwidth,
+ * minimize RTT, and avoid the over-allocation of buffers on interfaces and
+ * routers.  This code also does a fairly good job keeping RTTs in check
+ * across slow links like modems.  We implement an algorithm which is very
+ * similar (but not meant to be) TCP/Vegas.  The code operates on the
+ * transmitter side of a TCP connection and so only effects the transmit
+ * side of the connection.
+ *
+ * BACKGROUND:  TCP makes no provision for the management of buffer space
+ * at the end points or at the intermediate routers and switches.  A TCP 
+ * stream, whether using NewReno or not, will eventually buffer as
+ * many packets as it is able and the only reason this typically works is
+ * due to the fairly small default buffers made available for a connection
+ * (typicaly 16K or 32K).  As machines use larger windows and/or window
+ * scaling it is now fairly easy for even a single TCP connection to blow-out
+ * all available buffer space not only on the local interface, but on 
+ * intermediate routers and switches as well.  NewReno makes a misguided
+ * attempt to 'solve' this problem by waiting for an actual failure to occur,
+ * then backing off, then steadily increasing the window again until another
+ * failure occurs, ad-infinitum.  This results in terrible oscillation that
+ * is only made worse as network loads increase and the idea of intentionally
+ * blowing out network buffers is, frankly, a terrible way to manage network
+ * resources.
+ *
+ * It is far better to limit the transmit window prior to the failure
+ * condition being achieved.  There are two general ways to do this:  First
+ * you can 'scan' through different transmit window sizes and locate the
+ * point where the RTT stops increasing, indicating that you have filled the
+ * pipe, then scan backwards until you note that RTT stops decreasing, then
+ * repeat ad-infinitum.  This method works in principle but has severe
+ * implementation issues due to RTT variances, timer granularity, and
+ * instability in the algorithm which can lead to many false positives and
+ * create oscillations as well as interact badly with other TCP streams
+ * implementing the same algorithm.
+ *
+ * The second method is to limit the window to the bandwidth delay product
+ * of the link.  This is the method we implement.  RTT variances and our
+ * own manipulation of the congestion window, bwnd, can potentially 
+ * destabilize the algorithm.  For this reason we have to stabilize the
+ * elements used to calculate the window.  We do this by using the minimum
+ * observed RTT, the long term average of the observed bandwidth, and
+ * by adding two segments worth of slop.  It isn't perfect but it is able
+ * to react to changing conditions and gives us a very stable basis on
+ * which to extend the algorithm.
+ */
+void
+tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq)
+{
+	u_long bw;
+	u_long bwnd;
+	int save_ticks;
+
+	/*
+	 * If inflight_enable is disabled in the middle of a tcp connection,
+	 * make sure snd_bwnd is effectively disabled.
+	 */
+	if (tcp_inflight_enable == 0) {
+		tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
+		tp->snd_bandwidth = 0;
+		return;
+	}
+
+	/*
+	 * Figure out the bandwidth.  Due to the tick granularity this
+	 * is a very rough number and it MUST be averaged over a fairly
+	 * long period of time.  XXX we need to take into account a link
+	 * that is not using all available bandwidth, but for now our
+	 * slop will ramp us up if this case occurs and the bandwidth later
+	 * increases.
+	 */
+	save_ticks = ticks;
+	if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1)
+		return;
+
+	bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz / 
+	    (save_ticks - tp->t_bw_rtttime);
+	tp->t_bw_rtttime = save_ticks;
+	tp->t_bw_rtseq = ack_seq;
+	if (tp->t_bw_rtttime == 0)
+		return;
+	bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4;
+
+	tp->snd_bandwidth = bw;
+
+	/*
+	 * Calculate the semi-static bandwidth delay product, plus two maximal
+	 * segments.  The additional slop puts us squarely in the sweet
+	 * spot and also handles the bandwidth run-up case.  Without the
+	 * slop we could be locking ourselves into a lower bandwidth.
+	 *
+	 * Situations Handled:
+	 *	(1) Prevents over-queueing of packets on LANs, especially on
+	 *	    high speed LANs, allowing larger TCP buffers to be
+	 *	    specified, and also does a good job preventing 
+	 *	    over-queueing of packets over choke points like modems
+	 *	    (at least for the transmit side).
+	 *
+	 *	(2) Is able to handle changing network loads (bandwidth
+	 *	    drops so bwnd drops, bandwidth increases so bwnd
+	 *	    increases).
+	 *
+	 *	(3) Theoretically should stabilize in the face of multiple
+	 *	    connections implementing the same algorithm (this may need
+	 *	    a little work).
+	 */
+#define USERTT	((tp->t_srtt + tp->t_rttbest) / 2)
+	bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + 2 * tp->t_maxseg;
+
+	if (tcp_inflight_debug > 0) {
+		static int ltime;
+		if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) {
+			ltime = ticks;
+			printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n",
+			    tp,
+			    bw,
+			    tp->t_rttbest,
+			    tp->t_srtt,
+			    bwnd
+			);
+		}
+	}
+	if ((long)bwnd < tcp_inflight_min)
+		bwnd = tcp_inflight_min;
+	if (bwnd > tcp_inflight_max)
+		bwnd = tcp_inflight_max;
+	if ((long)bwnd < tp->t_maxseg * 2)
+		bwnd = tp->t_maxseg * 2;
+	tp->snd_bwnd = bwnd;
+}
+
--- a/sys/netinet/tcp_timewait.c
+++ b/sys/netinet/tcp_timewait.c
@ -146,6 +146,27 @@ static int	tcp_isn_reseed_interval = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW,
    &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret");

+/*
+ * TCP bandwidth limiting sysctls.  Note that the default lower bound of 
+ * 1024 exists only for debugging.  A good production default would be 
+ * something like 6100.
+ */
+static int	tcp_inflight_enable = 0;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_enable, CTLFLAG_RW,
+    &tcp_inflight_enable, 0, "Enable automatic TCP inflight data limiting");
+
+static int	tcp_inflight_debug = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_debug, CTLFLAG_RW,
+    &tcp_inflight_debug, 0, "Debug TCP inflight calculations");
+
+static int	tcp_inflight_min = 1024;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_min, CTLFLAG_RW,
+    &tcp_inflight_min, 0, "Lower-bound for TCP inflight window");
+
+static int	tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_max, CTLFLAG_RW,
+    &tcp_inflight_max, 0, "Upper-bound for TCP inflight window");
+
 static void	tcp_cleartaocache(void);
 static struct inpcb *tcp_notify(struct inpcb *, int);

@ -566,8 +587,10 @@ tcp_newtcpcb(inp)
 	tp->t_rttmin = tcp_rexmit_min;
 	tp->t_rxtcur = TCPTV_RTOBASE;
 	tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
+	tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	tp->t_rcvtime = ticks;
+	tp->t_bw_rtttime = ticks;
        /*
 	 * IPv4 TTL initialization is necessary for an IPv6 socket as well,
 	 * because the socket may be bound to an IPv6 wildcard address,
@ -1531,3 +1554,138 @@ static void
 tcp_cleartaocache()
 {
 }
+
+/*
+ * TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING
+ *
+ * This code attempts to calculate the bandwidth-delay product as a
+ * means of determining the optimal window size to maximize bandwidth,
+ * minimize RTT, and avoid the over-allocation of buffers on interfaces and
+ * routers.  This code also does a fairly good job keeping RTTs in check
+ * across slow links like modems.  We implement an algorithm which is very
+ * similar (but not meant to be) TCP/Vegas.  The code operates on the
+ * transmitter side of a TCP connection and so only effects the transmit
+ * side of the connection.
+ *
+ * BACKGROUND:  TCP makes no provision for the management of buffer space
+ * at the end points or at the intermediate routers and switches.  A TCP 
+ * stream, whether using NewReno or not, will eventually buffer as
+ * many packets as it is able and the only reason this typically works is
+ * due to the fairly small default buffers made available for a connection
+ * (typicaly 16K or 32K).  As machines use larger windows and/or window
+ * scaling it is now fairly easy for even a single TCP connection to blow-out
+ * all available buffer space not only on the local interface, but on 
+ * intermediate routers and switches as well.  NewReno makes a misguided
+ * attempt to 'solve' this problem by waiting for an actual failure to occur,
+ * then backing off, then steadily increasing the window again until another
+ * failure occurs, ad-infinitum.  This results in terrible oscillation that
+ * is only made worse as network loads increase and the idea of intentionally
+ * blowing out network buffers is, frankly, a terrible way to manage network
+ * resources.
+ *
+ * It is far better to limit the transmit window prior to the failure
+ * condition being achieved.  There are two general ways to do this:  First
+ * you can 'scan' through different transmit window sizes and locate the
+ * point where the RTT stops increasing, indicating that you have filled the
+ * pipe, then scan backwards until you note that RTT stops decreasing, then
+ * repeat ad-infinitum.  This method works in principle but has severe
+ * implementation issues due to RTT variances, timer granularity, and
+ * instability in the algorithm which can lead to many false positives and
+ * create oscillations as well as interact badly with other TCP streams
+ * implementing the same algorithm.
+ *
+ * The second method is to limit the window to the bandwidth delay product
+ * of the link.  This is the method we implement.  RTT variances and our
+ * own manipulation of the congestion window, bwnd, can potentially 
+ * destabilize the algorithm.  For this reason we have to stabilize the
+ * elements used to calculate the window.  We do this by using the minimum
+ * observed RTT, the long term average of the observed bandwidth, and
+ * by adding two segments worth of slop.  It isn't perfect but it is able
+ * to react to changing conditions and gives us a very stable basis on
+ * which to extend the algorithm.
+ */
+void
+tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq)
+{
+	u_long bw;
+	u_long bwnd;
+	int save_ticks;
+
+	/*
+	 * If inflight_enable is disabled in the middle of a tcp connection,
+	 * make sure snd_bwnd is effectively disabled.
+	 */
+	if (tcp_inflight_enable == 0) {
+		tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
+		tp->snd_bandwidth = 0;
+		return;
+	}
+
+	/*
+	 * Figure out the bandwidth.  Due to the tick granularity this
+	 * is a very rough number and it MUST be averaged over a fairly
+	 * long period of time.  XXX we need to take into account a link
+	 * that is not using all available bandwidth, but for now our
+	 * slop will ramp us up if this case occurs and the bandwidth later
+	 * increases.
+	 */
+	save_ticks = ticks;
+	if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1)
+		return;
+
+	bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz / 
+	    (save_ticks - tp->t_bw_rtttime);
+	tp->t_bw_rtttime = save_ticks;
+	tp->t_bw_rtseq = ack_seq;
+	if (tp->t_bw_rtttime == 0)
+		return;
+	bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4;
+
+	tp->snd_bandwidth = bw;
+
+	/*
+	 * Calculate the semi-static bandwidth delay product, plus two maximal
+	 * segments.  The additional slop puts us squarely in the sweet
+	 * spot and also handles the bandwidth run-up case.  Without the
+	 * slop we could be locking ourselves into a lower bandwidth.
+	 *
+	 * Situations Handled:
+	 *	(1) Prevents over-queueing of packets on LANs, especially on
+	 *	    high speed LANs, allowing larger TCP buffers to be
+	 *	    specified, and also does a good job preventing 
+	 *	    over-queueing of packets over choke points like modems
+	 *	    (at least for the transmit side).
+	 *
+	 *	(2) Is able to handle changing network loads (bandwidth
+	 *	    drops so bwnd drops, bandwidth increases so bwnd
+	 *	    increases).
+	 *
+	 *	(3) Theoretically should stabilize in the face of multiple
+	 *	    connections implementing the same algorithm (this may need
+	 *	    a little work).
+	 */
+#define USERTT	((tp->t_srtt + tp->t_rttbest) / 2)
+	bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + 2 * tp->t_maxseg;
+
+	if (tcp_inflight_debug > 0) {
+		static int ltime;
+		if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) {
+			ltime = ticks;
+			printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n",
+			    tp,
+			    bw,
+			    tp->t_rttbest,
+			    tp->t_srtt,
+			    bwnd
+			);
+		}
+	}
+	if ((long)bwnd < tcp_inflight_min)
+		bwnd = tcp_inflight_min;
+	if (bwnd > tcp_inflight_max)
+		bwnd = tcp_inflight_max;
+	if ((long)bwnd < tp->t_maxseg * 2)
+		bwnd = tp->t_maxseg * 2;
+	tp->snd_bwnd = bwnd;
+}
+
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@ -875,6 +875,7 @@ tcp_connect(tp, nam, td)
 	tp->t_state = TCPS_SYN_SENT;
 	callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp);
 	tp->iss = tcp_new_isn(tp);
+	tp->t_bw_rtseq = tp->iss;
 	tcp_sendseqinit(tp);

 	/*
@ -961,6 +962,7 @@ tcp6_connect(tp, nam, td)
 	tp->t_state = TCPS_SYN_SENT;
 	callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp);
 	tp->iss = tcp_new_isn(tp);
+	tp->t_bw_rtseq = tp->iss;
 	tcp_sendseqinit(tp);

 	/*
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@ -124,10 +124,12 @@ struct tcpcb {

 	u_long	snd_wnd;		/* send window */
 	u_long	snd_cwnd;		/* congestion-controlled window */
+	u_long	snd_bwnd;		/* bandwidth-controlled window */
 	u_long	snd_ssthresh;		/* snd_cwnd size threshold for
 					 * for slow start exponential to
 					 * linear switch
 					 */
+	u_long	snd_bandwidth;		/* calculated bandwidth or 0 */
 	tcp_seq	snd_recover;		/* for use in fast recovery */

 	u_int	t_maxopd;		/* mss plus options */
@ -137,6 +139,9 @@ struct tcpcb {
 	int	t_rtttime;		/* round trip time */
 	tcp_seq	t_rtseq;		/* sequence number being timed */

+	int	t_bw_rtttime;		/* used for bandwidth calculation */
+	tcp_seq	t_bw_rtseq;		/* used for bandwidth calculation */
+
 	int	t_rxtcur;		/* current retransmit value (ticks) */
 	u_int	t_maxseg;		/* maximum segment size */
 	int	t_srtt;			/* smoothed round-trip time */
@ -144,6 +149,7 @@ struct tcpcb {

 	int	t_rxtshift;		/* log(2) of rexmt exp. backoff */
 	u_int	t_rttmin;		/* minimum rtt allowed */
+	u_int	t_rttbest;		/* best rtt we've seen */
 	u_long	t_rttupdated;		/* number of times rtt sampled */
 	u_long	max_sndwnd;		/* largest window peer has offered */

@ -473,6 +479,7 @@ void	 tcp_fillheaders(struct tcpcb *, void *, void *);
 struct tcpcb *
 	 tcp_timers(struct tcpcb *, int);
 void	 tcp_trace(int, int, struct tcpcb *, void *, struct tcphdr *, int);
+void	 tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq);
 void	 syncache_init(void);
 void	 syncache_unreach(struct in_conninfo *, struct tcphdr *);
 int	 syncache_expand(struct in_conninfo *, struct tcphdr *,