Remove the TCP inflight bandwidth limiter as announced in r211315

to give way for the pluggable congestion control framework.  It is
the task of the congestion control algorithm to set the congestion
window and amount of inflight data without external interference.

In 'struct tcpcb' the variables previously used by the inflight
limiter are renamed to spares to keep the ABI intact and to have
some more space for future extensions.

In 'struct tcp_info' the variable 'tcpi_snd_bwnd' is not removed to
preserve the ABI.  It is always set to 0.

In siftr.c in 'struct pkt_node' the variable 'snd_bwnd' is not removed
to preserve the ABI.  It is always set to 0.

These unused variable in the various structures may be reused in the
future or garbage collected before the next release or at some other
point when an ABI change happens anyway for other reasons.

No MFC is planned.  The inflight bandwidth limiter stays disabled by
default in the other branches but remains available.
This commit is contained in:
Andre Oppermann 2010-09-16 21:06:45 +00:00
parent 9ed03f0231
commit 1c18314d17
8 changed files with 16 additions and 235 deletions

View File

@ -193,7 +193,7 @@ struct pkt_node {
u_long snd_wnd;
/* Receive Window (bytes). */
u_long rcv_wnd;
/* Bandwidth Controlled Window (bytes). */
/* Unused (was: Bandwidth Controlled Window (bytes)). */
u_long snd_bwnd;
/* Slow Start Threshold (bytes). */
u_long snd_ssthresh;
@ -775,7 +775,7 @@ siftr_siftdata(struct pkt_node *pn, struct inpcb *inp, struct tcpcb *tp,
pn->snd_cwnd = tp->snd_cwnd;
pn->snd_wnd = tp->snd_wnd;
pn->rcv_wnd = tp->rcv_wnd;
pn->snd_bwnd = tp->snd_bwnd;
pn->snd_bwnd = 0; /* Unused, kept for compat. */
pn->snd_ssthresh = tp->snd_ssthresh;
pn->snd_scale = tp->snd_scale;
pn->rcv_scale = tp->rcv_scale;

View File

@ -221,7 +221,7 @@ struct tcp_info {
/* FreeBSD extensions to tcp_info. */
u_int32_t tcpi_snd_wnd; /* Advertised send window. */
u_int32_t tcpi_snd_bwnd; /* Bandwidth send window. */
u_int32_t tcpi_snd_bwnd; /* No longer used. */
u_int32_t tcpi_snd_nxt; /* Next egress seqno */
u_int32_t tcpi_rcv_nxt; /* Next ingress seqno */
u_int32_t tcpi_toe_tid; /* HWTID for TOE endpoints */

View File

@ -1321,7 +1321,6 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
tcp_xmit_timer(tp,
ticks - tp->t_rtttime);
}
tcp_xmit_bandwidth_limit(tp, th->th_ack);
acked = th->th_ack - tp->snd_una;
TCPSTAT_INC(tcps_rcvackpack);
TCPSTAT_ADD(tcps_rcvackbyte, acked);
@ -2278,7 +2277,6 @@ process_ACK:
tp->t_rttlow = ticks - tp->t_rtttime;
tcp_xmit_timer(tp, ticks - tp->t_rtttime);
}
tcp_xmit_bandwidth_limit(tp, th->th_ack);
/*
* If all outstanding data is acked, stop retransmit
@ -3328,8 +3326,6 @@ tcp_mss(struct tcpcb *tp, int offer)
tp->snd_ssthresh = max(2 * mss, metrics.rmx_ssthresh);
TCPSTAT_INC(tcps_usedssthresh);
}
if (metrics.rmx_bandwidth)
tp->snd_bandwidth = metrics.rmx_bandwidth;
/*
* Set the slow-start flight size depending on whether this

View File

@ -225,7 +225,6 @@ again:
tso = 0;
off = tp->snd_nxt - tp->snd_una;
sendwin = min(tp->snd_wnd, tp->snd_cwnd);
sendwin = min(sendwin, tp->snd_bwnd);
flags = tcp_outflags[tp->t_state];
/*

View File

@ -160,14 +160,6 @@ SYSCTL_VNET_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt,
"Default TCP Maximum Segment Size for IPv6");
#endif
static int
vnet_sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS)
{
VNET_SYSCTL_ARG(req, arg1);
return (sysctl_msec_to_ticks(oidp, arg1, arg2, req));
}
/*
* Minimum MSS we accept and use. This prevents DoS attacks where
* we are forced to a ridiculous low MSS like 20 and send hundreds
@ -213,50 +205,6 @@ SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW,
&VNET_NAME(tcp_isn_reseed_interval), 0,
"Seconds between reseeding of ISN secret");
/*
* TCP bandwidth limiting sysctls. Note that the default lower bound of
* 1024 exists only for debugging. A good production default would be
* something like 6100.
*/
SYSCTL_NODE(_net_inet_tcp, OID_AUTO, inflight, CTLFLAG_RW, 0,
"TCP inflight data limiting");
static VNET_DEFINE(int, tcp_inflight_enable) = 0;
#define V_tcp_inflight_enable VNET(tcp_inflight_enable)
SYSCTL_VNET_INT(_net_inet_tcp_inflight, OID_AUTO, enable, CTLFLAG_RW,
&VNET_NAME(tcp_inflight_enable), 0,
"Enable automatic TCP inflight data limiting");
static int tcp_inflight_debug = 0;
SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, debug, CTLFLAG_RW,
&tcp_inflight_debug, 0,
"Debug TCP inflight calculations");
static VNET_DEFINE(int, tcp_inflight_rttthresh);
#define V_tcp_inflight_rttthresh VNET(tcp_inflight_rttthresh)
SYSCTL_VNET_PROC(_net_inet_tcp_inflight, OID_AUTO, rttthresh,
CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(tcp_inflight_rttthresh), 0,
vnet_sysctl_msec_to_ticks, "I",
"RTT threshold below which inflight will deactivate itself");
static VNET_DEFINE(int, tcp_inflight_min) = 6144;
#define V_tcp_inflight_min VNET(tcp_inflight_min)
SYSCTL_VNET_INT(_net_inet_tcp_inflight, OID_AUTO, min, CTLFLAG_RW,
&VNET_NAME(tcp_inflight_min), 0,
"Lower-bound for TCP inflight window");
static VNET_DEFINE(int, tcp_inflight_max) = TCP_MAXWIN << TCP_MAX_WINSHIFT;
#define V_tcp_inflight_max VNET(tcp_inflight_max)
SYSCTL_VNET_INT(_net_inet_tcp_inflight, OID_AUTO, max, CTLFLAG_RW,
&VNET_NAME(tcp_inflight_max), 0,
"Upper-bound for TCP inflight window");
static VNET_DEFINE(int, tcp_inflight_stab) = 20;
#define V_tcp_inflight_stab VNET(tcp_inflight_stab)
SYSCTL_VNET_INT(_net_inet_tcp_inflight, OID_AUTO, stab, CTLFLAG_RW,
&VNET_NAME(tcp_inflight_stab), 0,
"Inflight Algorithm Stabilization 20 = 2 packets");
#ifdef TCP_SORECEIVE_STREAM
static int tcp_soreceive_stream = 0;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, soreceive_stream, CTLFLAG_RDTUN,
@ -338,8 +286,6 @@ tcp_init(void)
in_pcbinfo_init(&V_tcbinfo, "tcp", &V_tcb, hashsize, hashsize,
"tcp_inpcb", tcp_inpcb_init, NULL, UMA_ZONE_NOFREE);
V_tcp_inflight_rttthresh = TCPTV_INFLIGHT_RTTTHRESH;
/*
* These have to be type stable for the benefit of the timers.
*/
@ -728,10 +674,8 @@ tcp_newtcpcb(struct inpcb *inp)
tp->t_rttmin = tcp_rexmit_min;
tp->t_rxtcur = TCPTV_RTOBASE;
tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
tp->t_rcvtime = ticks;
tp->t_bw_rtttime = ticks;
/*
* IPv4 TTL initialization is necessary for an IPv6 socket as well,
* because the socket may be bound to an IPv6 wildcard address,
@ -849,8 +793,6 @@ tcp_discardcb(struct tcpcb *tp)
metrics.rmx_rtt = tp->t_srtt;
metrics.rmx_rttvar = tp->t_rttvar;
/* XXX: This wraps if the pipe is more than 4 Gbit per second */
metrics.rmx_bandwidth = tp->snd_bandwidth;
metrics.rmx_cwnd = tp->snd_cwnd;
metrics.rmx_sendpipe = 0;
metrics.rmx_recvpipe = 0;
@ -1773,154 +1715,6 @@ ipsec_hdrsiz_tcp(struct tcpcb *tp)
}
#endif /* IPSEC */
/*
* TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING
*
* This code attempts to calculate the bandwidth-delay product as a
* means of determining the optimal window size to maximize bandwidth,
* minimize RTT, and avoid the over-allocation of buffers on interfaces and
* routers. This code also does a fairly good job keeping RTTs in check
* across slow links like modems. We implement an algorithm which is very
* similar (but not meant to be) TCP/Vegas. The code operates on the
* transmitter side of a TCP connection and so only effects the transmit
* side of the connection.
*
* BACKGROUND: TCP makes no provision for the management of buffer space
* at the end points or at the intermediate routers and switches. A TCP
* stream, whether using NewReno or not, will eventually buffer as
* many packets as it is able and the only reason this typically works is
* due to the fairly small default buffers made available for a connection
* (typicaly 16K or 32K). As machines use larger windows and/or window
* scaling it is now fairly easy for even a single TCP connection to blow-out
* all available buffer space not only on the local interface, but on
* intermediate routers and switches as well. NewReno makes a misguided
* attempt to 'solve' this problem by waiting for an actual failure to occur,
* then backing off, then steadily increasing the window again until another
* failure occurs, ad-infinitum. This results in terrible oscillation that
* is only made worse as network loads increase and the idea of intentionally
* blowing out network buffers is, frankly, a terrible way to manage network
* resources.
*
* It is far better to limit the transmit window prior to the failure
* condition being achieved. There are two general ways to do this: First
* you can 'scan' through different transmit window sizes and locate the
* point where the RTT stops increasing, indicating that you have filled the
* pipe, then scan backwards until you note that RTT stops decreasing, then
* repeat ad-infinitum. This method works in principle but has severe
* implementation issues due to RTT variances, timer granularity, and
* instability in the algorithm which can lead to many false positives and
* create oscillations as well as interact badly with other TCP streams
* implementing the same algorithm.
*
* The second method is to limit the window to the bandwidth delay product
* of the link. This is the method we implement. RTT variances and our
* own manipulation of the congestion window, bwnd, can potentially
* destabilize the algorithm. For this reason we have to stabilize the
* elements used to calculate the window. We do this by using the minimum
* observed RTT, the long term average of the observed bandwidth, and
* by adding two segments worth of slop. It isn't perfect but it is able
* to react to changing conditions and gives us a very stable basis on
* which to extend the algorithm.
*/
void
tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq)
{
u_long bw;
u_long bwnd;
int save_ticks;
INP_WLOCK_ASSERT(tp->t_inpcb);
/*
* If inflight_enable is disabled in the middle of a tcp connection,
* make sure snd_bwnd is effectively disabled.
*/
if (V_tcp_inflight_enable == 0 ||
tp->t_rttlow < V_tcp_inflight_rttthresh) {
tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
tp->snd_bandwidth = 0;
return;
}
/*
* Figure out the bandwidth. Due to the tick granularity this
* is a very rough number and it MUST be averaged over a fairly
* long period of time. XXX we need to take into account a link
* that is not using all available bandwidth, but for now our
* slop will ramp us up if this case occurs and the bandwidth later
* increases.
*
* Note: if ticks rollover 'bw' may wind up negative. We must
* effectively reset t_bw_rtttime for this case.
*/
save_ticks = ticks;
if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1)
return;
bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz /
(save_ticks - tp->t_bw_rtttime);
tp->t_bw_rtttime = save_ticks;
tp->t_bw_rtseq = ack_seq;
if (tp->t_bw_rtttime == 0 || (int)bw < 0)
return;
bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4;
tp->snd_bandwidth = bw;
/*
* Calculate the semi-static bandwidth delay product, plus two maximal
* segments. The additional slop puts us squarely in the sweet
* spot and also handles the bandwidth run-up case and stabilization.
* Without the slop we could be locking ourselves into a lower
* bandwidth.
*
* Situations Handled:
* (1) Prevents over-queueing of packets on LANs, especially on
* high speed LANs, allowing larger TCP buffers to be
* specified, and also does a good job preventing
* over-queueing of packets over choke points like modems
* (at least for the transmit side).
*
* (2) Is able to handle changing network loads (bandwidth
* drops so bwnd drops, bandwidth increases so bwnd
* increases).
*
* (3) Theoretically should stabilize in the face of multiple
* connections implementing the same algorithm (this may need
* a little work).
*
* (4) Stability value (defaults to 20 = 2 maximal packets) can
* be adjusted with a sysctl but typically only needs to be
* on very slow connections. A value no smaller then 5
* should be used, but only reduce this default if you have
* no other choice.
*/
#define USERTT ((tp->t_srtt + tp->t_rttbest) / 2)
bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + V_tcp_inflight_stab * tp->t_maxseg / 10;
#undef USERTT
if (tcp_inflight_debug > 0) {
static int ltime;
if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) {
ltime = ticks;
printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n",
tp,
bw,
tp->t_rttbest,
tp->t_srtt,
bwnd
);
}
}
if ((long)bwnd < V_tcp_inflight_min)
bwnd = V_tcp_inflight_min;
if (bwnd > V_tcp_inflight_max)
bwnd = V_tcp_inflight_max;
if ((long)bwnd < tp->t_maxseg * 2)
bwnd = tp->t_maxseg * 2;
tp->snd_bwnd = bwnd;
}
#ifdef TCP_SIGNATURE
/*
* Callback function invoked by m_apply() to digest TCP segment data

View File

@ -86,9 +86,6 @@
#define TCPTV_KEEPINTVL ( 75*hz) /* default probe interval */
#define TCPTV_KEEPCNT 8 /* max probes before drop */
#define TCPTV_INFLIGHT_RTTTHRESH (10*hz/1000) /* below which inflight
disengages, in msec */
#define TCPTV_FINWAIT2_TIMEOUT (60*hz) /* FIN_WAIT_2 timeout if no receiver */
/*

View File

@ -1105,7 +1105,6 @@ tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td)
tp->t_state = TCPS_SYN_SENT;
tcp_timer_activate(tp, TT_KEEP, tcp_keepinit);
tp->iss = tcp_new_isn(tp);
tp->t_bw_rtseq = tp->iss;
tcp_sendseqinit(tp);
return 0;
@ -1168,7 +1167,6 @@ tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td)
tp->t_state = TCPS_SYN_SENT;
tcp_timer_activate(tp, TT_KEEP, tcp_keepinit);
tp->iss = tcp_new_isn(tp);
tp->t_bw_rtseq = tp->iss;
tcp_sendseqinit(tp);
return 0;
@ -1214,7 +1212,7 @@ tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti)
ti->tcpi_rcv_space = tp->rcv_wnd;
ti->tcpi_rcv_nxt = tp->rcv_nxt;
ti->tcpi_snd_wnd = tp->snd_wnd;
ti->tcpi_snd_bwnd = tp->snd_bwnd;
ti->tcpi_snd_bwnd = 0; /* Unused, kept for compat. */
ti->tcpi_snd_nxt = tp->snd_nxt;
ti->tcpi_snd_mss = tp->t_maxseg;
ti->tcpi_rcv_mss = tp->t_maxseg;
@ -1795,26 +1793,24 @@ db_print_tcpcb(struct tcpcb *tp, const char *name, int indent)
tp->rcv_adv, tp->rcv_wnd, tp->rcv_up);
db_print_indent(indent);
db_printf("snd_wnd: %lu snd_cwnd: %lu snd_bwnd: %lu\n",
tp->snd_wnd, tp->snd_cwnd, tp->snd_bwnd);
db_printf("snd_wnd: %lu snd_cwnd: %lu\n",
tp->snd_wnd, tp->snd_cwnd);
db_print_indent(indent);
db_printf("snd_ssthresh: %lu snd_bandwidth: %lu snd_recover: "
"0x%08x\n", tp->snd_ssthresh, tp->snd_bandwidth,
tp->snd_recover);
db_printf("snd_ssthresh: %lu snd_recover: "
"0x%08x\n", tp->snd_ssthresh, tp->snd_recover);
db_print_indent(indent);
db_printf("t_maxopd: %u t_rcvtime: %u t_startime: %u\n",
tp->t_maxopd, tp->t_rcvtime, tp->t_starttime);
db_print_indent(indent);
db_printf("t_rttime: %u t_rtsq: 0x%08x t_bw_rtttime: %u\n",
tp->t_rtttime, tp->t_rtseq, tp->t_bw_rtttime);
db_printf("t_rttime: %u t_rtsq: 0x%08x\n",
tp->t_rtttime, tp->t_rtseq);
db_print_indent(indent);
db_printf("t_bw_rtseq: 0x%08x t_rxtcur: %d t_maxseg: %u "
"t_srtt: %d\n", tp->t_bw_rtseq, tp->t_rxtcur, tp->t_maxseg,
tp->t_srtt);
db_printf("t_rxtcur: %d t_maxseg: %u t_srtt: %d\n",
tp->t_rxtcur, tp->t_maxseg, tp->t_srtt);
db_print_indent(indent);
db_printf("t_rttvar: %d t_rxtshift: %d t_rttmin: %u "

View File

@ -135,12 +135,12 @@ struct tcpcb {
u_long snd_wnd; /* send window */
u_long snd_cwnd; /* congestion-controlled window */
u_long snd_bwnd; /* bandwidth-controlled window */
u_long snd_spare1; /* unused */
u_long snd_ssthresh; /* snd_cwnd size threshold for
* for slow start exponential to
* linear switch
*/
u_long snd_bandwidth; /* calculated bandwidth or 0 */
u_long snd_spare2; /* unused */
tcp_seq snd_recover; /* for use in NewReno Fast Recovery */
u_int t_maxopd; /* mss plus options */
@ -150,8 +150,8 @@ struct tcpcb {
u_int t_rtttime; /* RTT measurement start time */
tcp_seq t_rtseq; /* sequence number being timed */
u_int t_bw_rtttime; /* used for bandwidth calculation */
tcp_seq t_bw_rtseq; /* used for bandwidth calculation */
u_int t_bw_spare1; /* unused */
tcp_seq t_bw_spare2; /* unused */
int t_rxtcur; /* current retransmit value (ticks) */
u_int t_maxseg; /* maximum segment size */
@ -654,7 +654,6 @@ void tcpip_fillheaders(struct inpcb *, void *, void *);
void tcp_timer_activate(struct tcpcb *, int, u_int);
int tcp_timer_active(struct tcpcb *, int);
void tcp_trace(short, short, struct tcpcb *, void *, struct tcphdr *, int);
void tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq);
/*
* All tcp_hc_* functions are IPv4 and IPv6 (via in_conninfo)
*/