From 1c18314d17c6ce5596c814db1ca20c9b55f0df62 Mon Sep 17 00:00:00 2001 From: Andre Oppermann Date: Thu, 16 Sep 2010 21:06:45 +0000 Subject: [PATCH] Remove the TCP inflight bandwidth limiter as announced in r211315 to give way for the pluggable congestion control framework. It is the task of the congestion control algorithm to set the congestion window and amount of inflight data without external interference. In 'struct tcpcb' the variables previously used by the inflight limiter are renamed to spares to keep the ABI intact and to have some more space for future extensions. In 'struct tcp_info' the variable 'tcpi_snd_bwnd' is not removed to preserve the ABI. It is always set to 0. In siftr.c in 'struct pkt_node' the variable 'snd_bwnd' is not removed to preserve the ABI. It is always set to 0. These unused variable in the various structures may be reused in the future or garbage collected before the next release or at some other point when an ABI change happens anyway for other reasons. No MFC is planned. The inflight bandwidth limiter stays disabled by default in the other branches but remains available. --- sys/netinet/siftr.c | 4 +- sys/netinet/tcp.h | 2 +- sys/netinet/tcp_input.c | 4 - sys/netinet/tcp_output.c | 1 - sys/netinet/tcp_subr.c | 206 --------------------------------------- sys/netinet/tcp_timer.h | 3 - sys/netinet/tcp_usrreq.c | 22 ++--- sys/netinet/tcp_var.h | 9 +- 8 files changed, 16 insertions(+), 235 deletions(-) diff --git a/sys/netinet/siftr.c b/sys/netinet/siftr.c index 34f363636cc9..6097ad48a758 100644 --- a/sys/netinet/siftr.c +++ b/sys/netinet/siftr.c @@ -193,7 +193,7 @@ struct pkt_node { u_long snd_wnd; /* Receive Window (bytes). */ u_long rcv_wnd; - /* Bandwidth Controlled Window (bytes). */ + /* Unused (was: Bandwidth Controlled Window (bytes)). */ u_long snd_bwnd; /* Slow Start Threshold (bytes). */ u_long snd_ssthresh; @@ -775,7 +775,7 @@ siftr_siftdata(struct pkt_node *pn, struct inpcb *inp, struct tcpcb *tp, pn->snd_cwnd = tp->snd_cwnd; pn->snd_wnd = tp->snd_wnd; pn->rcv_wnd = tp->rcv_wnd; - pn->snd_bwnd = tp->snd_bwnd; + pn->snd_bwnd = 0; /* Unused, kept for compat. */ pn->snd_ssthresh = tp->snd_ssthresh; pn->snd_scale = tp->snd_scale; pn->rcv_scale = tp->rcv_scale; diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h index 27d45aac2f5a..62a89f7addac 100644 --- a/sys/netinet/tcp.h +++ b/sys/netinet/tcp.h @@ -221,7 +221,7 @@ struct tcp_info { /* FreeBSD extensions to tcp_info. */ u_int32_t tcpi_snd_wnd; /* Advertised send window. */ - u_int32_t tcpi_snd_bwnd; /* Bandwidth send window. */ + u_int32_t tcpi_snd_bwnd; /* No longer used. */ u_int32_t tcpi_snd_nxt; /* Next egress seqno */ u_int32_t tcpi_rcv_nxt; /* Next ingress seqno */ u_int32_t tcpi_toe_tid; /* HWTID for TOE endpoints */ diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c index e4bddb90525a..22a2ea4a89c7 100644 --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -1321,7 +1321,6 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, tcp_xmit_timer(tp, ticks - tp->t_rtttime); } - tcp_xmit_bandwidth_limit(tp, th->th_ack); acked = th->th_ack - tp->snd_una; TCPSTAT_INC(tcps_rcvackpack); TCPSTAT_ADD(tcps_rcvackbyte, acked); @@ -2278,7 +2277,6 @@ process_ACK: tp->t_rttlow = ticks - tp->t_rtttime; tcp_xmit_timer(tp, ticks - tp->t_rtttime); } - tcp_xmit_bandwidth_limit(tp, th->th_ack); /* * If all outstanding data is acked, stop retransmit @@ -3328,8 +3326,6 @@ tcp_mss(struct tcpcb *tp, int offer) tp->snd_ssthresh = max(2 * mss, metrics.rmx_ssthresh); TCPSTAT_INC(tcps_usedssthresh); } - if (metrics.rmx_bandwidth) - tp->snd_bandwidth = metrics.rmx_bandwidth; /* * Set the slow-start flight size depending on whether this diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c index 50d0ee6a4cd2..94b48fc474a8 100644 --- a/sys/netinet/tcp_output.c +++ b/sys/netinet/tcp_output.c @@ -225,7 +225,6 @@ again: tso = 0; off = tp->snd_nxt - tp->snd_una; sendwin = min(tp->snd_wnd, tp->snd_cwnd); - sendwin = min(sendwin, tp->snd_bwnd); flags = tcp_outflags[tp->t_state]; /* diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index b537fb94958d..d19a91a1698f 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -160,14 +160,6 @@ SYSCTL_VNET_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, "Default TCP Maximum Segment Size for IPv6"); #endif -static int -vnet_sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS) -{ - - VNET_SYSCTL_ARG(req, arg1); - return (sysctl_msec_to_ticks(oidp, arg1, arg2, req)); -} - /* * Minimum MSS we accept and use. This prevents DoS attacks where * we are forced to a ridiculous low MSS like 20 and send hundreds @@ -213,50 +205,6 @@ SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW, &VNET_NAME(tcp_isn_reseed_interval), 0, "Seconds between reseeding of ISN secret"); -/* - * TCP bandwidth limiting sysctls. Note that the default lower bound of - * 1024 exists only for debugging. A good production default would be - * something like 6100. - */ -SYSCTL_NODE(_net_inet_tcp, OID_AUTO, inflight, CTLFLAG_RW, 0, - "TCP inflight data limiting"); - -static VNET_DEFINE(int, tcp_inflight_enable) = 0; -#define V_tcp_inflight_enable VNET(tcp_inflight_enable) -SYSCTL_VNET_INT(_net_inet_tcp_inflight, OID_AUTO, enable, CTLFLAG_RW, - &VNET_NAME(tcp_inflight_enable), 0, - "Enable automatic TCP inflight data limiting"); - -static int tcp_inflight_debug = 0; -SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, debug, CTLFLAG_RW, - &tcp_inflight_debug, 0, - "Debug TCP inflight calculations"); - -static VNET_DEFINE(int, tcp_inflight_rttthresh); -#define V_tcp_inflight_rttthresh VNET(tcp_inflight_rttthresh) -SYSCTL_VNET_PROC(_net_inet_tcp_inflight, OID_AUTO, rttthresh, - CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(tcp_inflight_rttthresh), 0, - vnet_sysctl_msec_to_ticks, "I", - "RTT threshold below which inflight will deactivate itself"); - -static VNET_DEFINE(int, tcp_inflight_min) = 6144; -#define V_tcp_inflight_min VNET(tcp_inflight_min) -SYSCTL_VNET_INT(_net_inet_tcp_inflight, OID_AUTO, min, CTLFLAG_RW, - &VNET_NAME(tcp_inflight_min), 0, - "Lower-bound for TCP inflight window"); - -static VNET_DEFINE(int, tcp_inflight_max) = TCP_MAXWIN << TCP_MAX_WINSHIFT; -#define V_tcp_inflight_max VNET(tcp_inflight_max) -SYSCTL_VNET_INT(_net_inet_tcp_inflight, OID_AUTO, max, CTLFLAG_RW, - &VNET_NAME(tcp_inflight_max), 0, - "Upper-bound for TCP inflight window"); - -static VNET_DEFINE(int, tcp_inflight_stab) = 20; -#define V_tcp_inflight_stab VNET(tcp_inflight_stab) -SYSCTL_VNET_INT(_net_inet_tcp_inflight, OID_AUTO, stab, CTLFLAG_RW, - &VNET_NAME(tcp_inflight_stab), 0, - "Inflight Algorithm Stabilization 20 = 2 packets"); - #ifdef TCP_SORECEIVE_STREAM static int tcp_soreceive_stream = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, soreceive_stream, CTLFLAG_RDTUN, @@ -338,8 +286,6 @@ tcp_init(void) in_pcbinfo_init(&V_tcbinfo, "tcp", &V_tcb, hashsize, hashsize, "tcp_inpcb", tcp_inpcb_init, NULL, UMA_ZONE_NOFREE); - V_tcp_inflight_rttthresh = TCPTV_INFLIGHT_RTTTHRESH; - /* * These have to be type stable for the benefit of the timers. */ @@ -728,10 +674,8 @@ tcp_newtcpcb(struct inpcb *inp) tp->t_rttmin = tcp_rexmit_min; tp->t_rxtcur = TCPTV_RTOBASE; tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; - tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->t_rcvtime = ticks; - tp->t_bw_rtttime = ticks; /* * IPv4 TTL initialization is necessary for an IPv6 socket as well, * because the socket may be bound to an IPv6 wildcard address, @@ -849,8 +793,6 @@ tcp_discardcb(struct tcpcb *tp) metrics.rmx_rtt = tp->t_srtt; metrics.rmx_rttvar = tp->t_rttvar; - /* XXX: This wraps if the pipe is more than 4 Gbit per second */ - metrics.rmx_bandwidth = tp->snd_bandwidth; metrics.rmx_cwnd = tp->snd_cwnd; metrics.rmx_sendpipe = 0; metrics.rmx_recvpipe = 0; @@ -1773,154 +1715,6 @@ ipsec_hdrsiz_tcp(struct tcpcb *tp) } #endif /* IPSEC */ -/* - * TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING - * - * This code attempts to calculate the bandwidth-delay product as a - * means of determining the optimal window size to maximize bandwidth, - * minimize RTT, and avoid the over-allocation of buffers on interfaces and - * routers. This code also does a fairly good job keeping RTTs in check - * across slow links like modems. We implement an algorithm which is very - * similar (but not meant to be) TCP/Vegas. The code operates on the - * transmitter side of a TCP connection and so only effects the transmit - * side of the connection. - * - * BACKGROUND: TCP makes no provision for the management of buffer space - * at the end points or at the intermediate routers and switches. A TCP - * stream, whether using NewReno or not, will eventually buffer as - * many packets as it is able and the only reason this typically works is - * due to the fairly small default buffers made available for a connection - * (typicaly 16K or 32K). As machines use larger windows and/or window - * scaling it is now fairly easy for even a single TCP connection to blow-out - * all available buffer space not only on the local interface, but on - * intermediate routers and switches as well. NewReno makes a misguided - * attempt to 'solve' this problem by waiting for an actual failure to occur, - * then backing off, then steadily increasing the window again until another - * failure occurs, ad-infinitum. This results in terrible oscillation that - * is only made worse as network loads increase and the idea of intentionally - * blowing out network buffers is, frankly, a terrible way to manage network - * resources. - * - * It is far better to limit the transmit window prior to the failure - * condition being achieved. There are two general ways to do this: First - * you can 'scan' through different transmit window sizes and locate the - * point where the RTT stops increasing, indicating that you have filled the - * pipe, then scan backwards until you note that RTT stops decreasing, then - * repeat ad-infinitum. This method works in principle but has severe - * implementation issues due to RTT variances, timer granularity, and - * instability in the algorithm which can lead to many false positives and - * create oscillations as well as interact badly with other TCP streams - * implementing the same algorithm. - * - * The second method is to limit the window to the bandwidth delay product - * of the link. This is the method we implement. RTT variances and our - * own manipulation of the congestion window, bwnd, can potentially - * destabilize the algorithm. For this reason we have to stabilize the - * elements used to calculate the window. We do this by using the minimum - * observed RTT, the long term average of the observed bandwidth, and - * by adding two segments worth of slop. It isn't perfect but it is able - * to react to changing conditions and gives us a very stable basis on - * which to extend the algorithm. - */ -void -tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq) -{ - u_long bw; - u_long bwnd; - int save_ticks; - - INP_WLOCK_ASSERT(tp->t_inpcb); - - /* - * If inflight_enable is disabled in the middle of a tcp connection, - * make sure snd_bwnd is effectively disabled. - */ - if (V_tcp_inflight_enable == 0 || - tp->t_rttlow < V_tcp_inflight_rttthresh) { - tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; - tp->snd_bandwidth = 0; - return; - } - - /* - * Figure out the bandwidth. Due to the tick granularity this - * is a very rough number and it MUST be averaged over a fairly - * long period of time. XXX we need to take into account a link - * that is not using all available bandwidth, but for now our - * slop will ramp us up if this case occurs and the bandwidth later - * increases. - * - * Note: if ticks rollover 'bw' may wind up negative. We must - * effectively reset t_bw_rtttime for this case. - */ - save_ticks = ticks; - if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1) - return; - - bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz / - (save_ticks - tp->t_bw_rtttime); - tp->t_bw_rtttime = save_ticks; - tp->t_bw_rtseq = ack_seq; - if (tp->t_bw_rtttime == 0 || (int)bw < 0) - return; - bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4; - - tp->snd_bandwidth = bw; - - /* - * Calculate the semi-static bandwidth delay product, plus two maximal - * segments. The additional slop puts us squarely in the sweet - * spot and also handles the bandwidth run-up case and stabilization. - * Without the slop we could be locking ourselves into a lower - * bandwidth. - * - * Situations Handled: - * (1) Prevents over-queueing of packets on LANs, especially on - * high speed LANs, allowing larger TCP buffers to be - * specified, and also does a good job preventing - * over-queueing of packets over choke points like modems - * (at least for the transmit side). - * - * (2) Is able to handle changing network loads (bandwidth - * drops so bwnd drops, bandwidth increases so bwnd - * increases). - * - * (3) Theoretically should stabilize in the face of multiple - * connections implementing the same algorithm (this may need - * a little work). - * - * (4) Stability value (defaults to 20 = 2 maximal packets) can - * be adjusted with a sysctl but typically only needs to be - * on very slow connections. A value no smaller then 5 - * should be used, but only reduce this default if you have - * no other choice. - */ -#define USERTT ((tp->t_srtt + tp->t_rttbest) / 2) - bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + V_tcp_inflight_stab * tp->t_maxseg / 10; -#undef USERTT - - if (tcp_inflight_debug > 0) { - static int ltime; - if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) { - ltime = ticks; - printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n", - tp, - bw, - tp->t_rttbest, - tp->t_srtt, - bwnd - ); - } - } - if ((long)bwnd < V_tcp_inflight_min) - bwnd = V_tcp_inflight_min; - if (bwnd > V_tcp_inflight_max) - bwnd = V_tcp_inflight_max; - if ((long)bwnd < tp->t_maxseg * 2) - bwnd = tp->t_maxseg * 2; - tp->snd_bwnd = bwnd; -} - #ifdef TCP_SIGNATURE /* * Callback function invoked by m_apply() to digest TCP segment data diff --git a/sys/netinet/tcp_timer.h b/sys/netinet/tcp_timer.h index 1ab0b7b29386..4bfcdf65c155 100644 --- a/sys/netinet/tcp_timer.h +++ b/sys/netinet/tcp_timer.h @@ -86,9 +86,6 @@ #define TCPTV_KEEPINTVL ( 75*hz) /* default probe interval */ #define TCPTV_KEEPCNT 8 /* max probes before drop */ -#define TCPTV_INFLIGHT_RTTTHRESH (10*hz/1000) /* below which inflight - disengages, in msec */ - #define TCPTV_FINWAIT2_TIMEOUT (60*hz) /* FIN_WAIT_2 timeout if no receiver */ /* diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c index 2e61c3125c4d..f35890bee192 100644 --- a/sys/netinet/tcp_usrreq.c +++ b/sys/netinet/tcp_usrreq.c @@ -1105,7 +1105,6 @@ tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) tp->t_state = TCPS_SYN_SENT; tcp_timer_activate(tp, TT_KEEP, tcp_keepinit); tp->iss = tcp_new_isn(tp); - tp->t_bw_rtseq = tp->iss; tcp_sendseqinit(tp); return 0; @@ -1168,7 +1167,6 @@ tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) tp->t_state = TCPS_SYN_SENT; tcp_timer_activate(tp, TT_KEEP, tcp_keepinit); tp->iss = tcp_new_isn(tp); - tp->t_bw_rtseq = tp->iss; tcp_sendseqinit(tp); return 0; @@ -1214,7 +1212,7 @@ tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti) ti->tcpi_rcv_space = tp->rcv_wnd; ti->tcpi_rcv_nxt = tp->rcv_nxt; ti->tcpi_snd_wnd = tp->snd_wnd; - ti->tcpi_snd_bwnd = tp->snd_bwnd; + ti->tcpi_snd_bwnd = 0; /* Unused, kept for compat. */ ti->tcpi_snd_nxt = tp->snd_nxt; ti->tcpi_snd_mss = tp->t_maxseg; ti->tcpi_rcv_mss = tp->t_maxseg; @@ -1795,26 +1793,24 @@ db_print_tcpcb(struct tcpcb *tp, const char *name, int indent) tp->rcv_adv, tp->rcv_wnd, tp->rcv_up); db_print_indent(indent); - db_printf("snd_wnd: %lu snd_cwnd: %lu snd_bwnd: %lu\n", - tp->snd_wnd, tp->snd_cwnd, tp->snd_bwnd); + db_printf("snd_wnd: %lu snd_cwnd: %lu\n", + tp->snd_wnd, tp->snd_cwnd); db_print_indent(indent); - db_printf("snd_ssthresh: %lu snd_bandwidth: %lu snd_recover: " - "0x%08x\n", tp->snd_ssthresh, tp->snd_bandwidth, - tp->snd_recover); + db_printf("snd_ssthresh: %lu snd_recover: " + "0x%08x\n", tp->snd_ssthresh, tp->snd_recover); db_print_indent(indent); db_printf("t_maxopd: %u t_rcvtime: %u t_startime: %u\n", tp->t_maxopd, tp->t_rcvtime, tp->t_starttime); db_print_indent(indent); - db_printf("t_rttime: %u t_rtsq: 0x%08x t_bw_rtttime: %u\n", - tp->t_rtttime, tp->t_rtseq, tp->t_bw_rtttime); + db_printf("t_rttime: %u t_rtsq: 0x%08x\n", + tp->t_rtttime, tp->t_rtseq); db_print_indent(indent); - db_printf("t_bw_rtseq: 0x%08x t_rxtcur: %d t_maxseg: %u " - "t_srtt: %d\n", tp->t_bw_rtseq, tp->t_rxtcur, tp->t_maxseg, - tp->t_srtt); + db_printf("t_rxtcur: %d t_maxseg: %u t_srtt: %d\n", + tp->t_rxtcur, tp->t_maxseg, tp->t_srtt); db_print_indent(indent); db_printf("t_rttvar: %d t_rxtshift: %d t_rttmin: %u " diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h index b753e105ee50..2b7abcaf0950 100644 --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -135,12 +135,12 @@ struct tcpcb { u_long snd_wnd; /* send window */ u_long snd_cwnd; /* congestion-controlled window */ - u_long snd_bwnd; /* bandwidth-controlled window */ + u_long snd_spare1; /* unused */ u_long snd_ssthresh; /* snd_cwnd size threshold for * for slow start exponential to * linear switch */ - u_long snd_bandwidth; /* calculated bandwidth or 0 */ + u_long snd_spare2; /* unused */ tcp_seq snd_recover; /* for use in NewReno Fast Recovery */ u_int t_maxopd; /* mss plus options */ @@ -150,8 +150,8 @@ struct tcpcb { u_int t_rtttime; /* RTT measurement start time */ tcp_seq t_rtseq; /* sequence number being timed */ - u_int t_bw_rtttime; /* used for bandwidth calculation */ - tcp_seq t_bw_rtseq; /* used for bandwidth calculation */ + u_int t_bw_spare1; /* unused */ + tcp_seq t_bw_spare2; /* unused */ int t_rxtcur; /* current retransmit value (ticks) */ u_int t_maxseg; /* maximum segment size */ @@ -654,7 +654,6 @@ void tcpip_fillheaders(struct inpcb *, void *, void *); void tcp_timer_activate(struct tcpcb *, int, u_int); int tcp_timer_active(struct tcpcb *, int); void tcp_trace(short, short, struct tcpcb *, void *, struct tcphdr *, int); -void tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq); /* * All tcp_hc_* functions are IPv4 and IPv6 (via in_conninfo) */