Use estimated RTT for receive buffer auto resizing instead of timestamps

Switched from using timestamps to RTT estimates when performing TCP receive
buffer auto resizing, as not all hosts support / enable TCP timestamps.

Disabled reset of receive buffer auto scaling when not in bulk receive mode,
which gives an extra 20% performance increase.

Also extracted auto resizing to a common method shared between standard and
fastpath modules.

With this AWS S3 downloads at ~17ms latency on a 1Gbps connection jump from
~3MB/s to ~100MB/s using the default settings.

Reviewed by:    lstewart, gnn
MFC after:      2 weeks
Relnotes:       Yes
Sponsored by:   Multiplay
Differential Revision:  https://reviews.freebsd.org/D9668
This commit is contained in:
Steven Hartland 2017-04-10 08:19:35 +00:00
parent f3ac3a6e6f
commit e44c1887fd
6 changed files with 82 additions and 124 deletions

View File

@ -132,6 +132,14 @@ SDT_PROBE_DEFINE6_XLATE(tcp, , , state__change,
"void *", "void *",
"int", "tcplsinfo_t *");
SDT_PROBE_DEFINE6_XLATE(tcp, , , receive__autoresize,
"void *", "void *",
"struct tcpcb *", "csinfo_t *",
"struct mbuf *", "ipinfo_t *",
"struct tcpcb *", "tcpsinfo_t *" ,
"struct tcphdr *", "tcpinfoh_t *",
"int", "int");
SDT_PROBE_DEFINE5_XLATE(udp, , , receive,
"void *", "pktinfo_t *",
"struct inpcb *", "csinfo_t *",

View File

@ -65,6 +65,7 @@ SDT_PROBE_DECLARE(tcp, , , debug__input);
SDT_PROBE_DECLARE(tcp, , , debug__output);
SDT_PROBE_DECLARE(tcp, , , debug__user);
SDT_PROBE_DECLARE(tcp, , , debug__drop);
SDT_PROBE_DECLARE(tcp, , , receive__autoresize);
SDT_PROBE_DECLARE(udp, , , receive);
SDT_PROBE_DECLARE(udp, , , send);

View File

@ -1486,6 +1486,68 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
return (IPPROTO_DONE);
}
/*
* Automatic sizing of receive socket buffer. Often the send
* buffer size is not optimally adjusted to the actual network
* conditions at hand (delay bandwidth product). Setting the
* buffer size too small limits throughput on links with high
* bandwidth and high delay (eg. trans-continental/oceanic links).
*
* On the receive side the socket buffer memory is only rarely
* used to any significant extent. This allows us to be much
* more aggressive in scaling the receive socket buffer. For
* the case that the buffer space is actually used to a large
* extent and we run out of kernel memory we can simply drop
* the new segments; TCP on the sender will just retransmit it
* later. Setting the buffer size too big may only consume too
* much kernel memory if the application doesn't read() from
* the socket or packet loss or reordering makes use of the
* reassembly queue.
*
* The criteria to step up the receive buffer one notch are:
* 1. Application has not set receive buffer size with
* SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE.
* 2. the number of bytes received during the time it takes
* one timestamp to be reflected back to us (the RTT);
* 3. received bytes per RTT is within seven eighth of the
* current socket buffer size;
* 4. receive buffer size has not hit maximal automatic size;
*
* This algorithm does one step per RTT at most and only if
* we receive a bulk stream w/o packet losses or reorderings.
* Shrinking the buffer during idle times is not necessary as
* it doesn't consume any memory when idle.
*
* TODO: Only step up if the application is actually serving
* the buffer to better manage the socket buffer resources.
*/
int
tcp_autorcvbuf(struct mbuf *m, struct tcphdr *th, struct socket *so,
struct tcpcb *tp, int tlen)
{
int newsize = 0;
if (V_tcp_do_autorcvbuf && (so->so_rcv.sb_flags & SB_AUTOSIZE) &&
tp->t_srtt != 0 && tp->rfbuf_ts != 0 &&
TCP_TS_TO_TICKS(tcp_ts_getticks() - tp->rfbuf_ts) >
(tp->t_srtt >> TCP_RTT_SHIFT)) {
if (tp->rfbuf_cnt > (so->so_rcv.sb_hiwat / 8 * 7) &&
so->so_rcv.sb_hiwat < V_tcp_autorcvbuf_max) {
newsize = min(so->so_rcv.sb_hiwat +
V_tcp_autorcvbuf_inc, V_tcp_autorcvbuf_max);
}
TCP_PROBE6(receive__autoresize, NULL, tp, m, tp, th, newsize);
/* Start over with next RTT. */
tp->rfbuf_ts = 0;
tp->rfbuf_cnt = 0;
} else {
tp->rfbuf_cnt += tlen; /* add up */
}
return (newsize);
}
void
tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos,
@ -1849,62 +1911,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
#endif
TCP_PROBE3(debug__input, tp, th, m);
/*
* Automatic sizing of receive socket buffer. Often the send
* buffer size is not optimally adjusted to the actual network
* conditions at hand (delay bandwidth product). Setting the
* buffer size too small limits throughput on links with high
* bandwidth and high delay (eg. trans-continental/oceanic links).
*
* On the receive side the socket buffer memory is only rarely
* used to any significant extent. This allows us to be much
* more aggressive in scaling the receive socket buffer. For
* the case that the buffer space is actually used to a large
* extent and we run out of kernel memory we can simply drop
* the new segments; TCP on the sender will just retransmit it
* later. Setting the buffer size too big may only consume too
* much kernel memory if the application doesn't read() from
* the socket or packet loss or reordering makes use of the
* reassembly queue.
*
* The criteria to step up the receive buffer one notch are:
* 1. Application has not set receive buffer size with
* SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE.
* 2. the number of bytes received during the time it takes
* one timestamp to be reflected back to us (the RTT);
* 3. received bytes per RTT is within seven eighth of the
* current socket buffer size;
* 4. receive buffer size has not hit maximal automatic size;
*
* This algorithm does one step per RTT at most and only if
* we receive a bulk stream w/o packet losses or reorderings.
* Shrinking the buffer during idle times is not necessary as
* it doesn't consume any memory when idle.
*
* TODO: Only step up if the application is actually serving
* the buffer to better manage the socket buffer resources.
*/
if (V_tcp_do_autorcvbuf &&
(to.to_flags & TOF_TS) &&
to.to_tsecr &&
(so->so_rcv.sb_flags & SB_AUTOSIZE)) {
if (TSTMP_GT(to.to_tsecr, tp->rfbuf_ts) &&
to.to_tsecr - tp->rfbuf_ts < hz) {
if (tp->rfbuf_cnt >
(so->so_rcv.sb_hiwat / 8 * 7) &&
so->so_rcv.sb_hiwat <
V_tcp_autorcvbuf_max) {
newsize =
min(so->so_rcv.sb_hiwat +
V_tcp_autorcvbuf_inc,
V_tcp_autorcvbuf_max);
}
/* Start over with next RTT. */
tp->rfbuf_ts = 0;
tp->rfbuf_cnt = 0;
} else
tp->rfbuf_cnt += tlen; /* add up */
}
newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
/* Add data to socket buffer. */
SOCKBUF_LOCK(&so->so_rcv);
@ -1945,10 +1952,6 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
win = 0;
tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
/* Reset receive buffer auto scaling when not in bulk receive mode. */
tp->rfbuf_ts = 0;
tp->rfbuf_cnt = 0;
switch (tp->t_state) {
/*

View File

@ -831,11 +831,13 @@ tcp_output(struct tcpcb *tp)
to.to_tsval = tcp_ts_getticks() + tp->ts_offset;
to.to_tsecr = tp->ts_recent;
to.to_flags |= TOF_TS;
/* Set receive buffer autosizing timestamp. */
if (tp->rfbuf_ts == 0 &&
(so->so_rcv.sb_flags & SB_AUTOSIZE))
tp->rfbuf_ts = tcp_ts_getticks();
}
/* Set receive buffer autosizing timestamp. */
if (tp->rfbuf_ts == 0 &&
(so->so_rcv.sb_flags & SB_AUTOSIZE))
tp->rfbuf_ts = tcp_ts_getticks();
/* Selective ACK's. */
if (tp->t_flags & TF_SACK_PERMIT) {
if (flags & TH_SYN)

View File

@ -399,62 +399,8 @@ tcp_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
(void *)tcp_saveipgen, &tcp_savetcp, 0);
#endif
TCP_PROBE3(debug__input, tp, th, m);
/*
* Automatic sizing of receive socket buffer. Often the send
* buffer size is not optimally adjusted to the actual network
* conditions at hand (delay bandwidth product). Setting the
* buffer size too small limits throughput on links with high
* bandwidth and high delay (eg. trans-continental/oceanic links).
*
* On the receive side the socket buffer memory is only rarely
* used to any significant extent. This allows us to be much
* more aggressive in scaling the receive socket buffer. For
* the case that the buffer space is actually used to a large
* extent and we run out of kernel memory we can simply drop
* the new segments; TCP on the sender will just retransmit it
* later. Setting the buffer size too big may only consume too
* much kernel memory if the application doesn't read() from
* the socket or packet loss or reordering makes use of the
* reassembly queue.
*
* The criteria to step up the receive buffer one notch are:
* 1. Application has not set receive buffer size with
* SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE.
* 2. the number of bytes received during the time it takes
* one timestamp to be reflected back to us (the RTT);
* 3. received bytes per RTT is within seven eighth of the
* current socket buffer size;
* 4. receive buffer size has not hit maximal automatic size;
*
* This algorithm does one step per RTT at most and only if
* we receive a bulk stream w/o packet losses or reorderings.
* Shrinking the buffer during idle times is not necessary as
* it doesn't consume any memory when idle.
*
* TODO: Only step up if the application is actually serving
* the buffer to better manage the socket buffer resources.
*/
if (V_tcp_do_autorcvbuf &&
(to->to_flags & TOF_TS) &&
to->to_tsecr &&
(so->so_rcv.sb_flags & SB_AUTOSIZE)) {
if (TSTMP_GT(to->to_tsecr, tp->rfbuf_ts) &&
to->to_tsecr - tp->rfbuf_ts < hz) {
if (tp->rfbuf_cnt >
(so->so_rcv.sb_hiwat / 8 * 7) &&
so->so_rcv.sb_hiwat <
V_tcp_autorcvbuf_max) {
newsize =
min(so->so_rcv.sb_hiwat +
V_tcp_autorcvbuf_inc,
V_tcp_autorcvbuf_max);
}
/* Start over with next RTT. */
tp->rfbuf_ts = 0;
tp->rfbuf_cnt = 0;
} else
tp->rfbuf_cnt += tlen; /* add up */
}
newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
/* Add data to socket buffer. */
SOCKBUF_LOCK(&so->so_rcv);
@ -532,10 +478,6 @@ tcp_do_slowpath(struct mbuf *m, struct tcphdr *th, struct socket *so,
win = 0;
tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
/* Reset receive buffer auto scaling when not in bulk receive mode. */
tp->rfbuf_ts = 0;
tp->rfbuf_cnt = 0;
switch (tp->t_state) {
/*

View File

@ -778,6 +778,8 @@ void hhook_run_tcp_est_in(struct tcpcb *tp,
#endif
int tcp_input(struct mbuf **, int *, int);
int tcp_autorcvbuf(struct mbuf *, struct tcphdr *, struct socket *,
struct tcpcb *, int);
void tcp_do_segment(struct mbuf *, struct tcphdr *,
struct socket *, struct tcpcb *, int, int, uint8_t,
int);