update the SACK loss recovery to RFC6675, with the following new features:
- improved pipe calculation which does not degrade under heavy loss - engaging in Loss Recovery earlier under adverse conditions - Rescue Retransmission in case some of the trailing packets of a request got lost All above changes are toggled with the sysctl "rfc6675_pipe" (disabled by default). Reviewers: #transport, tuexen, lstewart, slavash, jtl, hselasky, kib, rgrimes, chengc_netapp.com, thj, #manpages, kbowling, #netapp, rscheff Reviewed By: #transport Subscribers: imp, melifaro MFC after: 2 weeks Sponsored by: NetApp, Inc. Differential Revision: https://reviews.freebsd.org/D18985
This commit is contained in:
parent
afcb3c4cb4
commit
3c40e1d52c
@ -34,7 +34,7 @@
|
||||
.\" From: @(#)tcp.4 8.1 (Berkeley) 6/5/93
|
||||
.\" $FreeBSD$
|
||||
.\"
|
||||
.Dd February 11, 2021
|
||||
.Dd February 13, 2021
|
||||
.Dt TCP 4
|
||||
.Os
|
||||
.Sh NAME
|
||||
@ -560,6 +560,14 @@ high losses leading to RTO, but reduces PRR effectiveness in more common setting
|
||||
.It Va rfc6675_pipe
|
||||
Calculate the bytes in flight using the algorithm described in RFC 6675, and
|
||||
is also an improvement when Proportional Rate Reduction is enabled.
|
||||
Also enables two other mechanisms from RFC6675.
|
||||
Rescue Retransmission helps timely loss recovery, when the trailing segments
|
||||
of a transmission are lost, while no additional data is ready to be sent.
|
||||
In case a partial ACK without a SACK block is received during SACK loss
|
||||
recovery, the trailing segment is immediately resent, rather than waiting
|
||||
for a Retransmission timeout.
|
||||
SACK loss recovery is also engaged, once two segments plus one byte are
|
||||
SACKed - even if no traditional duplicate ACKs were seen.
|
||||
.It Va rfc3042
|
||||
Enable the Limited Transmit algorithm as described in RFC 3042.
|
||||
It helps avoid timeouts on lossy links and also when the congestion window
|
||||
|
@ -1501,6 +1501,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
|
||||
struct mbuf *mfree;
|
||||
struct tcpopt to;
|
||||
int tfo_syn;
|
||||
u_int maxseg;
|
||||
|
||||
#ifdef TCPDEBUG
|
||||
/*
|
||||
@ -2502,8 +2503,6 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
|
||||
#endif
|
||||
|
||||
if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
|
||||
u_int maxseg;
|
||||
|
||||
maxseg = tcp_maxseg(tp);
|
||||
if (tlen == 0 &&
|
||||
(tiwin == tp->snd_wnd ||
|
||||
@ -2644,7 +2643,21 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
|
||||
tp->snd_cwnd += maxseg;
|
||||
(void) tp->t_fb->tfb_tcp_output(tp);
|
||||
goto drop;
|
||||
} else if (tp->t_dupacks == tcprexmtthresh) {
|
||||
} else if (tp->t_dupacks == tcprexmtthresh ||
|
||||
(tp->t_flags & TF_SACK_PERMIT &&
|
||||
V_tcp_do_rfc6675_pipe &&
|
||||
tp->sackhint.sacked_bytes >
|
||||
(tcprexmtthresh - 1) * maxseg)) {
|
||||
enter_recovery:
|
||||
/*
|
||||
* Above is the RFC6675 trigger condition of
|
||||
* more than (dupthresh-1)*maxseg sacked data.
|
||||
* If the count of holes in the
|
||||
* scoreboard is >= dupthresh, we could
|
||||
* also enter loss recovery, but don't
|
||||
* have that value readily available.
|
||||
*/
|
||||
tp->t_dupacks = tcprexmtthresh;
|
||||
tcp_seq onxt = tp->snd_nxt;
|
||||
|
||||
/*
|
||||
@ -2689,6 +2702,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
|
||||
tp->snd_recover = tp->snd_nxt;
|
||||
tp->snd_cwnd = maxseg;
|
||||
(void) tp->t_fb->tfb_tcp_output(tp);
|
||||
if (SEQ_GT(th->th_ack, tp->snd_una))
|
||||
goto resume_partialack;
|
||||
goto drop;
|
||||
}
|
||||
tp->snd_nxt = th->th_ack;
|
||||
@ -2775,10 +2790,19 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
|
||||
*/
|
||||
if ((tp->t_flags & TF_SACK_PERMIT) &&
|
||||
(to.to_flags & TOF_SACK) &&
|
||||
sack_changed)
|
||||
sack_changed) {
|
||||
tp->t_dupacks++;
|
||||
/* limit overhead by setting maxseg last */
|
||||
if (!IN_FASTRECOVERY(tp->t_flags) &&
|
||||
(tp->sackhint.sacked_bytes >
|
||||
((tcprexmtthresh - 1) *
|
||||
(maxseg = tcp_maxseg(tp))))) {
|
||||
goto enter_recovery;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
resume_partialack:
|
||||
KASSERT(SEQ_GT(th->th_ack, tp->snd_una),
|
||||
("%s: th_ack <= snd_una", __func__));
|
||||
|
||||
@ -2789,7 +2813,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
|
||||
if (IN_FASTRECOVERY(tp->t_flags)) {
|
||||
if (SEQ_LT(th->th_ack, tp->snd_recover)) {
|
||||
if (tp->t_flags & TF_SACK_PERMIT)
|
||||
if (V_tcp_do_prr)
|
||||
if (V_tcp_do_prr && to.to_flags & TOF_SACK)
|
||||
tcp_prr_partialack(tp, th);
|
||||
else
|
||||
tcp_sack_partialack(tp, th);
|
||||
|
@ -750,6 +750,16 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack)
|
||||
else
|
||||
sblkp--;
|
||||
}
|
||||
if (!(to->to_flags & TOF_SACK))
|
||||
/*
|
||||
* If this ACK did not contain any
|
||||
* SACK blocks, any only moved the
|
||||
* left edge right, it is a pure
|
||||
* cumulative ACK. Do not count
|
||||
* DupAck for this. Also required
|
||||
* for RFC6675 rescue retransmission.
|
||||
*/
|
||||
sack_changed = 0;
|
||||
tp->sackhint.delivered_data = delivered_data;
|
||||
tp->sackhint.sacked_bytes += delivered_data - left_edge_delta;
|
||||
KASSERT((delivered_data >= 0), ("delivered_data < 0"));
|
||||
@ -800,6 +810,31 @@ tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th)
|
||||
if (tp->snd_cwnd > tp->snd_ssthresh)
|
||||
tp->snd_cwnd = tp->snd_ssthresh;
|
||||
tp->t_flags |= TF_ACKNOW;
|
||||
/*
|
||||
* RFC6675 rescue retransmission
|
||||
* Add a hole between th_ack (snd_una is not yet set) and snd_max,
|
||||
* if this was a pure cumulative ACK and no data was send beyond
|
||||
* recovery point. Since the data in the socket has not been freed
|
||||
* at this point, we check if the scoreboard is empty, and the ACK
|
||||
* delivered some new data, indicating a full ACK. Also, if the
|
||||
* recovery point is still at snd_max, we are probably application
|
||||
* limited. However, this inference might not always be true. The
|
||||
* rescue retransmission may rarely be slightly premature
|
||||
* compared to RFC6675.
|
||||
* The corresponding ACK+SACK will cause any further outstanding
|
||||
* segments to be retransmitted. This addresses a corner case, when
|
||||
* the trailing packets of a window are lost and no further data
|
||||
* is available for sending.
|
||||
*/
|
||||
if ((V_tcp_do_rfc6675_pipe) &&
|
||||
SEQ_LT(th->th_ack, tp->snd_recover) &&
|
||||
(tp->snd_recover == tp->snd_max) &&
|
||||
TAILQ_EMPTY(&tp->snd_holes) &&
|
||||
(tp->sackhint.delivered_data > 0)) {
|
||||
struct sackhole *hole;
|
||||
int maxseg = tcp_maxseg(tp);
|
||||
hole = tcp_sackhole_insert(tp, SEQ_MAX(th->th_ack, tp->snd_max - maxseg), tp->snd_max, NULL);
|
||||
}
|
||||
(void) tp->t_fb->tfb_tcp_output(tp);
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user