tcp: SACK Lost Retransmission Detection (LRD)
Recover from excessive losses without reverting to a retransmission timeout (RTO). Disabled by default, enable with sysctl net.inet.tcp.do_lrd=1 Reviewed By: #transport, rrs, tuexen, #manpages Sponsored by: Netapp, Inc. Differential Revision: https://reviews.freebsd.org/D28931
This commit is contained in:
parent
988b1bb0c5
commit
0471a8c734
@ -547,6 +547,13 @@ This gently reduces the congestion window during periods, where TCP is
|
||||
application limited and the network bandwidth is not utilized completely.
|
||||
That prevents self-inflicted packet losses once the application starts to
|
||||
transmit data at a higher speed.
|
||||
.It Va do_lrd
|
||||
Enable Lost Retransmission Detection for SACK-enabled sessions, disabled by
|
||||
default.
|
||||
Under severe congestion, a retransmission can be lost which then leads to a
|
||||
mandatory Retransmission Timeout (RTO), followed by slow-start.
|
||||
LRD will try to resend the repeatedly lost packet, preventing the time-consuming
|
||||
RTO and performance reducing slow-start.
|
||||
.It Va do_prr
|
||||
Perform SACK loss recovery using the Proportional Rate Reduction (PRR) algorithm
|
||||
described in RFC6937.
|
||||
|
@ -199,6 +199,7 @@ struct tcphdr {
|
||||
#define TCP_PROC_ACCOUNTING 76 /* Do accounting on tcp cpu usage and counts */
|
||||
#define TCP_USE_CMP_ACKS 77 /* The transport can handle the Compressed mbuf acks */
|
||||
#define TCP_PERF_INFO 78 /* retrieve accounting counters */
|
||||
#define TCP_LRD 79 /* toggle Lost Retransmission Detection for A/B testing */
|
||||
#define TCP_KEEPINIT 128 /* N, time to establish connection */
|
||||
#define TCP_KEEPIDLE 256 /* L,N,X start keeplives after this period */
|
||||
#define TCP_KEEPINTVL 512 /* L,N interval between keepalives */
|
||||
|
@ -164,6 +164,11 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_prr, CTLFLAG_VNET | CTLFLAG_RW,
|
||||
&VNET_NAME(tcp_do_prr), 1,
|
||||
"Enable Proportional Rate Reduction per RFC 6937");
|
||||
|
||||
VNET_DEFINE(int, tcp_do_lrd) = 0;
|
||||
SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_lrd, CTLFLAG_VNET | CTLFLAG_RW,
|
||||
&VNET_NAME(tcp_do_lrd), 1,
|
||||
"Perform Lost Retransmission Detection");
|
||||
|
||||
VNET_DEFINE(int, tcp_do_newcwv) = 0;
|
||||
SYSCTL_INT(_net_inet_tcp, OID_AUTO, newcwv, CTLFLAG_VNET | CTLFLAG_RW,
|
||||
&VNET_NAME(tcp_do_newcwv), 0,
|
||||
@ -2523,9 +2528,12 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
|
||||
}
|
||||
if ((tp->t_flags & TF_SACK_PERMIT) &&
|
||||
((to.to_flags & TOF_SACK) ||
|
||||
!TAILQ_EMPTY(&tp->snd_holes)))
|
||||
sack_changed = tcp_sack_doack(tp, &to, th->th_ack);
|
||||
else
|
||||
!TAILQ_EMPTY(&tp->snd_holes))) {
|
||||
if (((sack_changed = tcp_sack_doack(tp, &to, th->th_ack)) != 0) &&
|
||||
(tp->t_flags & TF_LRD)) {
|
||||
tcp_sack_lost_retransmission(tp, th);
|
||||
}
|
||||
} else
|
||||
/*
|
||||
* Reset the value so that previous (valid) value
|
||||
* from the last ack with SACK doesn't get used.
|
||||
|
@ -1264,6 +1264,14 @@ tcp_output(struct tcpcb *tp)
|
||||
} else {
|
||||
th->th_seq = htonl(p->rxmit);
|
||||
p->rxmit += len;
|
||||
/*
|
||||
* Lost Retransmission Detection
|
||||
* trigger resending of a (then
|
||||
* still existing) hole, when
|
||||
* fack acks recoverypoint.
|
||||
*/
|
||||
if ((tp->t_flags & TF_LRD) && SEQ_GEQ(p->rxmit, p->end))
|
||||
p->rxmit = tp->snd_recover;
|
||||
tp->sackhint.sack_bytes_rexmit += len;
|
||||
}
|
||||
if (IN_RECOVERY(tp->t_flags)) {
|
||||
|
@ -119,6 +119,7 @@ __FBSDID("$FreeBSD$");
|
||||
#include <netinet/tcp_var.h>
|
||||
#include <netinet6/tcp6_var.h>
|
||||
#include <netinet/tcpip.h>
|
||||
#include <netinet/cc/cc.h>
|
||||
#ifdef TCPDEBUG
|
||||
#include <netinet/tcp_debug.h>
|
||||
#endif /* TCPDEBUG */
|
||||
@ -730,7 +731,8 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack)
|
||||
cur = TAILQ_PREV(cur, sackhole_head, scblink);
|
||||
continue;
|
||||
}
|
||||
tp->sackhint.sack_bytes_rexmit -= (cur->rxmit - cur->start);
|
||||
tp->sackhint.sack_bytes_rexmit -=
|
||||
(SEQ_MIN(cur->rxmit, cur->end) - cur->start);
|
||||
KASSERT(tp->sackhint.sack_bytes_rexmit >= 0,
|
||||
("sackhint bytes rtx >= 0"));
|
||||
sack_changed = 1;
|
||||
@ -761,6 +763,8 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack)
|
||||
delivered_data += (cur->end - sblkp->start);
|
||||
cur->end = sblkp->start;
|
||||
cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
|
||||
if ((tp->t_flags & TF_LRD) && SEQ_GEQ(cur->rxmit, cur->end))
|
||||
cur->rxmit = tp->snd_recover;
|
||||
} else {
|
||||
/*
|
||||
* ACKs some data in middle of a hole; need
|
||||
@ -771,18 +775,21 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack)
|
||||
if (temp != NULL) {
|
||||
if (SEQ_GT(cur->rxmit, temp->rxmit)) {
|
||||
temp->rxmit = cur->rxmit;
|
||||
tp->sackhint.sack_bytes_rexmit
|
||||
+= (temp->rxmit
|
||||
- temp->start);
|
||||
tp->sackhint.sack_bytes_rexmit +=
|
||||
(SEQ_MIN(temp->rxmit,
|
||||
temp->end) - temp->start);
|
||||
}
|
||||
cur->end = sblkp->start;
|
||||
cur->rxmit = SEQ_MIN(cur->rxmit,
|
||||
cur->end);
|
||||
if ((tp->t_flags & TF_LRD) && SEQ_GEQ(cur->rxmit, cur->end))
|
||||
cur->rxmit = tp->snd_recover;
|
||||
delivered_data += (sblkp->end - sblkp->start);
|
||||
}
|
||||
}
|
||||
}
|
||||
tp->sackhint.sack_bytes_rexmit += (cur->rxmit - cur->start);
|
||||
tp->sackhint.sack_bytes_rexmit +=
|
||||
(SEQ_MIN(cur->rxmit, cur->end) - cur->start);
|
||||
/*
|
||||
* Testing sblkp->start against cur->start tells us whether
|
||||
* we're done with the sack block or the sack hole.
|
||||
@ -912,7 +919,7 @@ tcp_sack_output_debug(struct tcpcb *tp, int *sack_bytes_rexmt)
|
||||
*sack_bytes_rexmt += (p->rxmit - p->start);
|
||||
break;
|
||||
}
|
||||
*sack_bytes_rexmt += (p->rxmit - p->start);
|
||||
*sack_bytes_rexmt += (SEQ_MIN(p->rxmit, p->end) - p->start);
|
||||
}
|
||||
return (p);
|
||||
}
|
||||
@ -989,3 +996,57 @@ tcp_sack_adjust(struct tcpcb *tp)
|
||||
return;
|
||||
tp->snd_nxt = tp->snd_fack;
|
||||
}
|
||||
|
||||
/*
|
||||
* Lost Retransmission Detection
|
||||
* Check is FACK is beyond the rexmit of the leftmost hole.
|
||||
* If yes, we restart sending from still existing holes,
|
||||
* and adjust cwnd via the congestion control module.
|
||||
*/
|
||||
void
|
||||
tcp_sack_lost_retransmission(struct tcpcb *tp, struct tcphdr *th)
|
||||
{
|
||||
struct sackhole *temp;
|
||||
uint32_t prev_cwnd;
|
||||
if (IN_RECOVERY(tp->t_flags) &&
|
||||
SEQ_GT(tp->snd_fack, tp->snd_recover) &&
|
||||
((temp = TAILQ_FIRST(&tp->snd_holes)) != NULL) &&
|
||||
SEQ_GEQ(temp->rxmit, temp->end) &&
|
||||
SEQ_GEQ(tp->snd_fack, temp->rxmit)) {
|
||||
TCPSTAT_INC(tcps_sack_lostrexmt);
|
||||
/*
|
||||
* Start retransmissions from the first hole, and
|
||||
* subsequently all other remaining holes, including
|
||||
* those, which had been sent completely before.
|
||||
*/
|
||||
tp->sackhint.nexthole = temp;
|
||||
TAILQ_FOREACH(temp, &tp->snd_holes, scblink) {
|
||||
if (SEQ_GEQ(tp->snd_fack, temp->rxmit) &&
|
||||
SEQ_GEQ(temp->rxmit, temp->end))
|
||||
temp->rxmit = temp->start;
|
||||
}
|
||||
/*
|
||||
* Remember the old ssthresh, to deduct the beta factor used
|
||||
* by the CC module. Finally, set cwnd to ssthresh just
|
||||
* prior to invoking another cwnd reduction by the CC
|
||||
* module, to not shrink it excessively.
|
||||
*/
|
||||
prev_cwnd = tp->snd_cwnd;
|
||||
tp->snd_cwnd = tp->snd_ssthresh;
|
||||
/*
|
||||
* Formally exit recovery, and let the CC module adjust
|
||||
* ssthresh as intended.
|
||||
*/
|
||||
EXIT_RECOVERY(tp->t_flags);
|
||||
cc_cong_signal(tp, th, CC_NDUPACK);
|
||||
/*
|
||||
* For PRR, adjust recover_fs as if this new reduction
|
||||
* initialized this variable.
|
||||
* cwnd will be adjusted by SACK or PRR processing
|
||||
* subsequently, only set it to a safe value here.
|
||||
*/
|
||||
tp->snd_cwnd = tcp_maxseg(tp);
|
||||
tp->sackhint.recover_fs = (tp->snd_max - tp->snd_una) -
|
||||
tp->sackhint.recover_fs;
|
||||
}
|
||||
}
|
||||
|
@ -2171,6 +2171,8 @@ tcp_newtcpcb(struct inpcb *inp)
|
||||
if (V_tcp_perconn_stats_enable == 1)
|
||||
tp->t_stats = stats_blob_alloc(V_tcp_perconn_stats_dflt_tpl, 0);
|
||||
#endif
|
||||
if (V_tcp_do_lrd)
|
||||
tp->t_flags |= TF_LRD;
|
||||
return (tp); /* XXX */
|
||||
}
|
||||
|
||||
|
@ -2001,6 +2001,7 @@ tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp
|
||||
|
||||
case TCP_NODELAY:
|
||||
case TCP_NOOPT:
|
||||
case TCP_LRD:
|
||||
INP_WUNLOCK(inp);
|
||||
error = sooptcopyin(sopt, &optval, sizeof optval,
|
||||
sizeof optval);
|
||||
@ -2015,6 +2016,9 @@ tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp
|
||||
case TCP_NOOPT:
|
||||
opt = TF_NOOPT;
|
||||
break;
|
||||
case TCP_LRD:
|
||||
opt = TF_LRD;
|
||||
break;
|
||||
default:
|
||||
opt = 0; /* dead code to fool gcc */
|
||||
break;
|
||||
@ -2562,6 +2566,11 @@ tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp
|
||||
error = sooptcopyout(sopt, &optval, sizeof(optval));
|
||||
break;
|
||||
#endif
|
||||
case TCP_LRD:
|
||||
optval = tp->t_flags & TF_LRD;
|
||||
INP_WUNLOCK(inp);
|
||||
error = sooptcopyout(sopt, &optval, sizeof optval);
|
||||
break;
|
||||
default:
|
||||
INP_WUNLOCK(inp);
|
||||
error = ENOPROTOOPT;
|
||||
|
@ -410,7 +410,7 @@ TAILQ_HEAD(tcp_funchead, tcp_function);
|
||||
#define TF_TOE 0x02000000 /* this connection is offloaded */
|
||||
#define TF_WAKESOW 0x04000000 /* wake up send socket */
|
||||
#define TF_UNUSED1 0x08000000 /* unused */
|
||||
#define TF_UNUSED2 0x10000000 /* unused */
|
||||
#define TF_LRD 0x10000000 /* Lost Retransmission Detection */
|
||||
#define TF_CONGRECOVERY 0x20000000 /* congestion recovery mode */
|
||||
#define TF_WASCRECOVERY 0x40000000 /* was in congestion recovery */
|
||||
#define TF_FASTOPEN 0x80000000 /* TCP Fast Open indication */
|
||||
@ -673,6 +673,7 @@ struct tcpstat {
|
||||
uint64_t tcps_sack_rexmit_bytes; /* SACK rexmit bytes */
|
||||
uint64_t tcps_sack_rcv_blocks; /* SACK blocks (options) received */
|
||||
uint64_t tcps_sack_send_blocks; /* SACK blocks (options) sent */
|
||||
uint64_t tcps_sack_lostrexmt; /* SACK lost retransmission recovered */
|
||||
uint64_t tcps_sack_sboverflow; /* times scoreboard overflowed */
|
||||
|
||||
/* ECN related stats */
|
||||
@ -697,7 +698,7 @@ struct tcpstat {
|
||||
uint64_t tcps_tunneled_pkts; /* Packets encap's in UDP received */
|
||||
uint64_t tcps_tunneled_errs; /* Packets that had errors that were UDP encaped */
|
||||
|
||||
uint64_t _pad[10]; /* 6 UTO, 6 TBD */
|
||||
uint64_t _pad[9]; /* 6 UTO, 3 TBD */
|
||||
};
|
||||
|
||||
#define tcps_rcvmemdrop tcps_rcvreassfull /* compat */
|
||||
@ -859,6 +860,7 @@ VNET_DECLARE(int, tcp_delack_enabled);
|
||||
VNET_DECLARE(int, tcp_do_autorcvbuf);
|
||||
VNET_DECLARE(int, tcp_do_autosndbuf);
|
||||
VNET_DECLARE(int, tcp_do_ecn);
|
||||
VNET_DECLARE(int, tcp_do_lrd);
|
||||
VNET_DECLARE(int, tcp_do_prr);
|
||||
VNET_DECLARE(int, tcp_do_prr_conservative);
|
||||
VNET_DECLARE(int, tcp_do_newcwv);
|
||||
@ -893,6 +895,7 @@ VNET_DECLARE(int, tcp_udp_tunneling_port);
|
||||
VNET_DECLARE(struct inpcbhead, tcb);
|
||||
VNET_DECLARE(struct inpcbinfo, tcbinfo);
|
||||
|
||||
#define V_tcp_do_lrd VNET(tcp_do_lrd)
|
||||
#define V_tcp_do_prr VNET(tcp_do_prr)
|
||||
#define V_tcp_do_prr_conservative VNET(tcp_do_prr_conservative)
|
||||
#define V_tcp_do_newcwv VNET(tcp_do_newcwv)
|
||||
@ -1091,8 +1094,10 @@ void tcp_clean_sackreport(struct tcpcb *tp);
|
||||
void tcp_sack_adjust(struct tcpcb *tp);
|
||||
struct sackhole *tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt);
|
||||
void tcp_do_prr_ack(struct tcpcb *, struct tcphdr *, struct tcpopt *);
|
||||
void tcp_lost_retransmission(struct tcpcb *, struct tcphdr *);
|
||||
void tcp_sack_partialack(struct tcpcb *, struct tcphdr *);
|
||||
void tcp_free_sackholes(struct tcpcb *tp);
|
||||
void tcp_sack_lost_retransmission(struct tcpcb *, struct tcphdr *);
|
||||
int tcp_newreno(struct tcpcb *, struct tcphdr *);
|
||||
int tcp_compute_pipe(struct tcpcb *);
|
||||
uint32_t tcp_compute_initwnd(uint32_t);
|
||||
|
@ -809,6 +809,8 @@ tcp_stats(u_long off, const char *name, int af1 __unused, int proto __unused)
|
||||
"{N:/SACK option%s (SACK blocks) received}\n");
|
||||
p(tcps_sack_send_blocks, "\t{:sent-option-blocks/%ju} "
|
||||
"{N:/SACK option%s (SACK blocks) sent}\n");
|
||||
p(tcps_sack_lostrexmit, "\t{:lost-retransmissions/%ju} "
|
||||
"{N:/SACK retransmission%s lost}\n");
|
||||
p1a(tcps_sack_sboverflow, "\t{:scoreboard-overflows/%ju} "
|
||||
"{N:/SACK scoreboard overflow}\n");
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user