From 0471a8c7340274a5cd4b8c963965493f459c9662 Mon Sep 17 00:00:00 2001 From: Richard Scheffenegger Date: Mon, 10 May 2021 18:47:47 +0200 Subject: [PATCH] tcp: SACK Lost Retransmission Detection (LRD) Recover from excessive losses without reverting to a retransmission timeout (RTO). Disabled by default, enable with sysctl net.inet.tcp.do_lrd=1 Reviewed By: #transport, rrs, tuexen, #manpages Sponsored by: Netapp, Inc. Differential Revision: https://reviews.freebsd.org/D28931 --- share/man/man4/tcp.4 | 7 ++++ sys/netinet/tcp.h | 1 + sys/netinet/tcp_input.c | 14 ++++++-- sys/netinet/tcp_output.c | 8 +++++ sys/netinet/tcp_sack.c | 73 ++++++++++++++++++++++++++++++++++++---- sys/netinet/tcp_subr.c | 2 ++ sys/netinet/tcp_usrreq.c | 9 +++++ sys/netinet/tcp_var.h | 9 +++-- usr.bin/netstat/inet.c | 2 ++ 9 files changed, 114 insertions(+), 11 deletions(-) diff --git a/share/man/man4/tcp.4 b/share/man/man4/tcp.4 index d06630aa8d44..93d1e075e92d 100644 --- a/share/man/man4/tcp.4 +++ b/share/man/man4/tcp.4 @@ -547,6 +547,13 @@ This gently reduces the congestion window during periods, where TCP is application limited and the network bandwidth is not utilized completely. That prevents self-inflicted packet losses once the application starts to transmit data at a higher speed. +.It Va do_lrd +Enable Lost Retransmission Detection for SACK-enabled sessions, disabled by +default. +Under severe congestion, a retransmission can be lost which then leads to a +mandatory Retransmission Timeout (RTO), followed by slow-start. +LRD will try to resend the repeatedly lost packet, preventing the time-consuming +RTO and performance reducing slow-start. .It Va do_prr Perform SACK loss recovery using the Proportional Rate Reduction (PRR) algorithm described in RFC6937. diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h index 50f0811a6517..7ba99df51ed3 100644 --- a/sys/netinet/tcp.h +++ b/sys/netinet/tcp.h @@ -199,6 +199,7 @@ struct tcphdr { #define TCP_PROC_ACCOUNTING 76 /* Do accounting on tcp cpu usage and counts */ #define TCP_USE_CMP_ACKS 77 /* The transport can handle the Compressed mbuf acks */ #define TCP_PERF_INFO 78 /* retrieve accounting counters */ +#define TCP_LRD 79 /* toggle Lost Retransmission Detection for A/B testing */ #define TCP_KEEPINIT 128 /* N, time to establish connection */ #define TCP_KEEPIDLE 256 /* L,N,X start keeplives after this period */ #define TCP_KEEPINTVL 512 /* L,N interval between keepalives */ diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c index bfa95feb7eee..49db8cc63cb3 100644 --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -164,6 +164,11 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_prr, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_prr), 1, "Enable Proportional Rate Reduction per RFC 6937"); +VNET_DEFINE(int, tcp_do_lrd) = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_lrd, CTLFLAG_VNET | CTLFLAG_RW, + &VNET_NAME(tcp_do_lrd), 1, + "Perform Lost Retransmission Detection"); + VNET_DEFINE(int, tcp_do_newcwv) = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, newcwv, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_newcwv), 0, @@ -2523,9 +2528,12 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, } if ((tp->t_flags & TF_SACK_PERMIT) && ((to.to_flags & TOF_SACK) || - !TAILQ_EMPTY(&tp->snd_holes))) - sack_changed = tcp_sack_doack(tp, &to, th->th_ack); - else + !TAILQ_EMPTY(&tp->snd_holes))) { + if (((sack_changed = tcp_sack_doack(tp, &to, th->th_ack)) != 0) && + (tp->t_flags & TF_LRD)) { + tcp_sack_lost_retransmission(tp, th); + } + } else /* * Reset the value so that previous (valid) value * from the last ack with SACK doesn't get used. diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c index 5bda2be14df0..20b9c0371122 100644 --- a/sys/netinet/tcp_output.c +++ b/sys/netinet/tcp_output.c @@ -1264,6 +1264,14 @@ tcp_output(struct tcpcb *tp) } else { th->th_seq = htonl(p->rxmit); p->rxmit += len; + /* + * Lost Retransmission Detection + * trigger resending of a (then + * still existing) hole, when + * fack acks recoverypoint. + */ + if ((tp->t_flags & TF_LRD) && SEQ_GEQ(p->rxmit, p->end)) + p->rxmit = tp->snd_recover; tp->sackhint.sack_bytes_rexmit += len; } if (IN_RECOVERY(tp->t_flags)) { diff --git a/sys/netinet/tcp_sack.c b/sys/netinet/tcp_sack.c index 96056b5efd4d..9753536926d5 100644 --- a/sys/netinet/tcp_sack.c +++ b/sys/netinet/tcp_sack.c @@ -119,6 +119,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #ifdef TCPDEBUG #include #endif /* TCPDEBUG */ @@ -730,7 +731,8 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack) cur = TAILQ_PREV(cur, sackhole_head, scblink); continue; } - tp->sackhint.sack_bytes_rexmit -= (cur->rxmit - cur->start); + tp->sackhint.sack_bytes_rexmit -= + (SEQ_MIN(cur->rxmit, cur->end) - cur->start); KASSERT(tp->sackhint.sack_bytes_rexmit >= 0, ("sackhint bytes rtx >= 0")); sack_changed = 1; @@ -761,6 +763,8 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack) delivered_data += (cur->end - sblkp->start); cur->end = sblkp->start; cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); + if ((tp->t_flags & TF_LRD) && SEQ_GEQ(cur->rxmit, cur->end)) + cur->rxmit = tp->snd_recover; } else { /* * ACKs some data in middle of a hole; need @@ -771,18 +775,21 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack) if (temp != NULL) { if (SEQ_GT(cur->rxmit, temp->rxmit)) { temp->rxmit = cur->rxmit; - tp->sackhint.sack_bytes_rexmit - += (temp->rxmit - - temp->start); + tp->sackhint.sack_bytes_rexmit += + (SEQ_MIN(temp->rxmit, + temp->end) - temp->start); } cur->end = sblkp->start; cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); + if ((tp->t_flags & TF_LRD) && SEQ_GEQ(cur->rxmit, cur->end)) + cur->rxmit = tp->snd_recover; delivered_data += (sblkp->end - sblkp->start); } } } - tp->sackhint.sack_bytes_rexmit += (cur->rxmit - cur->start); + tp->sackhint.sack_bytes_rexmit += + (SEQ_MIN(cur->rxmit, cur->end) - cur->start); /* * Testing sblkp->start against cur->start tells us whether * we're done with the sack block or the sack hole. @@ -912,7 +919,7 @@ tcp_sack_output_debug(struct tcpcb *tp, int *sack_bytes_rexmt) *sack_bytes_rexmt += (p->rxmit - p->start); break; } - *sack_bytes_rexmt += (p->rxmit - p->start); + *sack_bytes_rexmt += (SEQ_MIN(p->rxmit, p->end) - p->start); } return (p); } @@ -989,3 +996,57 @@ tcp_sack_adjust(struct tcpcb *tp) return; tp->snd_nxt = tp->snd_fack; } + +/* + * Lost Retransmission Detection + * Check is FACK is beyond the rexmit of the leftmost hole. + * If yes, we restart sending from still existing holes, + * and adjust cwnd via the congestion control module. + */ +void +tcp_sack_lost_retransmission(struct tcpcb *tp, struct tcphdr *th) +{ + struct sackhole *temp; + uint32_t prev_cwnd; + if (IN_RECOVERY(tp->t_flags) && + SEQ_GT(tp->snd_fack, tp->snd_recover) && + ((temp = TAILQ_FIRST(&tp->snd_holes)) != NULL) && + SEQ_GEQ(temp->rxmit, temp->end) && + SEQ_GEQ(tp->snd_fack, temp->rxmit)) { + TCPSTAT_INC(tcps_sack_lostrexmt); + /* + * Start retransmissions from the first hole, and + * subsequently all other remaining holes, including + * those, which had been sent completely before. + */ + tp->sackhint.nexthole = temp; + TAILQ_FOREACH(temp, &tp->snd_holes, scblink) { + if (SEQ_GEQ(tp->snd_fack, temp->rxmit) && + SEQ_GEQ(temp->rxmit, temp->end)) + temp->rxmit = temp->start; + } + /* + * Remember the old ssthresh, to deduct the beta factor used + * by the CC module. Finally, set cwnd to ssthresh just + * prior to invoking another cwnd reduction by the CC + * module, to not shrink it excessively. + */ + prev_cwnd = tp->snd_cwnd; + tp->snd_cwnd = tp->snd_ssthresh; + /* + * Formally exit recovery, and let the CC module adjust + * ssthresh as intended. + */ + EXIT_RECOVERY(tp->t_flags); + cc_cong_signal(tp, th, CC_NDUPACK); + /* + * For PRR, adjust recover_fs as if this new reduction + * initialized this variable. + * cwnd will be adjusted by SACK or PRR processing + * subsequently, only set it to a safe value here. + */ + tp->snd_cwnd = tcp_maxseg(tp); + tp->sackhint.recover_fs = (tp->snd_max - tp->snd_una) - + tp->sackhint.recover_fs; + } +} diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index 5f2997163471..c44f26f78a2f 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -2171,6 +2171,8 @@ tcp_newtcpcb(struct inpcb *inp) if (V_tcp_perconn_stats_enable == 1) tp->t_stats = stats_blob_alloc(V_tcp_perconn_stats_dflt_tpl, 0); #endif + if (V_tcp_do_lrd) + tp->t_flags |= TF_LRD; return (tp); /* XXX */ } diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c index cbc36860bf32..061681ddc2bc 100644 --- a/sys/netinet/tcp_usrreq.c +++ b/sys/netinet/tcp_usrreq.c @@ -2001,6 +2001,7 @@ tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp case TCP_NODELAY: case TCP_NOOPT: + case TCP_LRD: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); @@ -2015,6 +2016,9 @@ tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp case TCP_NOOPT: opt = TF_NOOPT; break; + case TCP_LRD: + opt = TF_LRD; + break; default: opt = 0; /* dead code to fool gcc */ break; @@ -2562,6 +2566,11 @@ tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp error = sooptcopyout(sopt, &optval, sizeof(optval)); break; #endif + case TCP_LRD: + optval = tp->t_flags & TF_LRD; + INP_WUNLOCK(inp); + error = sooptcopyout(sopt, &optval, sizeof optval); + break; default: INP_WUNLOCK(inp); error = ENOPROTOOPT; diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h index 515362c6bf9e..b80746b1ede4 100644 --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -410,7 +410,7 @@ TAILQ_HEAD(tcp_funchead, tcp_function); #define TF_TOE 0x02000000 /* this connection is offloaded */ #define TF_WAKESOW 0x04000000 /* wake up send socket */ #define TF_UNUSED1 0x08000000 /* unused */ -#define TF_UNUSED2 0x10000000 /* unused */ +#define TF_LRD 0x10000000 /* Lost Retransmission Detection */ #define TF_CONGRECOVERY 0x20000000 /* congestion recovery mode */ #define TF_WASCRECOVERY 0x40000000 /* was in congestion recovery */ #define TF_FASTOPEN 0x80000000 /* TCP Fast Open indication */ @@ -673,6 +673,7 @@ struct tcpstat { uint64_t tcps_sack_rexmit_bytes; /* SACK rexmit bytes */ uint64_t tcps_sack_rcv_blocks; /* SACK blocks (options) received */ uint64_t tcps_sack_send_blocks; /* SACK blocks (options) sent */ + uint64_t tcps_sack_lostrexmt; /* SACK lost retransmission recovered */ uint64_t tcps_sack_sboverflow; /* times scoreboard overflowed */ /* ECN related stats */ @@ -697,7 +698,7 @@ struct tcpstat { uint64_t tcps_tunneled_pkts; /* Packets encap's in UDP received */ uint64_t tcps_tunneled_errs; /* Packets that had errors that were UDP encaped */ - uint64_t _pad[10]; /* 6 UTO, 6 TBD */ + uint64_t _pad[9]; /* 6 UTO, 3 TBD */ }; #define tcps_rcvmemdrop tcps_rcvreassfull /* compat */ @@ -859,6 +860,7 @@ VNET_DECLARE(int, tcp_delack_enabled); VNET_DECLARE(int, tcp_do_autorcvbuf); VNET_DECLARE(int, tcp_do_autosndbuf); VNET_DECLARE(int, tcp_do_ecn); +VNET_DECLARE(int, tcp_do_lrd); VNET_DECLARE(int, tcp_do_prr); VNET_DECLARE(int, tcp_do_prr_conservative); VNET_DECLARE(int, tcp_do_newcwv); @@ -893,6 +895,7 @@ VNET_DECLARE(int, tcp_udp_tunneling_port); VNET_DECLARE(struct inpcbhead, tcb); VNET_DECLARE(struct inpcbinfo, tcbinfo); +#define V_tcp_do_lrd VNET(tcp_do_lrd) #define V_tcp_do_prr VNET(tcp_do_prr) #define V_tcp_do_prr_conservative VNET(tcp_do_prr_conservative) #define V_tcp_do_newcwv VNET(tcp_do_newcwv) @@ -1091,8 +1094,10 @@ void tcp_clean_sackreport(struct tcpcb *tp); void tcp_sack_adjust(struct tcpcb *tp); struct sackhole *tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt); void tcp_do_prr_ack(struct tcpcb *, struct tcphdr *, struct tcpopt *); +void tcp_lost_retransmission(struct tcpcb *, struct tcphdr *); void tcp_sack_partialack(struct tcpcb *, struct tcphdr *); void tcp_free_sackholes(struct tcpcb *tp); +void tcp_sack_lost_retransmission(struct tcpcb *, struct tcphdr *); int tcp_newreno(struct tcpcb *, struct tcphdr *); int tcp_compute_pipe(struct tcpcb *); uint32_t tcp_compute_initwnd(uint32_t); diff --git a/usr.bin/netstat/inet.c b/usr.bin/netstat/inet.c index 49478c4a9247..45b107b42d39 100644 --- a/usr.bin/netstat/inet.c +++ b/usr.bin/netstat/inet.c @@ -809,6 +809,8 @@ tcp_stats(u_long off, const char *name, int af1 __unused, int proto __unused) "{N:/SACK option%s (SACK blocks) received}\n"); p(tcps_sack_send_blocks, "\t{:sent-option-blocks/%ju} " "{N:/SACK option%s (SACK blocks) sent}\n"); + p(tcps_sack_lostrexmit, "\t{:lost-retransmissions/%ju} " + "{N:/SACK retransmission%s lost}\n"); p1a(tcps_sack_sboverflow, "\t{:scoreboard-overflows/%ju} " "{N:/SACK scoreboard overflow}\n");