Prevent premature SACK block transmission during loss recovery

Under specific conditions, a window update can be sent with
outdated SACK information. Some clients react to this by
subsequently delaying loss recovery, making TCP perform very
poorly.

Reported by:	chengc_netapp.com
Reviewed by:	rrs, jtl
MFC after:	2 weeks
Sponsored by:	NetApp, Inc.
Differential Revision:	https://reviews.freebsd.org/D24237
This commit is contained in:
Richard Scheffenegger 2020-11-08 18:47:05 +00:00
parent 2d39824195
commit 4d0770f172
6 changed files with 74 additions and 31 deletions

View File

@ -1461,6 +1461,29 @@ tcp_autorcvbuf(struct mbuf *m, struct tcphdr *th, struct socket *so,
return (newsize);
}
void
tcp_handle_wakeup(struct tcpcb *tp, struct socket *so)
{
/*
* Since tp might be gone if the session entered
* the TIME_WAIT state before coming here, we need
* to check if the socket is still connected.
*/
if ((so->so_state & SS_ISCONNECTED) == 0)
return;
INP_LOCK_ASSERT(tp->t_inpcb);
if (tp->t_flags & TF_WAKESOR) {
tp->t_flags &= ~TF_WAKESOR;
SOCKBUF_UNLOCK_ASSERT(&so->so_rcv);
sorwakeup(so);
}
if (tp->t_flags & TF_WAKESOW) {
tp->t_flags &= ~TF_WAKESOW;
SOCKBUF_UNLOCK_ASSERT(&so->so_snd);
sowwakeup(so);
}
}
void
tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos)
@ -1811,7 +1834,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
else if (!tcp_timer_active(tp, TT_PERSIST))
tcp_timer_activate(tp, TT_REXMT,
tp->t_rxtcur);
sowwakeup(so);
tp->t_flags |= TF_WAKESOW;
if (sbavail(&so->so_snd))
(void) tp->t_fb->tfb_tcp_output(tp);
goto check_delack;
@ -1876,8 +1899,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
m_adj(m, drop_hdrlen); /* delayed header drop */
sbappendstream_locked(&so->so_rcv, m, 0);
}
/* NB: sorwakeup_locked() does an implicit unlock. */
sorwakeup_locked(so);
SOCKBUF_UNLOCK(&so->so_rcv);
tp->t_flags |= TF_WAKESOR;
if (DELAY_ACK(tp, tlen)) {
tp->t_flags |= TF_DELACK;
} else {
@ -2811,8 +2834,8 @@ process_ACK:
tp->snd_wnd = 0;
ourfinisacked = 0;
}
/* NB: sowwakeup_locked() does an implicit unlock. */
sowwakeup_locked(so);
SOCKBUF_UNLOCK(&so->so_snd);
tp->t_flags |= TF_WAKESOW;
m_freem(mfree);
/* Detect una wraparound. */
if (!IN_RECOVERY(tp->t_flags) &&
@ -3033,8 +3056,8 @@ dodata: /* XXX */
m_freem(m);
else
sbappendstream_locked(&so->so_rcv, m, 0);
/* NB: sorwakeup_locked() does an implicit unlock. */
sorwakeup_locked(so);
SOCKBUF_UNLOCK(&so->so_rcv);
tp->t_flags |= TF_WAKESOR;
} else {
/*
* XXX: Due to the header drop above "th" is
@ -3101,6 +3124,8 @@ dodata: /* XXX */
if (thflags & TH_FIN) {
if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
socantrcvmore(so);
/* The socket upcall is handled by socantrcvmore. */
tp->t_flags &= ~TF_WAKESOR;
/*
* If connection is half-synchronized
* (ie NEEDSYN flag on) then delay ACK,
@ -3164,6 +3189,7 @@ check_delack:
tp->t_flags &= ~TF_DELACK;
tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
}
tcp_handle_wakeup(tp, so);
INP_WUNLOCK(tp->t_inpcb);
return;
@ -3197,6 +3223,7 @@ dropafterack:
TCP_PROBE3(debug__input, tp, th, m);
tp->t_flags |= TF_ACKNOW;
(void) tp->t_fb->tfb_tcp_output(tp);
tcp_handle_wakeup(tp, so);
INP_WUNLOCK(tp->t_inpcb);
m_freem(m);
return;
@ -3204,6 +3231,7 @@ dropafterack:
dropwithreset:
if (tp != NULL) {
tcp_dropwithreset(m, th, tp, tlen, rstreason);
tcp_handle_wakeup(tp, so);
INP_WUNLOCK(tp->t_inpcb);
} else
tcp_dropwithreset(m, th, NULL, tlen, rstreason);
@ -3219,8 +3247,10 @@ drop:
&tcp_savetcp, 0);
#endif
TCP_PROBE3(debug__input, tp, th, m);
if (tp != NULL)
if (tp != NULL) {
tcp_handle_wakeup(tp, so);
INP_WUNLOCK(tp->t_inpcb);
}
m_freem(m);
}

View File

@ -959,7 +959,8 @@ new_entry:
} else {
sbappendstream_locked(&so->so_rcv, m, 0);
}
sorwakeup_locked(so);
SOCKBUF_UNLOCK(&so->so_rcv);
tp->t_flags |= TF_WAKESOR;
return (flags);
}
if (tcp_new_limits) {
@ -1107,6 +1108,7 @@ present:
#ifdef TCP_REASS_LOGGING
tcp_reass_log_dump(tp);
#endif
sorwakeup_locked(so);
SOCKBUF_UNLOCK(&so->so_rcv);
tp->t_flags |= TF_WAKESOR;
return (flags);
}

View File

@ -7876,8 +7876,8 @@ bbr_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
acked_amount = min(acked, (int)sbavail(&so->so_snd));
tp->snd_wnd -= acked_amount;
mfree = sbcut_locked(&so->so_snd, acked_amount);
/* NB: sowwakeup_locked() does an implicit unlock. */
sowwakeup_locked(so);
SOCKBUF_UNLOCK(&so->so_snd);
tp->t_flags |= TF_WAKESOW;
m_freem(mfree);
if (SEQ_GT(th->th_ack, tp->snd_una)) {
bbr_collapse_rtt(tp, bbr, TCP_REXMTVAL(tp));
@ -8353,8 +8353,8 @@ bbr_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
appended =
#endif
sbappendstream_locked(&so->so_rcv, m, 0);
/* NB: sorwakeup_locked() does an implicit unlock. */
sorwakeup_locked(so);
SOCKBUF_UNLOCK(&so->so_rcv);
tp->t_flags |= TF_WAKESOR;
#ifdef NETFLIX_SB_LIMITS
if (so->so_rcv.sb_shlim && appended != mcnt)
counter_fo_release(so->so_rcv.sb_shlim,
@ -8414,6 +8414,8 @@ bbr_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (thflags & TH_FIN) {
if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
socantrcvmore(so);
/* The socket upcall is handled by socantrcvmore. */
tp->t_flags &= ~TF_WAKESOR;
/*
* If connection is half-synchronized (ie NEEDSYN
* flag on) then delay ACK, so it may be piggybacked
@ -8604,8 +8606,8 @@ bbr_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
sbappendstream_locked(&so->so_rcv, m, 0);
ctf_calc_rwin(so, tp);
}
/* NB: sorwakeup_locked() does an implicit unlock. */
sorwakeup_locked(so);
SOCKBUF_UNLOCK(&so->so_rcv);
tp->t_flags |= TF_WAKESOR;
#ifdef NETFLIX_SB_LIMITS
if (so->so_rcv.sb_shlim && mcnt != appended)
counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended);
@ -8796,7 +8798,7 @@ bbr_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
&tcp_savetcp, 0);
#endif
/* Wake up the socket if we have room to write more */
sowwakeup(so);
tp->t_flags |= TF_WAKESOW;
if (tp->snd_una == tp->snd_max) {
/* Nothing left outstanding */
bbr_log_progress_event(bbr, tp, ticks, PROGRESS_CLEAR, __LINE__);
@ -11740,8 +11742,10 @@ bbr_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
}
retval = bbr_do_segment_nounlock(m, th, so, tp,
drop_hdrlen, tlen, iptos, 0, &tv);
if (retval == 0)
if (retval == 0) {
tcp_handle_wakeup(tp, so);
INP_WUNLOCK(tp->t_inpcb);
}
}
/*

View File

@ -8344,8 +8344,8 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
ourfinisacked = 1;
}
/* NB: sowwakeup_locked() does an implicit unlock. */
sowwakeup_locked(so);
SOCKBUF_UNLOCK(&so->so_snd);
tp->t_flags |= TF_WAKESOW;
m_freem(mfree);
if (rack->r_ctl.rc_early_recovery == 0) {
if (IN_RECOVERY(tp->t_flags)) {
@ -8665,8 +8665,8 @@ rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
appended =
#endif
sbappendstream_locked(&so->so_rcv, m, 0);
/* NB: sorwakeup_locked() does an implicit unlock. */
sorwakeup_locked(so);
SOCKBUF_UNLOCK(&so->so_rcv);
tp->t_flags |= TF_WAKESOR;
#ifdef NETFLIX_SB_LIMITS
if (so->so_rcv.sb_shlim && appended != mcnt)
counter_fo_release(so->so_rcv.sb_shlim,
@ -8731,6 +8731,8 @@ rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (thflags & TH_FIN) {
if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
socantrcvmore(so);
/* The socket upcall is handled by socantrcvmore. */
tp->t_flags &= ~TF_WAKESOR;
/*
* If connection is half-synchronized (ie NEEDSYN
* flag on) then delay ACK, so it may be piggybacked
@ -8922,8 +8924,8 @@ rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
sbappendstream_locked(&so->so_rcv, m, 0);
ctf_calc_rwin(so, tp);
}
/* NB: sorwakeup_locked() does an implicit unlock. */
sorwakeup_locked(so);
SOCKBUF_UNLOCK(&so->so_rcv);
tp->t_flags |= TF_WAKESOR;
#ifdef NETFLIX_SB_LIMITS
if (so->so_rcv.sb_shlim && mcnt != appended)
counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended);
@ -9140,7 +9142,7 @@ rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
}
/* Wake up the socket if we have room to write more */
sowwakeup(so);
tp->t_flags |= TF_WAKESOW;
if (sbavail(&so->so_snd)) {
rack->r_wanted_output = 1;
}
@ -11188,8 +11190,10 @@ rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
tcp_get_usecs(&tv);
}
if(rack_do_segment_nounlock(m, th, so, tp,
drop_hdrlen, tlen, iptos, 0, &tv) == 0)
drop_hdrlen, tlen, iptos, 0, &tv) == 0) {
tcp_handle_wakeup(tp, so);
INP_WUNLOCK(tp->t_inpcb);
}
}
struct rack_sendmap *

View File

@ -458,6 +458,7 @@ ctf_do_queued_segments(struct socket *so, struct tcpcb *tp, int have_pkt)
/* We lost the tcpcb (maybe a RST came in)? */
return(1);
}
tcp_handle_wakeup(tp, so);
}
return (0);
}

View File

@ -381,7 +381,7 @@ TAILQ_HEAD(tcp_funchead, tcp_function);
#define TF_NEEDFIN 0x00000800 /* send FIN (implicit state) */
#define TF_NOPUSH 0x00001000 /* don't push */
#define TF_PREVVALID 0x00002000 /* saved values for bad rxmit valid */
#define TF_UNUSED1 0x00004000 /* unused */
#define TF_WAKESOR 0x00004000 /* wake up receive socket */
#define TF_GPUTINPROG 0x00008000 /* Goodput measurement in progress */
#define TF_MORETOCOME 0x00010000 /* More data to be appended to sock */
#define TF_LQ_OVERFLOW 0x00020000 /* listen queue overflow */
@ -393,9 +393,9 @@ TAILQ_HEAD(tcp_funchead, tcp_function);
#define TF_FORCEDATA 0x00800000 /* force out a byte */
#define TF_TSO 0x01000000 /* TSO enabled on this connection */
#define TF_TOE 0x02000000 /* this connection is offloaded */
#define TF_UNUSED3 0x04000000 /* unused */
#define TF_UNUSED4 0x08000000 /* unused */
#define TF_UNUSED5 0x10000000 /* unused */
#define TF_WAKESOW 0x04000000 /* wake up send socket */
#define TF_UNUSED1 0x08000000 /* unused */
#define TF_UNUSED2 0x10000000 /* unused */
#define TF_CONGRECOVERY 0x20000000 /* congestion recovery mode */
#define TF_WASCRECOVERY 0x40000000 /* was in congestion recovery */
#define TF_FASTOPEN 0x80000000 /* TCP Fast Open indication */
@ -931,7 +931,8 @@ char *tcp_log_addrs(struct in_conninfo *, struct tcphdr *, void *,
const void *);
char *tcp_log_vain(struct in_conninfo *, struct tcphdr *, void *,
const void *);
int tcp_reass(struct tcpcb *, struct tcphdr *, tcp_seq *, int *, struct mbuf *);
int tcp_reass(struct tcpcb *, struct tcphdr *, tcp_seq *, int *,
struct mbuf *);
void tcp_reass_global_init(void);
void tcp_reass_flush(struct tcpcb *);
void tcp_dooptions(struct tcpopt *, u_char *, int, int);
@ -955,6 +956,7 @@ void hhook_run_tcp_est_in(struct tcpcb *tp,
int tcp_input(struct mbuf **, int *, int);
int tcp_autorcvbuf(struct mbuf *, struct tcphdr *, struct socket *,
struct tcpcb *, int);
void tcp_handle_wakeup(struct tcpcb *, struct socket *);
void tcp_do_segment(struct mbuf *, struct tcphdr *,
struct socket *, struct tcpcb *, int, int, uint8_t);