[tcp] Keep socket buffer locked until upcall

r367492 would unlock the socket buffer before eventually calling the upcall.
This leads to problematic interaction with NFS kernel server/client components
(MP threads) accessing the socket buffer with potentially not correctly updated
state.

Reported by: rmacklem
Reviewed By: tuexen, #transport
Tested by: rmacklem, otis
MFC after: 2 weeks
Sponsored By: NetApp, Inc.
Differential Revision: https://reviews.freebsd.org/D29690
This commit is contained in:
Richard Scheffenegger 2021-05-21 11:00:53 +02:00
parent 500eb6dd80
commit 032bf749fd
6 changed files with 37 additions and 46 deletions

View File

@ -1509,18 +1509,17 @@ tcp_handle_wakeup(struct tcpcb *tp, struct socket *so)
* the TIME_WAIT state before coming here, we need
* to check if the socket is still connected.
*/
if ((so->so_state & SS_ISCONNECTED) == 0)
if (tp == NULL) {
return;
}
if (so == NULL) {
return;
}
INP_LOCK_ASSERT(tp->t_inpcb);
if (tp->t_flags & TF_WAKESOR) {
tp->t_flags &= ~TF_WAKESOR;
SOCKBUF_UNLOCK_ASSERT(&so->so_rcv);
sorwakeup(so);
}
if (tp->t_flags & TF_WAKESOW) {
tp->t_flags &= ~TF_WAKESOW;
SOCKBUF_UNLOCK_ASSERT(&so->so_snd);
sowwakeup(so);
SOCKBUF_LOCK_ASSERT(&so->so_rcv);
sorwakeup_locked(so);
}
}
@ -1898,7 +1897,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
else if (!tcp_timer_active(tp, TT_PERSIST))
tcp_timer_activate(tp, TT_REXMT,
tp->t_rxtcur);
tp->t_flags |= TF_WAKESOW;
sowwakeup(so);
if (sbavail(&so->so_snd))
(void) tp->t_fb->tfb_tcp_output(tp);
goto check_delack;
@ -1963,8 +1962,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
m_adj(m, drop_hdrlen); /* delayed header drop */
sbappendstream_locked(&so->so_rcv, m, 0);
}
SOCKBUF_UNLOCK(&so->so_rcv);
tp->t_flags |= TF_WAKESOR;
/* NB: sorwakeup_locked() does an implicit unlock. */
sorwakeup_locked(so);
if (DELAY_ACK(tp, tlen)) {
tp->t_flags |= TF_DELACK;
} else {
@ -2502,9 +2501,11 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
* If segment contains data or ACK, will call tcp_reass()
* later; if not, do so now to pass queued data to user.
*/
if (tlen == 0 && (thflags & TH_FIN) == 0)
if (tlen == 0 && (thflags & TH_FIN) == 0) {
(void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0,
(struct mbuf *)0);
tcp_handle_wakeup(tp, so);
}
tp->snd_wl1 = th->th_seq - 1;
/* FALLTHROUGH */
@ -2959,8 +2960,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
tp->snd_wnd = 0;
ourfinisacked = 0;
}
SOCKBUF_UNLOCK(&so->so_snd);
tp->t_flags |= TF_WAKESOW;
/* NB: sowwakeup_locked() does an implicit unlock. */
sowwakeup_locked(so);
m_freem(mfree);
/* Detect una wraparound. */
if (!IN_RECOVERY(tp->t_flags) &&
@ -3181,7 +3182,6 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
m_freem(m);
else
sbappendstream_locked(&so->so_rcv, m, 0);
SOCKBUF_UNLOCK(&so->so_rcv);
tp->t_flags |= TF_WAKESOR;
} else {
/*
@ -3227,6 +3227,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
save_start + tlen);
}
}
tcp_handle_wakeup(tp, so);
#if 0
/*
* Note the amount of data that peer has sent into
@ -3250,9 +3251,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
if (thflags & TH_FIN) {
if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
socantrcvmore(so);
/* The socket upcall is handled by socantrcvmore. */
tp->t_flags &= ~TF_WAKESOR;
socantrcvmore(so);
/*
* If connection is half-synchronized
* (ie NEEDSYN flag on) then delay ACK,
@ -3316,7 +3316,6 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
tp->t_flags &= ~TF_DELACK;
tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
}
tcp_handle_wakeup(tp, so);
INP_WUNLOCK(tp->t_inpcb);
return;
@ -3350,7 +3349,6 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
TCP_PROBE3(debug__input, tp, th, m);
tp->t_flags |= TF_ACKNOW;
(void) tp->t_fb->tfb_tcp_output(tp);
tcp_handle_wakeup(tp, so);
INP_WUNLOCK(tp->t_inpcb);
m_freem(m);
return;
@ -3358,7 +3356,6 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
dropwithreset:
if (tp != NULL) {
tcp_dropwithreset(m, th, tp, tlen, rstreason);
tcp_handle_wakeup(tp, so);
INP_WUNLOCK(tp->t_inpcb);
} else
tcp_dropwithreset(m, th, NULL, tlen, rstreason);
@ -3375,7 +3372,6 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
#endif
TCP_PROBE3(debug__input, tp, th, m);
if (tp != NULL) {
tcp_handle_wakeup(tp, so);
INP_WUNLOCK(tp->t_inpcb);
}
m_freem(m);

View File

@ -959,7 +959,6 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, tcp_seq *seq_start,
} else {
sbappendstream_locked(&so->so_rcv, m, 0);
}
SOCKBUF_UNLOCK(&so->so_rcv);
tp->t_flags |= TF_WAKESOR;
return (flags);
}
@ -1108,7 +1107,6 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, tcp_seq *seq_start,
#ifdef TCP_REASS_LOGGING
tcp_reass_log_dump(tp);
#endif
SOCKBUF_UNLOCK(&so->so_rcv);
tp->t_flags |= TF_WAKESOR;
return (flags);
}

View File

@ -7825,8 +7825,8 @@ bbr_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
acked_amount = min(acked, (int)sbavail(&so->so_snd));
tp->snd_wnd -= acked_amount;
mfree = sbcut_locked(&so->so_snd, acked_amount);
SOCKBUF_UNLOCK(&so->so_snd);
tp->t_flags |= TF_WAKESOW;
/* NB: sowwakeup_locked() does an implicit unlock. */
sowwakeup_locked(so);
m_freem(mfree);
if (SEQ_GT(th->th_ack, tp->snd_una)) {
bbr_collapse_rtt(tp, bbr, TCP_REXMTVAL(tp));
@ -8302,7 +8302,6 @@ bbr_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
appended =
#endif
sbappendstream_locked(&so->so_rcv, m, 0);
SOCKBUF_UNLOCK(&so->so_rcv);
tp->t_flags |= TF_WAKESOR;
#ifdef NETFLIX_SB_LIMITS
if (so->so_rcv.sb_shlim && appended != mcnt)
@ -8353,6 +8352,7 @@ bbr_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
save_start + tlen);
}
}
tcp_handle_wakeup(tp, so);
} else {
m_freem(m);
thflags &= ~TH_FIN;
@ -8364,9 +8364,8 @@ bbr_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
if (thflags & TH_FIN) {
if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
socantrcvmore(so);
/* The socket upcall is handled by socantrcvmore. */
tp->t_flags &= ~TF_WAKESOR;
socantrcvmore(so);
/*
* If connection is half-synchronized (ie NEEDSYN
* flag on) then delay ACK, so it may be piggybacked
@ -8557,8 +8556,8 @@ bbr_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
sbappendstream_locked(&so->so_rcv, m, 0);
ctf_calc_rwin(so, tp);
}
SOCKBUF_UNLOCK(&so->so_rcv);
tp->t_flags |= TF_WAKESOR;
/* NB: sorwakeup_locked() does an implicit unlock. */
sorwakeup_locked(so);
#ifdef NETFLIX_SB_LIMITS
if (so->so_rcv.sb_shlim && mcnt != appended)
counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended);
@ -8749,7 +8748,7 @@ bbr_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
&tcp_savetcp, 0);
#endif
/* Wake up the socket if we have room to write more */
tp->t_flags |= TF_WAKESOW;
sowwakeup(so);
if (tp->snd_una == tp->snd_max) {
/* Nothing left outstanding */
bbr_log_progress_event(bbr, tp, ticks, PROGRESS_CLEAR, __LINE__);
@ -9157,9 +9156,11 @@ bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
* If segment contains data or ACK, will call tcp_reass() later; if
* not, do so now to pass queued data to user.
*/
if (tlen == 0 && (thflags & TH_FIN) == 0)
if (tlen == 0 && (thflags & TH_FIN) == 0) {
(void)tcp_reass(tp, (struct tcphdr *)0, NULL, 0,
(struct mbuf *)0);
tcp_handle_wakeup(tp, so);
}
tp->snd_wl1 = th->th_seq - 1;
if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
return (ret_val);
@ -11711,7 +11712,6 @@ bbr_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
retval = bbr_do_segment_nounlock(m, th, so, tp,
drop_hdrlen, tlen, iptos, 0, &tv);
if (retval == 0) {
tcp_handle_wakeup(tp, so);
INP_WUNLOCK(tp->t_inpcb);
}
}

View File

@ -9857,8 +9857,8 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (acked_amount && sbavail(&so->so_snd))
rack_adjust_sendmap(rack, &so->so_snd, tp->snd_una);
rack_log_wakeup(tp,rack, &so->so_snd, acked, 2);
SOCKBUF_UNLOCK(&so->so_snd);
tp->t_flags |= TF_WAKESOW;
/* NB: sowwakeup_locked() does an implicit unlock. */
sowwakeup_locked(so);
m_freem(mfree);
if (SEQ_GT(tp->snd_una, tp->snd_recover))
tp->snd_recover = tp->snd_una;
@ -10208,7 +10208,6 @@ rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
sbappendstream_locked(&so->so_rcv, m, 0);
rack_log_wakeup(tp,rack, &so->so_rcv, tlen, 1);
SOCKBUF_UNLOCK(&so->so_rcv);
tp->t_flags |= TF_WAKESOR;
#ifdef NETFLIX_SB_LIMITS
if (so->so_rcv.sb_shlim && appended != mcnt)
@ -10265,6 +10264,7 @@ rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
save_start + tlen);
}
}
tcp_handle_wakeup(tp, so);
} else {
m_freem(m);
thflags &= ~TH_FIN;
@ -10276,9 +10276,8 @@ rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
if (thflags & TH_FIN) {
if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
socantrcvmore(so);
/* The socket upcall is handled by socantrcvmore. */
tp->t_flags &= ~TF_WAKESOR;
socantrcvmore(so);
/*
* If connection is half-synchronized (ie NEEDSYN
* flag on) then delay ACK, so it may be piggybacked
@ -10471,7 +10470,6 @@ rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
ctf_calc_rwin(so, tp);
}
rack_log_wakeup(tp,rack, &so->so_rcv, tlen, 1);
SOCKBUF_UNLOCK(&so->so_rcv);
tp->t_flags |= TF_WAKESOR;
#ifdef NETFLIX_SB_LIMITS
if (so->so_rcv.sb_shlim && mcnt != appended)
@ -10631,8 +10629,7 @@ rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
rack_adjust_sendmap(rack, &so->so_snd, tp->snd_una);
/* Wake up the socket if we have room to write more */
rack_log_wakeup(tp,rack, &so->so_snd, acked, 2);
SOCKBUF_UNLOCK(&so->so_snd);
tp->t_flags |= TF_WAKESOW;
sowwakeup(so);
m_freem(mfree);
tp->t_rxtshift = 0;
RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
@ -11070,9 +11067,11 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
* If segment contains data or ACK, will call tcp_reass() later; if
* not, do so now to pass queued data to user.
*/
if (tlen == 0 && (thflags & TH_FIN) == 0)
if (tlen == 0 && (thflags & TH_FIN) == 0) {
(void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0,
(struct mbuf *)0);
tcp_handle_wakeup(tp, so);
}
tp->snd_wl1 = th->th_seq - 1;
/* For syn-recv we need to possibly update the rtt */
if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
@ -13155,8 +13154,7 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb
rack_adjust_sendmap(rack, &so->so_snd, tp->snd_una);
/* Wake up the socket if we have room to write more */
rack_log_wakeup(tp,rack, &so->so_snd, acked, 2);
SOCKBUF_UNLOCK(&so->so_snd);
tp->t_flags |= TF_WAKESOW;
sowwakeup(so);
m_freem(mfree);
}
/* update progress */

View File

@ -533,7 +533,6 @@ ctf_do_queued_segments(struct socket *so, struct tcpcb *tp, int have_pkt)
/* We lost the tcpcb (maybe a RST came in)? */
return(1);
}
tcp_handle_wakeup(tp, so);
}
return (0);
}

View File

@ -408,7 +408,7 @@ TAILQ_HEAD(tcp_funchead, tcp_function);
#define TF_FORCEDATA 0x00800000 /* force out a byte */
#define TF_TSO 0x01000000 /* TSO enabled on this connection */
#define TF_TOE 0x02000000 /* this connection is offloaded */
#define TF_WAKESOW 0x04000000 /* wake up send socket */
#define TF_UNUSED0 0x04000000 /* unused */
#define TF_UNUSED1 0x08000000 /* unused */
#define TF_LRD 0x10000000 /* Lost Retransmission Detection */
#define TF_CONGRECOVERY 0x20000000 /* congestion recovery mode */