From 5a334970e35b6bdcc605326bd8d66ab8fd604e57 Mon Sep 17 00:00:00 2001 From: np Date: Sat, 1 Jun 2019 03:03:48 +0000 Subject: [PATCH] cxgbe/t4_tom: adjust the hardware receive window to match changes to the receive sockbuf's high water mark. Calculate rx credits on the spot instead of tracking sbused/sb_cc and rx_credits in the toepcb. The previous method worked when the high water mark changed due to SB_AUTOSIZE but not when it was adjusted directly (for example, by the soreserve in nfsrvd_addsock). This fixes a connection hang while running iozone over an NFS mounted share where nfsd's TCP sockets are being handled by t4_tom. MFC after: 3 days Sponsored by: Chelsio Communications --- sys/dev/cxgbe/cxgbei/cxgbei.c | 1 - sys/dev/cxgbe/tom/t4_connect.c | 7 ++-- sys/dev/cxgbe/tom/t4_cpl_io.c | 67 ++++++++++------------------------ sys/dev/cxgbe/tom/t4_ddp.c | 14 +------ sys/dev/cxgbe/tom/t4_listen.c | 4 +- sys/dev/cxgbe/tom/t4_tls.c | 45 ++++++----------------- sys/dev/cxgbe/tom/t4_tom.h | 4 +- 7 files changed, 37 insertions(+), 105 deletions(-) diff --git a/sys/dev/cxgbe/cxgbei/cxgbei.c b/sys/dev/cxgbe/cxgbei/cxgbei.c index b35154fef0b9..392da36bb95c 100644 --- a/sys/dev/cxgbe/cxgbei/cxgbei.c +++ b/sys/dev/cxgbe/cxgbei/cxgbei.c @@ -398,7 +398,6 @@ do_rx_iscsi_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) tp->t_rcvtime = ticks; /* update rx credits */ - toep->rx_credits += pdu_len; t4_rcvd(&toep->td->tod, tp); /* XXX: sc->tom_softc.tod */ so = inp->inp_socket; diff --git a/sys/dev/cxgbe/tom/t4_connect.c b/sys/dev/cxgbe/tom/t4_connect.c index 27fd2d9c845d..068bf38b6860 100644 --- a/sys/dev/cxgbe/tom/t4_connect.c +++ b/sys/dev/cxgbe/tom/t4_connect.c @@ -385,8 +385,7 @@ t4_connect(struct toedev *tod, struct socket *so, struct rtentry *rt, toep->vnet = so->so_vnet; set_ulp_mode(toep, select_ulp_mode(so, sc, &settings)); SOCKBUF_LOCK(&so->so_rcv); - /* opt0 rcv_bufsiz initially, assumes its normal meaning later */ - toep->rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ); + toep->opt0_rcv_bufsize = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ); SOCKBUF_UNLOCK(&so->so_rcv); /* @@ -440,7 +439,7 @@ t4_connect(struct toedev *tod, struct socket *so, struct rtentry *rt, cpl->peer_ip_hi = *(uint64_t *)&inp->in6p_faddr.s6_addr[0]; cpl->peer_ip_lo = *(uint64_t *)&inp->in6p_faddr.s6_addr[8]; cpl->opt0 = calc_opt0(so, vi, toep->l2te, mtu_idx, rscale, - toep->rx_credits, toep->ulp_mode, &settings); + toep->opt0_rcv_bufsize, toep->ulp_mode, &settings); cpl->opt2 = calc_opt2a(so, toep, &settings); } else { struct cpl_act_open_req *cpl = wrtod(wr); @@ -469,7 +468,7 @@ t4_connect(struct toedev *tod, struct socket *so, struct rtentry *rt, inp_4tuple_get(inp, &cpl->local_ip, &cpl->local_port, &cpl->peer_ip, &cpl->peer_port); cpl->opt0 = calc_opt0(so, vi, toep->l2te, mtu_idx, rscale, - toep->rx_credits, toep->ulp_mode, &settings); + toep->opt0_rcv_bufsize, toep->ulp_mode, &settings); cpl->opt2 = calc_opt2a(so, toep, &settings); } diff --git a/sys/dev/cxgbe/tom/t4_cpl_io.c b/sys/dev/cxgbe/tom/t4_cpl_io.c index 45408730f973..27ef745c764c 100644 --- a/sys/dev/cxgbe/tom/t4_cpl_io.c +++ b/sys/dev/cxgbe/tom/t4_cpl_io.c @@ -399,20 +399,10 @@ make_established(struct toepcb *toep, uint32_t iss, uint32_t irs, uint16_t opt) tp->irs = irs; tcp_rcvseqinit(tp); - tp->rcv_wnd = toep->rx_credits << 10; + tp->rcv_wnd = toep->opt0_rcv_bufsize << 10; tp->rcv_adv += tp->rcv_wnd; tp->last_ack_sent = tp->rcv_nxt; - /* - * If we were unable to send all rx credits via opt0, save the remainder - * in rx_credits so that they can be handed over with the next credit - * update. - */ - SOCKBUF_LOCK(&so->so_rcv); - bufsize = select_rcv_wnd(so); - SOCKBUF_UNLOCK(&so->so_rcv); - toep->rx_credits = bufsize - tp->rcv_wnd; - tp->iss = iss; tcp_sendseqinit(tp); tp->snd_una = iss + 1; @@ -483,37 +473,29 @@ t4_rcvd_locked(struct toedev *tod, struct tcpcb *tp) struct socket *so = inp->inp_socket; struct sockbuf *sb = &so->so_rcv; struct toepcb *toep = tp->t_toe; - int credits; + int rx_credits; INP_WLOCK_ASSERT(inp); - SOCKBUF_LOCK_ASSERT(sb); - KASSERT(toep->sb_cc >= sbused(sb), - ("%s: sb %p has more data (%d) than last time (%d).", - __func__, sb, sbused(sb), toep->sb_cc)); - credits = toep->sb_cc - sbused(sb); - toep->sb_cc = sbused(sb); + rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0; if (toep->ulp_mode == ULP_MODE_TLS) { - if (toep->tls.rcv_over >= credits) { - toep->tls.rcv_over -= credits; - credits = 0; + if (toep->tls.rcv_over >= rx_credits) { + toep->tls.rcv_over -= rx_credits; + rx_credits = 0; } else { - credits -= toep->tls.rcv_over; + rx_credits -= toep->tls.rcv_over; toep->tls.rcv_over = 0; } } - toep->rx_credits += credits; - if (toep->rx_credits > 0 && - (tp->rcv_wnd <= 32 * 1024 || toep->rx_credits >= 64 * 1024 || - (toep->rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) || - toep->sb_cc + tp->rcv_wnd < sb->sb_lowat)) { - - credits = send_rx_credits(sc, toep, toep->rx_credits); - toep->rx_credits -= credits; - tp->rcv_wnd += credits; - tp->rcv_adv += credits; + if (rx_credits > 0 && + (tp->rcv_wnd <= 32 * 1024 || rx_credits >= 64 * 1024 || + (rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) || + sbused(sb) + tp->rcv_wnd < sb->sb_lowat)) { + rx_credits = send_rx_credits(sc, toep, rx_credits); + tp->rcv_wnd += rx_credits; + tp->rcv_adv += rx_credits; } else if (toep->flags & TPF_FORCE_CREDITS) send_rx_modulate(sc, toep); } @@ -1551,7 +1533,7 @@ do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) struct socket *so; struct sockbuf *sb; struct epoch_tracker et; - int len; + int len, rx_credits; uint32_t ddp_placed = 0; if (__predict_false(toep->flags & TPF_SYNQE)) { @@ -1636,8 +1618,6 @@ do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) if (!sbreserve_locked(sb, newsize, so, NULL)) sb->sb_flags &= ~SB_AUTOSIZE; - else - toep->rx_credits += newsize - hiwat; } if (toep->ulp_mode == ULP_MODE_TCPDDP) { @@ -1675,19 +1655,12 @@ do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) } } - KASSERT(toep->sb_cc >= sbused(sb), - ("%s: sb %p has more data (%d) than last time (%d).", - __func__, sb, sbused(sb), toep->sb_cc)); - toep->rx_credits += toep->sb_cc - sbused(sb); sbappendstream_locked(sb, m, 0); - toep->sb_cc = sbused(sb); - if (toep->rx_credits > 0 && toep->sb_cc + tp->rcv_wnd < sb->sb_lowat) { - int credits; - - credits = send_rx_credits(sc, toep, toep->rx_credits); - toep->rx_credits -= credits; - tp->rcv_wnd += credits; - tp->rcv_adv += credits; + rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0; + if (rx_credits > 0 && sbused(sb) + tp->rcv_wnd < sb->sb_lowat) { + rx_credits = send_rx_credits(sc, toep, rx_credits); + tp->rcv_wnd += rx_credits; + tp->rcv_adv += rx_credits; } if (toep->ulp_mode == ULP_MODE_TCPDDP && toep->ddp.waiting_count > 0 && diff --git a/sys/dev/cxgbe/tom/t4_ddp.c b/sys/dev/cxgbe/tom/t4_ddp.c index 9660ae83c806..47062e634bbf 100644 --- a/sys/dev/cxgbe/tom/t4_ddp.c +++ b/sys/dev/cxgbe/tom/t4_ddp.c @@ -303,9 +303,6 @@ insert_ddp_data(struct toepcb *toep, uint32_t n) #ifndef USE_DDP_RX_FLOW_CONTROL KASSERT(tp->rcv_wnd >= n, ("%s: negative window size", __func__)); tp->rcv_wnd -= n; -#endif -#ifndef USE_DDP_RX_FLOW_CONTROL - toep->rx_credits += n; #endif CTR2(KTR_CXGBE, "%s: placed %u bytes before falling out of DDP", __func__, n); @@ -556,16 +553,10 @@ handle_ddp_data(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt, int len) if (!sbreserve_locked(sb, newsize, so, NULL)) sb->sb_flags &= ~SB_AUTOSIZE; - else - toep->rx_credits += newsize - hiwat; } SOCKBUF_UNLOCK(sb); CURVNET_RESTORE(); -#ifndef USE_DDP_RX_FLOW_CONTROL - toep->rx_credits += len; -#endif - job->msgrcv = 1; if (db->cancel_pending) { /* @@ -714,12 +705,9 @@ handle_ddp_close(struct toepcb *toep, struct tcpcb *tp, __be32 rcv_nxt) INP_WLOCK_ASSERT(toep->inp); DDP_ASSERT_LOCKED(toep); - len = be32toh(rcv_nxt) - tp->rcv_nxt; + len = be32toh(rcv_nxt) - tp->rcv_nxt; tp->rcv_nxt += len; -#ifndef USE_DDP_RX_FLOW_CONTROL - toep->rx_credits += len; -#endif while (toep->ddp.active_count > 0) { MPASS(toep->ddp.active_id != -1); diff --git a/sys/dev/cxgbe/tom/t4_listen.c b/sys/dev/cxgbe/tom/t4_listen.c index 314d82dffff0..1e47d0123bb9 100644 --- a/sys/dev/cxgbe/tom/t4_listen.c +++ b/sys/dev/cxgbe/tom/t4_listen.c @@ -1400,7 +1400,6 @@ do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss, mtu_idx = find_best_mtu_idx(sc, &inc, &settings); rscale = cpl->tcpopt.wsf && V_tcp_do_rfc1323 ? select_rcv_wscale() : 0; - /* opt0 rcv_bufsiz initially, assumes its normal meaning later */ wnd = max(so->sol_sbrcv_hiwat, MIN_RCV_WND); wnd = min(wnd, MAX_RCV_WND); rx_credits = min(wnd >> 10, M_RCV_BUFSIZ); @@ -1552,8 +1551,7 @@ do_pass_establish(struct sge_iq *iq, const struct rss_header *rss, toep->l2te = &sc->l2t->l2tab[synqe->l2e_idx]; toep->vnet = lctx->vnet; set_ulp_mode(toep, synqe->ulp_mode); - /* opt0 rcv_bufsiz initially, assumes its normal meaning later */ - toep->rx_credits = synqe->rcv_bufsize; + toep->opt0_rcv_bufsize = synqe->rcv_bufsize; MPASS(be32toh(cpl->snd_isn) - 1 == synqe->iss); MPASS(be32toh(cpl->rcv_isn) - 1 == synqe->irs); diff --git a/sys/dev/cxgbe/tom/t4_tls.c b/sys/dev/cxgbe/tom/t4_tls.c index 5eca394aa880..d030f68e3bc8 100644 --- a/sys/dev/cxgbe/tom/t4_tls.c +++ b/sys/dev/cxgbe/tom/t4_tls.c @@ -1458,7 +1458,7 @@ do_rx_tls_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) struct socket *so; struct sockbuf *sb; struct mbuf *tls_data; - int len, pdu_length, pdu_overhead, sb_length; + int len, pdu_length, rx_credits; KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__)); KASSERT(!(toep->flags & TPF_SYNQE), @@ -1562,24 +1562,10 @@ do_rx_tls_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) } /* - * Not all of the bytes on the wire are included in the socket - * buffer (e.g. the MAC of the TLS record). However, those - * bytes are included in the TCP sequence space. To handle - * this, compute the delta for this TLS record in - * 'pdu_overhead' and treat those bytes as having already been - * "read" by the application for the purposes of expanding the - * window. The meat of the TLS record passed to the - * application ('sb_length') will still not be counted as - * "read" until userland actually reads the bytes. - * - * XXX: Some of the calculations below are probably still not - * really correct. + * Not all of the bytes on the wire are included in the socket buffer + * (e.g. the MAC of the TLS record). However, those bytes are included + * in the TCP sequence space. */ - sb_length = m->m_pkthdr.len; - pdu_overhead = pdu_length - sb_length; - toep->rx_credits += pdu_overhead; - tp->rcv_wnd += pdu_overhead; - tp->rcv_adv += pdu_overhead; /* receive buffer autosize */ MPASS(toep->vnet == so->so_vnet); @@ -1587,34 +1573,25 @@ do_rx_tls_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) if (sb->sb_flags & SB_AUTOSIZE && V_tcp_do_autorcvbuf && sb->sb_hiwat < V_tcp_autorcvbuf_max && - sb_length > (sbspace(sb) / 8 * 7)) { + m->m_pkthdr.len > (sbspace(sb) / 8 * 7)) { unsigned int hiwat = sb->sb_hiwat; unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc, V_tcp_autorcvbuf_max); if (!sbreserve_locked(sb, newsize, so, NULL)) sb->sb_flags &= ~SB_AUTOSIZE; - else - toep->rx_credits += newsize - hiwat; } - KASSERT(toep->sb_cc >= sbused(sb), - ("%s: sb %p has more data (%d) than last time (%d).", - __func__, sb, sbused(sb), toep->sb_cc)); - toep->rx_credits += toep->sb_cc - sbused(sb); sbappendstream_locked(sb, m, 0); - toep->sb_cc = sbused(sb); + rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0; #ifdef VERBOSE_TRACES CTR5(KTR_CXGBE, "%s: tid %u PDU overhead %d rx_credits %u rcv_wnd %u", - __func__, tid, pdu_overhead, toep->rx_credits, tp->rcv_wnd); + __func__, tid, pdu_overhead, rx_credits, tp->rcv_wnd); #endif - if (toep->rx_credits > 0 && toep->sb_cc + tp->rcv_wnd < sb->sb_lowat) { - int credits; - - credits = send_rx_credits(sc, toep, toep->rx_credits); - toep->rx_credits -= credits; - tp->rcv_wnd += credits; - tp->rcv_adv += credits; + if (rx_credits > 0 && sbused(sb) + tp->rcv_wnd < sb->sb_lowat) { + rx_credits = send_rx_credits(sc, toep, rx_credits); + tp->rcv_wnd += rx_credits; + tp->rcv_adv += rx_credits; } sorwakeup_locked(so); diff --git a/sys/dev/cxgbe/tom/t4_tom.h b/sys/dev/cxgbe/tom/t4_tom.h index f32091755b95..db9e50cd39e1 100644 --- a/sys/dev/cxgbe/tom/t4_tom.h +++ b/sys/dev/cxgbe/tom/t4_tom.h @@ -181,9 +181,7 @@ struct toepcb { u_int tx_nocompl; /* tx WR credits since last compl request */ u_int plen_nocompl; /* payload since last compl request */ - /* rx credit handling */ - u_int sb_cc; /* last noted value of so_rcv->sb_cc */ - int rx_credits; /* rx credits (in bytes) to be returned to hw */ + int opt0_rcv_bufsize; /* XXX: save full opt0/opt2 for later? */ u_int ulp_mode; /* ULP mode */ void *ulpcb;