cxgbe/t4_tom: adjust the hardware receive window to match changes to the

receive sockbuf's high water mark.

Calculate rx credits on the spot instead of tracking sbused/sb_cc and
rx_credits in the toepcb.  The previous method worked when the high
water mark changed due to SB_AUTOSIZE but not when it was adjusted
directly (for example, by the soreserve in nfsrvd_addsock).

This fixes a connection hang while running iozone over an NFS mounted
share where nfsd's TCP sockets are being handled by t4_tom.

MFC after:	3 days
Sponsored by:	Chelsio Communications
This commit is contained in:
Navdeep Parhar 2019-06-01 03:03:48 +00:00
parent 4420fc895f
commit ebb8639822
7 changed files with 37 additions and 105 deletions

View File

@ -398,7 +398,6 @@ do_rx_iscsi_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
tp->t_rcvtime = ticks;
/* update rx credits */
toep->rx_credits += pdu_len;
t4_rcvd(&toep->td->tod, tp); /* XXX: sc->tom_softc.tod */
so = inp->inp_socket;

View File

@ -385,8 +385,7 @@ t4_connect(struct toedev *tod, struct socket *so, struct rtentry *rt,
toep->vnet = so->so_vnet;
set_ulp_mode(toep, select_ulp_mode(so, sc, &settings));
SOCKBUF_LOCK(&so->so_rcv);
/* opt0 rcv_bufsiz initially, assumes its normal meaning later */
toep->rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ);
toep->opt0_rcv_bufsize = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ);
SOCKBUF_UNLOCK(&so->so_rcv);
/*
@ -440,7 +439,7 @@ t4_connect(struct toedev *tod, struct socket *so, struct rtentry *rt,
cpl->peer_ip_hi = *(uint64_t *)&inp->in6p_faddr.s6_addr[0];
cpl->peer_ip_lo = *(uint64_t *)&inp->in6p_faddr.s6_addr[8];
cpl->opt0 = calc_opt0(so, vi, toep->l2te, mtu_idx, rscale,
toep->rx_credits, toep->ulp_mode, &settings);
toep->opt0_rcv_bufsize, toep->ulp_mode, &settings);
cpl->opt2 = calc_opt2a(so, toep, &settings);
} else {
struct cpl_act_open_req *cpl = wrtod(wr);
@ -469,7 +468,7 @@ t4_connect(struct toedev *tod, struct socket *so, struct rtentry *rt,
inp_4tuple_get(inp, &cpl->local_ip, &cpl->local_port,
&cpl->peer_ip, &cpl->peer_port);
cpl->opt0 = calc_opt0(so, vi, toep->l2te, mtu_idx, rscale,
toep->rx_credits, toep->ulp_mode, &settings);
toep->opt0_rcv_bufsize, toep->ulp_mode, &settings);
cpl->opt2 = calc_opt2a(so, toep, &settings);
}

View File

@ -399,20 +399,10 @@ make_established(struct toepcb *toep, uint32_t iss, uint32_t irs, uint16_t opt)
tp->irs = irs;
tcp_rcvseqinit(tp);
tp->rcv_wnd = toep->rx_credits << 10;
tp->rcv_wnd = toep->opt0_rcv_bufsize << 10;
tp->rcv_adv += tp->rcv_wnd;
tp->last_ack_sent = tp->rcv_nxt;
/*
* If we were unable to send all rx credits via opt0, save the remainder
* in rx_credits so that they can be handed over with the next credit
* update.
*/
SOCKBUF_LOCK(&so->so_rcv);
bufsize = select_rcv_wnd(so);
SOCKBUF_UNLOCK(&so->so_rcv);
toep->rx_credits = bufsize - tp->rcv_wnd;
tp->iss = iss;
tcp_sendseqinit(tp);
tp->snd_una = iss + 1;
@ -483,37 +473,29 @@ t4_rcvd_locked(struct toedev *tod, struct tcpcb *tp)
struct socket *so = inp->inp_socket;
struct sockbuf *sb = &so->so_rcv;
struct toepcb *toep = tp->t_toe;
int credits;
int rx_credits;
INP_WLOCK_ASSERT(inp);
SOCKBUF_LOCK_ASSERT(sb);
KASSERT(toep->sb_cc >= sbused(sb),
("%s: sb %p has more data (%d) than last time (%d).",
__func__, sb, sbused(sb), toep->sb_cc));
credits = toep->sb_cc - sbused(sb);
toep->sb_cc = sbused(sb);
rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0;
if (toep->ulp_mode == ULP_MODE_TLS) {
if (toep->tls.rcv_over >= credits) {
toep->tls.rcv_over -= credits;
credits = 0;
if (toep->tls.rcv_over >= rx_credits) {
toep->tls.rcv_over -= rx_credits;
rx_credits = 0;
} else {
credits -= toep->tls.rcv_over;
rx_credits -= toep->tls.rcv_over;
toep->tls.rcv_over = 0;
}
}
toep->rx_credits += credits;
if (toep->rx_credits > 0 &&
(tp->rcv_wnd <= 32 * 1024 || toep->rx_credits >= 64 * 1024 ||
(toep->rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) ||
toep->sb_cc + tp->rcv_wnd < sb->sb_lowat)) {
credits = send_rx_credits(sc, toep, toep->rx_credits);
toep->rx_credits -= credits;
tp->rcv_wnd += credits;
tp->rcv_adv += credits;
if (rx_credits > 0 &&
(tp->rcv_wnd <= 32 * 1024 || rx_credits >= 64 * 1024 ||
(rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) ||
sbused(sb) + tp->rcv_wnd < sb->sb_lowat)) {
rx_credits = send_rx_credits(sc, toep, rx_credits);
tp->rcv_wnd += rx_credits;
tp->rcv_adv += rx_credits;
} else if (toep->flags & TPF_FORCE_CREDITS)
send_rx_modulate(sc, toep);
}
@ -1551,7 +1533,7 @@ do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
struct socket *so;
struct sockbuf *sb;
struct epoch_tracker et;
int len;
int len, rx_credits;
uint32_t ddp_placed = 0;
if (__predict_false(toep->flags & TPF_SYNQE)) {
@ -1636,8 +1618,6 @@ do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
if (!sbreserve_locked(sb, newsize, so, NULL))
sb->sb_flags &= ~SB_AUTOSIZE;
else
toep->rx_credits += newsize - hiwat;
}
if (toep->ulp_mode == ULP_MODE_TCPDDP) {
@ -1675,19 +1655,12 @@ do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
}
}
KASSERT(toep->sb_cc >= sbused(sb),
("%s: sb %p has more data (%d) than last time (%d).",
__func__, sb, sbused(sb), toep->sb_cc));
toep->rx_credits += toep->sb_cc - sbused(sb);
sbappendstream_locked(sb, m, 0);
toep->sb_cc = sbused(sb);
if (toep->rx_credits > 0 && toep->sb_cc + tp->rcv_wnd < sb->sb_lowat) {
int credits;
credits = send_rx_credits(sc, toep, toep->rx_credits);
toep->rx_credits -= credits;
tp->rcv_wnd += credits;
tp->rcv_adv += credits;
rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0;
if (rx_credits > 0 && sbused(sb) + tp->rcv_wnd < sb->sb_lowat) {
rx_credits = send_rx_credits(sc, toep, rx_credits);
tp->rcv_wnd += rx_credits;
tp->rcv_adv += rx_credits;
}
if (toep->ulp_mode == ULP_MODE_TCPDDP && toep->ddp.waiting_count > 0 &&

View File

@ -303,9 +303,6 @@ insert_ddp_data(struct toepcb *toep, uint32_t n)
#ifndef USE_DDP_RX_FLOW_CONTROL
KASSERT(tp->rcv_wnd >= n, ("%s: negative window size", __func__));
tp->rcv_wnd -= n;
#endif
#ifndef USE_DDP_RX_FLOW_CONTROL
toep->rx_credits += n;
#endif
CTR2(KTR_CXGBE, "%s: placed %u bytes before falling out of DDP",
__func__, n);
@ -556,16 +553,10 @@ handle_ddp_data(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt, int len)
if (!sbreserve_locked(sb, newsize, so, NULL))
sb->sb_flags &= ~SB_AUTOSIZE;
else
toep->rx_credits += newsize - hiwat;
}
SOCKBUF_UNLOCK(sb);
CURVNET_RESTORE();
#ifndef USE_DDP_RX_FLOW_CONTROL
toep->rx_credits += len;
#endif
job->msgrcv = 1;
if (db->cancel_pending) {
/*
@ -714,12 +705,9 @@ handle_ddp_close(struct toepcb *toep, struct tcpcb *tp, __be32 rcv_nxt)
INP_WLOCK_ASSERT(toep->inp);
DDP_ASSERT_LOCKED(toep);
len = be32toh(rcv_nxt) - tp->rcv_nxt;
len = be32toh(rcv_nxt) - tp->rcv_nxt;
tp->rcv_nxt += len;
#ifndef USE_DDP_RX_FLOW_CONTROL
toep->rx_credits += len;
#endif
while (toep->ddp.active_count > 0) {
MPASS(toep->ddp.active_id != -1);

View File

@ -1400,7 +1400,6 @@ do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss,
mtu_idx = find_best_mtu_idx(sc, &inc, &settings);
rscale = cpl->tcpopt.wsf && V_tcp_do_rfc1323 ? select_rcv_wscale() : 0;
/* opt0 rcv_bufsiz initially, assumes its normal meaning later */
wnd = max(so->sol_sbrcv_hiwat, MIN_RCV_WND);
wnd = min(wnd, MAX_RCV_WND);
rx_credits = min(wnd >> 10, M_RCV_BUFSIZ);
@ -1552,8 +1551,7 @@ do_pass_establish(struct sge_iq *iq, const struct rss_header *rss,
toep->l2te = &sc->l2t->l2tab[synqe->l2e_idx];
toep->vnet = lctx->vnet;
set_ulp_mode(toep, synqe->ulp_mode);
/* opt0 rcv_bufsiz initially, assumes its normal meaning later */
toep->rx_credits = synqe->rcv_bufsize;
toep->opt0_rcv_bufsize = synqe->rcv_bufsize;
MPASS(be32toh(cpl->snd_isn) - 1 == synqe->iss);
MPASS(be32toh(cpl->rcv_isn) - 1 == synqe->irs);

View File

@ -1458,7 +1458,7 @@ do_rx_tls_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
struct socket *so;
struct sockbuf *sb;
struct mbuf *tls_data;
int len, pdu_length, pdu_overhead, sb_length;
int len, pdu_length, rx_credits;
KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
KASSERT(!(toep->flags & TPF_SYNQE),
@ -1562,24 +1562,10 @@ do_rx_tls_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
}
/*
* Not all of the bytes on the wire are included in the socket
* buffer (e.g. the MAC of the TLS record). However, those
* bytes are included in the TCP sequence space. To handle
* this, compute the delta for this TLS record in
* 'pdu_overhead' and treat those bytes as having already been
* "read" by the application for the purposes of expanding the
* window. The meat of the TLS record passed to the
* application ('sb_length') will still not be counted as
* "read" until userland actually reads the bytes.
*
* XXX: Some of the calculations below are probably still not
* really correct.
* Not all of the bytes on the wire are included in the socket buffer
* (e.g. the MAC of the TLS record). However, those bytes are included
* in the TCP sequence space.
*/
sb_length = m->m_pkthdr.len;
pdu_overhead = pdu_length - sb_length;
toep->rx_credits += pdu_overhead;
tp->rcv_wnd += pdu_overhead;
tp->rcv_adv += pdu_overhead;
/* receive buffer autosize */
MPASS(toep->vnet == so->so_vnet);
@ -1587,34 +1573,25 @@ do_rx_tls_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
if (sb->sb_flags & SB_AUTOSIZE &&
V_tcp_do_autorcvbuf &&
sb->sb_hiwat < V_tcp_autorcvbuf_max &&
sb_length > (sbspace(sb) / 8 * 7)) {
m->m_pkthdr.len > (sbspace(sb) / 8 * 7)) {
unsigned int hiwat = sb->sb_hiwat;
unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc,
V_tcp_autorcvbuf_max);
if (!sbreserve_locked(sb, newsize, so, NULL))
sb->sb_flags &= ~SB_AUTOSIZE;
else
toep->rx_credits += newsize - hiwat;
}
KASSERT(toep->sb_cc >= sbused(sb),
("%s: sb %p has more data (%d) than last time (%d).",
__func__, sb, sbused(sb), toep->sb_cc));
toep->rx_credits += toep->sb_cc - sbused(sb);
sbappendstream_locked(sb, m, 0);
toep->sb_cc = sbused(sb);
rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0;
#ifdef VERBOSE_TRACES
CTR5(KTR_CXGBE, "%s: tid %u PDU overhead %d rx_credits %u rcv_wnd %u",
__func__, tid, pdu_overhead, toep->rx_credits, tp->rcv_wnd);
__func__, tid, pdu_overhead, rx_credits, tp->rcv_wnd);
#endif
if (toep->rx_credits > 0 && toep->sb_cc + tp->rcv_wnd < sb->sb_lowat) {
int credits;
credits = send_rx_credits(sc, toep, toep->rx_credits);
toep->rx_credits -= credits;
tp->rcv_wnd += credits;
tp->rcv_adv += credits;
if (rx_credits > 0 && sbused(sb) + tp->rcv_wnd < sb->sb_lowat) {
rx_credits = send_rx_credits(sc, toep, rx_credits);
tp->rcv_wnd += rx_credits;
tp->rcv_adv += rx_credits;
}
sorwakeup_locked(so);

View File

@ -181,9 +181,7 @@ struct toepcb {
u_int tx_nocompl; /* tx WR credits since last compl request */
u_int plen_nocompl; /* payload since last compl request */
/* rx credit handling */
u_int sb_cc; /* last noted value of so_rcv->sb_cc */
int rx_credits; /* rx credits (in bytes) to be returned to hw */
int opt0_rcv_bufsize; /* XXX: save full opt0/opt2 for later? */
u_int ulp_mode; /* ULP mode */
void *ulpcb;