cxgbe/t4_tom: adjust the hardware receive window to match changes to the
receive sockbuf's high water mark. Calculate rx credits on the spot instead of tracking sbused/sb_cc and rx_credits in the toepcb. The previous method worked when the high water mark changed due to SB_AUTOSIZE but not when it was adjusted directly (for example, by the soreserve in nfsrvd_addsock). This fixes a connection hang while running iozone over an NFS mounted share where nfsd's TCP sockets are being handled by t4_tom. MFC after: 3 days Sponsored by: Chelsio Communications
This commit is contained in:
parent
c50c113c55
commit
5a334970e3
@ -398,7 +398,6 @@ do_rx_iscsi_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
|
||||
tp->t_rcvtime = ticks;
|
||||
|
||||
/* update rx credits */
|
||||
toep->rx_credits += pdu_len;
|
||||
t4_rcvd(&toep->td->tod, tp); /* XXX: sc->tom_softc.tod */
|
||||
|
||||
so = inp->inp_socket;
|
||||
|
@ -385,8 +385,7 @@ t4_connect(struct toedev *tod, struct socket *so, struct rtentry *rt,
|
||||
toep->vnet = so->so_vnet;
|
||||
set_ulp_mode(toep, select_ulp_mode(so, sc, &settings));
|
||||
SOCKBUF_LOCK(&so->so_rcv);
|
||||
/* opt0 rcv_bufsiz initially, assumes its normal meaning later */
|
||||
toep->rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ);
|
||||
toep->opt0_rcv_bufsize = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ);
|
||||
SOCKBUF_UNLOCK(&so->so_rcv);
|
||||
|
||||
/*
|
||||
@ -440,7 +439,7 @@ t4_connect(struct toedev *tod, struct socket *so, struct rtentry *rt,
|
||||
cpl->peer_ip_hi = *(uint64_t *)&inp->in6p_faddr.s6_addr[0];
|
||||
cpl->peer_ip_lo = *(uint64_t *)&inp->in6p_faddr.s6_addr[8];
|
||||
cpl->opt0 = calc_opt0(so, vi, toep->l2te, mtu_idx, rscale,
|
||||
toep->rx_credits, toep->ulp_mode, &settings);
|
||||
toep->opt0_rcv_bufsize, toep->ulp_mode, &settings);
|
||||
cpl->opt2 = calc_opt2a(so, toep, &settings);
|
||||
} else {
|
||||
struct cpl_act_open_req *cpl = wrtod(wr);
|
||||
@ -469,7 +468,7 @@ t4_connect(struct toedev *tod, struct socket *so, struct rtentry *rt,
|
||||
inp_4tuple_get(inp, &cpl->local_ip, &cpl->local_port,
|
||||
&cpl->peer_ip, &cpl->peer_port);
|
||||
cpl->opt0 = calc_opt0(so, vi, toep->l2te, mtu_idx, rscale,
|
||||
toep->rx_credits, toep->ulp_mode, &settings);
|
||||
toep->opt0_rcv_bufsize, toep->ulp_mode, &settings);
|
||||
cpl->opt2 = calc_opt2a(so, toep, &settings);
|
||||
}
|
||||
|
||||
|
@ -399,20 +399,10 @@ make_established(struct toepcb *toep, uint32_t iss, uint32_t irs, uint16_t opt)
|
||||
|
||||
tp->irs = irs;
|
||||
tcp_rcvseqinit(tp);
|
||||
tp->rcv_wnd = toep->rx_credits << 10;
|
||||
tp->rcv_wnd = toep->opt0_rcv_bufsize << 10;
|
||||
tp->rcv_adv += tp->rcv_wnd;
|
||||
tp->last_ack_sent = tp->rcv_nxt;
|
||||
|
||||
/*
|
||||
* If we were unable to send all rx credits via opt0, save the remainder
|
||||
* in rx_credits so that they can be handed over with the next credit
|
||||
* update.
|
||||
*/
|
||||
SOCKBUF_LOCK(&so->so_rcv);
|
||||
bufsize = select_rcv_wnd(so);
|
||||
SOCKBUF_UNLOCK(&so->so_rcv);
|
||||
toep->rx_credits = bufsize - tp->rcv_wnd;
|
||||
|
||||
tp->iss = iss;
|
||||
tcp_sendseqinit(tp);
|
||||
tp->snd_una = iss + 1;
|
||||
@ -483,37 +473,29 @@ t4_rcvd_locked(struct toedev *tod, struct tcpcb *tp)
|
||||
struct socket *so = inp->inp_socket;
|
||||
struct sockbuf *sb = &so->so_rcv;
|
||||
struct toepcb *toep = tp->t_toe;
|
||||
int credits;
|
||||
int rx_credits;
|
||||
|
||||
INP_WLOCK_ASSERT(inp);
|
||||
|
||||
SOCKBUF_LOCK_ASSERT(sb);
|
||||
KASSERT(toep->sb_cc >= sbused(sb),
|
||||
("%s: sb %p has more data (%d) than last time (%d).",
|
||||
__func__, sb, sbused(sb), toep->sb_cc));
|
||||
|
||||
credits = toep->sb_cc - sbused(sb);
|
||||
toep->sb_cc = sbused(sb);
|
||||
rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0;
|
||||
if (toep->ulp_mode == ULP_MODE_TLS) {
|
||||
if (toep->tls.rcv_over >= credits) {
|
||||
toep->tls.rcv_over -= credits;
|
||||
credits = 0;
|
||||
if (toep->tls.rcv_over >= rx_credits) {
|
||||
toep->tls.rcv_over -= rx_credits;
|
||||
rx_credits = 0;
|
||||
} else {
|
||||
credits -= toep->tls.rcv_over;
|
||||
rx_credits -= toep->tls.rcv_over;
|
||||
toep->tls.rcv_over = 0;
|
||||
}
|
||||
}
|
||||
toep->rx_credits += credits;
|
||||
|
||||
if (toep->rx_credits > 0 &&
|
||||
(tp->rcv_wnd <= 32 * 1024 || toep->rx_credits >= 64 * 1024 ||
|
||||
(toep->rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) ||
|
||||
toep->sb_cc + tp->rcv_wnd < sb->sb_lowat)) {
|
||||
|
||||
credits = send_rx_credits(sc, toep, toep->rx_credits);
|
||||
toep->rx_credits -= credits;
|
||||
tp->rcv_wnd += credits;
|
||||
tp->rcv_adv += credits;
|
||||
if (rx_credits > 0 &&
|
||||
(tp->rcv_wnd <= 32 * 1024 || rx_credits >= 64 * 1024 ||
|
||||
(rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) ||
|
||||
sbused(sb) + tp->rcv_wnd < sb->sb_lowat)) {
|
||||
rx_credits = send_rx_credits(sc, toep, rx_credits);
|
||||
tp->rcv_wnd += rx_credits;
|
||||
tp->rcv_adv += rx_credits;
|
||||
} else if (toep->flags & TPF_FORCE_CREDITS)
|
||||
send_rx_modulate(sc, toep);
|
||||
}
|
||||
@ -1551,7 +1533,7 @@ do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
|
||||
struct socket *so;
|
||||
struct sockbuf *sb;
|
||||
struct epoch_tracker et;
|
||||
int len;
|
||||
int len, rx_credits;
|
||||
uint32_t ddp_placed = 0;
|
||||
|
||||
if (__predict_false(toep->flags & TPF_SYNQE)) {
|
||||
@ -1636,8 +1618,6 @@ do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
|
||||
|
||||
if (!sbreserve_locked(sb, newsize, so, NULL))
|
||||
sb->sb_flags &= ~SB_AUTOSIZE;
|
||||
else
|
||||
toep->rx_credits += newsize - hiwat;
|
||||
}
|
||||
|
||||
if (toep->ulp_mode == ULP_MODE_TCPDDP) {
|
||||
@ -1675,19 +1655,12 @@ do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
|
||||
}
|
||||
}
|
||||
|
||||
KASSERT(toep->sb_cc >= sbused(sb),
|
||||
("%s: sb %p has more data (%d) than last time (%d).",
|
||||
__func__, sb, sbused(sb), toep->sb_cc));
|
||||
toep->rx_credits += toep->sb_cc - sbused(sb);
|
||||
sbappendstream_locked(sb, m, 0);
|
||||
toep->sb_cc = sbused(sb);
|
||||
if (toep->rx_credits > 0 && toep->sb_cc + tp->rcv_wnd < sb->sb_lowat) {
|
||||
int credits;
|
||||
|
||||
credits = send_rx_credits(sc, toep, toep->rx_credits);
|
||||
toep->rx_credits -= credits;
|
||||
tp->rcv_wnd += credits;
|
||||
tp->rcv_adv += credits;
|
||||
rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0;
|
||||
if (rx_credits > 0 && sbused(sb) + tp->rcv_wnd < sb->sb_lowat) {
|
||||
rx_credits = send_rx_credits(sc, toep, rx_credits);
|
||||
tp->rcv_wnd += rx_credits;
|
||||
tp->rcv_adv += rx_credits;
|
||||
}
|
||||
|
||||
if (toep->ulp_mode == ULP_MODE_TCPDDP && toep->ddp.waiting_count > 0 &&
|
||||
|
@ -303,9 +303,6 @@ insert_ddp_data(struct toepcb *toep, uint32_t n)
|
||||
#ifndef USE_DDP_RX_FLOW_CONTROL
|
||||
KASSERT(tp->rcv_wnd >= n, ("%s: negative window size", __func__));
|
||||
tp->rcv_wnd -= n;
|
||||
#endif
|
||||
#ifndef USE_DDP_RX_FLOW_CONTROL
|
||||
toep->rx_credits += n;
|
||||
#endif
|
||||
CTR2(KTR_CXGBE, "%s: placed %u bytes before falling out of DDP",
|
||||
__func__, n);
|
||||
@ -556,16 +553,10 @@ handle_ddp_data(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt, int len)
|
||||
|
||||
if (!sbreserve_locked(sb, newsize, so, NULL))
|
||||
sb->sb_flags &= ~SB_AUTOSIZE;
|
||||
else
|
||||
toep->rx_credits += newsize - hiwat;
|
||||
}
|
||||
SOCKBUF_UNLOCK(sb);
|
||||
CURVNET_RESTORE();
|
||||
|
||||
#ifndef USE_DDP_RX_FLOW_CONTROL
|
||||
toep->rx_credits += len;
|
||||
#endif
|
||||
|
||||
job->msgrcv = 1;
|
||||
if (db->cancel_pending) {
|
||||
/*
|
||||
@ -714,12 +705,9 @@ handle_ddp_close(struct toepcb *toep, struct tcpcb *tp, __be32 rcv_nxt)
|
||||
|
||||
INP_WLOCK_ASSERT(toep->inp);
|
||||
DDP_ASSERT_LOCKED(toep);
|
||||
len = be32toh(rcv_nxt) - tp->rcv_nxt;
|
||||
|
||||
len = be32toh(rcv_nxt) - tp->rcv_nxt;
|
||||
tp->rcv_nxt += len;
|
||||
#ifndef USE_DDP_RX_FLOW_CONTROL
|
||||
toep->rx_credits += len;
|
||||
#endif
|
||||
|
||||
while (toep->ddp.active_count > 0) {
|
||||
MPASS(toep->ddp.active_id != -1);
|
||||
|
@ -1400,7 +1400,6 @@ do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss,
|
||||
|
||||
mtu_idx = find_best_mtu_idx(sc, &inc, &settings);
|
||||
rscale = cpl->tcpopt.wsf && V_tcp_do_rfc1323 ? select_rcv_wscale() : 0;
|
||||
/* opt0 rcv_bufsiz initially, assumes its normal meaning later */
|
||||
wnd = max(so->sol_sbrcv_hiwat, MIN_RCV_WND);
|
||||
wnd = min(wnd, MAX_RCV_WND);
|
||||
rx_credits = min(wnd >> 10, M_RCV_BUFSIZ);
|
||||
@ -1552,8 +1551,7 @@ do_pass_establish(struct sge_iq *iq, const struct rss_header *rss,
|
||||
toep->l2te = &sc->l2t->l2tab[synqe->l2e_idx];
|
||||
toep->vnet = lctx->vnet;
|
||||
set_ulp_mode(toep, synqe->ulp_mode);
|
||||
/* opt0 rcv_bufsiz initially, assumes its normal meaning later */
|
||||
toep->rx_credits = synqe->rcv_bufsize;
|
||||
toep->opt0_rcv_bufsize = synqe->rcv_bufsize;
|
||||
|
||||
MPASS(be32toh(cpl->snd_isn) - 1 == synqe->iss);
|
||||
MPASS(be32toh(cpl->rcv_isn) - 1 == synqe->irs);
|
||||
|
@ -1458,7 +1458,7 @@ do_rx_tls_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
|
||||
struct socket *so;
|
||||
struct sockbuf *sb;
|
||||
struct mbuf *tls_data;
|
||||
int len, pdu_length, pdu_overhead, sb_length;
|
||||
int len, pdu_length, rx_credits;
|
||||
|
||||
KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
|
||||
KASSERT(!(toep->flags & TPF_SYNQE),
|
||||
@ -1562,24 +1562,10 @@ do_rx_tls_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
|
||||
}
|
||||
|
||||
/*
|
||||
* Not all of the bytes on the wire are included in the socket
|
||||
* buffer (e.g. the MAC of the TLS record). However, those
|
||||
* bytes are included in the TCP sequence space. To handle
|
||||
* this, compute the delta for this TLS record in
|
||||
* 'pdu_overhead' and treat those bytes as having already been
|
||||
* "read" by the application for the purposes of expanding the
|
||||
* window. The meat of the TLS record passed to the
|
||||
* application ('sb_length') will still not be counted as
|
||||
* "read" until userland actually reads the bytes.
|
||||
*
|
||||
* XXX: Some of the calculations below are probably still not
|
||||
* really correct.
|
||||
* Not all of the bytes on the wire are included in the socket buffer
|
||||
* (e.g. the MAC of the TLS record). However, those bytes are included
|
||||
* in the TCP sequence space.
|
||||
*/
|
||||
sb_length = m->m_pkthdr.len;
|
||||
pdu_overhead = pdu_length - sb_length;
|
||||
toep->rx_credits += pdu_overhead;
|
||||
tp->rcv_wnd += pdu_overhead;
|
||||
tp->rcv_adv += pdu_overhead;
|
||||
|
||||
/* receive buffer autosize */
|
||||
MPASS(toep->vnet == so->so_vnet);
|
||||
@ -1587,34 +1573,25 @@ do_rx_tls_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
|
||||
if (sb->sb_flags & SB_AUTOSIZE &&
|
||||
V_tcp_do_autorcvbuf &&
|
||||
sb->sb_hiwat < V_tcp_autorcvbuf_max &&
|
||||
sb_length > (sbspace(sb) / 8 * 7)) {
|
||||
m->m_pkthdr.len > (sbspace(sb) / 8 * 7)) {
|
||||
unsigned int hiwat = sb->sb_hiwat;
|
||||
unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc,
|
||||
V_tcp_autorcvbuf_max);
|
||||
|
||||
if (!sbreserve_locked(sb, newsize, so, NULL))
|
||||
sb->sb_flags &= ~SB_AUTOSIZE;
|
||||
else
|
||||
toep->rx_credits += newsize - hiwat;
|
||||
}
|
||||
|
||||
KASSERT(toep->sb_cc >= sbused(sb),
|
||||
("%s: sb %p has more data (%d) than last time (%d).",
|
||||
__func__, sb, sbused(sb), toep->sb_cc));
|
||||
toep->rx_credits += toep->sb_cc - sbused(sb);
|
||||
sbappendstream_locked(sb, m, 0);
|
||||
toep->sb_cc = sbused(sb);
|
||||
rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0;
|
||||
#ifdef VERBOSE_TRACES
|
||||
CTR5(KTR_CXGBE, "%s: tid %u PDU overhead %d rx_credits %u rcv_wnd %u",
|
||||
__func__, tid, pdu_overhead, toep->rx_credits, tp->rcv_wnd);
|
||||
__func__, tid, pdu_overhead, rx_credits, tp->rcv_wnd);
|
||||
#endif
|
||||
if (toep->rx_credits > 0 && toep->sb_cc + tp->rcv_wnd < sb->sb_lowat) {
|
||||
int credits;
|
||||
|
||||
credits = send_rx_credits(sc, toep, toep->rx_credits);
|
||||
toep->rx_credits -= credits;
|
||||
tp->rcv_wnd += credits;
|
||||
tp->rcv_adv += credits;
|
||||
if (rx_credits > 0 && sbused(sb) + tp->rcv_wnd < sb->sb_lowat) {
|
||||
rx_credits = send_rx_credits(sc, toep, rx_credits);
|
||||
tp->rcv_wnd += rx_credits;
|
||||
tp->rcv_adv += rx_credits;
|
||||
}
|
||||
|
||||
sorwakeup_locked(so);
|
||||
|
@ -181,9 +181,7 @@ struct toepcb {
|
||||
u_int tx_nocompl; /* tx WR credits since last compl request */
|
||||
u_int plen_nocompl; /* payload since last compl request */
|
||||
|
||||
/* rx credit handling */
|
||||
u_int sb_cc; /* last noted value of so_rcv->sb_cc */
|
||||
int rx_credits; /* rx credits (in bytes) to be returned to hw */
|
||||
int opt0_rcv_bufsize; /* XXX: save full opt0/opt2 for later? */
|
||||
|
||||
u_int ulp_mode; /* ULP mode */
|
||||
void *ulpcb;
|
||||
|
Loading…
Reference in New Issue
Block a user