From 61e02298cea3bee31604640e53da1567271dfa57 Mon Sep 17 00:00:00 2001 From: Navdeep Parhar Date: Mon, 22 Apr 2019 17:48:10 +0000 Subject: [PATCH] cxgbe/t4_tom: Add a "TCB history" feature that samples hardware state for a tid and maintains a running history of some interesting events. Service TCP_INFO queries from the history when the tid is being tracked there. --- sys/dev/cxgbe/common/t4_msg.h | 3 +- sys/dev/cxgbe/tom/t4_tom.c | 454 ++++++++++++++++++++++++++++++---- sys/dev/cxgbe/tom/t4_tom.h | 31 +++ 3 files changed, 432 insertions(+), 56 deletions(-) diff --git a/sys/dev/cxgbe/common/t4_msg.h b/sys/dev/cxgbe/common/t4_msg.h index f616ab91fbac..2a402228d691 100644 --- a/sys/dev/cxgbe/common/t4_msg.h +++ b/sys/dev/cxgbe/common/t4_msg.h @@ -923,7 +923,8 @@ struct cpl_get_tcb { WR_HDR; union opcode_tid ot; __be16 reply_ctrl; - __be16 cookie; + __u8 rsvd; + __u8 cookie; }; /* cpl_get_tcb.reply_ctrl fields */ diff --git a/sys/dev/cxgbe/tom/t4_tom.c b/sys/dev/cxgbe/tom/t4_tom.c index f4f0534d9190..cc055edd3130 100644 --- a/sys/dev/cxgbe/tom/t4_tom.c +++ b/sys/dev/cxgbe/tom/t4_tom.c @@ -386,28 +386,379 @@ t4_ctloutput(struct toedev *tod, struct tcpcb *tp, int dir, int name) } } -static inline int -get_tcb_bit(u_char *tcb, int bit) +static inline uint64_t +get_tcb_tflags(const uint64_t *tcb) { - int ix, shift; - ix = 127 - (bit >> 3); - shift = bit & 0x7; - - return ((tcb[ix] >> shift) & 1); + return ((be64toh(tcb[14]) << 32) | (be64toh(tcb[15]) >> 32)); } -static inline uint64_t -get_tcb_bits(u_char *tcb, int hi, int lo) +static inline uint32_t +get_tcb_field(const uint64_t *tcb, u_int word, uint32_t mask, u_int shift) { - uint64_t rc = 0; +#define LAST_WORD ((TCB_SIZE / 4) - 1) + uint64_t t1, t2; + int flit_idx; - while (hi >= lo) { - rc = (rc << 1) | get_tcb_bit(tcb, hi); - --hi; + MPASS(mask != 0); + MPASS(word <= LAST_WORD); + MPASS(shift < 32); + + flit_idx = (LAST_WORD - word) / 2; + if (word & 0x1) + shift += 32; + t1 = be64toh(tcb[flit_idx]) >> shift; + t2 = 0; + if (fls(mask) > 64 - shift) { + /* + * Will spill over into the next logical flit, which is the flit + * before this one. The flit_idx before this one must be valid. + */ + MPASS(flit_idx > 0); + t2 = be64toh(tcb[flit_idx - 1]) << (64 - shift); + } + return ((t2 | t1) & mask); +#undef LAST_WORD +} +#define GET_TCB_FIELD(tcb, F) \ + get_tcb_field(tcb, W_TCB_##F, M_TCB_##F, S_TCB_##F) + +/* + * Issues a CPL_GET_TCB to read the entire TCB for the tid. + */ +static int +send_get_tcb(struct adapter *sc, u_int tid) +{ + struct cpl_get_tcb *cpl; + struct wrq_cookie cookie; + + MPASS(tid < sc->tids.ntids); + + cpl = start_wrq_wr(&sc->sge.ctrlq[0], howmany(sizeof(*cpl), 16), + &cookie); + if (__predict_false(cpl == NULL)) + return (ENOMEM); + bzero(cpl, sizeof(*cpl)); + INIT_TP_WR(cpl, tid); + OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_GET_TCB, tid)); + cpl->reply_ctrl = htobe16(V_REPLY_CHAN(0) | + V_QUEUENO(sc->sge.ofld_rxq[0].iq.cntxt_id)); + cpl->cookie = 0xff; + commit_wrq_wr(&sc->sge.ctrlq[0], cpl, &cookie); + + return (0); +} + +static struct tcb_histent * +alloc_tcb_histent(struct adapter *sc, u_int tid, int flags) +{ + struct tcb_histent *te; + + MPASS(flags == M_NOWAIT || flags == M_WAITOK); + + te = malloc(sizeof(*te), M_CXGBE, M_ZERO | flags); + if (te == NULL) + return (NULL); + mtx_init(&te->te_lock, "TCB entry", NULL, MTX_DEF); + callout_init_mtx(&te->te_callout, &te->te_lock, 0); + te->te_adapter = sc; + te->te_tid = tid; + + return (te); +} + +static void +free_tcb_histent(struct tcb_histent *te) +{ + + mtx_destroy(&te->te_lock); + free(te, M_CXGBE); +} + +/* + * Start tracking the tid in the TCB history. + */ +int +add_tid_to_history(struct adapter *sc, u_int tid) +{ + struct tcb_histent *te = NULL; + struct tom_data *td = sc->tom_softc; + int rc; + + MPASS(tid < sc->tids.ntids); + + if (td->tcb_history == NULL) + return (ENXIO); + + rw_wlock(&td->tcb_history_lock); + if (td->tcb_history[tid] != NULL) { + rc = EEXIST; + goto done; + } + te = alloc_tcb_histent(sc, tid, M_NOWAIT); + if (te == NULL) { + rc = ENOMEM; + goto done; + } + mtx_lock(&te->te_lock); + rc = send_get_tcb(sc, tid); + if (rc == 0) { + te->te_flags |= TE_RPL_PENDING; + td->tcb_history[tid] = te; + } else { + free(te, M_CXGBE); + } + mtx_unlock(&te->te_lock); +done: + rw_wunlock(&td->tcb_history_lock); + return (rc); +} + +static void +remove_tcb_histent(struct tcb_histent *te) +{ + struct adapter *sc = te->te_adapter; + struct tom_data *td = sc->tom_softc; + + rw_assert(&td->tcb_history_lock, RA_WLOCKED); + mtx_assert(&te->te_lock, MA_OWNED); + MPASS(td->tcb_history[te->te_tid] == te); + + td->tcb_history[te->te_tid] = NULL; + free_tcb_histent(te); + rw_wunlock(&td->tcb_history_lock); +} + +static inline struct tcb_histent * +lookup_tcb_histent(struct adapter *sc, u_int tid, bool addrem) +{ + struct tcb_histent *te; + struct tom_data *td = sc->tom_softc; + + MPASS(tid < sc->tids.ntids); + + if (addrem) + rw_wlock(&td->tcb_history_lock); + else + rw_rlock(&td->tcb_history_lock); + te = td->tcb_history[tid]; + if (te != NULL) { + mtx_lock(&te->te_lock); + return (te); /* with both locks held */ + } + if (addrem) + rw_wunlock(&td->tcb_history_lock); + else + rw_runlock(&td->tcb_history_lock); + + return (te); +} + +static inline void +release_tcb_histent(struct tcb_histent *te) +{ + struct adapter *sc = te->te_adapter; + struct tom_data *td = sc->tom_softc; + + mtx_assert(&te->te_lock, MA_OWNED); + mtx_unlock(&te->te_lock); + rw_assert(&td->tcb_history_lock, RA_RLOCKED); + rw_runlock(&td->tcb_history_lock); +} + +static void +request_tcb(void *arg) +{ + struct tcb_histent *te = arg; + + mtx_assert(&te->te_lock, MA_OWNED); + + /* Noone else is supposed to update the histent. */ + MPASS(!(te->te_flags & TE_RPL_PENDING)); + if (send_get_tcb(te->te_adapter, te->te_tid) == 0) + te->te_flags |= TE_RPL_PENDING; + else + callout_schedule(&te->te_callout, hz / 100); +} + +static void +update_tcb_histent(struct tcb_histent *te, const uint64_t *tcb) +{ + struct tom_data *td = te->te_adapter->tom_softc; + uint64_t tflags = get_tcb_tflags(tcb); + uint8_t sample = 0; + + if (GET_TCB_FIELD(tcb, SND_MAX_RAW) != GET_TCB_FIELD(tcb, SND_UNA_RAW)) { + if (GET_TCB_FIELD(tcb, T_RXTSHIFT) != 0) + sample |= TS_RTO; + if (GET_TCB_FIELD(tcb, T_DUPACKS) != 0) + sample |= TS_DUPACKS; + if (GET_TCB_FIELD(tcb, T_DUPACKS) >= td->dupack_threshold) + sample |= TS_FASTREXMT; } - return (rc); + if (GET_TCB_FIELD(tcb, SND_MAX_RAW) != 0) { + uint32_t snd_wnd; + + sample |= TS_SND_BACKLOGGED; /* for whatever reason. */ + + snd_wnd = GET_TCB_FIELD(tcb, RCV_ADV); + if (tflags & V_TF_RECV_SCALE(1)) + snd_wnd <<= GET_TCB_FIELD(tcb, RCV_SCALE); + if (GET_TCB_FIELD(tcb, SND_CWND) < snd_wnd) + sample |= TS_CWND_LIMITED; /* maybe due to CWND */ + } + + if (tflags & V_TF_CCTRL_ECN(1)) { + + /* + * CE marker on incoming IP hdr, echoing ECE back in the TCP + * hdr. Indicates congestion somewhere on the way from the peer + * to this node. + */ + if (tflags & V_TF_CCTRL_ECE(1)) + sample |= TS_ECN_ECE; + + /* + * ECE seen and CWR sent (or about to be sent). Might indicate + * congestion on the way to the peer. This node is reducing its + * congestion window in response. + */ + if (tflags & (V_TF_CCTRL_CWR(1) | V_TF_CCTRL_RFR(1))) + sample |= TS_ECN_CWR; + } + + te->te_sample[te->te_pidx] = sample; + if (++te->te_pidx == nitems(te->te_sample)) + te->te_pidx = 0; + memcpy(te->te_tcb, tcb, TCB_SIZE); + te->te_flags |= TE_ACTIVE; +} + +static int +do_get_tcb_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) +{ + struct adapter *sc = iq->adapter; + const struct cpl_get_tcb_rpl *cpl = mtod(m, const void *); + const uint64_t *tcb = (const uint64_t *)(const void *)(cpl + 1); + struct tcb_histent *te; + const u_int tid = GET_TID(cpl); + bool remove; + + remove = GET_TCB_FIELD(tcb, T_STATE) == TCPS_CLOSED; + te = lookup_tcb_histent(sc, tid, remove); + if (te == NULL) { + /* Not in the history. Who issued the GET_TCB for this? */ + device_printf(sc->dev, "tcb %u: flags 0x%016jx, state %u, " + "srtt %u, sscale %u, rscale %u, cookie 0x%x\n", tid, + (uintmax_t)get_tcb_tflags(tcb), GET_TCB_FIELD(tcb, T_STATE), + GET_TCB_FIELD(tcb, T_SRTT), GET_TCB_FIELD(tcb, SND_SCALE), + GET_TCB_FIELD(tcb, RCV_SCALE), cpl->cookie); + goto done; + } + + MPASS(te->te_flags & TE_RPL_PENDING); + te->te_flags &= ~TE_RPL_PENDING; + if (remove) { + remove_tcb_histent(te); + } else { + update_tcb_histent(te, tcb); + callout_reset(&te->te_callout, hz / 10, request_tcb, te); + release_tcb_histent(te); + } +done: + m_freem(m); + return (0); +} + +static void +fill_tcp_info_from_tcb(struct adapter *sc, uint64_t *tcb, struct tcp_info *ti) +{ + uint32_t v; + + ti->tcpi_state = GET_TCB_FIELD(tcb, T_STATE); + + v = GET_TCB_FIELD(tcb, T_SRTT); + ti->tcpi_rtt = tcp_ticks_to_us(sc, v); + + v = GET_TCB_FIELD(tcb, T_RTTVAR); + ti->tcpi_rttvar = tcp_ticks_to_us(sc, v); + + ti->tcpi_snd_ssthresh = GET_TCB_FIELD(tcb, SND_SSTHRESH); + ti->tcpi_snd_cwnd = GET_TCB_FIELD(tcb, SND_CWND); + ti->tcpi_rcv_nxt = GET_TCB_FIELD(tcb, RCV_NXT); + + v = GET_TCB_FIELD(tcb, TX_MAX); + ti->tcpi_snd_nxt = v - GET_TCB_FIELD(tcb, SND_NXT_RAW); + + /* Receive window being advertised by us. */ + ti->tcpi_rcv_wscale = GET_TCB_FIELD(tcb, SND_SCALE); /* Yes, SND. */ + ti->tcpi_rcv_space = GET_TCB_FIELD(tcb, RCV_WND); + + /* Send window */ + ti->tcpi_snd_wscale = GET_TCB_FIELD(tcb, RCV_SCALE); /* Yes, RCV. */ + ti->tcpi_snd_wnd = GET_TCB_FIELD(tcb, RCV_ADV); + if (get_tcb_tflags(tcb) & V_TF_RECV_SCALE(1)) + ti->tcpi_snd_wnd <<= ti->tcpi_snd_wscale; + else + ti->tcpi_snd_wscale = 0; + +} + +static void +fill_tcp_info_from_history(struct adapter *sc, struct tcb_histent *te, + struct tcp_info *ti) +{ + + fill_tcp_info_from_tcb(sc, te->te_tcb, ti); +} + +/* + * Reads the TCB for the given tid using a memory window and copies it to 'buf' + * in the same format as CPL_GET_TCB_RPL. + */ +static void +read_tcb_using_memwin(struct adapter *sc, u_int tid, uint64_t *buf) +{ + int i, j, k, rc; + uint32_t addr; + u_char *tcb, tmp; + + MPASS(tid < sc->tids.ntids); + + addr = t4_read_reg(sc, A_TP_CMM_TCB_BASE) + tid * TCB_SIZE; + rc = read_via_memwin(sc, 2, addr, (uint32_t *)buf, TCB_SIZE); + if (rc != 0) + return; + + tcb = (u_char *)buf; + for (i = 0, j = TCB_SIZE - 16; i < j; i += 16, j -= 16) { + for (k = 0; k < 16; k++) { + tmp = tcb[i + k]; + tcb[i + k] = tcb[j + k]; + tcb[j + k] = tmp; + } + } +} + +static void +fill_tcp_info(struct adapter *sc, u_int tid, struct tcp_info *ti) +{ + uint64_t tcb[TCB_SIZE / sizeof(uint64_t)]; + struct tcb_histent *te; + + ti->tcpi_toe_tid = tid; + te = lookup_tcb_histent(sc, tid, false); + if (te != NULL) { + fill_tcp_info_from_history(sc, te, ti); + release_tcb_histent(te); + } else { + if (!(sc->debug_flags & DF_DISABLE_TCB_CACHE)) { + /* XXX: tell firmware to flush TCB cache. */ + } + read_tcb_using_memwin(sc, tid, tcb); + fill_tcp_info_from_tcb(sc, tcb, ti); + } } /* @@ -417,53 +768,13 @@ get_tcb_bits(u_char *tcb, int hi, int lo) static void t4_tcp_info(struct toedev *tod, struct tcpcb *tp, struct tcp_info *ti) { - int i, j, k, rc; struct adapter *sc = tod->tod_softc; struct toepcb *toep = tp->t_toe; - uint32_t addr, v; - uint32_t buf[TCB_SIZE / sizeof(uint32_t)]; - u_char *tcb, tmp; INP_WLOCK_ASSERT(tp->t_inpcb); MPASS(ti != NULL); - ti->tcpi_toe_tid = toep->tid; - - addr = t4_read_reg(sc, A_TP_CMM_TCB_BASE) + toep->tid * TCB_SIZE; - rc = read_via_memwin(sc, 2, addr, &buf[0], TCB_SIZE); - if (rc != 0) - return; - - tcb = (u_char *)&buf[0]; - for (i = 0, j = TCB_SIZE - 16; i < j; i += 16, j -= 16) { - for (k = 0; k < 16; k++) { - tmp = tcb[i + k]; - tcb[i + k] = tcb[j + k]; - tcb[j + k] = tmp; - } - } - - ti->tcpi_state = get_tcb_bits(tcb, 115, 112); - - v = get_tcb_bits(tcb, 271, 256); - ti->tcpi_rtt = tcp_ticks_to_us(sc, v); - - v = get_tcb_bits(tcb, 287, 272); - ti->tcpi_rttvar = tcp_ticks_to_us(sc, v); - - ti->tcpi_snd_ssthresh = get_tcb_bits(tcb, 487, 460); - ti->tcpi_snd_cwnd = get_tcb_bits(tcb, 459, 432); - ti->tcpi_rcv_nxt = get_tcb_bits(tcb, 553, 522); - - ti->tcpi_snd_nxt = get_tcb_bits(tcb, 319, 288) - - get_tcb_bits(tcb, 375, 348); - - /* Receive window being advertised by us. */ - ti->tcpi_rcv_space = get_tcb_bits(tcb, 581, 554); - - /* Send window ceiling. */ - v = get_tcb_bits(tcb, 159, 144) << get_tcb_bits(tcb, 131, 128); - ti->tcpi_snd_wnd = min(v, ti->tcpi_snd_cwnd); + fill_tcp_info(sc, toep->tid, ti); } /* @@ -807,6 +1118,35 @@ alloc_tid_tabs(struct tid_info *t) return (rc); } +static inline void +alloc_tcb_history(struct adapter *sc, struct tom_data *td) +{ + + if (sc->tids.ntids == 0 || sc->tids.ntids > 1024) + return; + rw_init(&td->tcb_history_lock, "TCB history"); + td->tcb_history = malloc(sc->tids.ntids * sizeof(*td->tcb_history), + M_CXGBE, M_ZERO | M_NOWAIT); + td->dupack_threshold = G_DUPACKTHRESH(t4_read_reg(sc, A_TP_PARA_REG0)); +} + +static inline void +free_tcb_history(struct adapter *sc, struct tom_data *td) +{ +#ifdef INVARIANTS + int i; + + if (td->tcb_history != NULL) { + for (i = 0; i < sc->tids.ntids; i++) { + MPASS(td->tcb_history[i] == NULL); + } + } +#endif + free(td->tcb_history, M_CXGBE); + if (rw_initialized(&td->tcb_history_lock)) + rw_destroy(&td->tcb_history_lock); +} + static void free_tom_data(struct adapter *sc, struct tom_data *td) { @@ -830,6 +1170,7 @@ free_tom_data(struct adapter *sc, struct tom_data *td) if (mtx_initialized(&td->toep_list_lock)) mtx_destroy(&td->toep_list_lock); + free_tcb_history(sc, td); free_tid_tabs(&sc->tids); free(td, M_CXGBE); } @@ -1097,6 +1438,8 @@ t4_tom_activate(struct adapter *sc) t4_set_reg_field(sc, A_ULP_RX_TDDP_TAGMASK, V_TDDPTAGMASK(M_TDDPTAGMASK), td->pr.pr_tag_mask); + alloc_tcb_history(sc, td); + /* toedev ops */ tod = &td->tod; init_toedev(tod); @@ -1214,6 +1557,7 @@ t4_tom_mod_load(void) struct protosw *tcp_protosw, *tcp6_protosw; /* CPL handlers */ + t4_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl); t4_register_shared_cpl_handler(CPL_L2T_WRITE_RPL, do_l2t_write_rpl2, CPL_COOKIE_TOM); t4_init_connect_cpl_handlers(); diff --git a/sys/dev/cxgbe/tom/t4_tom.h b/sys/dev/cxgbe/tom/t4_tom.h index 9a9346f3932d..f32091755b95 100644 --- a/sys/dev/cxgbe/tom/t4_tom.h +++ b/sys/dev/cxgbe/tom/t4_tom.h @@ -33,6 +33,7 @@ #ifndef __T4_TOM_H__ #define __T4_TOM_H__ #include +#include "common/t4_hw.h" #include "tom/t4_tls.h" #define LISTEN_HASH_SIZE 32 @@ -254,6 +255,31 @@ struct listen_ctx { struct clip_entry *ce; }; +/* tcb_histent flags */ +#define TE_RPL_PENDING 1 +#define TE_ACTIVE 2 + +/* bits in one 8b tcb_histent sample. */ +#define TS_RTO (1 << 0) +#define TS_DUPACKS (1 << 1) +#define TS_FASTREXMT (1 << 2) +#define TS_SND_BACKLOGGED (1 << 3) +#define TS_CWND_LIMITED (1 << 4) +#define TS_ECN_ECE (1 << 5) +#define TS_ECN_CWR (1 << 6) +#define TS_RESERVED (1 << 7) /* Unused. */ + +struct tcb_histent { + struct mtx te_lock; + struct callout te_callout; + uint64_t te_tcb[TCB_SIZE / sizeof(uint64_t)]; + struct adapter *te_adapter; + u_int te_flags; + u_int te_tid; + uint8_t te_pidx; + uint8_t te_sample[100]; +}; + struct tom_data { struct toedev tod; @@ -268,6 +294,10 @@ struct tom_data { struct ppod_region pr; + struct rwlock tcb_history_lock __aligned(CACHE_LINE_SIZE); + struct tcb_histent **tcb_history; + int dupack_threshold; + /* WRs that will not be sent to the chip because L2 resolution failed */ struct mtx unsent_wr_lock; STAILQ_HEAD(, wrqe) unsent_wr_list; @@ -326,6 +356,7 @@ int select_ulp_mode(struct socket *, struct adapter *, struct offload_settings *); void set_ulp_mode(struct toepcb *, int); int negative_advice(int); +int add_tid_to_history(struct adapter *, u_int); /* t4_connect.c */ void t4_init_connect_cpl_handlers(void);