cxgbe/t4_tom: Add a "TCB history" feature that samples hardware state

for a tid and maintains a running history of some interesting events.

Service TCP_INFO queries from the history when the tid is being tracked
there.
This commit is contained in:
np 2019-04-22 17:48:10 +00:00
parent 638f399f3c
commit 642cf308aa
3 changed files with 432 additions and 56 deletions

View File

@ -923,7 +923,8 @@ struct cpl_get_tcb {
WR_HDR;
union opcode_tid ot;
__be16 reply_ctrl;
__be16 cookie;
__u8 rsvd;
__u8 cookie;
};
/* cpl_get_tcb.reply_ctrl fields */

View File

@ -386,28 +386,379 @@ t4_ctloutput(struct toedev *tod, struct tcpcb *tp, int dir, int name)
}
}
static inline int
get_tcb_bit(u_char *tcb, int bit)
static inline uint64_t
get_tcb_tflags(const uint64_t *tcb)
{
int ix, shift;
ix = 127 - (bit >> 3);
shift = bit & 0x7;
return ((tcb[ix] >> shift) & 1);
return ((be64toh(tcb[14]) << 32) | (be64toh(tcb[15]) >> 32));
}
static inline uint64_t
get_tcb_bits(u_char *tcb, int hi, int lo)
static inline uint32_t
get_tcb_field(const uint64_t *tcb, u_int word, uint32_t mask, u_int shift)
{
uint64_t rc = 0;
#define LAST_WORD ((TCB_SIZE / 4) - 1)
uint64_t t1, t2;
int flit_idx;
while (hi >= lo) {
rc = (rc << 1) | get_tcb_bit(tcb, hi);
--hi;
MPASS(mask != 0);
MPASS(word <= LAST_WORD);
MPASS(shift < 32);
flit_idx = (LAST_WORD - word) / 2;
if (word & 0x1)
shift += 32;
t1 = be64toh(tcb[flit_idx]) >> shift;
t2 = 0;
if (fls(mask) > 64 - shift) {
/*
* Will spill over into the next logical flit, which is the flit
* before this one. The flit_idx before this one must be valid.
*/
MPASS(flit_idx > 0);
t2 = be64toh(tcb[flit_idx - 1]) << (64 - shift);
}
return ((t2 | t1) & mask);
#undef LAST_WORD
}
#define GET_TCB_FIELD(tcb, F) \
get_tcb_field(tcb, W_TCB_##F, M_TCB_##F, S_TCB_##F)
/*
* Issues a CPL_GET_TCB to read the entire TCB for the tid.
*/
static int
send_get_tcb(struct adapter *sc, u_int tid)
{
struct cpl_get_tcb *cpl;
struct wrq_cookie cookie;
MPASS(tid < sc->tids.ntids);
cpl = start_wrq_wr(&sc->sge.ctrlq[0], howmany(sizeof(*cpl), 16),
&cookie);
if (__predict_false(cpl == NULL))
return (ENOMEM);
bzero(cpl, sizeof(*cpl));
INIT_TP_WR(cpl, tid);
OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_GET_TCB, tid));
cpl->reply_ctrl = htobe16(V_REPLY_CHAN(0) |
V_QUEUENO(sc->sge.ofld_rxq[0].iq.cntxt_id));
cpl->cookie = 0xff;
commit_wrq_wr(&sc->sge.ctrlq[0], cpl, &cookie);
return (0);
}
static struct tcb_histent *
alloc_tcb_histent(struct adapter *sc, u_int tid, int flags)
{
struct tcb_histent *te;
MPASS(flags == M_NOWAIT || flags == M_WAITOK);
te = malloc(sizeof(*te), M_CXGBE, M_ZERO | flags);
if (te == NULL)
return (NULL);
mtx_init(&te->te_lock, "TCB entry", NULL, MTX_DEF);
callout_init_mtx(&te->te_callout, &te->te_lock, 0);
te->te_adapter = sc;
te->te_tid = tid;
return (te);
}
static void
free_tcb_histent(struct tcb_histent *te)
{
mtx_destroy(&te->te_lock);
free(te, M_CXGBE);
}
/*
* Start tracking the tid in the TCB history.
*/
int
add_tid_to_history(struct adapter *sc, u_int tid)
{
struct tcb_histent *te = NULL;
struct tom_data *td = sc->tom_softc;
int rc;
MPASS(tid < sc->tids.ntids);
if (td->tcb_history == NULL)
return (ENXIO);
rw_wlock(&td->tcb_history_lock);
if (td->tcb_history[tid] != NULL) {
rc = EEXIST;
goto done;
}
te = alloc_tcb_histent(sc, tid, M_NOWAIT);
if (te == NULL) {
rc = ENOMEM;
goto done;
}
mtx_lock(&te->te_lock);
rc = send_get_tcb(sc, tid);
if (rc == 0) {
te->te_flags |= TE_RPL_PENDING;
td->tcb_history[tid] = te;
} else {
free(te, M_CXGBE);
}
mtx_unlock(&te->te_lock);
done:
rw_wunlock(&td->tcb_history_lock);
return (rc);
}
static void
remove_tcb_histent(struct tcb_histent *te)
{
struct adapter *sc = te->te_adapter;
struct tom_data *td = sc->tom_softc;
rw_assert(&td->tcb_history_lock, RA_WLOCKED);
mtx_assert(&te->te_lock, MA_OWNED);
MPASS(td->tcb_history[te->te_tid] == te);
td->tcb_history[te->te_tid] = NULL;
free_tcb_histent(te);
rw_wunlock(&td->tcb_history_lock);
}
static inline struct tcb_histent *
lookup_tcb_histent(struct adapter *sc, u_int tid, bool addrem)
{
struct tcb_histent *te;
struct tom_data *td = sc->tom_softc;
MPASS(tid < sc->tids.ntids);
if (addrem)
rw_wlock(&td->tcb_history_lock);
else
rw_rlock(&td->tcb_history_lock);
te = td->tcb_history[tid];
if (te != NULL) {
mtx_lock(&te->te_lock);
return (te); /* with both locks held */
}
if (addrem)
rw_wunlock(&td->tcb_history_lock);
else
rw_runlock(&td->tcb_history_lock);
return (te);
}
static inline void
release_tcb_histent(struct tcb_histent *te)
{
struct adapter *sc = te->te_adapter;
struct tom_data *td = sc->tom_softc;
mtx_assert(&te->te_lock, MA_OWNED);
mtx_unlock(&te->te_lock);
rw_assert(&td->tcb_history_lock, RA_RLOCKED);
rw_runlock(&td->tcb_history_lock);
}
static void
request_tcb(void *arg)
{
struct tcb_histent *te = arg;
mtx_assert(&te->te_lock, MA_OWNED);
/* Noone else is supposed to update the histent. */
MPASS(!(te->te_flags & TE_RPL_PENDING));
if (send_get_tcb(te->te_adapter, te->te_tid) == 0)
te->te_flags |= TE_RPL_PENDING;
else
callout_schedule(&te->te_callout, hz / 100);
}
static void
update_tcb_histent(struct tcb_histent *te, const uint64_t *tcb)
{
struct tom_data *td = te->te_adapter->tom_softc;
uint64_t tflags = get_tcb_tflags(tcb);
uint8_t sample = 0;
if (GET_TCB_FIELD(tcb, SND_MAX_RAW) != GET_TCB_FIELD(tcb, SND_UNA_RAW)) {
if (GET_TCB_FIELD(tcb, T_RXTSHIFT) != 0)
sample |= TS_RTO;
if (GET_TCB_FIELD(tcb, T_DUPACKS) != 0)
sample |= TS_DUPACKS;
if (GET_TCB_FIELD(tcb, T_DUPACKS) >= td->dupack_threshold)
sample |= TS_FASTREXMT;
}
return (rc);
if (GET_TCB_FIELD(tcb, SND_MAX_RAW) != 0) {
uint32_t snd_wnd;
sample |= TS_SND_BACKLOGGED; /* for whatever reason. */
snd_wnd = GET_TCB_FIELD(tcb, RCV_ADV);
if (tflags & V_TF_RECV_SCALE(1))
snd_wnd <<= GET_TCB_FIELD(tcb, RCV_SCALE);
if (GET_TCB_FIELD(tcb, SND_CWND) < snd_wnd)
sample |= TS_CWND_LIMITED; /* maybe due to CWND */
}
if (tflags & V_TF_CCTRL_ECN(1)) {
/*
* CE marker on incoming IP hdr, echoing ECE back in the TCP
* hdr. Indicates congestion somewhere on the way from the peer
* to this node.
*/
if (tflags & V_TF_CCTRL_ECE(1))
sample |= TS_ECN_ECE;
/*
* ECE seen and CWR sent (or about to be sent). Might indicate
* congestion on the way to the peer. This node is reducing its
* congestion window in response.
*/
if (tflags & (V_TF_CCTRL_CWR(1) | V_TF_CCTRL_RFR(1)))
sample |= TS_ECN_CWR;
}
te->te_sample[te->te_pidx] = sample;
if (++te->te_pidx == nitems(te->te_sample))
te->te_pidx = 0;
memcpy(te->te_tcb, tcb, TCB_SIZE);
te->te_flags |= TE_ACTIVE;
}
static int
do_get_tcb_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
{
struct adapter *sc = iq->adapter;
const struct cpl_get_tcb_rpl *cpl = mtod(m, const void *);
const uint64_t *tcb = (const uint64_t *)(const void *)(cpl + 1);
struct tcb_histent *te;
const u_int tid = GET_TID(cpl);
bool remove;
remove = GET_TCB_FIELD(tcb, T_STATE) == TCPS_CLOSED;
te = lookup_tcb_histent(sc, tid, remove);
if (te == NULL) {
/* Not in the history. Who issued the GET_TCB for this? */
device_printf(sc->dev, "tcb %u: flags 0x%016jx, state %u, "
"srtt %u, sscale %u, rscale %u, cookie 0x%x\n", tid,
(uintmax_t)get_tcb_tflags(tcb), GET_TCB_FIELD(tcb, T_STATE),
GET_TCB_FIELD(tcb, T_SRTT), GET_TCB_FIELD(tcb, SND_SCALE),
GET_TCB_FIELD(tcb, RCV_SCALE), cpl->cookie);
goto done;
}
MPASS(te->te_flags & TE_RPL_PENDING);
te->te_flags &= ~TE_RPL_PENDING;
if (remove) {
remove_tcb_histent(te);
} else {
update_tcb_histent(te, tcb);
callout_reset(&te->te_callout, hz / 10, request_tcb, te);
release_tcb_histent(te);
}
done:
m_freem(m);
return (0);
}
static void
fill_tcp_info_from_tcb(struct adapter *sc, uint64_t *tcb, struct tcp_info *ti)
{
uint32_t v;
ti->tcpi_state = GET_TCB_FIELD(tcb, T_STATE);
v = GET_TCB_FIELD(tcb, T_SRTT);
ti->tcpi_rtt = tcp_ticks_to_us(sc, v);
v = GET_TCB_FIELD(tcb, T_RTTVAR);
ti->tcpi_rttvar = tcp_ticks_to_us(sc, v);
ti->tcpi_snd_ssthresh = GET_TCB_FIELD(tcb, SND_SSTHRESH);
ti->tcpi_snd_cwnd = GET_TCB_FIELD(tcb, SND_CWND);
ti->tcpi_rcv_nxt = GET_TCB_FIELD(tcb, RCV_NXT);
v = GET_TCB_FIELD(tcb, TX_MAX);
ti->tcpi_snd_nxt = v - GET_TCB_FIELD(tcb, SND_NXT_RAW);
/* Receive window being advertised by us. */
ti->tcpi_rcv_wscale = GET_TCB_FIELD(tcb, SND_SCALE); /* Yes, SND. */
ti->tcpi_rcv_space = GET_TCB_FIELD(tcb, RCV_WND);
/* Send window */
ti->tcpi_snd_wscale = GET_TCB_FIELD(tcb, RCV_SCALE); /* Yes, RCV. */
ti->tcpi_snd_wnd = GET_TCB_FIELD(tcb, RCV_ADV);
if (get_tcb_tflags(tcb) & V_TF_RECV_SCALE(1))
ti->tcpi_snd_wnd <<= ti->tcpi_snd_wscale;
else
ti->tcpi_snd_wscale = 0;
}
static void
fill_tcp_info_from_history(struct adapter *sc, struct tcb_histent *te,
struct tcp_info *ti)
{
fill_tcp_info_from_tcb(sc, te->te_tcb, ti);
}
/*
* Reads the TCB for the given tid using a memory window and copies it to 'buf'
* in the same format as CPL_GET_TCB_RPL.
*/
static void
read_tcb_using_memwin(struct adapter *sc, u_int tid, uint64_t *buf)
{
int i, j, k, rc;
uint32_t addr;
u_char *tcb, tmp;
MPASS(tid < sc->tids.ntids);
addr = t4_read_reg(sc, A_TP_CMM_TCB_BASE) + tid * TCB_SIZE;
rc = read_via_memwin(sc, 2, addr, (uint32_t *)buf, TCB_SIZE);
if (rc != 0)
return;
tcb = (u_char *)buf;
for (i = 0, j = TCB_SIZE - 16; i < j; i += 16, j -= 16) {
for (k = 0; k < 16; k++) {
tmp = tcb[i + k];
tcb[i + k] = tcb[j + k];
tcb[j + k] = tmp;
}
}
}
static void
fill_tcp_info(struct adapter *sc, u_int tid, struct tcp_info *ti)
{
uint64_t tcb[TCB_SIZE / sizeof(uint64_t)];
struct tcb_histent *te;
ti->tcpi_toe_tid = tid;
te = lookup_tcb_histent(sc, tid, false);
if (te != NULL) {
fill_tcp_info_from_history(sc, te, ti);
release_tcb_histent(te);
} else {
if (!(sc->debug_flags & DF_DISABLE_TCB_CACHE)) {
/* XXX: tell firmware to flush TCB cache. */
}
read_tcb_using_memwin(sc, tid, tcb);
fill_tcp_info_from_tcb(sc, tcb, ti);
}
}
/*
@ -417,53 +768,13 @@ get_tcb_bits(u_char *tcb, int hi, int lo)
static void
t4_tcp_info(struct toedev *tod, struct tcpcb *tp, struct tcp_info *ti)
{
int i, j, k, rc;
struct adapter *sc = tod->tod_softc;
struct toepcb *toep = tp->t_toe;
uint32_t addr, v;
uint32_t buf[TCB_SIZE / sizeof(uint32_t)];
u_char *tcb, tmp;
INP_WLOCK_ASSERT(tp->t_inpcb);
MPASS(ti != NULL);
ti->tcpi_toe_tid = toep->tid;
addr = t4_read_reg(sc, A_TP_CMM_TCB_BASE) + toep->tid * TCB_SIZE;
rc = read_via_memwin(sc, 2, addr, &buf[0], TCB_SIZE);
if (rc != 0)
return;
tcb = (u_char *)&buf[0];
for (i = 0, j = TCB_SIZE - 16; i < j; i += 16, j -= 16) {
for (k = 0; k < 16; k++) {
tmp = tcb[i + k];
tcb[i + k] = tcb[j + k];
tcb[j + k] = tmp;
}
}
ti->tcpi_state = get_tcb_bits(tcb, 115, 112);
v = get_tcb_bits(tcb, 271, 256);
ti->tcpi_rtt = tcp_ticks_to_us(sc, v);
v = get_tcb_bits(tcb, 287, 272);
ti->tcpi_rttvar = tcp_ticks_to_us(sc, v);
ti->tcpi_snd_ssthresh = get_tcb_bits(tcb, 487, 460);
ti->tcpi_snd_cwnd = get_tcb_bits(tcb, 459, 432);
ti->tcpi_rcv_nxt = get_tcb_bits(tcb, 553, 522);
ti->tcpi_snd_nxt = get_tcb_bits(tcb, 319, 288) -
get_tcb_bits(tcb, 375, 348);
/* Receive window being advertised by us. */
ti->tcpi_rcv_space = get_tcb_bits(tcb, 581, 554);
/* Send window ceiling. */
v = get_tcb_bits(tcb, 159, 144) << get_tcb_bits(tcb, 131, 128);
ti->tcpi_snd_wnd = min(v, ti->tcpi_snd_cwnd);
fill_tcp_info(sc, toep->tid, ti);
}
/*
@ -807,6 +1118,35 @@ alloc_tid_tabs(struct tid_info *t)
return (rc);
}
static inline void
alloc_tcb_history(struct adapter *sc, struct tom_data *td)
{
if (sc->tids.ntids == 0 || sc->tids.ntids > 1024)
return;
rw_init(&td->tcb_history_lock, "TCB history");
td->tcb_history = malloc(sc->tids.ntids * sizeof(*td->tcb_history),
M_CXGBE, M_ZERO | M_NOWAIT);
td->dupack_threshold = G_DUPACKTHRESH(t4_read_reg(sc, A_TP_PARA_REG0));
}
static inline void
free_tcb_history(struct adapter *sc, struct tom_data *td)
{
#ifdef INVARIANTS
int i;
if (td->tcb_history != NULL) {
for (i = 0; i < sc->tids.ntids; i++) {
MPASS(td->tcb_history[i] == NULL);
}
}
#endif
free(td->tcb_history, M_CXGBE);
if (rw_initialized(&td->tcb_history_lock))
rw_destroy(&td->tcb_history_lock);
}
static void
free_tom_data(struct adapter *sc, struct tom_data *td)
{
@ -830,6 +1170,7 @@ free_tom_data(struct adapter *sc, struct tom_data *td)
if (mtx_initialized(&td->toep_list_lock))
mtx_destroy(&td->toep_list_lock);
free_tcb_history(sc, td);
free_tid_tabs(&sc->tids);
free(td, M_CXGBE);
}
@ -1097,6 +1438,8 @@ t4_tom_activate(struct adapter *sc)
t4_set_reg_field(sc, A_ULP_RX_TDDP_TAGMASK,
V_TDDPTAGMASK(M_TDDPTAGMASK), td->pr.pr_tag_mask);
alloc_tcb_history(sc, td);
/* toedev ops */
tod = &td->tod;
init_toedev(tod);
@ -1214,6 +1557,7 @@ t4_tom_mod_load(void)
struct protosw *tcp_protosw, *tcp6_protosw;
/* CPL handlers */
t4_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
t4_register_shared_cpl_handler(CPL_L2T_WRITE_RPL, do_l2t_write_rpl2,
CPL_COOKIE_TOM);
t4_init_connect_cpl_handlers();

View File

@ -33,6 +33,7 @@
#ifndef __T4_TOM_H__
#define __T4_TOM_H__
#include <sys/vmem.h>
#include "common/t4_hw.h"
#include "tom/t4_tls.h"
#define LISTEN_HASH_SIZE 32
@ -254,6 +255,31 @@ struct listen_ctx {
struct clip_entry *ce;
};
/* tcb_histent flags */
#define TE_RPL_PENDING 1
#define TE_ACTIVE 2
/* bits in one 8b tcb_histent sample. */
#define TS_RTO (1 << 0)
#define TS_DUPACKS (1 << 1)
#define TS_FASTREXMT (1 << 2)
#define TS_SND_BACKLOGGED (1 << 3)
#define TS_CWND_LIMITED (1 << 4)
#define TS_ECN_ECE (1 << 5)
#define TS_ECN_CWR (1 << 6)
#define TS_RESERVED (1 << 7) /* Unused. */
struct tcb_histent {
struct mtx te_lock;
struct callout te_callout;
uint64_t te_tcb[TCB_SIZE / sizeof(uint64_t)];
struct adapter *te_adapter;
u_int te_flags;
u_int te_tid;
uint8_t te_pidx;
uint8_t te_sample[100];
};
struct tom_data {
struct toedev tod;
@ -268,6 +294,10 @@ struct tom_data {
struct ppod_region pr;
struct rwlock tcb_history_lock __aligned(CACHE_LINE_SIZE);
struct tcb_histent **tcb_history;
int dupack_threshold;
/* WRs that will not be sent to the chip because L2 resolution failed */
struct mtx unsent_wr_lock;
STAILQ_HEAD(, wrqe) unsent_wr_list;
@ -326,6 +356,7 @@ int select_ulp_mode(struct socket *, struct adapter *,
struct offload_settings *);
void set_ulp_mode(struct toepcb *, int);
int negative_advice(int);
int add_tid_to_history(struct adapter *, u_int);
/* t4_connect.c */
void t4_init_connect_cpl_handlers(void);