- Simplify mb_free_ext_fast

- increase asserts for mbuf accounting
- track outstanding mbufs (maps very closely to leaked)
- actually only create one thread per port if !multiq
    Oddly enough this fixes the use after free

- move txq_segs to stack in t3_encap
- add checks that pidx doesn't move pass cidx
- simplify mbuf free logic in collapse mbufs routine
This commit is contained in:
Kip Macy 2008-01-15 08:08:09 +00:00
parent 3c58f6ddbd
commit 139edb19d9
5 changed files with 91 additions and 40 deletions

View File

@ -298,8 +298,7 @@ struct sge_txq {
unsigned long txq_frees;
struct mtx lock;
struct sg_ent txq_sgl[TX_MAX_SEGS / 2 + 1];
bus_dma_segment_t txq_segs[TX_MAX_SEGS];
#define TXQ_NAME_LEN 32
#define TXQ_NAME_LEN 32
char lockbuf[TXQ_NAME_LEN];
};

View File

@ -422,6 +422,7 @@ cxgb_pcpu_start_(struct sge_qset *qs, struct mbuf *immpkt, int tx_flush)
txq = &qs->txq[TXQ_ETH];
mtx_assert(&txq->lock, MA_OWNED);
KASSERT(qs->idx == 0, ("invalid qs %d", qs->idx));
retry:
if (!pi->link_config.link_ok)
@ -667,13 +668,14 @@ cxgb_pcpu_startup_threads(struct adapter *sc)
#else
nqsets = 1;
#endif
for (j = 0; j < pi->nqsets; ++j) {
for (j = 0; j < nqsets; ++j) {
struct sge_qset *qs;
qs = &sc->sge.qs[pi->first_qset + j];
qs->port = pi;
qs->qs_cpuid = ((pi->first_qset + j) % mp_ncpus);
device_printf(sc->dev, "starting thread for %d\n", qs->qs_cpuid);
device_printf(sc->dev, "starting thread for %d\n",
qs->qs_cpuid);
kproc_create(cxgb_pcpu_start_proc, qs, &p,
RFNOWAIT, 0, "cxgbsp");
@ -686,11 +688,18 @@ void
cxgb_pcpu_shutdown_threads(struct adapter *sc)
{
int i, j;
int nqsets;
#ifdef IFNET_MULTIQUEUE
nqsets = pi->nqsets;
#else
nqsets = 1;
#endif
for (i = 0; i < sc->params.nports; i++) {
struct port_info *pi = &sc->port[i];
int first = pi->first_qset;
for (j = 0; j < pi->nqsets; j++) {
for (j = 0; j < nqsets; j++) {
struct sge_qset *qs = &sc->sge.qs[first + j];
qs->qs_flags |= QS_EXITING;

View File

@ -927,7 +927,17 @@ txq_prod(struct sge_txq *txq, unsigned int ndesc, struct txq_state *txqs)
txq->unacked &= 7;
txqs->pidx = txq->pidx;
txq->pidx += ndesc;
#ifdef INVARIANTS
if (((txqs->pidx > txq->cidx) &&
(txq->pidx < txqs->pidx) &&
(txq->pidx >= txq->cidx)) ||
((txqs->pidx < txq->cidx) &&
(txq->pidx >= txq-> cidx)) ||
((txqs->pidx < txq->cidx) &&
(txq->cidx < txqs->pidx)))
panic("txqs->pidx=%d txq->pidx=%d txq->cidx=%d",
txqs->pidx, txq->pidx, txq->cidx);
#endif
if (txq->pidx >= txq->size) {
txq->pidx -= txq->size;
txq->gen ^= 1;
@ -1205,23 +1215,23 @@ t3_encap(struct sge_qset *qs, struct mbuf **m, int count)
struct work_request_hdr *wrp;
struct tx_sw_desc *txsd;
struct sg_ent *sgp, *sgl;
bus_dma_segment_t *segs;
uint32_t wr_hi, wr_lo, sgl_flits;
bus_dma_segment_t segs[TX_MAX_SEGS];
struct tx_desc *txd;
struct mbuf_vec *mv;
struct mbuf_iovec *mi;
DPRINTF("t3_encap cpu=%d ", curcpu);
KASSERT(qs->idx == 0, ("invalid qs %d", qs->idx));
mi = NULL;
pi = qs->port;
sc = pi->adapter;
txq = &qs->txq[TXQ_ETH];
txsd = &txq->sdesc[txq->pidx];
txd = &txq->desc[txq->pidx];
txsd = &txq->sdesc[txq->pidx];
sgl = txq->txq_sgl;
segs = txq->txq_segs;
m0 = *m;
DPRINTF("t3_encap port_id=%d qsidx=%d ", pi->port_id, pi->first_qset);
@ -1240,6 +1250,8 @@ t3_encap(struct sge_qset *qs, struct mbuf **m, int count)
#endif
KASSERT(txsd->mi.mi_base == NULL, ("overwrting valid entry mi_base==%p",
txsd->mi.mi_base));
if (cxgb_debug)
printf("uipc_mvec PIO_LEN=%ld\n", PIO_LEN);
if (count > 1) {
panic("count > 1 not support in CVS\n");
@ -1253,7 +1265,7 @@ t3_encap(struct sge_qset *qs, struct mbuf **m, int count)
}
KASSERT(m0->m_pkthdr.len, ("empty packet nsegs=%d count=%d", nsegs, count));
if (m0->m_pkthdr.len > PIO_LEN) {
if (!(m0->m_pkthdr.len <= PIO_LEN)) {
mi_collapse_mbuf(&txsd->mi, m0);
mi = &txsd->mi;
}
@ -1393,8 +1405,11 @@ t3_encap(struct sge_qset *qs, struct mbuf **m, int count)
write_wr_hdr_sgl(ndesc, txd, &txqs, txq, sgl, flits, sgl_flits, wr_hi, wr_lo);
check_ring_tx_db(pi->adapter, txq);
if ((m0->m_type == MT_DATA) && ((m0->m_flags & (M_EXT|M_NOFREE)) == M_EXT)) {
if ((m0->m_type == MT_DATA) &&
((m0->m_flags & (M_EXT|M_NOFREE)) == M_EXT) &&
(m0->m_ext.ext_type != EXT_PACKET)) {
m0->m_flags &= ~M_EXT ;
mbufs_outstanding--;
m_free(m0);
}
@ -1797,6 +1812,7 @@ t3_free_tx_desc(struct sge_txq *q, int reclaimable)
cidx = q->cidx;
txsd = &q->sdesc[cidx];
DPRINTF("reclaiming %d WR\n", reclaimable);
mtx_assert(&q->lock, MA_OWNED);
while (reclaimable--) {
DPRINTF("cidx=%d d=%p\n", cidx, txsd);
if (txsd->mi.mi_base != NULL) {
@ -2208,6 +2224,7 @@ t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx,
}
init_qset_cntxt(q, id);
q->idx = id;
if ((ret = alloc_ring(sc, p->fl_size, sizeof(struct rx_desc),
sizeof(struct rx_sw_desc), &q->fl[0].phys_addr,
@ -3203,7 +3220,11 @@ t3_add_attach_sysctls(adapter_t *sc)
SYSCTL_ADD_INT(ctx, children, OID_AUTO,
"ext_freed",
CTLFLAG_RD, &cxgb_ext_freed,
0, "#times a cluster was freed through ext_free");
0, "#times a cluster was freed through ext_free");
SYSCTL_ADD_INT(ctx, children, OID_AUTO,
"mbufs_outstanding",
CTLFLAG_RD, &mbufs_outstanding,
0, "#mbufs in flight in the driver");
}

View File

@ -45,6 +45,7 @@ void cxgb_cache_refill(void);
extern int cxgb_cached_allocations;
extern int cxgb_cached;
extern int cxgb_ext_freed;
extern int mbufs_outstanding;
#define mtomv(m) ((struct mbuf_vec *)((m)->m_pktdat))
#define M_IOVEC 0x100000 /* mbuf immediate data area is used for cluster ptrs */
@ -162,7 +163,7 @@ m_explode(struct mbuf *m)
static __inline void
busdma_map_mbuf_fast(struct mbuf *m, bus_dma_segment_t *seg)
{
seg->ds_addr = pmap_kextract((vm_offset_t)m->m_data);
seg->ds_addr = pmap_kextract(mtod(m, vm_offset_t));
seg->ds_len = m->m_len;
}
@ -229,11 +230,17 @@ m_free_iovec(struct mbuf *m, int type)
static __inline void
m_freem_iovec(struct mbuf_iovec *mi)
{
struct mbuf *m;
struct mbuf *m = (struct mbuf *)mi->mi_base;
switch (mi->mi_type) {
case EXT_MBUF:
m_free_fast((struct mbuf *)mi->mi_base);
#ifdef PIO_LEN
KASSERT(m->m_pkthdr.len > PIO_LEN, ("freeing PIO buf"));
#endif
KASSERT((mi->mi_flags & M_NOFREE) == 0, ("no free set on mbuf"));
KASSERT(m->m_next == NULL, ("freeing chain"));
mbufs_outstanding--;
m_free_fast(m);
break;
case EXT_IOVEC:
case EXT_CLIOVEC:

View File

@ -75,6 +75,7 @@ extern uint32_t mb_free_vec_free;
uma_zone_t zone_miovec;
static int mi_inited = 0;
int mbufs_outstanding = 0;
void
mi_init(void)
@ -198,12 +199,18 @@ busdma_map_sg_collapse(struct mbuf **m, bus_dma_segment_t *segs, int *nsegs)
struct mbuf *marray[TX_MAX_SEGS];
int i, type, seg_count, defragged = 0, err = 0;
struct mbuf_vec *mv;
int skipped, freed, outstanding;
KASSERT(n->m_pkthdr.len, ("packet has zero header len"));
if (n->m_flags & M_PKTHDR && !SLIST_EMPTY(&n->m_pkthdr.tags))
m_tag_delete_chain(n, NULL);
if (cxgb_debug)
printf("cxgb_sge PIO_LEN=%ld\n", PIO_LEN);
if (n->m_pkthdr.len <= PIO_LEN)
return (0);
retry:
@ -211,16 +218,21 @@ busdma_map_sg_collapse(struct mbuf **m, bus_dma_segment_t *segs, int *nsegs)
if (n->m_next == NULL) {
busdma_map_mbuf_fast(n, segs);
*nsegs = 1;
if ((n->m_flags & M_NOFREE) == 0)
mbufs_outstanding++;
return (0);
}
skipped = freed = outstanding = 0;
while (n && seg_count < TX_MAX_SEGS) {
marray[seg_count] = n;
/*
* firmware doesn't like empty segments
*/
if (__predict_true(n->m_len != 0))
if (__predict_true(n->m_len != 0))
seg_count++;
else
skipped++;
n = n->m_next;
}
@ -265,24 +277,35 @@ busdma_map_sg_collapse(struct mbuf **m, bus_dma_segment_t *segs, int *nsegs)
}
n = *m;
while (n) {
if (n->m_ext.ext_type == EXT_PACKET)
if (n->m_len == 0)
/* do nothing - free if mbuf or cluster */;
else if (((n->m_flags & M_EXT) == 0) ||
((n->m_flags & M_EXT) && (n->m_ext.ext_type == EXT_PACKET)) ||
(n->m_flags & M_NOFREE))
goto skip;
else if (n->m_len == 0)
/* do nothing */;
else if ((n->m_flags & (M_EXT|M_NOFREE)) == M_EXT)
n->m_flags &= ~M_EXT;
else
goto skip;
mhead = n->m_next;
m_free(n);
n = mhead;
freed++;
continue;
skip:
/*
* is an immediate mbuf or is from the packet zone
*/
outstanding++;
n = n->m_next;
}
*nsegs = seg_count;
*m = m0;
DPRINTF("pktlen=%d m0=%p *m=%p m=%p\n", m0->m_pkthdr.len, m0, *m, m);
mbufs_outstanding += outstanding;
KASSERT(outstanding + freed == skipped + seg_count,
("outstanding=%d freed=%d skipped=%d seg_count=%d",
outstanding, freed, skipped, seg_count));
return (0);
err_out:
m_freem(*m);
@ -325,6 +348,7 @@ busdma_map_sg_vec(struct mbuf **m, struct mbuf **mret, bus_dma_segment_t *segs,
(*mp)->m_next = (*mp)->m_nextpkt = NULL;
if (((*mp)->m_flags & (M_EXT|M_NOFREE)) == M_EXT) {
(*mp)->m_flags &= ~M_EXT;
mbufs_outstanding--;
m_free(*mp);
}
}
@ -336,29 +360,17 @@ busdma_map_sg_vec(struct mbuf **m, struct mbuf **mret, bus_dma_segment_t *segs,
void
mb_free_ext_fast(struct mbuf_iovec *mi, int type, int idx)
{
u_int cnt;
int dofree;
caddr_t cl;
/* Account for lazy ref count assign. */
dofree = (mi->mi_refcnt == NULL);
/*
* This is tricky. We need to make sure to decrement the
* refcount in a safe way but to also clean up if we're the
* last reference. This method seems to do it without race.
*/
while (dofree == 0) {
cnt = *(mi->mi_refcnt);
if (cnt == 1) {
dofree = 1;
break;
}
if (atomic_cmpset_int(mi->mi_refcnt, cnt, cnt - 1)) {
if (cnt == 1)
dofree = 1;
break;
}
if (dofree == 0) {
KASSERT(mi->mi_type != EXT_MBUF,
("refcnt must be null for mbuf"));
if (*(mi->mi_refcnt) == 1 ||
atomic_fetchadd_int(mi->mi_refcnt, -1) == 1)
dofree = 1;
}
if (dofree == 0)
return;
@ -366,6 +378,8 @@ mb_free_ext_fast(struct mbuf_iovec *mi, int type, int idx)
cl = mi->mi_base;
switch (type) {
case EXT_MBUF:
KASSERT((mi->mi_flags & M_NOFREE) == 0, ("no free set on mbuf"));
mbufs_outstanding--;
m_free_fast((struct mbuf *)cl);
break;
case EXT_CLUSTER:
@ -395,6 +409,7 @@ mb_free_ext_fast(struct mbuf_iovec *mi, int type, int idx)
mi->mi_ext.ext_args);
break;
case EXT_PACKET:
mbufs_outstanding--;
if (*(mi->mi_refcnt) == 0)
*(mi->mi_refcnt) = 1;
uma_zfree(zone_pack, mi->mi_mbuf);