This fixes a out-of-order problem with several

of the newer drivers. The basic problem was
that the driver was pulling the mbuf off the
drbr ring and then when sending with xmit(), encounting
a full transmit ring. Thus the lower layer
xmit() function would return an error, and the
drivers would then append the data back on to the ring.
For TCP this is a horrible scenario sure to bring
on a fast-retransmit.

The fix is to use drbr_peek() to pull the data pointer
but not remove it from the ring. If it fails then
we either call the new drbr_putback or drbr_advance
method. Advance moves it forward (we do this sometimes
when the xmit() function frees the mbuf). When
we succeed we always call advance. The
putback will always copy the mbuf back to the top
of the ring. Note that the putback *cannot* be used
with a drbr_dequeue() only with drbr_peek(). We most
of the time, in putback, would not need to copy it
back since most likey the mbuf is still the same, but
sometimes xmit() functions will change the mbuf via
a pullup or other call. So the optimial case for
the single consumer is to always copy it back. If
we ever do a multiple_consumer (for lagg?) we
will  need a test and atomic in the put back possibly
a seperate putback_mc() in the ring buf.

Reviewed by:	jhb@freebsd.org, jlv@freebsd.org
This commit is contained in:
Randall Stewart 2013-02-07 15:20:54 +00:00
parent 9be7626dee
commit ded5ea6a25
9 changed files with 189 additions and 83 deletions

View File

@ -9506,24 +9506,15 @@ bxe_tx_mq_start_locked(struct ifnet *ifp,
BXE_FP_LOCK_ASSERT(fp);
if (m == NULL) {
/* No new work, check for pending frames. */
next = drbr_dequeue(ifp, fp->br);
} else if (drbr_needs_enqueue(ifp, fp->br)) {
/* Both new and pending work, maintain packet order. */
if (m != NULL) {
rc = drbr_enqueue(ifp, fp->br, m);
if (rc != 0) {
fp->tx_soft_errors++;
goto bxe_tx_mq_start_locked_exit;
}
next = drbr_dequeue(ifp, fp->br);
} else
/* New work only, nothing pending. */
next = m;
}
/* Keep adding entries while there are frames to send. */
while (next != NULL) {
while ((next = drbr_peek(ifp, fp->br)) != NULL) {
/* The transmit mbuf now belongs to us, keep track of it. */
fp->tx_mbuf_alloc++;
@ -9537,23 +9528,22 @@ bxe_tx_mq_start_locked(struct ifnet *ifp,
if (__predict_false(rc != 0)) {
fp->tx_encap_failures++;
/* Very Bad Frames(tm) may have been dropped. */
if (next != NULL) {
if (next == NULL) {
drbr_advance(ifp, fp->br);
} else {
drbr_putback(ifp, fp->br, next);
/*
* Mark the TX queue as full and save
* the frame.
*/
ifp->if_drv_flags |= IFF_DRV_OACTIVE;
fp->tx_frame_deferred++;
/* This may reorder frame. */
rc = drbr_enqueue(ifp, fp->br, next);
fp->tx_mbuf_alloc--;
}
/* Stop looking for more work. */
break;
}
drbr_advance(ifp, fp->br);
/* The transmit frame was enqueued successfully. */
tx_count++;
@ -9574,8 +9564,6 @@ bxe_tx_mq_start_locked(struct ifnet *ifp,
ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
break;
}
next = drbr_dequeue(ifp, fp->br);
}
/* No TX packets were dequeued. */

View File

@ -905,22 +905,24 @@ em_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr, struct mbuf *m)
}
enq = 0;
if (m == NULL) {
next = drbr_dequeue(ifp, txr->br);
} else if (drbr_needs_enqueue(ifp, txr->br)) {
if ((err = drbr_enqueue(ifp, txr->br, m)) != 0)
if (m != NULL) {
err = drbr_enqueue(ifp, txr->br, m);
if (err) {
return (err);
next = drbr_dequeue(ifp, txr->br);
} else
next = m;
}
}
/* Process the queue */
while (next != NULL) {
while ((next = drbr_peek(ifp, txr->br)) != NULL) {
if ((err = em_xmit(txr, &next)) != 0) {
if (next != NULL)
err = drbr_enqueue(ifp, txr->br, next);
break;
if (next == NULL) {
drbr_advance(ifp, txr->br);
} else {
drbr_putback(ifp, txr->br, next);
}
break;
}
drbr_advance(ifp, txr->br);
enq++;
ifp->if_obytes += next->m_pkthdr.len;
if (next->m_flags & M_MCAST)
@ -928,7 +930,6 @@ em_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr, struct mbuf *m)
ETHER_BPF_MTAP(ifp, next);
if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
break;
next = drbr_dequeue(ifp, txr->br);
}
if (enq > 0) {

View File

@ -350,6 +350,16 @@ TUNABLE_INT("hw.igb.max_interrupt_rate", &igb_max_interrupt_rate);
SYSCTL_INT(_hw_igb, OID_AUTO, max_interrupt_rate, CTLFLAG_RDTUN,
&igb_max_interrupt_rate, 0, "Maximum interrupts per second");
#if __FreeBSD_version >= 800000
/*
** Tuneable number of buffers in the buf-ring (drbr_xxx)
*/
static int igb_buf_ring_size = IGB_BR_SIZE;
TUNABLE_INT("hw.igb.buf_ring_size", &igb_buf_ring_size);
SYSCTL_INT(_hw_igb, OID_AUTO, buf_ring_size, CTLFLAG_RDTUN,
&igb_buf_ring_size, 0, "Size of the bufring");
#endif
/*
** Header split causes the packet header to
** be dma'd to a seperate mbuf from the payload.
@ -965,12 +975,13 @@ igb_mq_start(struct ifnet *ifp, struct mbuf *m)
** out-of-order delivery, but
** settle for it if that fails
*/
if (m)
if (m != NULL)
drbr_enqueue(ifp, txr->br, m);
err = igb_mq_start_locked(ifp, txr);
IGB_TX_UNLOCK(txr);
} else {
err = drbr_enqueue(ifp, txr->br, m);
if (m != NULL)
err = drbr_enqueue(ifp, txr->br, m);
taskqueue_enqueue(que->tq, &txr->txq_task);
}
@ -994,12 +1005,22 @@ igb_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr)
enq = 0;
/* Process the queue */
while ((next = drbr_dequeue(ifp, txr->br)) != NULL) {
while ((next = drbr_peek(ifp, txr->br)) != NULL) {
if ((err = igb_xmit(txr, &next)) != 0) {
if (next != NULL)
err = drbr_enqueue(ifp, txr->br, next);
if (next == NULL) {
/* It was freed, move forward */
drbr_advance(ifp, txr->br);
} else {
/*
* Still have one left, it may not be
* the same since the transmit function
* may have changed it.
*/
drbr_putback(ifp, txr->br, next);
}
break;
}
drbr_advance(ifp, txr->br);
enq++;
ifp->if_obytes += next->m_pkthdr.len;
if (next->m_flags & M_MCAST)
@ -3301,7 +3322,7 @@ igb_allocate_queues(struct adapter *adapter)
}
#if __FreeBSD_version >= 800000
/* Allocate a buf ring */
txr->br = buf_ring_alloc(IGB_BR_SIZE, M_DEVBUF,
txr->br = buf_ring_alloc(igb_buf_ring_size, M_DEVBUF,
M_WAITOK, &txr->tx_mtx);
#endif
}

View File

@ -832,22 +832,24 @@ ixgbe_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr, struct mbuf *m)
}
enqueued = 0;
if (m == NULL) {
next = drbr_dequeue(ifp, txr->br);
} else if (drbr_needs_enqueue(ifp, txr->br)) {
if ((err = drbr_enqueue(ifp, txr->br, m)) != 0)
if (m != NULL) {
err = drbr_enqueue(ifp, txr->br, m);
if (err) {
return (err);
next = drbr_dequeue(ifp, txr->br);
} else
next = m;
}
}
/* Process the queue */
while (next != NULL) {
while ((next = drbr_peek(ifp, txr->br)) != NULL) {
if ((err = ixgbe_xmit(txr, &next)) != 0) {
if (next != NULL)
err = drbr_enqueue(ifp, txr->br, next);
if (next == NULL) {
drbr_advance(ifp, txr->br);
} else {
drbr_putback(ifp, txr->br, next);
}
break;
}
drbr_advance(ifp, txr->br);
enqueued++;
/* Send a copy of the frame to the BPF listener */
ETHER_BPF_MTAP(ifp, next);
@ -855,7 +857,6 @@ ixgbe_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr, struct mbuf *m)
break;
if (txr->tx_avail < IXGBE_TX_OP_THRESHOLD)
ixgbe_txeof(txr);
next = drbr_dequeue(ifp, txr->br);
}
if (enqueued > 0) {

View File

@ -620,22 +620,23 @@ ixv_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr, struct mbuf *m)
ixv_txeof(txr);
enqueued = 0;
if (m == NULL) {
next = drbr_dequeue(ifp, txr->br);
} else if (drbr_needs_enqueue(ifp, txr->br)) {
if ((err = drbr_enqueue(ifp, txr->br, m)) != 0)
if (m != NULL) {
err = drbr_enqueue(ifp, txr->br, m);
if (err) {
return (err);
next = drbr_dequeue(ifp, txr->br);
} else
next = m;
}
}
/* Process the queue */
while (next != NULL) {
while ((next = drbr_peek(ifp, txr->br)) != NULL) {
if ((err = ixv_xmit(txr, &next)) != 0) {
if (next != NULL)
err = drbr_enqueue(ifp, txr->br, next);
if (next == NULL) {
drbr_advance(ifp, txr->br);
} else {
drbr_putback(ifp, txr->br, next);
}
break;
}
drbr_advance(ifp, txr->br);
enqueued++;
ifp->if_obytes += next->m_pkthdr.len;
if (next->m_flags & M_MCAST)
@ -648,7 +649,6 @@ ixv_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr, struct mbuf *m)
ifp->if_drv_flags |= IFF_DRV_OACTIVE;
break;
}
next = drbr_dequeue(ifp, txr->br);
}
if (enqueued > 0) {

View File

@ -1166,29 +1166,27 @@ oce_multiq_transmit(struct ifnet *ifp, struct mbuf *m, struct oce_wq *wq)
return status;
}
if (m == NULL)
next = drbr_dequeue(ifp, br);
else if (drbr_needs_enqueue(ifp, br)) {
if (m != NULL) {
if ((status = drbr_enqueue(ifp, br, m)) != 0)
return status;
next = drbr_dequeue(ifp, br);
} else
next = m;
while (next != NULL) {
}
while ((next = drbr_peek(ifp, br)) != NULL) {
if (oce_tx(sc, &next, queue_index)) {
if (next != NULL) {
if (next == NULL) {
drbr_advance(ifp, br);
} else {
drbr_putback(ifp, br, next);
wq->tx_stats.tx_stops ++;
ifp->if_drv_flags |= IFF_DRV_OACTIVE;
status = drbr_enqueue(ifp, br, next);
}
break;
}
drbr_advance(ifp, br);
ifp->if_obytes += next->m_pkthdr.len;
if (next->m_flags & M_MCAST)
ifp->if_omcasts++;
ETHER_BPF_MTAP(ifp, next);
next = drbr_dequeue(ifp, br);
}
return status;

View File

@ -621,6 +621,45 @@ drbr_enqueue(struct ifnet *ifp, struct buf_ring *br, struct mbuf *m)
return (error);
}
static __inline void
drbr_putback(struct ifnet *ifp, struct buf_ring *br, struct mbuf *new)
{
/*
* The top of the list needs to be swapped
* for this one.
*/
#ifdef ALTQ
if (ifp != NULL && ALTQ_IS_ENABLED(&ifp->if_snd)) {
/*
* Peek in altq case dequeued it
* so put it back.
*/
IFQ_DRV_PREPEND(&ifp->if_snd, new);
return;
}
#endif
buf_ring_putback_sc(br, new);
}
static __inline struct mbuf *
drbr_peek(struct ifnet *ifp, struct buf_ring *br)
{
#ifdef ALTQ
struct mbuf *m;
if (ifp != NULL && ALTQ_IS_ENABLED(&ifp->if_snd)) {
/*
* Pull it off like a dequeue
* since drbr_advance() does nothing
* for altq and drbr_putback() will
* use the old prepend function.
*/
IFQ_DEQUEUE(&ifp->if_snd, m);
return (m);
}
#endif
return(buf_ring_peek(br));
}
static __inline void
drbr_flush(struct ifnet *ifp, struct buf_ring *br)
{
@ -648,7 +687,7 @@ drbr_dequeue(struct ifnet *ifp, struct buf_ring *br)
#ifdef ALTQ
struct mbuf *m;
if (ALTQ_IS_ENABLED(&ifp->if_snd)) {
if (ifp != NULL && ALTQ_IS_ENABLED(&ifp->if_snd)) {
IFQ_DEQUEUE(&ifp->if_snd, m);
return (m);
}
@ -656,6 +695,18 @@ drbr_dequeue(struct ifnet *ifp, struct buf_ring *br)
return (buf_ring_dequeue_sc(br));
}
static __inline void
drbr_advance(struct ifnet *ifp, struct buf_ring *br)
{
#ifdef ALTQ
/* Nothing to do here since peek dequeues in altq case */
if (ifp != NULL && ALTQ_IS_ENABLED(&ifp->if_snd))
return;
#endif
return (buf_ring_advance_sc(br));
}
static __inline struct mbuf *
drbr_dequeue_cond(struct ifnet *ifp, struct buf_ring *br,
int (*func) (struct mbuf *, void *), void *arg)

View File

@ -931,22 +931,21 @@ mlx4_en_transmit_locked(struct ifnet *dev, int tx_ind, struct mbuf *m)
}
enqueued = 0;
if (m == NULL) {
next = drbr_dequeue(dev, ring->br);
} else if (drbr_needs_enqueue(dev, ring->br)) {
if (m != NULL) {
if ((err = drbr_enqueue(dev, ring->br, m)) != 0)
return (err);
next = drbr_dequeue(dev, ring->br);
} else
next = m;
}
/* Process the queue */
while (next != NULL) {
while ((next = drbr_peek(ifp, ring->br)) != NULL) {
if ((err = mlx4_en_xmit(dev, tx_ind, &next)) != 0) {
if (next != NULL)
err = drbr_enqueue(dev, ring->br, next);
if (next == NULL) {
drbr_advance(ifp, ring->br);
} else {
drbr_putback(ifp, ring->br, next);
}
break;
}
drbr_advance(ifp, ring->br);
enqueued++;
dev->if_obytes += next->m_pkthdr.len;
if (next->m_flags & M_MCAST)
@ -955,7 +954,6 @@ mlx4_en_transmit_locked(struct ifnet *dev, int tx_ind, struct mbuf *m)
ETHER_BPF_MTAP(dev, next);
if ((dev->if_drv_flags & IFF_DRV_RUNNING) == 0)
break;
next = drbr_dequeue(dev, ring->br);
}
if (enqueued > 0)

View File

@ -207,6 +207,54 @@ buf_ring_dequeue_sc(struct buf_ring *br)
return (buf);
}
/*
* single-consumer advance after a peek
* use where it is protected by a lock
* e.g. a network driver's tx queue lock
*/
static __inline void
buf_ring_advance_sc(struct buf_ring *br)
{
uint32_t cons_head, cons_next;
uint32_t prod_tail;
cons_head = br->br_cons_head;
prod_tail = br->br_prod_tail;
cons_next = (cons_head + 1) & br->br_cons_mask;
if (cons_head == prod_tail)
return;
br->br_cons_head = cons_next;
#ifdef DEBUG_BUFRING
br->br_ring[cons_head] = NULL;
#endif
br->br_cons_tail = cons_next;
}
/*
* Used to return a buffer (most likely already there)
* to the top od the ring. The caller should *not*
* have used any dequeue to pull it out of the ring
* but instead should have used the peek() function.
* This is normally used where the transmit queue
* of a driver is full, and an mubf must be returned.
* Most likely whats in the ring-buffer is what
* is being put back (since it was not removed), but
* sometimes the lower transmit function may have
* done a pullup or other function that will have
* changed it. As an optimzation we always put it
* back (since jhb says the store is probably cheaper),
* if we have to do a multi-queue version we will need
* the compare and an atomic.
*/
static __inline void
buf_ring_putback_sc(struct buf_ring *br, void *new)
{
KASSERT(br->br_cons_head != br->br_prod_tail,
("Buf-Ring has none in putback")) ;
br->br_ring[br->br_cons_head] = new;
}
/*
* return a pointer to the first entry in the ring
* without modifying it, or NULL if the ring is empty