cxgbe(4): changes in the Tx path to help increase tx coalescing.

- Ask the firmware for the number of frames that can be stuffed in one work request. - Modify mp_ring to increase the likelihood of tx coalescing when there are just one or two threads that are doing most of the tx. Add teeth to the abdication mechanism by pushing the consumer lock into mp_ring. This reduces the likelihood that a consumer will get stuck with all the work even though it is above its budget. - Add support for coalesced tx WR to the VF driver. This, with the changes above, results in a 7x improvement in the tx pps of the VF driver for some common cases. The firmware vets the L2 headers submitted by the VF driver and it's a big win if the checks are performed for a batch of packets and not each one individually. Reviewed by: jhb@ MFC after: 2 weeks Sponsored by: Chelsio Communications Differential Revision: https://reviews.freebsd.org/D25454
svn path=/head/; revision=362905
2020-07-03 04:44:23 +00:00 · 2020-07-03 04:44:23 +00:00 · d735920d33 · 2020-12-20 02:59:44 +00:00
commit d735920d33
parent 2749666d01
7 changed files with 802 additions and 349 deletions
--- a/sys/dev/cxgbe/adapter.h
+++ b/sys/dev/cxgbe/adapter.h
@ -550,6 +550,23 @@ struct sge_fl {

 struct mp_ring;

+struct txpkts {
+	uint8_t wr_type;	/* type 0 or type 1 */
+	uint8_t npkt;		/* # of packets in this work request */
+	uint8_t len16;		/* # of 16B pieces used by this work request */
+	uint8_t score;		/* 1-10. coalescing attempted if score > 3 */
+	uint8_t max_npkt;	/* maximum number of packets allowed */
+	uint16_t plen;		/* total payload (sum of all packets) */
+
+	/* straight from fw_eth_tx_pkts_vm_wr. */
+	__u8   ethmacdst[6];
+	__u8   ethmacsrc[6];
+	__be16 ethtype;
+	__be16 vlantci;
+
+	struct mbuf *mb[15];
+};
+
 /* txq: SGE egress queue + what's needed for Ethernet NIC */
 struct sge_txq {
 	struct sge_eq eq;	/* MUST be first */
@ -560,6 +577,7 @@ struct sge_txq {
 	struct sglist *gl;
 	__be32 cpl_ctrl0;	/* for convenience */
 	int tc_idx;		/* traffic class */
+	struct txpkts txp;

 	struct task tx_reclaim_task;
 	/* stats for common events first */
--- a/sys/dev/cxgbe/common/common.h
+++ b/sys/dev/cxgbe/common/common.h
@ -389,6 +389,7 @@ struct adapter_params {
 	bool ulptx_memwrite_dsgl;	/* use of T5 DSGL allowed */
 	bool fr_nsmr_tpte_wr_support;	/* FW support for FR_NSMR_TPTE_WR */
 	bool viid_smt_extn_support;	/* FW returns vin, vfvld & smt index? */
+	unsigned int max_pkts_per_eth_tx_pkts_wr;
 };

 #define CHELSIO_T4		0x4
--- a/sys/dev/cxgbe/t4_main.c
+++ b/sys/dev/cxgbe/t4_main.c
@ -2191,7 +2191,7 @@ cxgbe_transmit(struct ifnet *ifp, struct mbuf *m)
 		    vi->rsrv_noflowq);

 	items[0] = m;
-	rc = mp_ring_enqueue(txq->r, items, 1, 4096);
+	rc = mp_ring_enqueue(txq->r, items, 1, 256);
 	if (__predict_false(rc != 0))
 		m_freem(m);

@ -2212,7 +2212,7 @@ cxgbe_qflush(struct ifnet *ifp)
 			txq->eq.flags |= EQ_QFLUSH;
 			TXQ_UNLOCK(txq);
 			while (!mp_ring_is_idle(txq->r)) {
-				mp_ring_check_drainage(txq->r, 0);
+				mp_ring_check_drainage(txq->r, 4096);
 				pause("qflush", 1);
 			}
 			TXQ_LOCK(txq);
@ -2261,7 +2261,7 @@ vi_get_counter(struct ifnet *ifp, ift_counter c)
 			struct sge_txq *txq;

 			for_each_txq(vi, i, txq)
-				drops += counter_u64_fetch(txq->r->drops);
+				drops += counter_u64_fetch(txq->r->dropped);
 		}

 		return (drops);
@ -2326,7 +2326,7 @@ cxgbe_get_counter(struct ifnet *ifp, ift_counter c)
 			struct sge_txq *txq;

 			for_each_txq(vi, i, txq)
-				drops += counter_u64_fetch(txq->r->drops);
+				drops += counter_u64_fetch(txq->r->dropped);
 		}

 		return (drops);
@ -4457,6 +4457,13 @@ get_params__post_init(struct adapter *sc)
 	else
 		sc->params.fr_nsmr_tpte_wr_support = false;

+	param[0] = FW_PARAM_PFVF(MAX_PKTS_PER_ETH_TX_PKTS_WR);
+	rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, param, val);
+	if (rc == 0)
+		sc->params.max_pkts_per_eth_tx_pkts_wr = val[0];
+	else
+		sc->params.max_pkts_per_eth_tx_pkts_wr = 15;
+
 	/* get capabilites */
 	bzero(&caps, sizeof(caps));
 	caps.op_to_write = htobe32(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) |
@ -5965,7 +5972,7 @@ quiesce_txq(struct adapter *sc, struct sge_txq *txq)

 	/* Wait for the mp_ring to empty. */
 	while (!mp_ring_is_idle(txq->r)) {
-		mp_ring_check_drainage(txq->r, 0);
+		mp_ring_check_drainage(txq->r, 4096);
 		pause("rquiesce", 1);
 	}

--- a/sys/dev/cxgbe/t4_mp_ring.c
+++ b/sys/dev/cxgbe/t4_mp_ring.c
@ -34,6 +34,8 @@ __FBSDID("$FreeBSD$");
 #include <sys/counter.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/sysctl.h>
 #include <machine/cpu.h>

 #include "t4_mp_ring.h"
@ -43,6 +45,23 @@ __FBSDID("$FreeBSD$");
 #define atomic_cmpset_rel_64 atomic_cmpset_64
 #endif

+/*
+ * mp_ring handles multiple threads (producers) enqueueing data to a tx queue.
+ * The thread that is writing the hardware descriptors is the consumer and it
+ * runs with the consumer lock held.  A producer becomes the consumer if there
+ * isn't one already.  The consumer runs with the flags sets to BUSY and
+ * consumes everything (IDLE or COALESCING) or gets STALLED.  If it is running
+ * over its budget it sets flags to TOO_BUSY.  A producer that observes a
+ * TOO_BUSY consumer will become the new consumer by setting flags to
+ * TAKING_OVER.  The original consumer stops and sets the flags back to BUSY for
+ * the new consumer.
+ *
+ * COALESCING is the same as IDLE except there are items being held in the hope
+ * that they can be coalesced with items that follow.  The driver must arrange
+ * for a tx update or some other event that transmits all the held items in a
+ * timely manner if nothing else is enqueued.
+ */
+
 union ring_state {
 	struct {
 		uint16_t pidx_head;
@ -54,11 +73,19 @@ union ring_state {
 };

 enum {
-	IDLE = 0,	/* consumer ran to completion, nothing more to do. */
+	IDLE = 0,	/* tx is all caught up, nothing to do. */
+	COALESCING,	/* IDLE, but tx frames are being held for coalescing */
 	BUSY,		/* consumer is running already, or will be shortly. */
+	TOO_BUSY,	/* consumer is running and is beyond its budget */
+	TAKING_OVER,	/* new consumer taking over from a TOO_BUSY consumer */
 	STALLED,	/* consumer stopped due to lack of resources. */
-	ABDICATED,	/* consumer stopped even though there was work to be
-			   done because it wants another thread to take over. */
+};
+
+enum {
+	C_FAST = 0,
+	C_2,
+	C_3,
+	C_TAKEOVER,
 };

 static inline uint16_t
@ -83,90 +110,101 @@ increment_idx(struct mp_ring *r, uint16_t idx, uint16_t n)
 	return (x > n ? idx + n : n - x);
 }

-/* Consumer is about to update the ring's state to s */
-static inline uint16_t
-state_to_flags(union ring_state s, int abdicate)
-{
-
-	if (s.cidx == s.pidx_tail)
-		return (IDLE);
-	else if (abdicate && s.pidx_tail != s.pidx_head)
-		return (ABDICATED);
-
-	return (BUSY);
-}
-
 /*
- * Caller passes in a state, with a guarantee that there is work to do and that
- * all items up to the pidx_tail in the state are visible.
+ * Consumer.  Called with the consumer lock held and a guarantee that there is
+ * work to do.
 */
 static void
-drain_ring(struct mp_ring *r, union ring_state os, uint16_t prev, int budget)
+drain_ring(struct mp_ring *r, int budget)
 {
-	union ring_state ns;
+	union ring_state os, ns;
 	int n, pending, total;
-	uint16_t cidx = os.cidx;
-	uint16_t pidx = os.pidx_tail;
+	uint16_t cidx;
+	uint16_t pidx;
+	bool coalescing;

+	mtx_assert(r->cons_lock, MA_OWNED);
+
+	os.state = atomic_load_acq_64(&r->state);
 	MPASS(os.flags == BUSY);
+
+	cidx = os.cidx;
+	pidx = os.pidx_tail;
 	MPASS(cidx != pidx);

-	if (prev == IDLE)
-		counter_u64_add(r->starts, 1);
 	pending = 0;
 	total = 0;

 	while (cidx != pidx) {

 		/* Items from cidx to pidx are available for consumption. */
-		n = r->drain(r, cidx, pidx);
+		n = r->drain(r, cidx, pidx, &coalescing);
 		if (n == 0) {
 			critical_enter();
-			os.state = r->state;
+			os.state = atomic_load_64(&r->state);
 			do {
 				ns.state = os.state;
 				ns.cidx = cidx;
-				ns.flags = STALLED;
+
+				MPASS(os.flags == BUSY ||
+				    os.flags == TOO_BUSY ||
+				    os.flags == TAKING_OVER);
+
+				if (os.flags == TAKING_OVER)
+					ns.flags = BUSY;
+				else
+					ns.flags = STALLED;
 			} while (atomic_fcmpset_64(&r->state, &os.state,
 			    ns.state) == 0);
 			critical_exit();
-			if (prev != STALLED)
+			if (os.flags == TAKING_OVER)
+				counter_u64_add(r->abdications, 1);
+			else if (ns.flags == STALLED)
 				counter_u64_add(r->stalls, 1);
-			else if (total > 0) {
-				counter_u64_add(r->restarts, 1);
-				counter_u64_add(r->stalls, 1);
-			}
 			break;
 		}
 		cidx = increment_idx(r, cidx, n);
 		pending += n;
 		total += n;
+		counter_u64_add(r->consumed, n);

-		/*
-		 * We update the cidx only if we've caught up with the pidx, the
-		 * real cidx is getting too far ahead of the one visible to
-		 * everyone else, or we have exceeded our budget.
-		 */
-		if (cidx != pidx && pending < 64 && total < budget)
-			continue;
-		critical_enter();
-		os.state = r->state;
+		os.state = atomic_load_64(&r->state);
 		do {
+			MPASS(os.flags == BUSY || os.flags == TOO_BUSY ||
+			    os.flags == TAKING_OVER);
+
 			ns.state = os.state;
 			ns.cidx = cidx;
-			ns.flags = state_to_flags(ns, total >= budget);
-		} while (atomic_fcmpset_acq_64(&r->state, &os.state, ns.state) == 0);
-		critical_exit();
-
-		if (ns.flags == ABDICATED)
-			counter_u64_add(r->abdications, 1);
-		if (ns.flags != BUSY) {
-			/* Wrong loop exit if we're going to stall. */
-			MPASS(ns.flags != STALLED);
-			if (prev == STALLED) {
-				MPASS(total > 0);
-				counter_u64_add(r->restarts, 1);
+			if (__predict_false(os.flags == TAKING_OVER)) {
+				MPASS(total >= budget);
+				ns.flags = BUSY;
+				continue;
 			}
+			if (cidx == os.pidx_tail) {
+				ns.flags = coalescing ? COALESCING : IDLE;
+				continue;
+			}
+			if (total >= budget) {
+				ns.flags = TOO_BUSY;
+				continue;
+			}
+			MPASS(os.flags == BUSY);
+			if (pending < 32)
+				break;
+		} while (atomic_fcmpset_acq_64(&r->state, &os.state, ns.state) == 0);
+
+		if (__predict_false(os.flags == TAKING_OVER)) {
+			MPASS(ns.flags == BUSY);
+			counter_u64_add(r->abdications, 1);
+			break;
+		}
+
+		if (ns.flags == IDLE || ns.flags == COALESCING) {
+			MPASS(ns.pidx_tail == cidx);
+			if (ns.pidx_head != ns.pidx_tail)
+				counter_u64_add(r->cons_idle2, 1);
+			else
+				counter_u64_add(r->cons_idle, 1);
 			break;
 		}

@ -177,13 +215,55 @@ drain_ring(struct mp_ring *r, union ring_state os, uint16_t prev, int budget)
 		pidx = ns.pidx_tail;
 		pending = 0;
 	}
+
+#ifdef INVARIANTS
+	if (os.flags == TAKING_OVER)
+		MPASS(ns.flags == BUSY);
+	else {
+		MPASS(ns.flags == IDLE || ns.flags == COALESCING ||
+		    ns.flags == STALLED);
+	}
+#endif
+}
+
+static void
+drain_txpkts(struct mp_ring *r, union ring_state os, int budget)
+{
+	union ring_state ns;
+	uint16_t cidx = os.cidx;
+	uint16_t pidx = os.pidx_tail;
+	bool coalescing;
+
+	mtx_assert(r->cons_lock, MA_OWNED);
+	MPASS(os.flags == BUSY);
+	MPASS(cidx == pidx);
+
+	r->drain(r, cidx, pidx, &coalescing);
+	MPASS(coalescing == false);
+	critical_enter();
+	os.state = atomic_load_64(&r->state);
+	do {
+		ns.state = os.state;
+		MPASS(os.flags == BUSY);
+		MPASS(os.cidx == cidx);
+		if (ns.cidx == ns.pidx_tail)
+			ns.flags = IDLE;
+		else
+			ns.flags = BUSY;
+	} while (atomic_fcmpset_acq_64(&r->state, &os.state, ns.state) == 0);
+	critical_exit();
+
+	if (ns.flags == BUSY)
+		drain_ring(r, budget);
 }

 int
 mp_ring_alloc(struct mp_ring **pr, int size, void *cookie, ring_drain_t drain,
-    ring_can_drain_t can_drain, struct malloc_type *mt, int flags)
+    ring_can_drain_t can_drain, struct malloc_type *mt, struct mtx *lck,
+    int flags)
 {
 	struct mp_ring *r;
+	int i;

 	/* All idx are 16b so size can be 65536 at most */
 	if (pr == NULL || size < 2 || size > 65536 || drain == NULL ||
@ -201,43 +281,59 @@ mp_ring_alloc(struct mp_ring **pr, int size, void *cookie, ring_drain_t drain,
 	r->mt = mt;
 	r->drain = drain;
 	r->can_drain = can_drain;
-	r->enqueues = counter_u64_alloc(flags);
-	r->drops = counter_u64_alloc(flags);
-	r->starts = counter_u64_alloc(flags);
-	r->stalls = counter_u64_alloc(flags);
-	r->restarts = counter_u64_alloc(flags);
-	r->abdications = counter_u64_alloc(flags);
-	if (r->enqueues == NULL || r->drops == NULL || r->starts == NULL ||
-	    r->stalls == NULL || r->restarts == NULL ||
-	    r->abdications == NULL) {
-		mp_ring_free(r);
-		return (ENOMEM);
+	r->cons_lock = lck;
+	if ((r->dropped = counter_u64_alloc(flags)) == NULL)
+		goto failed;
+	for (i = 0; i < nitems(r->consumer); i++) {
+		if ((r->consumer[i] = counter_u64_alloc(flags)) == NULL)
+			goto failed;
 	}
-
+	if ((r->not_consumer = counter_u64_alloc(flags)) == NULL)
+		goto failed;
+	if ((r->abdications = counter_u64_alloc(flags)) == NULL)
+		goto failed;
+	if ((r->stalls = counter_u64_alloc(flags)) == NULL)
+		goto failed;
+	if ((r->consumed = counter_u64_alloc(flags)) == NULL)
+		goto failed;
+	if ((r->cons_idle = counter_u64_alloc(flags)) == NULL)
+		goto failed;
+	if ((r->cons_idle2 = counter_u64_alloc(flags)) == NULL)
+		goto failed;
 	*pr = r;
 	return (0);
+failed:
+	mp_ring_free(r);
+	return (ENOMEM);
 }

 void

 mp_ring_free(struct mp_ring *r)
 {
+	int i;

 	if (r == NULL)
 		return;

-	if (r->enqueues != NULL)
-		counter_u64_free(r->enqueues);
-	if (r->drops != NULL)
-		counter_u64_free(r->drops);
-	if (r->starts != NULL)
-		counter_u64_free(r->starts);
-	if (r->stalls != NULL)
-		counter_u64_free(r->stalls);
-	if (r->restarts != NULL)
-		counter_u64_free(r->restarts);
+	if (r->dropped != NULL)
+		counter_u64_free(r->dropped);
+	for (i = 0; i < nitems(r->consumer); i++) {
+		if (r->consumer[i] != NULL)
+			counter_u64_free(r->consumer[i]);
+	}
+	if (r->not_consumer != NULL)
+		counter_u64_free(r->not_consumer);
 	if (r->abdications != NULL)
 		counter_u64_free(r->abdications);
+	if (r->stalls != NULL)
+		counter_u64_free(r->stalls);
+	if (r->consumed != NULL)
+		counter_u64_free(r->consumed);
+	if (r->cons_idle != NULL)
+		counter_u64_free(r->cons_idle);
+	if (r->cons_idle2 != NULL)
+		counter_u64_free(r->cons_idle2);

 	free(r, r->mt);
 }
@ -252,7 +348,8 @@ mp_ring_enqueue(struct mp_ring *r, void **items, int n, int budget)
 {
 	union ring_state os, ns;
 	uint16_t pidx_start, pidx_stop;
-	int i;
+	int i, nospc, cons;
+	bool consumer;

 	MPASS(items != NULL);
 	MPASS(n > 0);
@ -261,26 +358,70 @@ mp_ring_enqueue(struct mp_ring *r, void **items, int n, int budget)
 	 * Reserve room for the new items.  Our reservation, if successful, is
 	 * from 'pidx_start' to 'pidx_stop'.
 	 */
-	os.state = r->state;
+	nospc = 0;
+	os.state = atomic_load_64(&r->state);
 	for (;;) {
-		if (n >= space_available(r, os)) {
-			counter_u64_add(r->drops, n);
+		for (;;) {
+			if (__predict_true(space_available(r, os) >= n))
+				break;
+
+			/* Not enough room in the ring. */
+
 			MPASS(os.flags != IDLE);
+			MPASS(os.flags != COALESCING);
+			if (__predict_false(++nospc > 100)) {
+				counter_u64_add(r->dropped, n);
+				return (ENOBUFS);
+			}
 			if (os.flags == STALLED)
-				mp_ring_check_drainage(r, 0);
-			return (ENOBUFS);
+				mp_ring_check_drainage(r, 64);
+			else
+				cpu_spinwait();
+			os.state = atomic_load_64(&r->state);
 		}
+
+		/* There is room in the ring. */
+
+		cons = -1;
 		ns.state = os.state;
 		ns.pidx_head = increment_idx(r, os.pidx_head, n);
+		if (os.flags == IDLE || os.flags == COALESCING) {
+			MPASS(os.pidx_tail == os.cidx);
+			if (os.pidx_head == os.pidx_tail) {
+				cons = C_FAST;
+				ns.pidx_tail = increment_idx(r, os.pidx_tail, n);
+			} else
+				cons = C_2;
+			ns.flags = BUSY;
+		} else if (os.flags == TOO_BUSY) {
+			cons = C_TAKEOVER;
+			ns.flags = TAKING_OVER;
+		}
 		critical_enter();
 		if (atomic_fcmpset_64(&r->state, &os.state, ns.state))
 			break;
 		critical_exit();
 		cpu_spinwait();
-	}
+	};
+
 	pidx_start = os.pidx_head;
 	pidx_stop = ns.pidx_head;

+	if (cons == C_FAST) {
+		i = pidx_start;
+		do {
+			r->items[i] = *items++;
+			if (__predict_false(++i == r->size))
+				i = 0;
+		} while (i != pidx_stop);
+		critical_exit();
+		counter_u64_add(r->consumer[C_FAST], 1);
+		mtx_lock(r->cons_lock);
+		drain_ring(r, budget);
+		mtx_unlock(r->cons_lock);
+		return (0);
+	}
+
 	/*
 	 * Wait for other producers who got in ahead of us to enqueue their
 	 * items, one producer at a time.  It is our turn when the ring's
@ -288,7 +429,7 @@ mp_ring_enqueue(struct mp_ring *r, void **items, int n, int budget)
 	 */
 	while (ns.pidx_tail != pidx_start) {
 		cpu_spinwait();
-		ns.state = r->state;
+		ns.state = atomic_load_64(&r->state);
 	}

 	/* Now it is our turn to fill up the area we reserved earlier. */
@ -303,21 +444,33 @@ mp_ring_enqueue(struct mp_ring *r, void **items, int n, int budget)
 	 * Update the ring's pidx_tail.  The release style atomic guarantees
 	 * that the items are visible to any thread that sees the updated pidx.
 	 */
-	os.state = r->state;
+	os.state = atomic_load_64(&r->state);
 	do {
+		consumer = false;
 		ns.state = os.state;
 		ns.pidx_tail = pidx_stop;
-		ns.flags = BUSY;
+		if (os.flags == IDLE || os.flags == COALESCING ||
+		    (os.flags == STALLED && r->can_drain(r))) {
+			MPASS(cons == -1);
+			consumer = true;
+			ns.flags = BUSY;
+		}
 	} while (atomic_fcmpset_rel_64(&r->state, &os.state, ns.state) == 0);
 	critical_exit();
-	counter_u64_add(r->enqueues, n);

-	/*
-	 * Turn into a consumer if some other thread isn't active as a consumer
-	 * already.
-	 */
-	if (os.flags != BUSY)
-		drain_ring(r, ns, os.flags, budget);
+	if (cons == -1) {
+		if (consumer)
+			cons = C_3;
+		else {
+			counter_u64_add(r->not_consumer, 1);
+			return (0);
+		}
+	}
+	MPASS(cons > C_FAST && cons < nitems(r->consumer));
+	counter_u64_add(r->consumer[cons], 1);
+	mtx_lock(r->cons_lock);
+	drain_ring(r, budget);
+	mtx_unlock(r->cons_lock);

 	return (0);
 }
@ -327,46 +480,96 @@ mp_ring_check_drainage(struct mp_ring *r, int budget)
 {
 	union ring_state os, ns;

-	os.state = r->state;
-	if (os.flags != STALLED || os.pidx_head != os.pidx_tail ||
-	    r->can_drain(r) == 0)
-		return;
-
-	MPASS(os.cidx != os.pidx_tail);	/* implied by STALLED */
-	ns.state = os.state;
-	ns.flags = BUSY;
-
-	/*
-	 * The acquire style atomic guarantees visibility of items associated
-	 * with the pidx that we read here.
-	 */
-	if (!atomic_cmpset_acq_64(&r->state, os.state, ns.state))
-		return;
-
-	drain_ring(r, ns, os.flags, budget);
+	os.state = atomic_load_64(&r->state);
+	if (os.flags == STALLED && r->can_drain(r)) {
+		MPASS(os.cidx != os.pidx_tail);	/* implied by STALLED */
+		ns.state = os.state;
+		ns.flags = BUSY;
+		if (atomic_cmpset_acq_64(&r->state, os.state, ns.state)) {
+			mtx_lock(r->cons_lock);
+			drain_ring(r, budget);
+			mtx_unlock(r->cons_lock);
+		}
+	} else if (os.flags == COALESCING) {
+		MPASS(os.cidx == os.pidx_tail);
+		ns.state = os.state;
+		ns.flags = BUSY;
+		if (atomic_cmpset_acq_64(&r->state, os.state, ns.state)) {
+			mtx_lock(r->cons_lock);
+			drain_txpkts(r, ns, budget);
+			mtx_unlock(r->cons_lock);
+		}
+	}
 }

 void
 mp_ring_reset_stats(struct mp_ring *r)
 {
+	int i;

-	counter_u64_zero(r->enqueues);
-	counter_u64_zero(r->drops);
-	counter_u64_zero(r->starts);
-	counter_u64_zero(r->stalls);
-	counter_u64_zero(r->restarts);
+	counter_u64_zero(r->dropped);
+	for (i = 0; i < nitems(r->consumer); i++)
+		counter_u64_zero(r->consumer[i]);
+	counter_u64_zero(r->not_consumer);
 	counter_u64_zero(r->abdications);
+	counter_u64_zero(r->stalls);
+	counter_u64_zero(r->consumed);
+	counter_u64_zero(r->cons_idle);
+	counter_u64_zero(r->cons_idle2);
 }

-int
+bool
 mp_ring_is_idle(struct mp_ring *r)
 {
 	union ring_state s;

-	s.state = r->state;
+	s.state = atomic_load_64(&r->state);
 	if (s.pidx_head == s.pidx_tail && s.pidx_tail == s.cidx &&
 	    s.flags == IDLE)
-		return (1);
+		return (true);

-	return (0);
+	return (false);
+}
+
+void
+mp_ring_sysctls(struct mp_ring *r, struct sysctl_ctx_list *ctx,
+    struct sysctl_oid_list *children)
+{
+	struct sysctl_oid *oid;
+
+	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "mp_ring", CTLFLAG_RD |
+	    CTLFLAG_MPSAFE, NULL, "mp_ring statistics");
+	children = SYSCTL_CHILDREN(oid);
+
+	SYSCTL_ADD_U64(ctx, children, OID_AUTO, "state", CTLFLAG_RD,
+	    __DEVOLATILE(uint64_t *, &r->state), 0, "ring state");
+	SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "dropped", CTLFLAG_RD,
+	    &r->dropped, "# of items dropped");
+	SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "consumed",
+	    CTLFLAG_RD, &r->consumed, "# of items consumed");
+	SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "fast_consumer",
+	    CTLFLAG_RD, &r->consumer[C_FAST],
+	    "# of times producer became consumer (fast)");
+	SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "consumer2",
+	    CTLFLAG_RD, &r->consumer[C_2],
+	    "# of times producer became consumer (2)");
+	SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "consumer3",
+	    CTLFLAG_RD, &r->consumer[C_3],
+	    "# of times producer became consumer (3)");
+	SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "takeovers",
+	    CTLFLAG_RD, &r->consumer[C_TAKEOVER],
+	    "# of times producer took over from another consumer.");
+	SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "not_consumer",
+	    CTLFLAG_RD, &r->not_consumer,
+	    "# of times producer did not become consumer");
+	SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "abdications",
+	    CTLFLAG_RD, &r->abdications, "# of consumer abdications");
+	SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "stalls",
+	    CTLFLAG_RD, &r->stalls, "# of consumer stalls");
+	SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "cons_idle",
+	    CTLFLAG_RD, &r->cons_idle,
+	    "# of times consumer ran fully to completion");
+	SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "cons_idle2",
+	    CTLFLAG_RD, &r->cons_idle2,
+	    "# of times consumer idled when another enqueue was in progress");
 }
--- a/sys/dev/cxgbe/t4_mp_ring.h
+++ b/sys/dev/cxgbe/t4_mp_ring.h
@ -36,33 +36,38 @@
 #endif

 struct mp_ring;
-typedef u_int (*ring_drain_t)(struct mp_ring *, u_int, u_int);
+typedef u_int (*ring_drain_t)(struct mp_ring *, u_int, u_int, bool *);
 typedef u_int (*ring_can_drain_t)(struct mp_ring *);

 struct mp_ring {
 	volatile uint64_t	state __aligned(CACHE_LINE_SIZE);
+	struct malloc_type *	mt;

 	int			size __aligned(CACHE_LINE_SIZE);
 	void *			cookie;
-	struct malloc_type *	mt;
 	ring_drain_t		drain;
 	ring_can_drain_t	can_drain;	/* cheap, may be unreliable */
-	counter_u64_t		enqueues;
-	counter_u64_t		drops;
-	counter_u64_t		starts;
-	counter_u64_t		stalls;
-	counter_u64_t		restarts;	/* recovered after stalling */
+	struct mtx *		cons_lock;
+	counter_u64_t		dropped;
+	counter_u64_t		consumer[4];
+	counter_u64_t		not_consumer;
 	counter_u64_t		abdications;
+	counter_u64_t		consumed;
+	counter_u64_t		cons_idle;
+	counter_u64_t		cons_idle2;
+	counter_u64_t		stalls;

 	void * volatile		items[] __aligned(CACHE_LINE_SIZE);
 };

 int mp_ring_alloc(struct mp_ring **, int, void *, ring_drain_t,
-    ring_can_drain_t, struct malloc_type *, int);
+    ring_can_drain_t, struct malloc_type *, struct mtx *, int);
 void mp_ring_free(struct mp_ring *);
 int mp_ring_enqueue(struct mp_ring *, void **, int, int);
 void mp_ring_check_drainage(struct mp_ring *, int);
 void mp_ring_reset_stats(struct mp_ring *);
-int mp_ring_is_idle(struct mp_ring *);
+bool mp_ring_is_idle(struct mp_ring *);
+void mp_ring_sysctls(struct mp_ring *, struct sysctl_ctx_list *,
+    struct sysctl_oid_list *);

 #endif
--- a/sys/dev/cxgbe/t4_sge.c
+++ b/sys/dev/cxgbe/t4_sge.c
@ -203,19 +203,6 @@ static int lro_mbufs = 0;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, lro_mbufs, CTLFLAG_RDTUN, &lro_mbufs, 0,
    "Enable presorting of LRO frames");

-struct txpkts {
-	u_int wr_type;		/* type 0 or type 1 */
-	u_int npkt;		/* # of packets in this work request */
-	u_int plen;		/* total payload (sum of all packets) */
-	u_int len16;		/* # of 16B pieces used by this work request */
-};
-
-/* A packet's SGL.  This + m_pkthdr has all info needed for tx */
-struct sgl {
-	struct sglist sg;
-	struct sglist_seg seg[TX_SGL_SEGS];
-};
-
 static int service_iq(struct sge_iq *, int);
 static int service_iq_fl(struct sge_iq *, int);
 static struct mbuf *get_fl_payload(struct adapter *, struct sge_fl *, uint32_t);
@ -284,14 +271,16 @@ static inline u_int txpkt_vm_len16(u_int, u_int);
 static inline u_int txpkts0_len16(u_int);
 static inline u_int txpkts1_len16(void);
 static u_int write_raw_wr(struct sge_txq *, void *, struct mbuf *, u_int);
-static u_int write_txpkt_wr(struct adapter *, struct sge_txq *,
-    struct fw_eth_tx_pkt_wr *, struct mbuf *, u_int);
+static u_int write_txpkt_wr(struct adapter *, struct sge_txq *, struct mbuf *,
+    u_int);
 static u_int write_txpkt_vm_wr(struct adapter *, struct sge_txq *,
-    struct fw_eth_tx_pkt_vm_wr *, struct mbuf *, u_int);
-static int try_txpkts(struct mbuf *, struct mbuf *, struct txpkts *, u_int);
-static int add_to_txpkts(struct mbuf *, struct txpkts *, u_int);
-static u_int write_txpkts_wr(struct adapter *, struct sge_txq *,
-    struct fw_eth_tx_pkts_wr *, struct mbuf *, const struct txpkts *, u_int);
+    struct mbuf *);
+static int add_to_txpkts_vf(struct adapter *, struct sge_txq *, struct mbuf *,
+    int, bool *);
+static int add_to_txpkts_pf(struct adapter *, struct sge_txq *, struct mbuf *,
+    int, bool *);
+static u_int write_txpkts_wr(struct adapter *, struct sge_txq *);
+static u_int write_txpkts_vm_wr(struct adapter *, struct sge_txq *);
 static void write_gl_to_txd(struct sge_txq *, struct mbuf *, caddr_t *, int);
 static inline void copy_to_txd(struct sge_eq *, caddr_t, caddr_t *, int);
 static inline void ring_eq_db(struct adapter *, struct sge_eq *, u_int);
@ -2839,7 +2828,7 @@ can_resume_eth_tx(struct mp_ring *r)
 	return (total_available_tx_desc(eq) > eq->sidx / 8);
 }

-static inline int
+static inline bool
 cannot_use_txpkts(struct mbuf *m)
 {
 	/* maybe put a GL limit too, to avoid silliness? */
@ -2855,8 +2844,9 @@ discard_tx(struct sge_eq *eq)
 }

 static inline int
-wr_can_update_eq(struct fw_eth_tx_pkts_wr *wr)
+wr_can_update_eq(void *p)
 {
+	struct fw_eth_tx_pkts_wr *wr = p;

 	switch (G_FW_WR_OP(be32toh(wr->op_pkd))) {
 	case FW_ULPTX_WR:
@ -2864,159 +2854,232 @@ wr_can_update_eq(struct fw_eth_tx_pkts_wr *wr)
 	case FW_ETH_TX_PKTS_WR:
 	case FW_ETH_TX_PKTS2_WR:
 	case FW_ETH_TX_PKT_VM_WR:
+	case FW_ETH_TX_PKTS_VM_WR:
 		return (1);
 	default:
 		return (0);
 	}
 }

+static inline void
+set_txupdate_flags(struct sge_txq *txq, u_int avail,
+    struct fw_eth_tx_pkt_wr *wr)
+{
+	struct sge_eq *eq = &txq->eq;
+	struct txpkts *txp = &txq->txp;
+
+	if ((txp->npkt > 0 || avail < eq->sidx / 2) &&
+	    atomic_cmpset_int(&eq->equiq, 0, 1)) {
+		wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ | F_FW_WR_EQUIQ);
+		eq->equeqidx = eq->pidx;
+	} else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >= 32) {
+		wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ);
+		eq->equeqidx = eq->pidx;
+	}
+}
+
 /*
 * r->items[cidx] to r->items[pidx], with a wraparound at r->size, are ready to
 * be consumed.  Return the actual number consumed.  0 indicates a stall.
 */
 static u_int
-eth_tx(struct mp_ring *r, u_int cidx, u_int pidx)
+eth_tx(struct mp_ring *r, u_int cidx, u_int pidx, bool *coalescing)
 {
 	struct sge_txq *txq = r->cookie;
-	struct sge_eq *eq = &txq->eq;
 	struct ifnet *ifp = txq->ifp;
+	struct sge_eq *eq = &txq->eq;
+	struct txpkts *txp = &txq->txp;
 	struct vi_info *vi = ifp->if_softc;
 	struct adapter *sc = vi->adapter;
 	u_int total, remaining;		/* # of packets */
-	u_int available, dbdiff;	/* # of hardware descriptors */
-	u_int n, next_cidx;
-	struct mbuf *m0, *tail;
-	struct txpkts txp;
-	struct fw_eth_tx_pkts_wr *wr;	/* any fw WR struct will do */
+	u_int n, avail, dbdiff;		/* # of hardware descriptors */
+	int i, rc;
+	struct mbuf *m0;
+	bool snd;
+	void *wr;	/* start of the last WR written to the ring */
+
+	TXQ_LOCK_ASSERT_OWNED(txq);

 	remaining = IDXDIFF(pidx, cidx, r->size);
-	MPASS(remaining > 0);	/* Must not be called without work to do. */
-	total = 0;
-
-	TXQ_LOCK(txq);
 	if (__predict_false(discard_tx(eq))) {
+		for (i = 0; i < txp->npkt; i++)
+			m_freem(txp->mb[i]);
+		txp->npkt = 0;
 		while (cidx != pidx) {
 			m0 = r->items[cidx];
 			m_freem(m0);
 			if (++cidx == r->size)
 				cidx = 0;
 		}
-		reclaim_tx_descs(txq, 2048);
-		total = remaining;
-		goto done;
+		reclaim_tx_descs(txq, eq->sidx);
+		*coalescing = false;
+		return (remaining);	/* emptied */
 	}

 	/* How many hardware descriptors do we have readily available. */
-	if (eq->pidx == eq->cidx)
-		available = eq->sidx - 1;
-	else
-		available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
-	dbdiff = IDXDIFF(eq->pidx, eq->dbidx, eq->sidx);
+	if (eq->pidx == eq->cidx) {
+		avail = eq->sidx - 1;
+		if (txp->score++ >= 5)
+			txp->score = 5;	/* tx is completely idle, reset. */
+	} else
+		avail = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;

+	total = 0;
+	if (remaining == 0) {
+		if (txp->score-- == 1)	/* egr_update had to drain txpkts */
+			txp->score = 1;
+		goto send_txpkts;
+	}
+
+	dbdiff = 0;
+	MPASS(remaining > 0);
 	while (remaining > 0) {
-
 		m0 = r->items[cidx];
 		M_ASSERTPKTHDR(m0);
 		MPASS(m0->m_nextpkt == NULL);

-		if (available < tx_len16_to_desc(mbuf_len16(m0))) {
-			available += reclaim_tx_descs(txq, 64);
-			if (available < tx_len16_to_desc(mbuf_len16(m0)))
-				break;	/* out of descriptors */
+		if (avail < 2 * SGE_MAX_WR_NDESC)
+			avail += reclaim_tx_descs(txq, 64);
+
+		if (txp->npkt > 0 || remaining > 1 || txp->score > 3 ||
+		    atomic_load_int(&txq->eq.equiq) != 0) {
+			if (sc->flags & IS_VF)
+				rc = add_to_txpkts_vf(sc, txq, m0, avail, &snd);
+			else
+				rc = add_to_txpkts_pf(sc, txq, m0, avail, &snd);
+		} else {
+			snd = false;
+			rc = EINVAL;
+		}
+		if (snd) {
+			MPASS(txp->npkt > 0);
+			for (i = 0; i < txp->npkt; i++)
+				ETHER_BPF_MTAP(ifp, txp->mb[i]);
+			if (txp->npkt > 1) {
+				if (txp->score++ >= 10)
+					txp->score = 10;
+				MPASS(avail >= tx_len16_to_desc(txp->len16));
+				if (sc->flags & IS_VF)
+					n = write_txpkts_vm_wr(sc, txq);
+				else
+					n = write_txpkts_wr(sc, txq);
+			} else {
+				MPASS(avail >=
+				    tx_len16_to_desc(mbuf_len16(txp->mb[0])));
+				if (sc->flags & IS_VF)
+					n = write_txpkt_vm_wr(sc, txq,
+					    txp->mb[0]);
+				else
+					n = write_txpkt_wr(sc, txq, txp->mb[0],
+					    avail);
+			}
+			MPASS(n <= SGE_MAX_WR_NDESC);
+			avail -= n;
+			dbdiff += n;
+			wr = &eq->desc[eq->pidx];
+			IDXINCR(eq->pidx, n, eq->sidx);
+			txp->npkt = 0;	/* emptied */
+		}
+		if (rc == 0) {
+			/* m0 was coalesced into txq->txpkts. */
+			goto next_mbuf;
+		}
+		if (rc == EAGAIN) {
+			/*
+			 * m0 is suitable for tx coalescing but could not be
+			 * combined with the existing txq->txpkts, which has now
+			 * been transmitted.  Start a new txpkts with m0.
+			 */
+			MPASS(snd);
+			MPASS(txp->npkt == 0);
+			continue;
 		}

-		next_cidx = cidx + 1;
-		if (__predict_false(next_cidx == r->size))
-			next_cidx = 0;
-
-		wr = (void *)&eq->desc[eq->pidx];
+		MPASS(rc != 0 && rc != EAGAIN);
+		MPASS(txp->npkt == 0);
+		wr = &eq->desc[eq->pidx];
 		if (mbuf_cflags(m0) & MC_RAW_WR) {
-			total++;
-			remaining--;
-			n = write_raw_wr(txq, (void *)wr, m0, available);
+			n = write_raw_wr(txq, wr, m0, avail);
 #ifdef KERN_TLS
 		} else if (mbuf_cflags(m0) & MC_TLS) {
-			total++;
-			remaining--;
 			ETHER_BPF_MTAP(ifp, m0);
-			n = t6_ktls_write_wr(txq,(void *)wr, m0,
-			    mbuf_nsegs(m0), available);
+			n = t6_ktls_write_wr(txq, wr, m0, mbuf_nsegs(m0),
+			    avail);
 #endif
-		} else if (sc->flags & IS_VF) {
-			total++;
-			remaining--;
-			ETHER_BPF_MTAP(ifp, m0);
-			n = write_txpkt_vm_wr(sc, txq, (void *)wr, m0,
-			    available);
-		} else if (remaining > 1 &&
-		    try_txpkts(m0, r->items[next_cidx], &txp, available) == 0) {
-
-			/* pkts at cidx, next_cidx should both be in txp. */
-			MPASS(txp.npkt == 2);
-			tail = r->items[next_cidx];
-			MPASS(tail->m_nextpkt == NULL);
-			ETHER_BPF_MTAP(ifp, m0);
-			ETHER_BPF_MTAP(ifp, tail);
-			m0->m_nextpkt = tail;
-
-			if (__predict_false(++next_cidx == r->size))
-				next_cidx = 0;
-
-			while (next_cidx != pidx) {
-				if (add_to_txpkts(r->items[next_cidx], &txp,
-				    available) != 0)
-					break;
-				tail->m_nextpkt = r->items[next_cidx];
-				tail = tail->m_nextpkt;
-				ETHER_BPF_MTAP(ifp, tail);
-				if (__predict_false(++next_cidx == r->size))
-					next_cidx = 0;
-			}
-
-			n = write_txpkts_wr(sc, txq, wr, m0, &txp, available);
-			total += txp.npkt;
-			remaining -= txp.npkt;
 		} else {
-			total++;
-			remaining--;
-			ETHER_BPF_MTAP(ifp, m0);
-			n = write_txpkt_wr(sc, txq, (void *)wr, m0, available);
+			n = tx_len16_to_desc(mbuf_len16(m0));
+			if (__predict_false(avail < n)) {
+				avail += reclaim_tx_descs(txq, 32);
+				if (avail < n)
+					break;	/* out of descriptors */
+			}
+			if (sc->flags & IS_VF)
+				n = write_txpkt_vm_wr(sc, txq, m0);
+			else
+				n = write_txpkt_wr(sc, txq, m0, avail);
 		}
-		MPASS(n >= 1 && n <= available);
+		MPASS(n >= 1 && n <= avail);
 		if (!(mbuf_cflags(m0) & MC_TLS))
 			MPASS(n <= SGE_MAX_WR_NDESC);

-		available -= n;
+		avail -= n;
 		dbdiff += n;
 		IDXINCR(eq->pidx, n, eq->sidx);

-		if (wr_can_update_eq(wr)) {
-			if (total_available_tx_desc(eq) < eq->sidx / 4 &&
-			    atomic_cmpset_int(&eq->equiq, 0, 1)) {
-				wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ |
-				    F_FW_WR_EQUEQ);
-				eq->equeqidx = eq->pidx;
-			} else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >=
-			    32) {
-				wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ);
-				eq->equeqidx = eq->pidx;
-			}
-		}
-
-		if (dbdiff >= 16 && remaining >= 4) {
+		if (dbdiff >= 512 / EQ_ESIZE) {	/* X_FETCHBURSTMAX_512B */
+			if (wr_can_update_eq(wr))
+				set_txupdate_flags(txq, avail, wr);
 			ring_eq_db(sc, eq, dbdiff);
-			available += reclaim_tx_descs(txq, 4 * dbdiff);
+			avail += reclaim_tx_descs(txq, 32);
 			dbdiff = 0;
 		}
-
-		cidx = next_cidx;
+next_mbuf:
+		total++;
+		remaining--;
+		if (__predict_false(++cidx == r->size))
+			cidx = 0;
 	}
 	if (dbdiff != 0) {
+		if (wr_can_update_eq(wr))
+			set_txupdate_flags(txq, avail, wr);
 		ring_eq_db(sc, eq, dbdiff);
 		reclaim_tx_descs(txq, 32);
+	} else if (eq->pidx == eq->cidx && txp->npkt > 0 &&
+	    atomic_load_int(&txq->eq.equiq) == 0) {
+		/*
+		 * If nothing was submitted to the chip for tx (it was coalesced
+		 * into txpkts instead) and there is no tx update outstanding
+		 * then we need to send txpkts now.
+		 */
+send_txpkts:
+		MPASS(txp->npkt > 0);
+		for (i = 0; i < txp->npkt; i++)
+			ETHER_BPF_MTAP(ifp, txp->mb[i]);
+		if (txp->npkt > 1) {
+			MPASS(avail >= tx_len16_to_desc(txp->len16));
+			if (sc->flags & IS_VF)
+				n = write_txpkts_vm_wr(sc, txq);
+			else
+				n = write_txpkts_wr(sc, txq);
+		} else {
+			MPASS(avail >=
+			    tx_len16_to_desc(mbuf_len16(txp->mb[0])));
+			if (sc->flags & IS_VF)
+				n = write_txpkt_vm_wr(sc, txq, txp->mb[0]);
+			else
+				n = write_txpkt_wr(sc, txq, txp->mb[0], avail);
+		}
+		MPASS(n <= SGE_MAX_WR_NDESC);
+		wr = &eq->desc[eq->pidx];
+		IDXINCR(eq->pidx, n, eq->sidx);
+		txp->npkt = 0;	/* emptied */
+
+		MPASS(wr_can_update_eq(wr));
+		set_txupdate_flags(txq, avail - n, wr);
+		ring_eq_db(sc, eq, n);
+		reclaim_tx_descs(txq, 32);
 	}
-done:
-	TXQ_UNLOCK(txq);
+	*coalescing = txp->npkt > 0;

 	return (total);
 }
@ -4106,11 +4169,12 @@ alloc_txq(struct vi_info *vi, struct sge_txq *txq, int idx,
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct sge_eq *eq = &txq->eq;
+	struct txpkts *txp;
 	char name[16];
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);

 	rc = mp_ring_alloc(&txq->r, eq->sidx, txq, eth_tx, can_resume_eth_tx,
-	    M_CXGBE, M_WAITOK);
+	    M_CXGBE, &eq->eq_lock, M_WAITOK);
 	if (rc != 0) {
 		device_printf(sc->dev, "failed to allocate mp_ring: %d\n", rc);
 		return (rc);
@ -4147,6 +4211,12 @@ alloc_txq(struct vi_info *vi, struct sge_txq *txq, int idx,
 	txq->sdesc = malloc(eq->sidx * sizeof(struct tx_sdesc), M_CXGBE,
 	    M_ZERO | M_WAITOK);

+	txp = &txq->txp;
+	txp->score = 5;
+	MPASS(nitems(txp->mb) >= sc->params.max_pkts_per_eth_tx_pkts_wr);
+	txq->txp.max_npkt = min(nitems(txp->mb),
+	    sc->params.max_pkts_per_eth_tx_pkts_wr);
+
 	snprintf(name, sizeof(name), "%d", idx);
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name,
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "tx queue");
@ -4242,25 +4312,7 @@ alloc_txq(struct vi_info *vi, struct sge_txq *txq, int idx,
 		    "# of NIC TLS sessions using AES-GCM");
 	}
 #endif
-
-	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_enqueues",
-	    CTLFLAG_RD, &txq->r->enqueues,
-	    "# of enqueues to the mp_ring for this queue");
-	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_drops",
-	    CTLFLAG_RD, &txq->r->drops,
-	    "# of drops in the mp_ring for this queue");
-	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_starts",
-	    CTLFLAG_RD, &txq->r->starts,
-	    "# of normal consumer starts in the mp_ring for this queue");
-	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_stalls",
-	    CTLFLAG_RD, &txq->r->stalls,
-	    "# of consumer stalls in the mp_ring for this queue");
-	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_restarts",
-	    CTLFLAG_RD, &txq->r->restarts,
-	    "# of consumer restarts in the mp_ring for this queue");
-	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_abdications",
-	    CTLFLAG_RD, &txq->r->abdications,
-	    "# of consumer abdications in the mp_ring for this queue");
+	mp_ring_sysctls(txq->r, &vi->ctx, children);

 	return (0);
 }
@ -4655,10 +4707,10 @@ csum_to_ctrl(struct adapter *sc, struct mbuf *m)
 * The return value is the # of hardware descriptors used.
 */
 static u_int
-write_txpkt_vm_wr(struct adapter *sc, struct sge_txq *txq,
-    struct fw_eth_tx_pkt_vm_wr *wr, struct mbuf *m0, u_int available)
+write_txpkt_vm_wr(struct adapter *sc, struct sge_txq *txq, struct mbuf *m0)
 {
-	struct sge_eq *eq = &txq->eq;
+	struct sge_eq *eq;
+	struct fw_eth_tx_pkt_vm_wr *wr;
 	struct tx_sdesc *txsd;
 	struct cpl_tx_pkt_core *cpl;
 	uint32_t ctrl;	/* used in many unrelated places */
@ -4668,7 +4720,6 @@ write_txpkt_vm_wr(struct adapter *sc, struct sge_txq *txq,

 	TXQ_LOCK_ASSERT_OWNED(txq);
 	M_ASSERTPKTHDR(m0);
-	MPASS(available > 0 && available < eq->sidx);

 	len16 = mbuf_len16(m0);
 	nsegs = mbuf_nsegs(m0);
@ -4677,10 +4728,10 @@ write_txpkt_vm_wr(struct adapter *sc, struct sge_txq *txq,
 	if (needs_tso(m0))
 		ctrl += sizeof(struct cpl_tx_pkt_lso_core);
 	ndesc = tx_len16_to_desc(len16);
-	MPASS(ndesc <= available);

 	/* Firmware work request header */
-	MPASS(wr == (void *)&eq->desc[eq->pidx]);
+	eq = &txq->eq;
+	wr = (void *)&eq->desc[eq->pidx];
 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_VM_WR) |
 	    V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));

@ -4760,7 +4811,6 @@ write_txpkt_vm_wr(struct adapter *sc, struct sge_txq *txq,
 	} else
 		write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx);
 	txq->sgl_wrs++;
-
 	txq->txpkt_wrs++;

 	txsd = &txq->sdesc[eq->pidx];
@ -4811,10 +4861,11 @@ write_raw_wr(struct sge_txq *txq, void *wr, struct mbuf *m0, u_int available)
 * The return value is the # of hardware descriptors used.
 */
 static u_int
-write_txpkt_wr(struct adapter *sc, struct sge_txq *txq,
-    struct fw_eth_tx_pkt_wr *wr, struct mbuf *m0, u_int available)
+write_txpkt_wr(struct adapter *sc, struct sge_txq *txq, struct mbuf *m0,
+    u_int available)
 {
-	struct sge_eq *eq = &txq->eq;
+	struct sge_eq *eq;
+	struct fw_eth_tx_pkt_wr *wr;
 	struct tx_sdesc *txsd;
 	struct cpl_tx_pkt_core *cpl;
 	uint32_t ctrl;	/* used in many unrelated places */
@ -4824,7 +4875,6 @@ write_txpkt_wr(struct adapter *sc, struct sge_txq *txq,

 	TXQ_LOCK_ASSERT_OWNED(txq);
 	M_ASSERTPKTHDR(m0);
-	MPASS(available > 0 && available < eq->sidx);

 	len16 = mbuf_len16(m0);
 	nsegs = mbuf_nsegs(m0);
@ -4844,7 +4894,8 @@ write_txpkt_wr(struct adapter *sc, struct sge_txq *txq,
 	MPASS(ndesc <= available);

 	/* Firmware work request header */
-	MPASS(wr == (void *)&eq->desc[eq->pidx]);
+	eq = &txq->eq;
+	wr = (void *)&eq->desc[eq->pidx];
 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) |
 	    V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));

@ -4927,71 +4978,151 @@ write_txpkt_wr(struct adapter *sc, struct sge_txq *txq,
 	return (ndesc);
 }

-static int
-try_txpkts(struct mbuf *m, struct mbuf *n, struct txpkts *txp, u_int available)
+static inline bool
+cmp_l2hdr(struct txpkts *txp, struct mbuf *m)
 {
-	u_int needed, nsegs1, nsegs2, l1, l2;
+	int len;

-	if (cannot_use_txpkts(m) || cannot_use_txpkts(n))
-		return (1);
+	MPASS(txp->npkt > 0);
+	MPASS(m->m_len >= 16);	/* type1 implies 1 GL with all of the frame. */

-	nsegs1 = mbuf_nsegs(m);
-	nsegs2 = mbuf_nsegs(n);
-	if (nsegs1 + nsegs2 == 2) {
-		txp->wr_type = 1;
-		l1 = l2 = txpkts1_len16();
-	} else {
-		txp->wr_type = 0;
-		l1 = txpkts0_len16(nsegs1);
-		l2 = txpkts0_len16(nsegs2);
+	if (txp->ethtype == be16toh(ETHERTYPE_VLAN))
+		len = sizeof(struct ether_vlan_header);
+	else
+		len = sizeof(struct ether_header);
+
+	return (memcmp(m->m_data, &txp->ethmacdst[0], len) != 0);
+}
+
+static inline void
+save_l2hdr(struct txpkts *txp, struct mbuf *m)
+{
+	MPASS(m->m_len >= 16);	/* type1 implies 1 GL with all of the frame. */
+
+	memcpy(&txp->ethmacdst[0], mtod(m, const void *), 16);
+}
+
+static int
+add_to_txpkts_vf(struct adapter *sc, struct sge_txq *txq, struct mbuf *m,
+    int avail, bool *send)
+{
+	struct txpkts *txp = &txq->txp;
+
+	MPASS(sc->flags & IS_VF);
+
+	/* Cannot have TSO and coalesce at the same time. */
+	if (cannot_use_txpkts(m)) {
+cannot_coalesce:
+		*send = txp->npkt > 0;
+		return (EINVAL);
 	}
-	txp->len16 = howmany(sizeof(struct fw_eth_tx_pkts_wr), 16) + l1 + l2;
-	needed = tx_len16_to_desc(txp->len16);
-	if (needed > SGE_MAX_WR_NDESC || needed > available)
-		return (1);

-	txp->plen = m->m_pkthdr.len + n->m_pkthdr.len;
-	if (txp->plen > 65535)
-		return (1);
+	/* VF allows coalescing of type 1 (1 GL) only */
+	if (mbuf_nsegs(m) > 1)
+		goto cannot_coalesce;

-	txp->npkt = 2;
-	set_mbuf_len16(m, l1);
-	set_mbuf_len16(n, l2);
+	*send = false;
+	if (txp->npkt > 0) {
+		MPASS(tx_len16_to_desc(txp->len16) <= avail);
+		MPASS(txp->npkt < txp->max_npkt);
+		MPASS(txp->wr_type == 1);	/* VF supports type 1 only */

+		if (tx_len16_to_desc(txp->len16 + txpkts1_len16()) > avail) {
+retry_after_send:
+			*send = true;
+			return (EAGAIN);
+		}
+		if (m->m_pkthdr.len + txp->plen > 65535)
+			goto retry_after_send;
+		if (cmp_l2hdr(txp, m))
+			goto retry_after_send;
+
+		txp->len16 += txpkts1_len16();
+		txp->plen += m->m_pkthdr.len;
+		txp->mb[txp->npkt++] = m;
+		if (txp->npkt == txp->max_npkt)
+			*send = true;
+	} else {
+		txp->len16 = howmany(sizeof(struct fw_eth_tx_pkts_vm_wr), 16) +
+		    txpkts1_len16();
+		if (tx_len16_to_desc(txp->len16) > avail)
+			goto cannot_coalesce;
+		txp->npkt = 1;
+		txp->wr_type = 1;
+		txp->plen = m->m_pkthdr.len;
+		txp->mb[0] = m;
+		save_l2hdr(txp, m);
+	}
 	return (0);
 }

 static int
-add_to_txpkts(struct mbuf *m, struct txpkts *txp, u_int available)
+add_to_txpkts_pf(struct adapter *sc, struct sge_txq *txq, struct mbuf *m,
+    int avail, bool *send)
 {
-	u_int plen, len16, needed, nsegs;
+	struct txpkts *txp = &txq->txp;
+	int nsegs;

-	MPASS(txp->wr_type == 0 || txp->wr_type == 1);
+	MPASS(!(sc->flags & IS_VF));

-	if (cannot_use_txpkts(m))
-		return (1);
+	/* Cannot have TSO and coalesce at the same time. */
+	if (cannot_use_txpkts(m)) {
+cannot_coalesce:
+		*send = txp->npkt > 0;
+		return (EINVAL);
+	}

+	*send = false;
 	nsegs = mbuf_nsegs(m);
-	if (txp->wr_type == 1 && nsegs != 1)
-		return (1);
+	if (txp->npkt == 0) {
+		if (m->m_pkthdr.len > 65535)
+			goto cannot_coalesce;
+		if (nsegs > 1) {
+			txp->wr_type = 0;
+			txp->len16 =
+			    howmany(sizeof(struct fw_eth_tx_pkts_wr), 16) +
+			    txpkts0_len16(nsegs);
+		} else {
+			txp->wr_type = 1;
+			txp->len16 =
+			    howmany(sizeof(struct fw_eth_tx_pkts_wr), 16) +
+			    txpkts1_len16();
+		}
+		if (tx_len16_to_desc(txp->len16) > avail)
+			goto cannot_coalesce;
+		txp->npkt = 1;
+		txp->plen = m->m_pkthdr.len;
+		txp->mb[0] = m;
+	} else {
+		MPASS(tx_len16_to_desc(txp->len16) <= avail);
+		MPASS(txp->npkt < txp->max_npkt);

-	plen = txp->plen + m->m_pkthdr.len;
-	if (plen > 65535)
-		return (1);
+		if (m->m_pkthdr.len + txp->plen > 65535) {
+retry_after_send:
+			*send = true;
+			return (EAGAIN);
+		}

-	if (txp->wr_type == 0)
-		len16 = txpkts0_len16(nsegs);
-	else
-		len16 = txpkts1_len16();
-	needed = tx_len16_to_desc(txp->len16 + len16);
-	if (needed > SGE_MAX_WR_NDESC || needed > available)
-		return (1);
-
-	txp->npkt++;
-	txp->plen = plen;
-	txp->len16 += len16;
-	set_mbuf_len16(m, len16);
+		MPASS(txp->wr_type == 0 || txp->wr_type == 1);
+		if (txp->wr_type == 0) {
+			if (tx_len16_to_desc(txp->len16 +
+			    txpkts0_len16(nsegs)) > min(avail, SGE_MAX_WR_NDESC))
+				goto retry_after_send;
+			txp->len16 += txpkts0_len16(nsegs);
+		} else {
+			if (nsegs != 1)
+				goto retry_after_send;
+			if (tx_len16_to_desc(txp->len16 + txpkts1_len16()) >
+			    avail)
+				goto retry_after_send;
+			txp->len16 += txpkts1_len16();
+		}

+		txp->plen += m->m_pkthdr.len;
+		txp->mb[txp->npkt++] = m;
+		if (txp->npkt == txp->max_npkt)
+			*send = true;
+	}
 	return (0);
 }

@ -5003,34 +5134,25 @@ add_to_txpkts(struct mbuf *m, struct txpkts *txp, u_int available)
 * The return value is the # of hardware descriptors used.
 */
 static u_int
-write_txpkts_wr(struct adapter *sc, struct sge_txq *txq,
-    struct fw_eth_tx_pkts_wr *wr, struct mbuf *m0, const struct txpkts *txp,
-    u_int available)
+write_txpkts_wr(struct adapter *sc, struct sge_txq *txq)
 {
+	const struct txpkts *txp = &txq->txp;
 	struct sge_eq *eq = &txq->eq;
+	struct fw_eth_tx_pkts_wr *wr;
 	struct tx_sdesc *txsd;
 	struct cpl_tx_pkt_core *cpl;
-	uint32_t ctrl;
 	uint64_t ctrl1;
-	int ndesc, checkwrap;
-	struct mbuf *m;
+	int ndesc, i, checkwrap;
+	struct mbuf *m, *last;
 	void *flitp;

 	TXQ_LOCK_ASSERT_OWNED(txq);
 	MPASS(txp->npkt > 0);
-	MPASS(txp->plen < 65536);
-	MPASS(m0 != NULL);
-	MPASS(m0->m_nextpkt != NULL);
 	MPASS(txp->len16 <= howmany(SGE_MAX_WR_LEN, 16));
-	MPASS(available > 0 && available < eq->sidx);

-	ndesc = tx_len16_to_desc(txp->len16);
-	MPASS(ndesc <= available);
-
-	MPASS(wr == (void *)&eq->desc[eq->pidx]);
+	wr = (void *)&eq->desc[eq->pidx];
 	wr->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR));
-	ctrl = V_FW_WR_LEN16(txp->len16);
-	wr->equiq_to_len16 = htobe32(ctrl);
+	wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(txp->len16));
 	wr->plen = htobe16(txp->plen);
 	wr->npkt = txp->npkt;
 	wr->r3 = 0;
@ -5042,8 +5164,11 @@ write_txpkts_wr(struct adapter *sc, struct sge_txq *txq,
 	 * set then we know the WR is going to wrap around somewhere.  We'll
 	 * check for that at appropriate points.
 	 */
+	ndesc = tx_len16_to_desc(txp->len16);
+	last = NULL;
 	checkwrap = eq->sidx - ndesc < eq->pidx;
-	for (m = m0; m != NULL; m = m->m_nextpkt) {
+	for (i = 0; i < txp->npkt; i++) {
+		m = txp->mb[i];
 		if (txp->wr_type == 0) {
 			struct ulp_txpkt *ulpmc;
 			struct ulptx_idata *ulpsc;
@ -5052,7 +5177,7 @@ write_txpkts_wr(struct adapter *sc, struct sge_txq *txq,
 			ulpmc = flitp;
 			ulpmc->cmd_dest = htobe32(V_ULPTX_CMD(ULP_TX_PKT) |
 			    V_ULP_TXPKT_DEST(0) | V_ULP_TXPKT_FID(eq->iqid));
-			ulpmc->len = htobe32(mbuf_len16(m));
+			ulpmc->len = htobe32(txpkts0_len16(mbuf_nsegs(m)));

 			/* ULP subcommand */
 			ulpsc = (void *)(ulpmc + 1);
@ -5093,8 +5218,12 @@ write_txpkts_wr(struct adapter *sc, struct sge_txq *txq,

 		write_gl_to_txd(txq, m, (caddr_t *)(&flitp), checkwrap);

+		if (last != NULL)
+			last->m_nextpkt = m;
+		last = m;
 	}

+	txq->sgl_wrs++;
 	if (txp->wr_type == 0) {
 		txq->txpkts0_pkts += txp->npkt;
 		txq->txpkts0_wrs++;
@ -5104,7 +5233,87 @@ write_txpkts_wr(struct adapter *sc, struct sge_txq *txq,
 	}

 	txsd = &txq->sdesc[eq->pidx];
-	txsd->m = m0;
+	txsd->m = txp->mb[0];
+	txsd->desc_used = ndesc;
+
+	return (ndesc);
+}
+
+static u_int
+write_txpkts_vm_wr(struct adapter *sc, struct sge_txq *txq)
+{
+	const struct txpkts *txp = &txq->txp;
+	struct sge_eq *eq = &txq->eq;
+	struct fw_eth_tx_pkts_vm_wr *wr;
+	struct tx_sdesc *txsd;
+	struct cpl_tx_pkt_core *cpl;
+	uint64_t ctrl1;
+	int ndesc, i;
+	struct mbuf *m, *last;
+	void *flitp;
+
+	TXQ_LOCK_ASSERT_OWNED(txq);
+	MPASS(txp->npkt > 0);
+	MPASS(txp->wr_type == 1);	/* VF supports type 1 only */
+	MPASS(txp->mb[0] != NULL);
+	MPASS(txp->len16 <= howmany(SGE_MAX_WR_LEN, 16));
+
+	wr = (void *)&eq->desc[eq->pidx];
+	wr->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_VM_WR));
+	wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(txp->len16));
+	wr->r3 = 0;
+	wr->plen = htobe16(txp->plen);
+	wr->npkt = txp->npkt;
+	wr->r4 = 0;
+	memcpy(&wr->ethmacdst[0], &txp->ethmacdst[0], 16);
+	flitp = wr + 1;
+
+	/*
+	 * At this point we are 32B into a hardware descriptor.  Each mbuf in
+	 * the WR will take 32B so we check for the end of the descriptor ring
+	 * before writing odd mbufs (mb[1], 3, 5, ..)
+	 */
+	ndesc = tx_len16_to_desc(txp->len16);
+	last = NULL;
+	for (i = 0; i < txp->npkt; i++) {
+		m = txp->mb[i];
+		if (i & 1 && (uintptr_t)flitp == (uintptr_t)&eq->desc[eq->sidx])
+			flitp = &eq->desc[0];
+		cpl = flitp;
+
+		/* Checksum offload */
+		ctrl1 = csum_to_ctrl(sc, m);
+		if (ctrl1 != (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS))
+			txq->txcsum++;	/* some hardware assistance provided */
+
+		/* VLAN tag insertion */
+		if (needs_vlan_insertion(m)) {
+			ctrl1 |= F_TXPKT_VLAN_VLD |
+			    V_TXPKT_VLAN(m->m_pkthdr.ether_vtag);
+			txq->vlan_insertion++;
+		}
+
+		/* CPL header */
+		cpl->ctrl0 = txq->cpl_ctrl0;
+		cpl->pack = 0;
+		cpl->len = htobe16(m->m_pkthdr.len);
+		cpl->ctrl1 = htobe64(ctrl1);
+
+		flitp = cpl + 1;
+		MPASS(mbuf_nsegs(m) == 1);
+		write_gl_to_txd(txq, m, (caddr_t *)(&flitp), 0);
+
+		if (last != NULL)
+			last->m_nextpkt = m;
+		last = m;
+	}
+
+	txq->sgl_wrs++;
+	txq->txpkts1_pkts += txp->npkt;
+	txq->txpkts1_wrs++;
+
+	txsd = &txq->sdesc[eq->pidx];
+	txsd->m = txp->mb[0];
 	txsd->desc_used = ndesc;

 	return (ndesc);
@ -5444,8 +5653,10 @@ handle_eth_egr_update(struct adapter *sc, struct sge_eq *eq)
 	MPASS((eq->flags & EQ_TYPEMASK) == EQ_ETH);

 	atomic_readandclear_int(&eq->equiq);
-	mp_ring_check_drainage(txq->r, 0);
-	taskqueue_enqueue(sc->tq[eq->tx_chan], &txq->tx_reclaim_task);
+	if (mp_ring_is_idle(txq->r))
+		taskqueue_enqueue(sc->tq[eq->tx_chan], &txq->tx_reclaim_task);
+	else
+		mp_ring_check_drainage(txq->r, 64);
 }

 static int
--- a/sys/dev/cxgbe/t4_vf.c
+++ b/sys/dev/cxgbe/t4_vf.c
@ -231,6 +231,7 @@ static int
 get_params__post_init(struct adapter *sc)
 {
 	int rc;
+	uint32_t param, val;

 	rc = -t4vf_get_sge_params(sc);
 	if (rc != 0) {
@ -282,6 +283,13 @@ get_params__post_init(struct adapter *sc)
 	}
 	sc->params.portvec = sc->params.vfres.pmask;

+	param = FW_PARAM_PFVF(MAX_PKTS_PER_ETH_TX_PKTS_WR);
+	rc = -t4vf_query_params(sc, 1, &param, &val);
+	if (rc == 0)
+		sc->params.max_pkts_per_eth_tx_pkts_wr = val;
+	else
+		sc->params.max_pkts_per_eth_tx_pkts_wr = 14;
+
 	return (0);
 }