diff --git a/sys/dev/ath/if_ath_tx_edma.c b/sys/dev/ath/if_ath_tx_edma.c
index 44835c53f475..c9720199fc6f 100644
--- a/sys/dev/ath/if_ath_tx_edma.c
+++ b/sys/dev/ath/if_ath_tx_edma.c
@@ -138,14 +138,145 @@ MALLOC_DECLARE(M_ATHDEV);
 
 static void ath_edma_tx_processq(struct ath_softc *sc, int dosched);
 
+#ifdef	ATH_DEBUG_ALQ
+static void
+ath_tx_alq_edma_push(struct ath_softc *sc, int txq, int nframes,
+    int fifo_depth, int frame_cnt)
+{
+	struct if_ath_alq_tx_fifo_push aq;
+
+	aq.txq = htobe32(txq);
+	aq.nframes = htobe32(nframes);
+	aq.fifo_depth = htobe32(fifo_depth);
+	aq.frame_cnt = htobe32(frame_cnt);
+
+	if_ath_alq_post(&sc->sc_alq, ATH_ALQ_TX_FIFO_PUSH,
+	    sizeof(aq),
+	    (const char *) &aq);
+}
+#endif	/* ATH_DEBUG_ALQ */
+
+static void
+ath_tx_edma_push_staging_list(struct ath_softc *sc, struct ath_txq *txq,
+    int limit)
+{
+	struct ath_buf *bf, *bf_last;
+	struct ath_buf *bfi, *bfp;
+	int i, sqdepth;
+	TAILQ_HEAD(axq_q_f_s, ath_buf)  sq;
+
+	ATH_TXQ_LOCK_ASSERT(txq);
+
+	/*
+	 * Don't bother doing any work if it's full.
+	 */
+	if (txq->axq_fifo_depth >= HAL_TXFIFO_DEPTH)
+		return;
+
+	if (TAILQ_EMPTY(&txq->axq_q))
+		return;
+
+	TAILQ_INIT(&sq);
+
+	/*
+	 * First pass - walk sq, queue up to 'limit' entries,
+	 * subtract them from the staging queue.
+	 */
+	sqdepth = 0;
+	for (i = 0; i < limit; i++) {
+		/* Grab the head entry */
+		bf = ATH_TXQ_FIRST(txq);
+		if (bf == NULL)
+			break;
+		ATH_TXQ_REMOVE(txq, bf, bf_list);
+
+		/* Queue it into our staging list */
+		TAILQ_INSERT_TAIL(&sq, bf, bf_list);
+		sqdepth++;
+	}
+
+	/*
+	 * Ok, so now we have a staging list of up to 'limit'
+	 * frames from the txq.  Now let's wrap that up
+	 * into its own list and pass that to the hardware
+	 * as one FIFO entry.
+	 */
+
+	bf = TAILQ_FIRST(&sq);
+	bf_last = TAILQ_LAST(&sq, axq_q_s);
+
+	/*
+	 * Ok, so here's the gymnastics reqiured to make this
+	 * all sensible.
+	 */
+
+	/*
+	 * Tag the first/last buffer appropriately.
+	 */
+	bf->bf_flags |= ATH_BUF_FIFOPTR;
+	bf_last->bf_flags |= ATH_BUF_FIFOEND;
+
+	/*
+	 * Walk the descriptor list and link them appropriately.
+	 */
+	bfp = NULL;
+	TAILQ_FOREACH(bfi, &sq, bf_list) {
+		if (bfp != NULL) {
+			ath_hal_settxdesclink(sc->sc_ah, bfp->bf_lastds,
+			    bfi->bf_daddr);
+		}
+		bfp = bfi;
+	}
+
+	i = 0;
+	TAILQ_FOREACH(bfi, &sq, bf_list) {
+#ifdef	ATH_DEBUG
+		if (sc->sc_debug & ATH_DEBUG_XMIT_DESC)
+			ath_printtxbuf(sc, bfi, txq->axq_qnum, i, 0);
+#endif/* ATH_DEBUG */
+#ifdef	ATH_DEBUG_ALQ
+		if (if_ath_alq_checkdebug(&sc->sc_alq, ATH_ALQ_EDMA_TXDESC))
+			ath_tx_alq_post(sc, bfi);
+#endif /* ATH_DEBUG_ALQ */
+		i++;
+	}
+
+	/*
+	 * We now need to push this set of frames onto the tail
+	 * of the FIFO queue.  We don't adjust the aggregate
+	 * count, only the queue depth counter(s).
+	 * We also need to blank the link pointer now.
+	 */
+
+	TAILQ_CONCAT(&txq->fifo.axq_q, &sq, bf_list);
+	/* Bump total queue tracking in FIFO queue */
+	txq->fifo.axq_depth += sqdepth;
+
+	/* Bump FIFO queue */
+	txq->axq_fifo_depth++;
+	DPRINTF(sc, ATH_DEBUG_XMIT,
+	    "%s: queued %d packets; depth=%d, fifo depth=%d\n",
+	    __func__, sqdepth, txq->fifo.axq_depth, txq->axq_fifo_depth);
+
+	/* Push the first entry into the hardware */
+	ath_hal_puttxbuf(sc->sc_ah, txq->axq_qnum, bf->bf_daddr);
+
+	/* Push start on the DMA if it's not already started */
+	ath_hal_txstart(sc->sc_ah, txq->axq_qnum);
+
+#ifdef	ATH_DEBUG_ALQ
+	ath_tx_alq_edma_push(sc, txq->axq_qnum, sqdepth,
+	    txq->axq_fifo_depth,
+	    txq->fifo.axq_depth);
+#endif /* ATH_DEBUG_ALQ */
+}
+
 /*
  * Push some frames into the TX FIFO if we have space.
  */
 static void
 ath_edma_tx_fifo_fill(struct ath_softc *sc, struct ath_txq *txq)
 {
-	struct ath_buf *bf, *bf_last;
-	int i = 0;
 
 	ATH_TXQ_LOCK_ASSERT(txq);
 
@@ -153,64 +284,40 @@ ath_edma_tx_fifo_fill(struct ath_softc *sc, struct ath_txq *txq)
 	    __func__,
 	    txq->axq_qnum);
 
-	TAILQ_FOREACH(bf, &txq->axq_q, bf_list) {
-		if (txq->axq_fifo_depth >= HAL_TXFIFO_DEPTH)
-			break;
-
-		/*
-		 * We have space in the FIFO - so let's push a frame
-		 * into it.
-		 */
-
-		/*
-		 * Remove it from the normal list
-		 */
-		ATH_TXQ_REMOVE(txq, bf, bf_list);
-
-		/*
-		 * XXX for now, we only dequeue a frame at a time, so
-		 * that's only one buffer.  Later on when we just
-		 * push this staging _list_ into the queue, we'll
-		 * set bf_last to the end pointer in the list.
-		 */
-		bf_last = bf;
-		DPRINTF(sc, ATH_DEBUG_TX_PROC,
-		    "%s: Q%d: depth=%d; pushing %p->%p\n",
-		    __func__,
-		    txq->axq_qnum,
-		    txq->axq_fifo_depth,
-		    bf,
-		    bf_last);
-
-		/*
-		 * Append it to the FIFO staging list
-		 */
-		ATH_TXQ_INSERT_TAIL(&txq->fifo, bf, bf_list);
-
-		/*
-		 * Set fifo start / fifo end flags appropriately
-		 *
-		 */
-		bf->bf_flags |= ATH_BUF_FIFOPTR;
-		bf_last->bf_flags |= ATH_BUF_FIFOEND;
-
-		/*
-		 * Push _into_ the FIFO.
-		 */
-		ath_hal_puttxbuf(sc->sc_ah, txq->axq_qnum, bf->bf_daddr);
-#ifdef	ATH_DEBUG
-		if (sc->sc_debug & ATH_DEBUG_XMIT_DESC)
-			ath_printtxbuf(sc, bf, txq->axq_qnum, i, 0);
-#endif/* ATH_DEBUG */
-#ifdef	ATH_DEBUG_ALQ
-		if (if_ath_alq_checkdebug(&sc->sc_alq, ATH_ALQ_EDMA_TXDESC))
-			ath_tx_alq_post(sc, bf);
-#endif /* ATH_DEBUG_ALQ */
-		txq->axq_fifo_depth++;
-		i++;
-	}
-	if (i > 0)
-		ath_hal_txstart(sc->sc_ah, txq->axq_qnum);
+	/*
+	 * For now, push up to 4 frames per TX FIFO slot.
+	 * If more are in the hardware queue then they'll
+	 * get populated when we try to send another frame
+	 * or complete a frame - so at most there'll be
+	 * 32 non-AMPDU frames per TXQ.
+	 *
+	 * Note that the hardware staging queue will limit
+	 * how many frames in total we will have pushed into
+	 * here.
+	 *
+	 * Later on, we'll want to push less frames into
+	 * the TX FIFO since we don't want to necessarily
+	 * fill tens or hundreds of milliseconds of potential
+	 * frames.
+	 *
+	 * However, we need more frames right now because of
+	 * how the MAC implements the frame scheduling policy.
+	 * It only ungates a single FIFO entry at a time,
+	 * and will run that until CHNTIME expires or the
+	 * end of that FIFO entry descriptor list is reached.
+	 * So for TDMA we suffer a big performance penalty -
+	 * single TX FIFO entries mean the MAC only sends out
+	 * one frame per DBA event, which turned out on average
+	 * 6ms per TX frame.
+	 *
+	 * So, for aggregates it's okay - it'll push two at a
+	 * time and this will just do them more efficiently.
+	 * For non-aggregates it'll do 4 at a time, up to the
+	 * non-aggr limit (non_aggr, which is 32.)  They should
+	 * be time based rather than a hard count, but I also
+	 * do need sleep.
+	 */
+	ath_tx_edma_push_staging_list(sc, txq, 4);
 }
 
 /*