diff --git a/sys/dev/ath/if_ath_tx.c b/sys/dev/ath/if_ath_tx.c
index e6a31e5d62dd..0a248bedc5a6 100644
--- a/sys/dev/ath/if_ath_tx.c
+++ b/sys/dev/ath/if_ath_tx.c
@@ -3129,7 +3129,21 @@ ath_tx_swq(struct ath_softc *sc, struct ieee80211_node *ni,
 		ATH_TID_INSERT_TAIL(atid, bf, bf_list);
 		/* XXX sched? */
 	} else if (ath_tx_ampdu_running(sc, an, tid)) {
-		/* AMPDU running, attempt direct dispatch if possible */
+		/*
+		 * AMPDU running, queue single-frame if the hardware queue
+		 * isn't busy.
+		 *
+		 * If the hardware queue is busy, sending an aggregate frame
+		 * then just hold off so we can queue more aggregate frames.
+		 *
+		 * Otherwise we may end up with single frames leaking through
+		 * because we are dispatching them too quickly.
+		 *
+		 * TODO: maybe we should treat this as two policies - minimise
+		 * latency, or maximise throughput.  Then for BE/BK we can
+		 * maximise throughput, and VO/VI (if AMPDU is enabled!)
+		 * minimise latency.
+		 */
 
 		/*
 		 * Always queue the frame to the tail of the list.
@@ -3138,18 +3152,18 @@ ath_tx_swq(struct ath_softc *sc, struct ieee80211_node *ni,
 
 		/*
 		 * If the hardware queue isn't busy, direct dispatch
-		 * the head frame in the list.  Don't schedule the
-		 * TID - let it build some more frames first?
+		 * the head frame in the list.
 		 *
-		 * When running A-MPDU, always just check the hardware
-		 * queue depth against the aggregate frame limit.
-		 * We don't want to burst a large number of single frames
-		 * out to the hardware; we want to aggressively hold back.
+		 * Note: if we're say, configured to do ADDBA but not A-MPDU
+		 * then maybe we want to still queue two non-aggregate frames
+		 * to the hardware.  Again with the per-TID policy
+		 * configuration..)
 		 *
 		 * Otherwise, schedule the TID.
 		 */
 		/* XXX TXQ locking */
-		if (txq->axq_depth + txq->fifo.axq_depth < sc->sc_hwq_limit_aggr) {
+		if (txq->axq_depth + txq->fifo.axq_depth == 0) {
+
 			bf = ATH_TID_FIRST(atid);
 			ATH_TID_REMOVE(atid, bf, bf_list);
 
@@ -5615,19 +5629,40 @@ ath_txq_sched(struct ath_softc *sc, struct ath_txq *txq)
 	ATH_TX_LOCK_ASSERT(sc);
 
 	/*
-	 * Don't schedule if the hardware queue is busy.
-	 * This (hopefully) gives some more time to aggregate
-	 * some packets in the aggregation queue.
+	 * For non-EDMA chips, aggr frames that have been built are
+	 * in axq_aggr_depth, whether they've been scheduled or not.
+	 * There's no FIFO, so txq->axq_depth is what's been scheduled
+	 * to the hardware.
 	 *
-	 * XXX It doesn't stop a parallel sender from sneaking
-	 * in transmitting a frame!
+	 * For EDMA chips, we do it in two stages.  The existing code
+	 * builds a list of frames to go to the hardware and the EDMA
+	 * code turns it into a single entry to push into the FIFO.
+	 * That way we don't take up one packet per FIFO slot.
+	 * We do push one aggregate per FIFO slot though, just to keep
+	 * things simple.
+	 *
+	 * The FIFO depth is what's in the hardware; the txq->axq_depth
+	 * is what's been scheduled to the FIFO.
+	 *
+	 * fifo.axq_depth is the number of frames (or aggregates) pushed
+	 *  into the EDMA FIFO.  For multi-frame lists, this is the number
+	 *  of frames pushed in.
+	 * axq_fifo_depth is the number of FIFO slots currently busy.
 	 */
-	/* XXX TXQ locking */
-	if (txq->axq_aggr_depth + txq->fifo.axq_depth >= sc->sc_hwq_limit_aggr) {
+
+	/* For EDMA and non-EDMA, check built/scheduled against aggr limit */
+	if (txq->axq_aggr_depth >= sc->sc_hwq_limit_aggr) {
 		sc->sc_aggr_stats.aggr_sched_nopkt++;
 		return;
 	}
-	if (txq->axq_depth >= sc->sc_hwq_limit_nonaggr) {
+
+	/*
+	 * For non-EDMA chips, axq_depth is the "what's scheduled to
+	 * the hardware list".  For EDMA it's "What's built for the hardware"
+	 * and fifo.axq_depth is how many frames have been dispatched
+	 * already to the hardware.
+	 */
+	if (txq->axq_depth + txq->fifo.axq_depth >= sc->sc_hwq_limit_nonaggr) {
 		sc->sc_aggr_stats.aggr_sched_nopkt++;
 		return;
 	}