From 4f5ec72aa42401b2a7663df78d9d5c440542daec Mon Sep 17 00:00:00 2001
From: Adrian Chadd <adrian@FreeBSD.org>
Date: Tue, 21 Jun 2016 15:38:20 +0000
Subject: [PATCH] [ath] fix TX throughput for EDMA chips by pushing more into
 the TX FIFO.

It turns out that getting decent performance requires stacking the TX
FIFO a little more aggressively.

* Ensure that when we complete a frame, we attempt to push a new frame
  into the FIFO so TX is kept as active as it needs to be
* Be more aggressive about batching non-aggregate frames into a single
  TX FIFO slot.  This "fixes" TDMA performance (since we only get one
  TX FIFO slot ungated per DMA beacon alert) but it does this by pushing
  a whole lot of work into the TX FIFO slot.

I'm not /entirely/ pleased by this solution, but it does fix a whole bunch
of corner case issues in the transmit side and fix TDMA whilst I'm at it.
I'll go revisit transmit packet scheduling in ath(4) post 11.

Tested:

* AR9380, STA mode
* AR9580, hostap mode
* AR9380, TDMA client mode

Approved by:	re (hrs)
---
 sys/dev/ath/if_ath_tx_edma.c | 90 ++++++++++++++++++++++++++++--------
 1 file changed, 70 insertions(+), 20 deletions(-)

diff --git a/sys/dev/ath/if_ath_tx_edma.c b/sys/dev/ath/if_ath_tx_edma.c
index 4cad0ce5cefe..486c06a8a6ba 100644
--- a/sys/dev/ath/if_ath_tx_edma.c
+++ b/sys/dev/ath/if_ath_tx_edma.c
@@ -156,6 +156,17 @@ ath_tx_alq_edma_push(struct ath_softc *sc, int txq, int nframes,
 }
 #endif	/* ATH_DEBUG_ALQ */
 
+/*
+ * XXX TODO: push an aggregate as a single FIFO slot, even though
+ * it may not meet the TXOP for say, DBA-gated traffic in TDMA mode.
+ *
+ * The TX completion code handles a TX FIFO slot having multiple frames,
+ * aggregate or otherwise, but it may just make things easier to deal
+ * with.
+ *
+ * XXX TODO: track the number of aggregate subframes and put that in the
+ * push alq message.
+ */
 static void
 ath_tx_edma_push_staging_list(struct ath_softc *sc, struct ath_txq *txq,
     int limit)
@@ -274,6 +285,8 @@ ath_tx_edma_push_staging_list(struct ath_softc *sc, struct ath_txq *txq,
 #endif /* ATH_DEBUG_ALQ */
 }
 
+#define	TX_BATCH_SIZE	32
+
 /*
  * Push some frames into the TX FIFO if we have space.
  */
@@ -320,7 +333,50 @@ ath_edma_tx_fifo_fill(struct ath_softc *sc, struct ath_txq *txq)
 	 * be time based rather than a hard count, but I also
 	 * do need sleep.
 	 */
-	ath_tx_edma_push_staging_list(sc, txq, 4);
+
+	/*
+	 * Do some basic, basic batching to the hardware
+	 * queue.
+	 *
+	 * If we have TX_BATCH_SIZE entries in the staging
+	 * queue, then let's try to send them all in one hit.
+	 *
+	 * Ensure we don't push more than TX_BATCH_SIZE worth
+	 * in, otherwise we end up draining 8 slots worth of
+	 * 32 frames into the hardware queue and then we don't
+	 * attempt to push more frames in until we empty the
+	 * FIFO.
+	 */
+	if (txq->axq_depth >= TX_BATCH_SIZE / 2 &&
+	    txq->fifo.axq_depth <= TX_BATCH_SIZE) {
+		ath_tx_edma_push_staging_list(sc, txq, TX_BATCH_SIZE);
+	}
+
+	/*
+	 * Aggregate check: if we have less than two FIFO slots
+	 * busy and we have some aggregate frames, queue it.
+	 *
+	 * Now, ideally we'd just check to see if the scheduler
+	 * has given us aggregate frames and push them into the FIFO
+	 * as individual slots, as honestly we should just be pushing
+	 * a single aggregate in as one FIFO slot.
+	 *
+	 * Let's do that next once I know this works.
+	 */
+	else if (txq->axq_aggr_depth > 0 && txq->axq_fifo_depth < 2)
+		ath_tx_edma_push_staging_list(sc, txq, TX_BATCH_SIZE);
+
+	/*
+	 *
+	 * If we have less, and the TXFIFO isn't empty, let's
+	 * wait until we've finished sending the FIFO.
+	 *
+	 * If we have less, and the TXFIFO is empty, then
+	 * send them.
+	 */
+	else if (txq->axq_fifo_depth == 0) {
+		ath_tx_edma_push_staging_list(sc, txq, TX_BATCH_SIZE);
+	}
 }
 
 /*
@@ -478,13 +534,6 @@ ath_edma_xmit_handoff_hw(struct ath_softc *sc, struct ath_txq *txq,
 	/* Push and update frame stats */
 	ATH_TXQ_INSERT_TAIL(txq, bf, bf_list);
 
-	/* For now, set the link pointer in the last descriptor
-	 * to be NULL.
-	 *
-	 * Later on, when it comes time to handling multiple descriptors
-	 * in one FIFO push, we can link descriptors together this way.
-	 */
-
 	/*
 	 * Finally, call the FIFO schedule routine to schedule some
 	 * frames to the FIFO.
@@ -722,6 +771,7 @@ ath_edma_tx_processq(struct ath_softc *sc, int dosched)
 	struct ieee80211_node *ni;
 	int nacked = 0;
 	int idx;
+	int i;
 
 #ifdef	ATH_DEBUG
 	/* XXX */
@@ -927,28 +977,28 @@ ath_edma_tx_processq(struct ath_softc *sc, int dosched)
 		/* Handle frame completion and rate control update */
 		ath_tx_process_buf_completion(sc, txq, &ts, bf);
 
-		/* bf is invalid at this point */
-
-		/*
-		 * Now that there's space in the FIFO, let's push some
-		 * more frames into it.
-		 */
-		ATH_TXQ_LOCK(txq);
-		if (dosched)
-			ath_edma_tx_fifo_fill(sc, txq);
-		ATH_TXQ_UNLOCK(txq);
+		/* NB: bf is invalid at this point */
 	}
 
 	sc->sc_wd_timer = 0;
 
-	/* Kick software scheduler */
 	/*
 	 * XXX It's inefficient to do this if the FIFO queue is full,
 	 * but there's no easy way right now to only populate
 	 * the txq task for _one_ TXQ.  This should be fixed.
 	 */
-	if (dosched)
+	if (dosched) {
+		/* Attempt to schedule more hardware frames to the TX FIFO */
+		for (i = 0; i < HAL_NUM_TX_QUEUES; i++) {
+			if (ATH_TXQ_SETUP(sc, i)) {
+				ATH_TXQ_LOCK(&sc->sc_txq[i]);
+				ath_edma_tx_fifo_fill(sc, &sc->sc_txq[i]);
+				ATH_TXQ_UNLOCK(&sc->sc_txq[i]);
+			}
+		}
+		/* Kick software scheduler */
 		ath_tx_swq_kick(sc);
+	}
 }
 
 static void