Implement a separate hardware queue threshold for aggregate and non-aggr

traffic. When transmitting non-aggregate traffic, we need to keep the hardware busy whilst transmitting or small bursts in txdone/tx latency will kill us. This restores non-aggregate iperf performance, especially when doing TDMA. Tested: * AR5416<->AR5416, TDMA * AR5416 STA <-> AR9280 AP
2013-05-21 18:13:57 +00:00 · 2013-05-21 18:13:57 +00:00 · e1aea355e7
commit e1aea355e7
parent c283b0b0ae
5 changed files with 65 additions and 12 deletions
--- a/sys/dev/ath/if_ath.c
+++ b/sys/dev/ath/if_ath.c
@ -839,7 +839,8 @@ ath_attach(u_int16_t devid, struct ath_softc *sc)
 	/*
 	 * Initial aggregation settings.
 	 */
-	sc->sc_hwq_limit = ATH_AGGR_MIN_QDEPTH;
+	sc->sc_hwq_limit_aggr = ATH_AGGR_MIN_QDEPTH;
+	sc->sc_hwq_limit_nonaggr = ATH_NONAGGR_MIN_QDEPTH;
 	sc->sc_tid_hwq_lo = ATH_AGGR_SCHED_LOW;
 	sc->sc_tid_hwq_hi = ATH_AGGR_SCHED_HIGH;
 	sc->sc_aggr_limit = ATH_AGGR_MAXSIZE;
--- a/sys/dev/ath/if_ath_sysctl.c
+++ b/sys/dev/ath/if_ath_sysctl.c
@ -722,8 +722,11 @@ ath_sysctlattach(struct ath_softc *sc)
 		"mask of error frames to pass when monitoring");

 	SYSCTL_ADD_INT(ctx, SYSCTL_CHILDREN(tree), OID_AUTO,
-		"hwq_limit", CTLFLAG_RW, &sc->sc_hwq_limit, 0,
-		"Hardware queue depth before software-queuing TX frames");
+		"hwq_limit_nonaggr", CTLFLAG_RW, &sc->sc_hwq_limit_nonaggr, 0,
+		"Hardware non-AMPDU queue depth before software-queuing TX frames");
+	SYSCTL_ADD_INT(ctx, SYSCTL_CHILDREN(tree), OID_AUTO,
+		"hwq_limit_aggr", CTLFLAG_RW, &sc->sc_hwq_limit_aggr, 0,
+		"Hardware AMPDU queue depth before software-queuing TX frames");
 	SYSCTL_ADD_INT(ctx, SYSCTL_CHILDREN(tree), OID_AUTO,
 		"tid_hwq_lo", CTLFLAG_RW, &sc->sc_tid_hwq_lo, 0,
 		"");
--- a/sys/dev/ath/if_ath_tx.c
+++ b/sys/dev/ath/if_ath_tx.c
@ -3108,9 +3108,15 @@ ath_tx_swq(struct ath_softc *sc, struct ieee80211_node *ni,
 		 * the head frame in the list.  Don't schedule the
 		 * TID - let it build some more frames first?
 		 *
+		 * When running A-MPDU, always just check the hardware
+		 * queue depth against the aggregate frame limit.
+		 * We don't want to burst a large number of single frames
+		 * out to the hardware; we want to aggressively hold back.
+		 *
 		 * Otherwise, schedule the TID.
 		 */
-		if (txq->axq_depth + txq->fifo.axq_depth < sc->sc_hwq_limit) {
+		/* XXX TXQ locking */
+		if (txq->axq_depth + txq->fifo.axq_depth < sc->sc_hwq_limit_aggr) {
 			bf = ATH_TID_FIRST(atid);
 			ATH_TID_REMOVE(atid, bf, bf_list);

@ -3134,7 +3140,22 @@ ath_tx_swq(struct ath_softc *sc, struct ieee80211_node *ni,

 			ath_tx_tid_sched(sc, atid);
 		}
-	} else if (txq->axq_depth + txq->fifo.axq_depth < sc->sc_hwq_limit) {
+	/*
+	 * If we're not doing A-MPDU, be prepared to direct dispatch
+	 * up to both limits if possible.  This particular corner
+	 * case may end up with packet starvation between aggregate
+	 * traffic and non-aggregate traffic: we wnat to ensure
+	 * that non-aggregate stations get a few frames queued to the
+	 * hardware before the aggregate station(s) get their chance.
+	 *
+	 * So if you only ever see a couple of frames direct dispatched
+	 * to the hardware from a non-AMPDU client, check both here
+	 * and in the software queue dispatcher to ensure that those
+	 * non-AMPDU stations get a fair chance to transmit.
+	 */
+	/* XXX TXQ locking */
+	} else if ((txq->axq_depth + txq->fifo.axq_depth < sc->sc_hwq_limit_nonaggr) &&
+		    (txq->axq_aggr_depth < sc->sc_hwq_limit_aggr)) {
 		/* AMPDU not running, attempt direct dispatch */
 		DPRINTF(sc, ATH_DEBUG_SW_TX, "%s: xmit_normal\n", __func__);
 		/* See if clrdmask needs to be set */
@ -5339,7 +5360,8 @@ ath_tx_tid_hw_queue_aggr(struct ath_softc *sc, struct ath_node *an,
 		 *
 		 * XXX locking on txq here?
 		 */
-		if (txq->axq_aggr_depth >= sc->sc_hwq_limit ||
+		/* XXX TXQ locking */
+		if (txq->axq_aggr_depth >= sc->sc_hwq_limit_aggr ||
 		    (status == ATH_AGGR_BAW_CLOSED ||
 		     status == ATH_AGGR_LEAK_CLOSED))
 			break;
@ -5348,6 +5370,15 @@ ath_tx_tid_hw_queue_aggr(struct ath_softc *sc, struct ath_node *an,

 /*
 * Schedule some packets from the given node/TID to the hardware.
+ *
+ * XXX TODO: this routine doesn't enforce the maximum TXQ depth.
+ * It just dumps frames into the TXQ.  We should limit how deep
+ * the transmit queue can grow for frames dispatched to the given
+ * TXQ.
+ *
+ * To avoid locking issues, either we need to own the TXQ lock
+ * at this point, or we need to pass in the maximum frame count
+ * from the caller.
 */
 void
 ath_tx_tid_hw_queue_norm(struct ath_softc *sc, struct ath_node *an,
@ -5452,8 +5483,16 @@ ath_txq_sched(struct ath_softc *sc, struct ath_txq *txq)
 	 * Don't schedule if the hardware queue is busy.
 	 * This (hopefully) gives some more time to aggregate
 	 * some packets in the aggregation queue.
+	 *
+	 * XXX It doesn't stop a parallel sender from sneaking
+	 * in transmitting a frame!
 	 */
-	if (txq->axq_aggr_depth >= sc->sc_hwq_limit) {
+	/* XXX TXQ locking */
+	if (txq->axq_aggr_depth + txq->fifo.axq_depth >= sc->sc_hwq_limit_aggr) {
+		sc->sc_aggr_stats.aggr_sched_nopkt++;
+		return;
+	}
+	if (txq->axq_depth >= sc->sc_hwq_limit_nonaggr) {
 		sc->sc_aggr_stats.aggr_sched_nopkt++;
 		return;
 	}
@ -5489,7 +5528,11 @@ ath_txq_sched(struct ath_softc *sc, struct ath_txq *txq)
 		 * packets.  If we aren't running aggregation then
 		 * we should still limit the hardware queue depth.
 		 */
-		if (txq->axq_depth >= sc->sc_hwq_limit) {
+		/* XXX TXQ locking */
+		if (txq->axq_aggr_depth + txq->fifo.axq_depth >= sc->sc_hwq_limit_aggr) {
+			break;
+		}
+		if (txq->axq_depth >= sc->sc_hwq_limit_nonaggr) {
 			break;
 		}

--- a/sys/dev/ath/if_ath_tx.h
+++ b/sys/dev/ath/if_ath_tx.h
@ -47,6 +47,7 @@
 * How 'busy' to try and keep the hardware txq
 */
 #define	ATH_AGGR_MIN_QDEPTH		2
+#define	ATH_NONAGGR_MIN_QDEPTH		32

 /*
 * Watermark for scheduling TIDs in order to maximise aggregation.
--- a/sys/dev/ath/if_athvar.h
+++ b/sys/dev/ath/if_athvar.h
@ -814,16 +814,21 @@ struct ath_softc {
 	int			sc_txq_node_psq_maxdepth;

 	/*
-	 * Aggregation twiddles
+	 * Software queue twiddles
 	 *
-	 * hwq_limit:	how busy to keep the hardware queue - don't schedule
-	 *		further packets to the hardware, regardless of the TID
+	 * hwq_limit_nonaggr:
+	 *		when to begin limiting non-aggregate frames to the
+	 *		hardware queue, regardless of the TID.
+	 * hwq_limit_aggr:
+	 *		when to begin limiting A-MPDU frames to the
+	 *		hardware queue, regardless of the TID.
 	 * tid_hwq_lo:	how low the per-TID hwq count has to be before the
 	 *		TID will be scheduled again
 	 * tid_hwq_hi:	how many frames to queue to the HWQ before the TID
 	 *		stops being scheduled.
 	 */
-	int			sc_hwq_limit;
+	int			sc_hwq_limit_nonaggr;
+	int			sc_hwq_limit_aggr;
 	int			sc_tid_hwq_lo;
 	int			sc_tid_hwq_hi;