Implement a separate hardware queue threshold for aggregate and non-aggr

traffic.

When transmitting non-aggregate traffic, we need to keep the hardware
busy whilst transmitting or small bursts in txdone/tx latency will
kill us.

This restores non-aggregate iperf performance, especially when doing
TDMA.

Tested:

* AR5416<->AR5416, TDMA
* AR5416 STA <-> AR9280 AP
This commit is contained in:
adrian 2013-05-21 18:13:57 +00:00
parent c283b0b0ae
commit e1aea355e7
5 changed files with 65 additions and 12 deletions

View File

@ -839,7 +839,8 @@ ath_attach(u_int16_t devid, struct ath_softc *sc)
/*
* Initial aggregation settings.
*/
sc->sc_hwq_limit = ATH_AGGR_MIN_QDEPTH;
sc->sc_hwq_limit_aggr = ATH_AGGR_MIN_QDEPTH;
sc->sc_hwq_limit_nonaggr = ATH_NONAGGR_MIN_QDEPTH;
sc->sc_tid_hwq_lo = ATH_AGGR_SCHED_LOW;
sc->sc_tid_hwq_hi = ATH_AGGR_SCHED_HIGH;
sc->sc_aggr_limit = ATH_AGGR_MAXSIZE;

View File

@ -722,8 +722,11 @@ ath_sysctlattach(struct ath_softc *sc)
"mask of error frames to pass when monitoring");
SYSCTL_ADD_INT(ctx, SYSCTL_CHILDREN(tree), OID_AUTO,
"hwq_limit", CTLFLAG_RW, &sc->sc_hwq_limit, 0,
"Hardware queue depth before software-queuing TX frames");
"hwq_limit_nonaggr", CTLFLAG_RW, &sc->sc_hwq_limit_nonaggr, 0,
"Hardware non-AMPDU queue depth before software-queuing TX frames");
SYSCTL_ADD_INT(ctx, SYSCTL_CHILDREN(tree), OID_AUTO,
"hwq_limit_aggr", CTLFLAG_RW, &sc->sc_hwq_limit_aggr, 0,
"Hardware AMPDU queue depth before software-queuing TX frames");
SYSCTL_ADD_INT(ctx, SYSCTL_CHILDREN(tree), OID_AUTO,
"tid_hwq_lo", CTLFLAG_RW, &sc->sc_tid_hwq_lo, 0,
"");

View File

@ -3108,9 +3108,15 @@ ath_tx_swq(struct ath_softc *sc, struct ieee80211_node *ni,
* the head frame in the list. Don't schedule the
* TID - let it build some more frames first?
*
* When running A-MPDU, always just check the hardware
* queue depth against the aggregate frame limit.
* We don't want to burst a large number of single frames
* out to the hardware; we want to aggressively hold back.
*
* Otherwise, schedule the TID.
*/
if (txq->axq_depth + txq->fifo.axq_depth < sc->sc_hwq_limit) {
/* XXX TXQ locking */
if (txq->axq_depth + txq->fifo.axq_depth < sc->sc_hwq_limit_aggr) {
bf = ATH_TID_FIRST(atid);
ATH_TID_REMOVE(atid, bf, bf_list);
@ -3134,7 +3140,22 @@ ath_tx_swq(struct ath_softc *sc, struct ieee80211_node *ni,
ath_tx_tid_sched(sc, atid);
}
} else if (txq->axq_depth + txq->fifo.axq_depth < sc->sc_hwq_limit) {
/*
* If we're not doing A-MPDU, be prepared to direct dispatch
* up to both limits if possible. This particular corner
* case may end up with packet starvation between aggregate
* traffic and non-aggregate traffic: we wnat to ensure
* that non-aggregate stations get a few frames queued to the
* hardware before the aggregate station(s) get their chance.
*
* So if you only ever see a couple of frames direct dispatched
* to the hardware from a non-AMPDU client, check both here
* and in the software queue dispatcher to ensure that those
* non-AMPDU stations get a fair chance to transmit.
*/
/* XXX TXQ locking */
} else if ((txq->axq_depth + txq->fifo.axq_depth < sc->sc_hwq_limit_nonaggr) &&
(txq->axq_aggr_depth < sc->sc_hwq_limit_aggr)) {
/* AMPDU not running, attempt direct dispatch */
DPRINTF(sc, ATH_DEBUG_SW_TX, "%s: xmit_normal\n", __func__);
/* See if clrdmask needs to be set */
@ -5339,7 +5360,8 @@ ath_tx_tid_hw_queue_aggr(struct ath_softc *sc, struct ath_node *an,
*
* XXX locking on txq here?
*/
if (txq->axq_aggr_depth >= sc->sc_hwq_limit ||
/* XXX TXQ locking */
if (txq->axq_aggr_depth >= sc->sc_hwq_limit_aggr ||
(status == ATH_AGGR_BAW_CLOSED ||
status == ATH_AGGR_LEAK_CLOSED))
break;
@ -5348,6 +5370,15 @@ ath_tx_tid_hw_queue_aggr(struct ath_softc *sc, struct ath_node *an,
/*
* Schedule some packets from the given node/TID to the hardware.
*
* XXX TODO: this routine doesn't enforce the maximum TXQ depth.
* It just dumps frames into the TXQ. We should limit how deep
* the transmit queue can grow for frames dispatched to the given
* TXQ.
*
* To avoid locking issues, either we need to own the TXQ lock
* at this point, or we need to pass in the maximum frame count
* from the caller.
*/
void
ath_tx_tid_hw_queue_norm(struct ath_softc *sc, struct ath_node *an,
@ -5452,8 +5483,16 @@ ath_txq_sched(struct ath_softc *sc, struct ath_txq *txq)
* Don't schedule if the hardware queue is busy.
* This (hopefully) gives some more time to aggregate
* some packets in the aggregation queue.
*
* XXX It doesn't stop a parallel sender from sneaking
* in transmitting a frame!
*/
if (txq->axq_aggr_depth >= sc->sc_hwq_limit) {
/* XXX TXQ locking */
if (txq->axq_aggr_depth + txq->fifo.axq_depth >= sc->sc_hwq_limit_aggr) {
sc->sc_aggr_stats.aggr_sched_nopkt++;
return;
}
if (txq->axq_depth >= sc->sc_hwq_limit_nonaggr) {
sc->sc_aggr_stats.aggr_sched_nopkt++;
return;
}
@ -5489,7 +5528,11 @@ ath_txq_sched(struct ath_softc *sc, struct ath_txq *txq)
* packets. If we aren't running aggregation then
* we should still limit the hardware queue depth.
*/
if (txq->axq_depth >= sc->sc_hwq_limit) {
/* XXX TXQ locking */
if (txq->axq_aggr_depth + txq->fifo.axq_depth >= sc->sc_hwq_limit_aggr) {
break;
}
if (txq->axq_depth >= sc->sc_hwq_limit_nonaggr) {
break;
}

View File

@ -47,6 +47,7 @@
* How 'busy' to try and keep the hardware txq
*/
#define ATH_AGGR_MIN_QDEPTH 2
#define ATH_NONAGGR_MIN_QDEPTH 32
/*
* Watermark for scheduling TIDs in order to maximise aggregation.

View File

@ -814,16 +814,21 @@ struct ath_softc {
int sc_txq_node_psq_maxdepth;
/*
* Aggregation twiddles
* Software queue twiddles
*
* hwq_limit: how busy to keep the hardware queue - don't schedule
* further packets to the hardware, regardless of the TID
* hwq_limit_nonaggr:
* when to begin limiting non-aggregate frames to the
* hardware queue, regardless of the TID.
* hwq_limit_aggr:
* when to begin limiting A-MPDU frames to the
* hardware queue, regardless of the TID.
* tid_hwq_lo: how low the per-TID hwq count has to be before the
* TID will be scheduled again
* tid_hwq_hi: how many frames to queue to the HWQ before the TID
* stops being scheduled.
*/
int sc_hwq_limit;
int sc_hwq_limit_nonaggr;
int sc_hwq_limit_aggr;
int sc_tid_hwq_lo;
int sc_tid_hwq_hi;