Fix up some corner cases with aggregation handling.

I've come across a weird scenario in net80211 where two TX streams will happily attempt to setup an aggregation session together. If we're very lucky, it happens concurrently on separate CPUs and the total lack of locking in the net80211 aggregation code causes this stuff to race. Badly. So >1 call would occur to the ath(4) addba start, but only one call would complete to addba complete or timeout. The TID would thus stay paused. The real fix is to implement some proper per-node (or maybe per-TID) locking in net80211, which then could be leveraged by the ath(4) TX aggregation code. Whilst I'm at it, shuffle around the debugging messages a bit. I like to keep people on their toes.
2012-05-22 06:31:03 +00:00 · 2012-05-22 06:31:03 +00:00 · a217e03ba9
commit a217e03ba9
parent a1e4bfc55a
2 changed files with 19 additions and 5 deletions
--- a/sys/dev/ath/if_ath_tx.c
+++ b/sys/dev/ath/if_ath_tx.c
@ -1580,21 +1580,21 @@ ath_tx_start(struct ath_softc *sc, struct ieee80211_node *ni,
 	 * reached.)
 	 */
 	if (txq == &avp->av_mcastq) {
-		DPRINTF(sc, ATH_DEBUG_SW_TX_CTRL,
+		DPRINTF(sc, ATH_DEBUG_SW_TX,
 		    "%s: bf=%p: mcastq: TX'ing\n", __func__, bf);
 		ATH_TXQ_LOCK(txq);
 		ath_tx_xmit_normal(sc, txq, bf);
 		ATH_TXQ_UNLOCK(txq);
 	} else if (type == IEEE80211_FC0_TYPE_CTL &&
 		    subtype == IEEE80211_FC0_SUBTYPE_BAR) {
-		DPRINTF(sc, ATH_DEBUG_SW_TX_CTRL,
+		DPRINTF(sc, ATH_DEBUG_SW_TX,
 		    "%s: BAR: TX'ing direct\n", __func__);
 		ATH_TXQ_LOCK(txq);
 		ath_tx_xmit_normal(sc, txq, bf);
 		ATH_TXQ_UNLOCK(txq);
 	} else {
 		/* add to software queue */
-		DPRINTF(sc, ATH_DEBUG_SW_TX_CTRL,
+		DPRINTF(sc, ATH_DEBUG_SW_TX,
 		    "%s: bf=%p: swq: TX'ing\n", __func__, bf);
 		ath_tx_swq(sc, ni, txq, bf);
 	}
@ -3113,7 +3113,7 @@ ath_tx_tid_cleanup(struct ath_softc *sc, struct ath_node *an, int tid)
 	struct ath_buf *bf, *bf_next;
 	ath_bufhead bf_cq;

-	DPRINTF(sc, ATH_DEBUG_SW_TX_CTRL,
+	DPRINTF(sc, ATH_DEBUG_SW_TX_BAW,
 	    "%s: TID %d: called\n", __func__, tid);

 	TAILQ_INIT(&bf_cq);
@ -4336,7 +4336,15 @@ ath_addba_request(struct ieee80211_node *ni, struct ieee80211_tx_ampdu *tap,
 	 * fall within it.
 	 */
 	ATH_TXQ_LOCK(sc->sc_ac2q[atid->ac]);
-	ath_tx_tid_pause(sc, atid);
+	/*
+	 * This is a bit annoying.  Until net80211 HT code inherits some
+	 * (any) locking, we may have this called in parallel BUT only
+	 * one response/timeout will be called.  Grr.
+	 */
+	if (atid->addba_tx_pending == 0) {
+		ath_tx_tid_pause(sc, atid);
+		atid->addba_tx_pending = 1;
+	}
 	ATH_TXQ_UNLOCK(sc->sc_ac2q[atid->ac]);

 	DPRINTF(sc, ATH_DEBUG_SW_TX_CTRL,
@ -4397,6 +4405,7 @@ ath_addba_response(struct ieee80211_node *ni, struct ieee80211_tx_ampdu *tap,
 	r = sc->sc_addba_response(ni, tap, status, code, batimeout);

 	ATH_TXQ_LOCK(sc->sc_ac2q[atid->ac]);
+	atid->addba_tx_pending = 0;
 	/*
 	 * XXX dirty!
 	 * Slide the BAW left edge to wherever net80211 left it for us.
@ -4500,6 +4509,10 @@ ath_addba_response_timeout(struct ieee80211_node *ni,
 	DPRINTF(sc, ATH_DEBUG_SW_TX_CTRL,
 	    "%s: called; resuming\n", __func__);

+	ATH_TXQ_LOCK(sc->sc_ac2q[atid->ac]);
+	atid->addba_tx_pending = 0;
+	ATH_TXQ_UNLOCK(sc->sc_ac2q[atid->ac]);
+
 	/* Note: This updates the aggregate state to (again) pending */
 	sc->sc_addba_response_timeout(ni, tap);

--- a/sys/dev/ath/if_athvar.h
+++ b/sys/dev/ath/if_athvar.h
@ -106,6 +106,7 @@ struct ath_tid {
 	TAILQ_ENTRY(ath_tid)	axq_qelem;
 	int			sched;
 	int			paused;	/* >0 if the TID has been paused */
+	int			addba_tx_pending;	/* TX ADDBA pending */
 	int			bar_wait;	/* waiting for BAR */
 	int			bar_tx;		/* BAR TXed */