if_arge: fix up TX workaround; add TX/RX requirements for busdma; add stats

The early ethernet MACs (I think AR71xx and AR913x) require that both
TX and RX require 4-byte alignment for all packets.

The later MACs have started relaxing the requirements.

For now, the 1-byte TX and 1-byte RX alignment requirements are only for
the QCA955x SoCs.  I'll add in the relaxed requirements as I review the
datasheets and do testing.

* Add a hardware flags field and 1-byte / 4-byte TX/RX alignment.
* .. defaulting to 4-byte TX and 4-byte RX alignment.
* Only enforce the TX alignment fixup if the hardware requires a 4-byte
  TX alignment.  This avoids a call to m_defrag().
* Add counters for various situations for further debugging.
* Set the 1-byte and 4-byte busdma alignment requirement when
  the tag is created.

This improves the straight bridging performance from 130mbit/sec
to 180mbit/sec, purely by removing the need for TX path bounce buffers.

The main performance issue is the RX alignment requirement and any RX
bounce buffering that's occuring.  (In a local test, removing the RX
fixup path and just aligning buffers raises the performance to above
400mbit/sec.

In theory it's a no-op for SoCs before the QCA955x.

Tested:

* QCA9558 SoC in AP135 board, using software bridging between arge0/arge1.
This commit is contained in:
Adrian Chadd 2015-10-18 00:59:28 +00:00
parent 46839d1247
commit 9919dec83c
2 changed files with 119 additions and 22 deletions

View File

@ -298,6 +298,29 @@ arge_attach_sysctl(device_t dev)
"tx_pkts_unaligned", CTLFLAG_RW, &sc->stats.tx_pkts_unaligned,
0, "number of TX unaligned packets");
SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(tree), OID_AUTO,
"tx_pkts_unaligned_start", CTLFLAG_RW, &sc->stats.tx_pkts_unaligned_start,
0, "number of TX unaligned packets (start)");
SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(tree), OID_AUTO,
"tx_pkts_unaligned_len", CTLFLAG_RW, &sc->stats.tx_pkts_unaligned_len,
0, "number of TX unaligned packets (len)");
SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(tree), OID_AUTO,
"tx_pkts_nosegs", CTLFLAG_RW, &sc->stats.tx_pkts_nosegs,
0, "number of TX packets fail with no ring slots avail");
SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(tree), OID_AUTO,
"intr_stray_filter", CTLFLAG_RW, &sc->stats.intr_stray,
0, "number of stray interrupts (filter)");
SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(tree), OID_AUTO,
"intr_stray_intr", CTLFLAG_RW, &sc->stats.intr_stray2,
0, "number of stray interrupts (intr)");
SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(tree), OID_AUTO,
"intr_ok", CTLFLAG_RW, &sc->stats.intr_ok,
0, "number of OK interrupts");
#ifdef ARGE_DEBUG
SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(tree), OID_AUTO, "tx_prod",
CTLFLAG_RW, &sc->arge_cdata.arge_tx_prod, 0, "");
@ -626,6 +649,22 @@ arge_attach(device_t dev)
local_macstr = NULL;
}
/*
* Hardware workarounds.
*/
switch (ar71xx_soc) {
case AR71XX_SOC_QCA9556:
case AR71XX_SOC_QCA9558:
/* Arbitrary alignment */
sc->arge_hw_flags |= ARGE_HW_FLG_TX_DESC_ALIGN_1BYTE;
sc->arge_hw_flags |= ARGE_HW_FLG_RX_DESC_ALIGN_1BYTE;
break;
default:
sc->arge_hw_flags |= ARGE_HW_FLG_TX_DESC_ALIGN_4BYTE;
sc->arge_hw_flags |= ARGE_HW_FLG_RX_DESC_ALIGN_4BYTE;
break;
}
/*
* Some units (eg the TP-Link WR-1043ND) do not have a convenient
* EEPROM location to read the ethernet MAC address from.
@ -825,6 +864,9 @@ arge_attach(device_t dev)
ARGE_WRITE(sc, AR71XX_MAC_FIFO_CFG0,
FIFO_CFG0_ALL << FIFO_CFG0_ENABLE_SHIFT);
/*
* SoC specific bits.
*/
switch (ar71xx_soc) {
case AR71XX_SOC_AR7240:
case AR71XX_SOC_AR7241:
@ -1351,24 +1393,35 @@ arge_init_locked(struct arge_softc *sc)
* Return whether the mbuf chain is correctly aligned
* for the arge TX engine.
*
* The TX engine requires each fragment to be aligned to a
* 4 byte boundary and the size of each fragment except
* the last to be a multiple of 4 bytes.
* All the MACs have a length requirement: any non-final
* fragment (ie, descriptor with MORE bit set) needs to have
* a length divisible by 4.
*
* XXX TODO: I believe this is only a bug on the AR71xx and
* AR913x MACs. The later MACs (AR724x and later) does not
* need this workaround.
* The AR71xx, AR913x require the start address also be
* DWORD aligned. The later MACs don't.
*/
static int
arge_mbuf_chain_is_tx_aligned(struct mbuf *m0)
arge_mbuf_chain_is_tx_aligned(struct arge_softc *sc, struct mbuf *m0)
{
struct mbuf *m;
for (m = m0; m != NULL; m = m->m_next) {
if((mtod(m, intptr_t) & 3) != 0)
/*
* Only do this for chips that require it.
*/
if ((sc->arge_hw_flags & ARGE_HW_FLG_TX_DESC_ALIGN_4BYTE) &&
(mtod(m, intptr_t) & 3) != 0) {
sc->stats.tx_pkts_unaligned_start++;
return 0;
if ((m->m_next != NULL) && ((m->m_len & 0x03) != 0))
}
/*
* All chips have this requirement for length.
*/
if ((m->m_next != NULL) && ((m->m_len & 0x03) != 0)) {
sc->stats.tx_pkts_unaligned_len++;
return 0;
}
}
return 1;
}
@ -1389,15 +1442,10 @@ arge_encap(struct arge_softc *sc, struct mbuf **m_head)
ARGE_LOCK_ASSERT(sc);
/*
* Fix mbuf chain, all fragments should be 4 bytes aligned and
* even 4 bytes
*
* XXX TODO: I believe this is only a bug on the AR71xx and
* AR913x MACs. The later MACs (AR724x and later) does not
* need this workaround.
* Fix mbuf chain based on hardware alignment constraints.
*/
m = *m_head;
if (! arge_mbuf_chain_is_tx_aligned(m)) {
if (! arge_mbuf_chain_is_tx_aligned(sc, m)) {
sc->stats.tx_pkts_unaligned++;
m = m_defrag(*m_head, M_NOWAIT);
if (m == NULL) {
@ -1427,6 +1475,7 @@ arge_encap(struct arge_softc *sc, struct mbuf **m_head)
/* Check number of available descriptors. */
if (sc->arge_cdata.arge_tx_cnt + nsegs >= (ARGE_TX_RING_COUNT - 1)) {
bus_dmamap_unload(sc->arge_cdata.arge_tx_tag, txd->tx_dmamap);
sc->stats.tx_pkts_nosegs++;
return (ENOBUFS);
}
@ -1444,7 +1493,9 @@ arge_encap(struct arge_softc *sc, struct mbuf **m_head)
desc = &sc->arge_rdata.arge_tx_ring[prod];
desc->packet_ctrl = ARGE_DMASIZE(txsegs[i].ds_len);
if (txsegs[i].ds_addr & 3)
/* XXX Note: only relevant for older MACs; but check length! */
if ((sc->arge_hw_flags & ARGE_HW_FLG_TX_DESC_ALIGN_4BYTE) &&
(txsegs[i].ds_addr & 3))
panic("TX packet address unaligned\n");
desc->packet_addr = txsegs[i].ds_addr;
@ -1715,6 +1766,16 @@ arge_dma_alloc(struct arge_softc *sc)
struct arge_txdesc *txd;
struct arge_rxdesc *rxd;
int error, i;
int arge_tx_align, arge_rx_align;
/* Assume 4 byte alignment by default */
arge_tx_align = 4;
arge_rx_align = 4;
if (sc->arge_hw_flags & ARGE_HW_FLG_TX_DESC_ALIGN_1BYTE)
arge_tx_align = 1;
if (sc->arge_hw_flags & ARGE_HW_FLG_RX_DESC_ALIGN_1BYTE)
arge_rx_align = 1;
/* Create parent DMA tag. */
error = bus_dma_tag_create(
@ -1775,7 +1836,7 @@ arge_dma_alloc(struct arge_softc *sc)
/* Create tag for Tx buffers. */
error = bus_dma_tag_create(
sc->arge_cdata.arge_parent_tag, /* parent */
sizeof(uint32_t), 0, /* alignment, boundary */
arge_tx_align, 0, /* alignment, boundary */
BUS_SPACE_MAXADDR, /* lowaddr */
BUS_SPACE_MAXADDR, /* highaddr */
NULL, NULL, /* filter, filterarg */
@ -1793,7 +1854,7 @@ arge_dma_alloc(struct arge_softc *sc)
/* Create tag for Rx buffers. */
error = bus_dma_tag_create(
sc->arge_cdata.arge_parent_tag, /* parent */
ARGE_RX_ALIGN, 0, /* alignment, boundary */
arge_rx_align, 0, /* alignment, boundary */
BUS_SPACE_MAXADDR, /* lowaddr */
BUS_SPACE_MAXADDR, /* highaddr */
NULL, NULL, /* filter, filterarg */
@ -2108,6 +2169,11 @@ arge_newbuf(struct arge_softc *sc, int idx)
if (m == NULL)
return (ENOBUFS);
m->m_len = m->m_pkthdr.len = MCLBYTES;
/*
* Add extra space to "adjust" (copy) the packet back to be aligned
* for purposes of IPv4/IPv6 header contents.
*/
m_adj(m, sizeof(uint64_t));
if (bus_dmamap_load_mbuf_sg(sc->arge_cdata.arge_rx_tag,
@ -2126,7 +2192,8 @@ arge_newbuf(struct arge_softc *sc, int idx)
sc->arge_cdata.arge_rx_sparemap = map;
rxd->rx_m = m;
desc = rxd->desc;
if (segs[0].ds_addr & 3)
if ((sc->arge_hw_flags & ARGE_HW_FLG_RX_DESC_ALIGN_4BYTE) &&
segs[0].ds_addr & 3)
panic("RX packet address unaligned");
desc->packet_addr = segs[0].ds_addr;
desc->packet_ctrl = ARGE_DESC_EMPTY | ARGE_DMASIZE(segs[0].ds_len);
@ -2331,10 +2398,12 @@ arge_intr_filter(void *arg)
if (status & DMA_INTR_ALL) {
sc->arge_intr_status |= status;
ARGE_WRITE(sc, AR71XX_DMA_INTR, 0);
sc->stats.intr_ok++;
return (FILTER_SCHEDULE_THREAD);
}
sc->arge_intr_status = 0;
sc->stats.intr_stray++;
return (FILTER_STRAY);
}
@ -2355,8 +2424,10 @@ arge_intr(void *arg)
/*
* Is it our interrupt at all?
*/
if (status == 0)
if (status == 0) {
sc->stats.intr_stray2++;
return;
}
if (status & DMA_INTR_RX_BUS_ERROR) {
ARGE_WRITE(sc, AR71XX_DMA_RX_STATUS, DMA_RX_STATUS_BUS_ERROR);

View File

@ -37,7 +37,10 @@
#define ARGE_TX_DMA_SIZE ARGE_TX_RING_COUNT * sizeof(struct arge_desc)
#define ARGE_MAXFRAGS 8
#define ARGE_RING_ALIGN sizeof(struct arge_desc)
#define ARGE_RX_ALIGN sizeof(uint32_t)
#define ARGE_RX_ALIGN_4BYTE sizeof(uint32_t)
#define ARGE_RX_ALIGN_1BYTE sizeof(char)
#define ARGE_TX_ALIGN_4BYTE sizeof(uint32_t)
#define ARGE_TX_ALIGN_1BYTE sizeof(char)
#define ARGE_MAXFRAGS 8
#define ARGE_TX_RING_ADDR(sc, i) \
((sc)->arge_rdata.arge_tx_ring_paddr + sizeof(struct arge_desc) * (i))
@ -149,6 +152,22 @@ struct arge_pll_data {
uint32_t pll_1000;
};
/*
* Hardware specific behaviours.
*/
/*
* Older chips support 4 byte only transmit and receive
* addresses.
*
* Later chips support arbitrary TX and later later,
* arbitrary RX addresses.
*/
#define ARGE_HW_FLG_TX_DESC_ALIGN_4BYTE 0x00000001
#define ARGE_HW_FLG_RX_DESC_ALIGN_4BYTE 0x00000002
#define ARGE_HW_FLG_TX_DESC_ALIGN_1BYTE 0x00000004
#define ARGE_HW_FLG_RX_DESC_ALIGN_1BYTE 0x00000008
struct arge_softc {
struct ifnet *arge_ifp; /* interface info */
device_t arge_dev;
@ -180,13 +199,20 @@ struct arge_softc {
uint32_t arge_intr_status;
int arge_mac_unit;
int arge_if_flags;
uint32_t arge_hw_flags;
uint32_t arge_debug;
uint32_t arge_mdiofreq;
struct {
uint32_t tx_pkts_unaligned;
uint32_t tx_pkts_unaligned_start;
uint32_t tx_pkts_unaligned_len;
uint32_t tx_pkts_nosegs;
uint32_t tx_pkts_aligned;
uint32_t rx_overflow;
uint32_t tx_underflow;
uint32_t intr_stray;
uint32_t intr_stray2;
uint32_t intr_ok;
} stats;
};