if_arge: fix up TX workaround; add TX/RX requirements for busdma; add stats
The early ethernet MACs (I think AR71xx and AR913x) require that both TX and RX require 4-byte alignment for all packets. The later MACs have started relaxing the requirements. For now, the 1-byte TX and 1-byte RX alignment requirements are only for the QCA955x SoCs. I'll add in the relaxed requirements as I review the datasheets and do testing. * Add a hardware flags field and 1-byte / 4-byte TX/RX alignment. * .. defaulting to 4-byte TX and 4-byte RX alignment. * Only enforce the TX alignment fixup if the hardware requires a 4-byte TX alignment. This avoids a call to m_defrag(). * Add counters for various situations for further debugging. * Set the 1-byte and 4-byte busdma alignment requirement when the tag is created. This improves the straight bridging performance from 130mbit/sec to 180mbit/sec, purely by removing the need for TX path bounce buffers. The main performance issue is the RX alignment requirement and any RX bounce buffering that's occuring. (In a local test, removing the RX fixup path and just aligning buffers raises the performance to above 400mbit/sec. In theory it's a no-op for SoCs before the QCA955x. Tested: * QCA9558 SoC in AP135 board, using software bridging between arge0/arge1.
This commit is contained in:
parent
46839d1247
commit
9919dec83c
@ -298,6 +298,29 @@ arge_attach_sysctl(device_t dev)
|
||||
"tx_pkts_unaligned", CTLFLAG_RW, &sc->stats.tx_pkts_unaligned,
|
||||
0, "number of TX unaligned packets");
|
||||
|
||||
SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(tree), OID_AUTO,
|
||||
"tx_pkts_unaligned_start", CTLFLAG_RW, &sc->stats.tx_pkts_unaligned_start,
|
||||
0, "number of TX unaligned packets (start)");
|
||||
|
||||
SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(tree), OID_AUTO,
|
||||
"tx_pkts_unaligned_len", CTLFLAG_RW, &sc->stats.tx_pkts_unaligned_len,
|
||||
0, "number of TX unaligned packets (len)");
|
||||
|
||||
SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(tree), OID_AUTO,
|
||||
"tx_pkts_nosegs", CTLFLAG_RW, &sc->stats.tx_pkts_nosegs,
|
||||
0, "number of TX packets fail with no ring slots avail");
|
||||
|
||||
SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(tree), OID_AUTO,
|
||||
"intr_stray_filter", CTLFLAG_RW, &sc->stats.intr_stray,
|
||||
0, "number of stray interrupts (filter)");
|
||||
|
||||
SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(tree), OID_AUTO,
|
||||
"intr_stray_intr", CTLFLAG_RW, &sc->stats.intr_stray2,
|
||||
0, "number of stray interrupts (intr)");
|
||||
|
||||
SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(tree), OID_AUTO,
|
||||
"intr_ok", CTLFLAG_RW, &sc->stats.intr_ok,
|
||||
0, "number of OK interrupts");
|
||||
#ifdef ARGE_DEBUG
|
||||
SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(tree), OID_AUTO, "tx_prod",
|
||||
CTLFLAG_RW, &sc->arge_cdata.arge_tx_prod, 0, "");
|
||||
@ -626,6 +649,22 @@ arge_attach(device_t dev)
|
||||
local_macstr = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Hardware workarounds.
|
||||
*/
|
||||
switch (ar71xx_soc) {
|
||||
case AR71XX_SOC_QCA9556:
|
||||
case AR71XX_SOC_QCA9558:
|
||||
/* Arbitrary alignment */
|
||||
sc->arge_hw_flags |= ARGE_HW_FLG_TX_DESC_ALIGN_1BYTE;
|
||||
sc->arge_hw_flags |= ARGE_HW_FLG_RX_DESC_ALIGN_1BYTE;
|
||||
break;
|
||||
default:
|
||||
sc->arge_hw_flags |= ARGE_HW_FLG_TX_DESC_ALIGN_4BYTE;
|
||||
sc->arge_hw_flags |= ARGE_HW_FLG_RX_DESC_ALIGN_4BYTE;
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* Some units (eg the TP-Link WR-1043ND) do not have a convenient
|
||||
* EEPROM location to read the ethernet MAC address from.
|
||||
@ -825,6 +864,9 @@ arge_attach(device_t dev)
|
||||
ARGE_WRITE(sc, AR71XX_MAC_FIFO_CFG0,
|
||||
FIFO_CFG0_ALL << FIFO_CFG0_ENABLE_SHIFT);
|
||||
|
||||
/*
|
||||
* SoC specific bits.
|
||||
*/
|
||||
switch (ar71xx_soc) {
|
||||
case AR71XX_SOC_AR7240:
|
||||
case AR71XX_SOC_AR7241:
|
||||
@ -1351,24 +1393,35 @@ arge_init_locked(struct arge_softc *sc)
|
||||
* Return whether the mbuf chain is correctly aligned
|
||||
* for the arge TX engine.
|
||||
*
|
||||
* The TX engine requires each fragment to be aligned to a
|
||||
* 4 byte boundary and the size of each fragment except
|
||||
* the last to be a multiple of 4 bytes.
|
||||
* All the MACs have a length requirement: any non-final
|
||||
* fragment (ie, descriptor with MORE bit set) needs to have
|
||||
* a length divisible by 4.
|
||||
*
|
||||
* XXX TODO: I believe this is only a bug on the AR71xx and
|
||||
* AR913x MACs. The later MACs (AR724x and later) does not
|
||||
* need this workaround.
|
||||
* The AR71xx, AR913x require the start address also be
|
||||
* DWORD aligned. The later MACs don't.
|
||||
*/
|
||||
static int
|
||||
arge_mbuf_chain_is_tx_aligned(struct mbuf *m0)
|
||||
arge_mbuf_chain_is_tx_aligned(struct arge_softc *sc, struct mbuf *m0)
|
||||
{
|
||||
struct mbuf *m;
|
||||
|
||||
for (m = m0; m != NULL; m = m->m_next) {
|
||||
if((mtod(m, intptr_t) & 3) != 0)
|
||||
/*
|
||||
* Only do this for chips that require it.
|
||||
*/
|
||||
if ((sc->arge_hw_flags & ARGE_HW_FLG_TX_DESC_ALIGN_4BYTE) &&
|
||||
(mtod(m, intptr_t) & 3) != 0) {
|
||||
sc->stats.tx_pkts_unaligned_start++;
|
||||
return 0;
|
||||
if ((m->m_next != NULL) && ((m->m_len & 0x03) != 0))
|
||||
}
|
||||
|
||||
/*
|
||||
* All chips have this requirement for length.
|
||||
*/
|
||||
if ((m->m_next != NULL) && ((m->m_len & 0x03) != 0)) {
|
||||
sc->stats.tx_pkts_unaligned_len++;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
@ -1389,15 +1442,10 @@ arge_encap(struct arge_softc *sc, struct mbuf **m_head)
|
||||
ARGE_LOCK_ASSERT(sc);
|
||||
|
||||
/*
|
||||
* Fix mbuf chain, all fragments should be 4 bytes aligned and
|
||||
* even 4 bytes
|
||||
*
|
||||
* XXX TODO: I believe this is only a bug on the AR71xx and
|
||||
* AR913x MACs. The later MACs (AR724x and later) does not
|
||||
* need this workaround.
|
||||
* Fix mbuf chain based on hardware alignment constraints.
|
||||
*/
|
||||
m = *m_head;
|
||||
if (! arge_mbuf_chain_is_tx_aligned(m)) {
|
||||
if (! arge_mbuf_chain_is_tx_aligned(sc, m)) {
|
||||
sc->stats.tx_pkts_unaligned++;
|
||||
m = m_defrag(*m_head, M_NOWAIT);
|
||||
if (m == NULL) {
|
||||
@ -1427,6 +1475,7 @@ arge_encap(struct arge_softc *sc, struct mbuf **m_head)
|
||||
/* Check number of available descriptors. */
|
||||
if (sc->arge_cdata.arge_tx_cnt + nsegs >= (ARGE_TX_RING_COUNT - 1)) {
|
||||
bus_dmamap_unload(sc->arge_cdata.arge_tx_tag, txd->tx_dmamap);
|
||||
sc->stats.tx_pkts_nosegs++;
|
||||
return (ENOBUFS);
|
||||
}
|
||||
|
||||
@ -1444,7 +1493,9 @@ arge_encap(struct arge_softc *sc, struct mbuf **m_head)
|
||||
desc = &sc->arge_rdata.arge_tx_ring[prod];
|
||||
desc->packet_ctrl = ARGE_DMASIZE(txsegs[i].ds_len);
|
||||
|
||||
if (txsegs[i].ds_addr & 3)
|
||||
/* XXX Note: only relevant for older MACs; but check length! */
|
||||
if ((sc->arge_hw_flags & ARGE_HW_FLG_TX_DESC_ALIGN_4BYTE) &&
|
||||
(txsegs[i].ds_addr & 3))
|
||||
panic("TX packet address unaligned\n");
|
||||
|
||||
desc->packet_addr = txsegs[i].ds_addr;
|
||||
@ -1715,6 +1766,16 @@ arge_dma_alloc(struct arge_softc *sc)
|
||||
struct arge_txdesc *txd;
|
||||
struct arge_rxdesc *rxd;
|
||||
int error, i;
|
||||
int arge_tx_align, arge_rx_align;
|
||||
|
||||
/* Assume 4 byte alignment by default */
|
||||
arge_tx_align = 4;
|
||||
arge_rx_align = 4;
|
||||
|
||||
if (sc->arge_hw_flags & ARGE_HW_FLG_TX_DESC_ALIGN_1BYTE)
|
||||
arge_tx_align = 1;
|
||||
if (sc->arge_hw_flags & ARGE_HW_FLG_RX_DESC_ALIGN_1BYTE)
|
||||
arge_rx_align = 1;
|
||||
|
||||
/* Create parent DMA tag. */
|
||||
error = bus_dma_tag_create(
|
||||
@ -1775,7 +1836,7 @@ arge_dma_alloc(struct arge_softc *sc)
|
||||
/* Create tag for Tx buffers. */
|
||||
error = bus_dma_tag_create(
|
||||
sc->arge_cdata.arge_parent_tag, /* parent */
|
||||
sizeof(uint32_t), 0, /* alignment, boundary */
|
||||
arge_tx_align, 0, /* alignment, boundary */
|
||||
BUS_SPACE_MAXADDR, /* lowaddr */
|
||||
BUS_SPACE_MAXADDR, /* highaddr */
|
||||
NULL, NULL, /* filter, filterarg */
|
||||
@ -1793,7 +1854,7 @@ arge_dma_alloc(struct arge_softc *sc)
|
||||
/* Create tag for Rx buffers. */
|
||||
error = bus_dma_tag_create(
|
||||
sc->arge_cdata.arge_parent_tag, /* parent */
|
||||
ARGE_RX_ALIGN, 0, /* alignment, boundary */
|
||||
arge_rx_align, 0, /* alignment, boundary */
|
||||
BUS_SPACE_MAXADDR, /* lowaddr */
|
||||
BUS_SPACE_MAXADDR, /* highaddr */
|
||||
NULL, NULL, /* filter, filterarg */
|
||||
@ -2108,6 +2169,11 @@ arge_newbuf(struct arge_softc *sc, int idx)
|
||||
if (m == NULL)
|
||||
return (ENOBUFS);
|
||||
m->m_len = m->m_pkthdr.len = MCLBYTES;
|
||||
|
||||
/*
|
||||
* Add extra space to "adjust" (copy) the packet back to be aligned
|
||||
* for purposes of IPv4/IPv6 header contents.
|
||||
*/
|
||||
m_adj(m, sizeof(uint64_t));
|
||||
|
||||
if (bus_dmamap_load_mbuf_sg(sc->arge_cdata.arge_rx_tag,
|
||||
@ -2126,7 +2192,8 @@ arge_newbuf(struct arge_softc *sc, int idx)
|
||||
sc->arge_cdata.arge_rx_sparemap = map;
|
||||
rxd->rx_m = m;
|
||||
desc = rxd->desc;
|
||||
if (segs[0].ds_addr & 3)
|
||||
if ((sc->arge_hw_flags & ARGE_HW_FLG_RX_DESC_ALIGN_4BYTE) &&
|
||||
segs[0].ds_addr & 3)
|
||||
panic("RX packet address unaligned");
|
||||
desc->packet_addr = segs[0].ds_addr;
|
||||
desc->packet_ctrl = ARGE_DESC_EMPTY | ARGE_DMASIZE(segs[0].ds_len);
|
||||
@ -2331,10 +2398,12 @@ arge_intr_filter(void *arg)
|
||||
if (status & DMA_INTR_ALL) {
|
||||
sc->arge_intr_status |= status;
|
||||
ARGE_WRITE(sc, AR71XX_DMA_INTR, 0);
|
||||
sc->stats.intr_ok++;
|
||||
return (FILTER_SCHEDULE_THREAD);
|
||||
}
|
||||
|
||||
sc->arge_intr_status = 0;
|
||||
sc->stats.intr_stray++;
|
||||
return (FILTER_STRAY);
|
||||
}
|
||||
|
||||
@ -2355,8 +2424,10 @@ arge_intr(void *arg)
|
||||
/*
|
||||
* Is it our interrupt at all?
|
||||
*/
|
||||
if (status == 0)
|
||||
if (status == 0) {
|
||||
sc->stats.intr_stray2++;
|
||||
return;
|
||||
}
|
||||
|
||||
if (status & DMA_INTR_RX_BUS_ERROR) {
|
||||
ARGE_WRITE(sc, AR71XX_DMA_RX_STATUS, DMA_RX_STATUS_BUS_ERROR);
|
||||
|
@ -37,7 +37,10 @@
|
||||
#define ARGE_TX_DMA_SIZE ARGE_TX_RING_COUNT * sizeof(struct arge_desc)
|
||||
#define ARGE_MAXFRAGS 8
|
||||
#define ARGE_RING_ALIGN sizeof(struct arge_desc)
|
||||
#define ARGE_RX_ALIGN sizeof(uint32_t)
|
||||
#define ARGE_RX_ALIGN_4BYTE sizeof(uint32_t)
|
||||
#define ARGE_RX_ALIGN_1BYTE sizeof(char)
|
||||
#define ARGE_TX_ALIGN_4BYTE sizeof(uint32_t)
|
||||
#define ARGE_TX_ALIGN_1BYTE sizeof(char)
|
||||
#define ARGE_MAXFRAGS 8
|
||||
#define ARGE_TX_RING_ADDR(sc, i) \
|
||||
((sc)->arge_rdata.arge_tx_ring_paddr + sizeof(struct arge_desc) * (i))
|
||||
@ -149,6 +152,22 @@ struct arge_pll_data {
|
||||
uint32_t pll_1000;
|
||||
};
|
||||
|
||||
/*
|
||||
* Hardware specific behaviours.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Older chips support 4 byte only transmit and receive
|
||||
* addresses.
|
||||
*
|
||||
* Later chips support arbitrary TX and later later,
|
||||
* arbitrary RX addresses.
|
||||
*/
|
||||
#define ARGE_HW_FLG_TX_DESC_ALIGN_4BYTE 0x00000001
|
||||
#define ARGE_HW_FLG_RX_DESC_ALIGN_4BYTE 0x00000002
|
||||
#define ARGE_HW_FLG_TX_DESC_ALIGN_1BYTE 0x00000004
|
||||
#define ARGE_HW_FLG_RX_DESC_ALIGN_1BYTE 0x00000008
|
||||
|
||||
struct arge_softc {
|
||||
struct ifnet *arge_ifp; /* interface info */
|
||||
device_t arge_dev;
|
||||
@ -180,13 +199,20 @@ struct arge_softc {
|
||||
uint32_t arge_intr_status;
|
||||
int arge_mac_unit;
|
||||
int arge_if_flags;
|
||||
uint32_t arge_hw_flags;
|
||||
uint32_t arge_debug;
|
||||
uint32_t arge_mdiofreq;
|
||||
struct {
|
||||
uint32_t tx_pkts_unaligned;
|
||||
uint32_t tx_pkts_unaligned_start;
|
||||
uint32_t tx_pkts_unaligned_len;
|
||||
uint32_t tx_pkts_nosegs;
|
||||
uint32_t tx_pkts_aligned;
|
||||
uint32_t rx_overflow;
|
||||
uint32_t tx_underflow;
|
||||
uint32_t intr_stray;
|
||||
uint32_t intr_stray2;
|
||||
uint32_t intr_ok;
|
||||
} stats;
|
||||
};
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user