Do not use interrupt taskqueue on controllers with MSI/MSI-X

capability. One of reason using interrupt taskqueue in re(4) was
to reduce number of TX/RX interrupts under load because re(4)
controllers have no good TX/RX interrupt moderation mechanism.
Basic TX interrupt moderation is done by hardware for most
controllers but RX interrupt moderation through undocumented
register showed poor RX performance so it was disabled in r215025.
Using taskqueue to handle RX interrupt greatly reduced number of
interrupts but re(4) consumed all available CPU cycles to run the
taskqueue under high TX/RX network load.  This can happen even with
RTL810x fast ethernet controller and I believe this is not
acceptable for most systems.

To mitigate the issue, use one-shot timer register to moderate RX
interrupts. The timer register provides programmable one-shot timer
and can be used to suppress interrupt generation. The timer runs at
125MHZ on PCIe controllers so the minimum time allowed for the
timer is 8ns. Data sheet says the register is 32 bits but
experimentation shows only lower 13 bits are valid so maximum time
that can be programmed is 65.528us. This yields theoretical maximum
number of RX interrupts that could be generated per second is about
15260. Combined with TX completion interrupts re(4) shall generate
less than 20k interrupts. This number is still slightly high
compared to other intelligent ethernet controllers but system is
very responsive even under high network load.

Introduce sysctl variable dev.re.%d.int_rx_mod that controls amount
of time to delay RX interrupt processing in units of us. Value 0
completely disables RX interrupt moderation. To provide old
behavior for controllers that have MSI/MSI-X capability, introduce
a new tunable hw.re.intr_filter. If the tunable is set to non-zero
value, driver will use interrupt taskqueue. The default value of
the tunable is 0. This tunable has no effect on controllers that
has no MSI/MSI-X capability or if MSI/MSI-X is explicitly disabled
by administrator.

While I'm here cleanup interrupt setup/teardown since re(4) uses
single MSI/MSI-X message at this moment.
This commit is contained in:
Pyun YongHyeon 2011-01-26 20:25:40 +00:00
parent a5c1afadeb
commit 502be0f749
2 changed files with 185 additions and 32 deletions

View File

@ -157,6 +157,8 @@ MODULE_DEPEND(re, miibus, 1, 1, 1);
#include "miibus_if.h"
/* Tunables. */
static int intr_filter = 0;
TUNABLE_INT("hw.re.intr_filter", &intr_filter);
static int msi_disable = 0;
TUNABLE_INT("hw.re.msi_disable", &msi_disable);
static int msix_disable = 0;
@ -253,6 +255,7 @@ static int re_poll (struct ifnet *, enum poll_cmd, int);
static int re_poll_locked (struct ifnet *, enum poll_cmd, int);
#endif
static int re_intr (void *);
static void re_intr_msi (void *);
static void re_tick (void *);
static void re_int_task (void *, int);
static void re_start (struct ifnet *);
@ -290,6 +293,8 @@ static int re_diag (struct rl_softc *);
static void re_add_sysctls (struct rl_softc *);
static int re_sysctl_stats (SYSCTL_HANDLER_ARGS);
static int sysctl_int_range (SYSCTL_HANDLER_ARGS, int, int);
static int sysctl_hw_re_int_mod (SYSCTL_HANDLER_ARGS);
static device_method_t re_methods[] = {
/* Device interface */
@ -1574,19 +1579,19 @@ re_attach(device_t dev)
}
#endif
#ifdef RE_TX_MODERATION
intr_filter = 1;
#endif
/* Hook interrupt last to avoid having to lock softc */
if ((sc->rl_flags & (RL_FLAG_MSI | RL_FLAG_MSIX)) == 0)
if ((sc->rl_flags & (RL_FLAG_MSI | RL_FLAG_MSIX)) != 0 &&
intr_filter == 0) {
error = bus_setup_intr(dev, sc->rl_irq[0],
INTR_TYPE_NET | INTR_MPSAFE, NULL, re_intr_msi, sc,
&sc->rl_intrhand[0]);
} else {
error = bus_setup_intr(dev, sc->rl_irq[0],
INTR_TYPE_NET | INTR_MPSAFE, re_intr, NULL, sc,
&sc->rl_intrhand[0]);
else {
for (i = 0; i < RL_MSI_MESSAGES; i++) {
error = bus_setup_intr(dev, sc->rl_irq[i],
INTR_TYPE_NET | INTR_MPSAFE, re_intr, NULL, sc,
&sc->rl_intrhand[i]);
if (error != 0)
break;
}
}
if (error) {
device_printf(dev, "couldn't set up irq\n");
@ -1657,31 +1662,22 @@ re_detach(device_t dev)
* stopped here.
*/
for (i = 0; i < RL_MSI_MESSAGES; i++) {
if (sc->rl_intrhand[i] != NULL) {
bus_teardown_intr(dev, sc->rl_irq[i],
sc->rl_intrhand[i]);
sc->rl_intrhand[i] = NULL;
}
if (sc->rl_intrhand[0] != NULL) {
bus_teardown_intr(dev, sc->rl_irq[0], sc->rl_intrhand[0]);
sc->rl_intrhand[0] = NULL;
}
if (ifp != NULL)
if_free(ifp);
if ((sc->rl_flags & (RL_FLAG_MSI | RL_FLAG_MSIX)) == 0) {
if (sc->rl_irq[0] != NULL) {
bus_release_resource(dev, SYS_RES_IRQ, 0,
sc->rl_irq[0]);
sc->rl_irq[0] = NULL;
}
} else {
for (i = 0, rid = 1; i < RL_MSI_MESSAGES; i++, rid++) {
if (sc->rl_irq[i] != NULL) {
bus_release_resource(dev, SYS_RES_IRQ, rid,
sc->rl_irq[i]);
sc->rl_irq[i] = NULL;
}
}
pci_release_msi(dev);
if ((sc->rl_flags & (RL_FLAG_MSI | RL_FLAG_MSIX)) == 0)
rid = 0;
else
rid = 1;
if (sc->rl_irq[0] != NULL) {
bus_release_resource(dev, SYS_RES_IRQ, rid, sc->rl_irq[0]);
sc->rl_irq[0] = NULL;
}
if ((sc->rl_flags & (RL_FLAG_MSI | RL_FLAG_MSIX)) != 0)
pci_release_msi(dev);
if (sc->rl_res_pba) {
rid = PCIR_BAR(4);
bus_release_resource(dev, SYS_RES_MEMORY, rid, sc->rl_res_pba);
@ -1970,6 +1966,7 @@ re_rx_list_init(struct rl_softc *sc)
sc->rl_ldata.rl_rx_prodidx = 0;
sc->rl_head = sc->rl_tail = NULL;
sc->rl_int_rx_act = 0;
return (0);
}
@ -1993,6 +1990,7 @@ re_jrx_list_init(struct rl_softc *sc)
sc->rl_ldata.rl_rx_prodidx = 0;
sc->rl_head = sc->rl_tail = NULL;
sc->rl_int_rx_act = 0;
return (0);
}
@ -2478,6 +2476,87 @@ re_int_task(void *arg, int npending)
CSR_WRITE_2(sc, RL_IMR, RL_INTRS_CPLUS);
}
static void
re_intr_msi(void *xsc)
{
struct rl_softc *sc;
struct ifnet *ifp;
uint16_t intrs, status;
sc = xsc;
RL_LOCK(sc);
ifp = sc->rl_ifp;
#ifdef DEVICE_POLLING
if (ifp->if_capenable & IFCAP_POLLING) {
RL_UNLOCK(sc);
return;
}
#endif
/* Disable interrupts. */
CSR_WRITE_2(sc, RL_IMR, 0);
if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
RL_UNLOCK(sc);
return;
}
intrs = RL_INTRS_CPLUS;
status = CSR_READ_2(sc, RL_ISR);
CSR_WRITE_2(sc, RL_ISR, status);
if (sc->rl_int_rx_act > 0) {
intrs &= ~(RL_ISR_RX_OK | RL_ISR_RX_ERR | RL_ISR_FIFO_OFLOW |
RL_ISR_RX_OVERRUN);
status &= ~(RL_ISR_RX_OK | RL_ISR_RX_ERR | RL_ISR_FIFO_OFLOW |
RL_ISR_RX_OVERRUN);
}
if (status & (RL_ISR_TIMEOUT_EXPIRED | RL_ISR_RX_OK | RL_ISR_RX_ERR |
RL_ISR_FIFO_OFLOW | RL_ISR_RX_OVERRUN)) {
re_rxeof(sc, NULL);
if ((ifp->if_drv_flags & IFF_DRV_RUNNING) != 0) {
if (sc->rl_int_rx_mod != 0 &&
(status & (RL_ISR_RX_OK | RL_ISR_RX_ERR |
RL_ISR_FIFO_OFLOW | RL_ISR_RX_OVERRUN)) != 0) {
/* Rearm one-shot timer. */
CSR_WRITE_4(sc, RL_TIMERCNT, 1);
intrs &= ~(RL_ISR_RX_OK | RL_ISR_RX_ERR |
RL_ISR_FIFO_OFLOW | RL_ISR_RX_OVERRUN);
sc->rl_int_rx_act = 1;
} else {
intrs |= RL_ISR_RX_OK | RL_ISR_RX_ERR |
RL_ISR_FIFO_OFLOW | RL_ISR_RX_OVERRUN;
sc->rl_int_rx_act = 0;
}
}
}
/*
* Some chips will ignore a second TX request issued
* while an existing transmission is in progress. If
* the transmitter goes idle but there are still
* packets waiting to be sent, we need to restart the
* channel here to flush them out. This only seems to
* be required with the PCIe devices.
*/
if ((status & (RL_ISR_TX_OK | RL_ISR_TX_DESC_UNAVAIL)) &&
(sc->rl_flags & RL_FLAG_PCIE))
CSR_WRITE_1(sc, sc->rl_txstart, RL_TXSTART_START);
if (status & (RL_ISR_TX_OK | RL_ISR_TX_ERR | RL_ISR_TX_DESC_UNAVAIL))
re_txeof(sc);
if (status & RL_ISR_SYSTEM_ERR) {
ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
re_init_locked(sc);
}
if ((ifp->if_drv_flags & IFF_DRV_RUNNING) != 0) {
if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd))
re_start_locked(ifp);
CSR_WRITE_2(sc, RL_IMR, intrs);
}
RL_UNLOCK(sc);
}
static int
re_encap(struct rl_softc *sc, struct mbuf **m_head)
{
@ -3007,18 +3086,35 @@ re_init_locked(struct rl_softc *sc)
CSR_WRITE_1(sc, RL_COMMAND, RL_CMD_TX_ENB|RL_CMD_RX_ENB);
#endif
#ifdef RE_TX_MODERATION
/*
* Initialize the timer interrupt register so that
* a timer interrupt will be generated once the timer
* reaches a certain number of ticks. The timer is
* reloaded on each transmit. This gives us TX interrupt
* reloaded on each transmit.
*/
#ifdef RE_TX_MODERATION
/*
* Use timer interrupt register to moderate TX interrupt
* moderation, which dramatically improves TX frame rate.
*/
if (sc->rl_type == RL_8169)
CSR_WRITE_4(sc, RL_TIMERINT_8169, 0x800);
else
CSR_WRITE_4(sc, RL_TIMERINT, 0x400);
#else
/*
* Use timer interrupt register to moderate RX interrupt
* moderation.
*/
if ((sc->rl_flags & (RL_FLAG_MSI | RL_FLAG_MSIX)) != 0 &&
intr_filter == 0) {
if (sc->rl_type == RL_8169)
CSR_WRITE_4(sc, RL_TIMERINT_8169,
RL_USECS(sc->rl_int_rx_mod));
} else {
if (sc->rl_type == RL_8169)
CSR_WRITE_4(sc, RL_TIMERINT_8169, RL_USECS(0));
}
#endif
/*
@ -3535,6 +3631,7 @@ re_add_sysctls(struct rl_softc *sc)
{
struct sysctl_ctx_list *ctx;
struct sysctl_oid_list *children;
int error;
ctx = device_get_sysctl_ctx(sc->rl_dev);
children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->rl_dev));
@ -3542,6 +3639,26 @@ re_add_sysctls(struct rl_softc *sc)
SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "stats",
CTLTYPE_INT | CTLFLAG_RW, sc, 0, re_sysctl_stats, "I",
"Statistics Information");
if ((sc->rl_flags & (RL_FLAG_MSI | RL_FLAG_MSIX)) == 0)
return;
SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "int_rx_mod",
CTLTYPE_INT | CTLFLAG_RW, &sc->rl_int_rx_mod, 0,
sysctl_hw_re_int_mod, "I", "re RX interrupt moderation");
/* Pull in device tunables. */
sc->rl_int_rx_mod = RL_TIMER_DEFAULT;
error = resource_int_value(device_get_name(sc->rl_dev),
device_get_unit(sc->rl_dev), "int_rx_mod", &sc->rl_int_rx_mod);
if (error == 0) {
if (sc->rl_int_rx_mod < RL_TIMER_MIN ||
sc->rl_int_rx_mod > RL_TIMER_MAX) {
device_printf(sc->rl_dev, "int_rx_mod value out of "
"range; using default: %d\n",
RL_TIMER_DEFAULT);
sc->rl_int_rx_mod = RL_TIMER_DEFAULT;
}
}
}
static int
@ -3619,3 +3736,29 @@ re_sysctl_stats(SYSCTL_HANDLER_ARGS)
return (error);
}
static int
sysctl_int_range(SYSCTL_HANDLER_ARGS, int low, int high)
{
int error, value;
if (arg1 == NULL)
return (EINVAL);
value = *(int *)arg1;
error = sysctl_handle_int(oidp, &value, 0, req);
if (error || req->newptr == NULL)
return (error);
if (value < low || value > high)
return (EINVAL);
*(int *)arg1 = value;
return (0);
}
static int
sysctl_hw_re_int_mod(SYSCTL_HANDLER_ARGS)
{
return (sysctl_int_range(oidp, arg1, arg2, req, RL_TIMER_MIN,
RL_TIMER_MAX));
}

View File

@ -497,6 +497,14 @@
#define RL_EARLYTXTHRESH_CNT 0x003F /* byte count times 8 */
/* Timer interrupt register */
#define RL_TIMERINT_8169_VAL 0x00001FFF
#define RL_TIMER_MIN 0
#define RL_TIMER_MAX 65 /* 65.528us */
#define RL_TIMER_DEFAULT RL_TIMER_MAX
#define RL_TIMER_PCIE_CLK 125 /* 125MHZ */
#define RL_USECS(x) ((x) * RL_TIMER_PCIE_CLK)
/*
* Gigabit PHY access register (8169 only)
*/
@ -896,6 +904,8 @@ struct rl_softc {
struct task rl_inttask;
int rl_txstart;
int rl_int_rx_act;
int rl_int_rx_mod;
uint32_t rl_flags;
#define RL_FLAG_MSI 0x0001
#define RL_FLAG_AUTOPAD 0x0002