iflib: add per-tx-queue netmap timer

The way netmap TX is handled in iflib when TX interrupts are not
used (IFC_NETMAP_TX_IRQ not set) has some issues:
  - The netmap_tx_irq() function gets called by iflib_timer(), which
    gets scheduled with tick granularity (hz). This is not frequent
    enough for 10Gbps NICs and beyond (e.g., ixgbe or ixl). The end
    result is that the transmitting netmap application is not woken
    up fast enough to saturate the link with small packets.
  - The iflib_timer() functions also calls isc_txd_credits_update()
    to ask for more TX completion updates. However, this violates
    the netmap requirement that only txsync can access the TX queue
    for datapath operations. Only netmap_tx_irq() may be called out
    of the txsync context.

This change introduces per-tx-queue netmap timers, using microsecond
granularity to ensure that netmap_tx_irq() can be called often enough
to allow for maximum packet rate. The timer routine simply calls
netmap_tx_irq() to wake up the netmap application. The latter will
wake up and call txsync to collect TX completion updates.

This change brings back line rate speed with small packets for ixgbe.
For the time being, timer expiration is hardcoded to 90 microseconds,
in order to avoid introducing a new sysctl.
We may eventually implement an adaptive expiration period or use another
deferred work mechanism in place of timers.

Also, fix the timers usage to make sure that each queue is serviced
by a different CPU.

PR:	248652
Reported by:	sg@efficientip.com
MFC after:	2 weeks
This commit is contained in:
Vincenzo Maffione 2020-10-27 21:53:33 +00:00
parent de7b5f1c52
commit 17cec474c0

View File

@ -346,6 +346,9 @@ struct iflib_txq {
qidx_t ift_size;
uint16_t ift_id;
struct callout ift_timer;
#ifdef DEV_NETMAP
struct callout ift_netmap_timer;
#endif /* DEV_NETMAP */
if_txsd_vec_t ift_sds;
uint8_t ift_qstatus;
@ -753,6 +756,7 @@ iflib_num_tx_descs(if_ctx_t ctx)
MODULE_DEPEND(iflib, netmap, 1, 1, 1);
static int netmap_fl_refill(iflib_rxq_t rxq, struct netmap_kring *kring, bool init);
static void iflib_netmap_timer(void *arg);
/*
* device-specific sysctl variables:
@ -918,6 +922,8 @@ netmap_fl_refill(iflib_rxq_t rxq, struct netmap_kring *kring, bool init)
return (0);
}
#define NETMAP_TX_TIMER_US 90
/*
* Reconcile kernel and user view of the transmit ring.
*
@ -1047,9 +1053,8 @@ iflib_netmap_txsync(struct netmap_kring *kring, int flags)
* Second part: reclaim buffers for completed transmissions.
*
* If there are unclaimed buffers, attempt to reclaim them.
* If none are reclaimed, and TX IRQs are not in use, do an initial
* minimal delay, then trigger the tx handler which will spin in the
* group task queue.
* If we don't manage to reclaim them all, and TX IRQs are not in use,
* trigger a per-tx-queue timer to try again later.
*/
if (kring->nr_hwtail != nm_prev(kring->nr_hwcur, lim)) {
if (iflib_tx_credits_update(ctx, txq)) {
@ -1058,11 +1063,13 @@ iflib_netmap_txsync(struct netmap_kring *kring, int flags)
kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim);
}
}
if (!(ctx->ifc_flags & IFC_NETMAP_TX_IRQ))
if (kring->nr_hwtail != nm_prev(kring->nr_hwcur, lim)) {
callout_reset_on(&txq->ift_timer, hz < 2000 ? 1 : hz / 1000,
iflib_timer, txq, txq->ift_timer.c_cpu);
}
callout_reset_sbt(&txq->ift_netmap_timer,
NETMAP_TX_TIMER_US * SBT_1US, SBT_1US,
iflib_netmap_timer, txq, txq->ift_netmap_timer.c_cpu);
}
return (0);
}
@ -1263,28 +1270,16 @@ iflib_netmap_rxq_init(if_ctx_t ctx, iflib_rxq_t rxq)
}
static void
iflib_netmap_timer_adjust(if_ctx_t ctx, iflib_txq_t txq, uint32_t *reset_on)
iflib_netmap_timer(void *arg)
{
struct netmap_kring *kring;
uint16_t txqid;
iflib_txq_t txq = arg;
if_ctx_t ctx = txq->ift_ctx;
txqid = txq->ift_id;
kring = netmap_kring_on(NA(ctx->ifc_ifp), txqid, NR_TX);
if (kring == NULL)
return;
if (kring->nr_hwcur != nm_next(kring->nr_hwtail, kring->nkr_num_slots - 1)) {
bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
BUS_DMASYNC_POSTREAD);
if (ctx->isc_txd_credits_update(ctx->ifc_softc, txqid, false))
netmap_tx_irq(ctx->ifc_ifp, txqid);
if (!(ctx->ifc_flags & IFC_NETMAP_TX_IRQ)) {
if (hz < 2000)
*reset_on = 1;
else
*reset_on = hz / 1000;
}
}
/*
* Wake up the netmap application, to give it a chance to
* call txsync and reclaim more completed TX buffers.
*/
netmap_tx_irq(ctx->ifc_ifp, txq->ift_id);
}
#define iflib_netmap_detach(ifp) netmap_detach(ifp)
@ -1296,8 +1291,6 @@ iflib_netmap_timer_adjust(if_ctx_t ctx, iflib_txq_t txq, uint32_t *reset_on)
#define iflib_netmap_attach(ctx) (0)
#define netmap_rx_irq(ifp, qid, budget) (0)
#define netmap_tx_irq(ifp, qid) do {} while (0)
#define iflib_netmap_timer_adjust(ctx, txq, reset_on)
#endif
#if defined(__i386__) || defined(__amd64__)
@ -2287,7 +2280,6 @@ iflib_timer(void *arg)
if_ctx_t ctx = txq->ift_ctx;
if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
uint64_t this_tick = ticks;
uint32_t reset_on = hz / 2;
if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING))
return;
@ -2312,17 +2304,13 @@ iflib_timer(void *arg)
}
txq->ift_cleaned_prev = txq->ift_cleaned;
}
#ifdef DEV_NETMAP
if (if_getcapenable(ctx->ifc_ifp) & IFCAP_NETMAP)
iflib_netmap_timer_adjust(ctx, txq, &reset_on);
#endif
/* handle any laggards */
if (txq->ift_db_pending)
GROUPTASK_ENQUEUE(&txq->ift_task);
sctx->isc_pause_frames = 0;
if (if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)
callout_reset_on(&txq->ift_timer, reset_on, iflib_timer, txq, txq->ift_timer.c_cpu);
callout_reset_on(&txq->ift_timer, hz / 2, iflib_timer, txq, txq->ift_timer.c_cpu);
return;
hung:
@ -2396,6 +2384,9 @@ iflib_init_locked(if_ctx_t ctx)
for (i = 0, txq = ctx->ifc_txqs; i < sctx->isc_ntxqsets; i++, txq++) {
CALLOUT_LOCK(txq);
callout_stop(&txq->ift_timer);
#ifdef DEV_NETMAP
callout_stop(&txq->ift_netmap_timer);
#endif /* DEV_NETMAP */
CALLOUT_UNLOCK(txq);
iflib_netmap_txq_init(ctx, txq);
}
@ -2485,6 +2476,9 @@ iflib_stop(if_ctx_t ctx)
CALLOUT_LOCK(txq);
callout_stop(&txq->ift_timer);
#ifdef DEV_NETMAP
callout_stop(&txq->ift_netmap_timer);
#endif /* DEV_NETMAP */
CALLOUT_UNLOCK(txq);
/* clean any enqueued buffers */
@ -3882,7 +3876,6 @@ _task_fn_admin(void *context)
iflib_txq_t txq;
int i;
bool oactive, running, do_reset, do_watchdog, in_detach;
uint32_t reset_on = hz / 2;
STATE_LOCK(ctx);
running = (if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING);
@ -3910,12 +3903,8 @@ _task_fn_admin(void *context)
}
IFDI_UPDATE_ADMIN_STATUS(ctx);
for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++) {
#ifdef DEV_NETMAP
reset_on = hz / 2;
if (if_getcapenable(ctx->ifc_ifp) & IFCAP_NETMAP)
iflib_netmap_timer_adjust(ctx, txq, &reset_on);
#endif
callout_reset_on(&txq->ift_timer, reset_on, iflib_timer, txq, txq->ift_timer.c_cpu);
callout_reset_on(&txq->ift_timer, hz / 2, iflib_timer, txq,
txq->ift_timer.c_cpu);
}
IFDI_LINK_INTR_ENABLE(ctx);
if (do_reset)
@ -5088,6 +5077,9 @@ iflib_pseudo_deregister(if_ctx_t ctx)
tqg = qgroup_if_io_tqg;
for (txq = ctx->ifc_txqs, i = 0; i < NTXQSETS(ctx); i++, txq++) {
callout_drain(&txq->ift_timer);
#ifdef DEV_NETMAP
callout_drain(&txq->ift_netmap_timer);
#endif /* DEV_NETMAP */
if (txq->ift_task.gt_uniq != NULL)
taskqgroup_detach(tqg, &txq->ift_task);
}
@ -5174,6 +5166,9 @@ iflib_device_deregister(if_ctx_t ctx)
tqg = qgroup_if_io_tqg;
for (txq = ctx->ifc_txqs, i = 0; i < NTXQSETS(ctx); i++, txq++) {
callout_drain(&txq->ift_timer);
#ifdef DEV_NETMAP
callout_drain(&txq->ift_netmap_timer);
#endif /* DEV_NETMAP */
if (txq->ift_task.gt_uniq != NULL)
taskqgroup_detach(tqg, &txq->ift_task);
}
@ -5583,8 +5578,6 @@ iflib_queues_alloc(if_ctx_t ctx)
} else {
txq->ift_br_offset = 0;
}
/* XXX fix this */
txq->ift_timer.c_cpu = cpu;
if (iflib_txsd_alloc(txq)) {
device_printf(dev, "Critical Failure setting up TX buffers\n");
@ -5597,6 +5590,11 @@ iflib_queues_alloc(if_ctx_t ctx)
device_get_nameunit(dev), txq->ift_id);
mtx_init(&txq->ift_mtx, txq->ift_mtx_name, NULL, MTX_DEF);
callout_init_mtx(&txq->ift_timer, &txq->ift_mtx, 0);
txq->ift_timer.c_cpu = cpu;
#ifdef DEV_NETMAP
callout_init_mtx(&txq->ift_netmap_timer, &txq->ift_mtx, 0);
txq->ift_netmap_timer.c_cpu = cpu;
#endif /* DEV_NETMAP */
err = ifmp_ring_alloc(&txq->ift_br, 2048, txq, iflib_txq_drain,
iflib_txq_can_drain, M_IFLIB, M_WAITOK);