Implement TX completion event interleaving.

This patch implements a sysctl which allows setting a factor, N, for
how many work queue elements can be generated before requiring a
completion event. When a completion event happens the code simulates N
completion events instead of only one. When draining a transmit queue,
N-1 NOPs are transmitted at most, to force generation of the final
completion event.  Further a timer is running every HZ ticks to flush
any remaining data off the transmit queue when the tx_completion_fact
> 1.

The goal of this feature is to reduce the PCI bandwidth needed when
transmitting data.

Sponsored by:	Mellanox Technologies
Tested by:	Netflix
MFC after:	1 week
This commit is contained in:
Hans Petter Selasky 2016-05-20 06:54:58 +00:00
parent b1940deb18
commit 376bcf6331
4 changed files with 185 additions and 39 deletions

View File

@ -391,6 +391,8 @@ struct mlx5e_params {
m(+1, u64 tx_coalesce_usecs, "tx_coalesce_usecs", "Limit in usec for joining tx packets") \
m(+1, u64 tx_coalesce_pkts, "tx_coalesce_pkts", "Maximum number of tx packets to join") \
m(+1, u64 tx_coalesce_mode, "tx_coalesce_mode", "0: EQE mode 1: CQE mode") \
m(+1, u64 tx_completion_fact, "tx_completion_fact", "1..MAX: Completion event ratio") \
m(+1, u64 tx_completion_fact_max, "tx_completion_fact_max", "Maximum completion event ratio") \
m(+1, u64 hw_lro, "hw_lro", "set to enable hw_lro") \
m(+1, u64 cqe_zipping, "cqe_zipping", "0 : CQE zipping disabled")
@ -496,6 +498,13 @@ struct mlx5e_sq {
/* dirtied @xmit */
u16 pc __aligned(MLX5E_CACHELINE_SIZE);
u16 bf_offset;
u16 cev_counter; /* completion event counter */
u16 cev_factor; /* completion event factor */
u32 cev_next_state; /* next completion event state */
#define MLX5E_CEV_STATE_INITIAL 0 /* timer not started */
#define MLX5E_CEV_STATE_SEND_NOPS 1 /* send NOPs */
#define MLX5E_CEV_STATE_HOLD_NOPS 2 /* don't send NOPs yet */
struct callout cev_callout;
struct mlx5e_sq_stats stats;
struct mlx5e_cq cq;
@ -787,6 +796,7 @@ void mlx5e_create_stats(struct sysctl_ctx_list *,
struct sysctl_oid_list *, const char *,
const char **, unsigned, u64 *);
void mlx5e_send_nop(struct mlx5e_sq *, u32, bool);
void mlx5e_sq_cev_timeout(void *);
int mlx5e_refresh_channel_params(struct mlx5e_priv *);
#endif /* _MLX5_EN_H_ */

View File

@ -48,6 +48,42 @@ mlx5e_create_stats(struct sysctl_ctx_list *ctx,
}
}
static void
mlx5e_ethtool_sync_tx_completion_fact(struct mlx5e_priv *priv)
{
/*
* Limit the maximum distance between completion events to
* half of the currently set TX queue size.
*
* The maximum number of queue entries a single IP packet can
* consume is given by MLX5_SEND_WQE_MAX_WQEBBS.
*
* The worst case max value is then given as below:
*/
uint64_t max = priv->params_ethtool.tx_queue_size /
(2 * MLX5_SEND_WQE_MAX_WQEBBS);
/*
* Update the maximum completion factor value in case the
* tx_queue_size field changed. Ensure we don't overflow
* 16-bits.
*/
if (max < 1)
max = 1;
else if (max > 65535)
max = 65535;
priv->params_ethtool.tx_completion_fact_max = max;
/*
* Verify that the current TX completion factor is within the
* given limits:
*/
if (priv->params_ethtool.tx_completion_fact < 1)
priv->params_ethtool.tx_completion_fact = 1;
else if (priv->params_ethtool.tx_completion_fact > max)
priv->params_ethtool.tx_completion_fact = max;
}
static int
mlx5e_ethtool_handler(SYSCTL_HANDLER_ARGS)
{
@ -206,6 +242,14 @@ mlx5e_ethtool_handler(SYSCTL_HANDLER_ARGS)
priv->params_ethtool.cqe_zipping = 0;
}
}
if (&priv->params_ethtool.arg[arg2] ==
&priv->params_ethtool.tx_completion_fact ||
&priv->params_ethtool.arg[arg2] ==
&priv->params_ethtool.tx_queue_size) {
/* verify parameter */
mlx5e_ethtool_sync_tx_completion_fact(priv);
}
if (was_opened)
mlx5e_open_locked(priv->ifp);
done:
@ -475,6 +519,7 @@ mlx5e_create_ethtool(struct mlx5e_priv *priv)
priv->params_ethtool.tx_coalesce_pkts = priv->params.tx_cq_moderation_pkts;
priv->params_ethtool.hw_lro = priv->params.hw_lro_en;
priv->params_ethtool.cqe_zipping = priv->params.cqe_zipping_en;
mlx5e_ethtool_sync_tx_completion_fact(priv);
/* create root node */
node = SYSCTL_ADD_NODE(&priv->sysctl_ctx,

View File

@ -1185,24 +1185,82 @@ err_destroy_sq:
}
static void
mlx5e_close_sq(struct mlx5e_sq *sq)
mlx5e_sq_send_nops_locked(struct mlx5e_sq *sq, int can_sleep)
{
/* ensure hw is notified of all pending wqes */
if (mlx5e_sq_has_room_for(sq, 1))
/* fill up remainder with NOPs */
while (sq->cev_counter != 0) {
while (!mlx5e_sq_has_room_for(sq, 1)) {
if (can_sleep != 0) {
mtx_unlock(&sq->lock);
msleep(4);
mtx_lock(&sq->lock);
} else {
goto done;
}
}
mlx5e_send_nop(sq, 1, true);
}
done:
return;
}
mlx5e_modify_sq(sq, MLX5_SQC_STATE_RDY, MLX5_SQC_STATE_ERR);
void
mlx5e_sq_cev_timeout(void *arg)
{
struct mlx5e_sq *sq = arg;
mtx_assert(&sq->lock, MA_OWNED);
/* check next state */
switch (sq->cev_next_state) {
case MLX5E_CEV_STATE_SEND_NOPS:
/* fill TX ring with NOPs, if any */
mlx5e_sq_send_nops_locked(sq, 0);
/* check if completed */
if (sq->cev_counter == 0) {
sq->cev_next_state = MLX5E_CEV_STATE_INITIAL;
return;
}
break;
default:
/* send NOPs on next timeout */
sq->cev_next_state = MLX5E_CEV_STATE_SEND_NOPS;
break;
}
/* restart timer */
callout_reset_curcpu(&sq->cev_callout, hz, mlx5e_sq_cev_timeout, sq);
}
static void
mlx5e_close_sq_wait(struct mlx5e_sq *sq)
{
mtx_lock(&sq->lock);
/* teardown event factor timer, if any */
sq->cev_next_state = MLX5E_CEV_STATE_HOLD_NOPS;
callout_stop(&sq->cev_callout);
/* send dummy NOPs in order to flush the transmit ring */
mlx5e_sq_send_nops_locked(sq, 1);
mtx_unlock(&sq->lock);
/* make sure it is safe to free the callout */
callout_drain(&sq->cev_callout);
/* error out remaining requests */
mlx5e_modify_sq(sq, MLX5_SQC_STATE_RDY, MLX5_SQC_STATE_ERR);
/* wait till SQ is empty */
mtx_lock(&sq->lock);
while (sq->cc != sq->pc) {
mtx_unlock(&sq->lock);
msleep(4);
sq->cq.mcq.comp(&sq->cq.mcq);
mtx_lock(&sq->lock);
}
mtx_unlock(&sq->lock);
mlx5e_disable_sq(sq);
mlx5e_destroy_sq(sq);
@ -1412,23 +1470,12 @@ mlx5e_open_sqs(struct mlx5e_channel *c,
return (0);
err_close_sqs:
for (tc--; tc >= 0; tc--) {
mlx5e_close_sq(&c->sq[tc]);
for (tc--; tc >= 0; tc--)
mlx5e_close_sq_wait(&c->sq[tc]);
}
return (err);
}
static void
mlx5e_close_sqs(struct mlx5e_channel *c)
{
int tc;
for (tc = 0; tc < c->num_tc; tc++)
mlx5e_close_sq(&c->sq[tc]);
}
static void
mlx5e_close_sqs_wait(struct mlx5e_channel *c)
{
@ -1446,9 +1493,19 @@ mlx5e_chan_mtx_init(struct mlx5e_channel *c)
mtx_init(&c->rq.mtx, "mlx5rx", MTX_NETWORK_LOCK, MTX_DEF);
for (tc = 0; tc < c->num_tc; tc++) {
mtx_init(&c->sq[tc].lock, "mlx5tx", MTX_NETWORK_LOCK, MTX_DEF);
mtx_init(&c->sq[tc].comp_lock, "mlx5comp", MTX_NETWORK_LOCK,
struct mlx5e_sq *sq = c->sq + tc;
mtx_init(&sq->lock, "mlx5tx", MTX_NETWORK_LOCK, MTX_DEF);
mtx_init(&sq->comp_lock, "mlx5comp", MTX_NETWORK_LOCK,
MTX_DEF);
callout_init_mtx(&sq->cev_callout, &sq->lock, 0);
sq->cev_factor = c->priv->params_ethtool.tx_completion_fact;
/* ensure the TX completion event factor is not zero */
if (sq->cev_factor == 0)
sq->cev_factor = 1;
}
}
@ -1529,7 +1586,6 @@ mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
return (0);
err_close_sqs:
mlx5e_close_sqs(c);
mlx5e_close_sqs_wait(c);
err_close_rx_cq:
@ -1554,7 +1610,6 @@ mlx5e_close_channel(struct mlx5e_channel *volatile *pp)
if (c == NULL)
return;
mlx5e_close_rq(&c->rq);
mlx5e_close_sqs(c);
}
static void

View File

@ -28,6 +28,18 @@
#include "en.h"
#include <machine/atomic.h>
static inline bool
mlx5e_do_send_cqe(struct mlx5e_sq *sq)
{
sq->cev_counter++;
/* interleave the CQEs */
if (sq->cev_counter >= sq->cev_factor) {
sq->cev_counter = 0;
return (1);
}
return (0);
}
void
mlx5e_send_nop(struct mlx5e_sq *sq, u32 ds_cnt, bool notify_hw)
{
@ -38,7 +50,10 @@ mlx5e_send_nop(struct mlx5e_sq *sq, u32 ds_cnt, bool notify_hw)
wqe->ctrl.opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | MLX5_OPCODE_NOP);
wqe->ctrl.qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt);
wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
if (mlx5e_do_send_cqe(sq))
wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
else
wqe->ctrl.fm_ce_se = 0;
sq->mbuf[pi].mbuf = NULL;
sq->mbuf[pi].num_bytes = 0;
@ -340,7 +355,10 @@ skip_dma:
wqe->ctrl.opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | opcode);
wqe->ctrl.qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt);
wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
if (mlx5e_do_send_cqe(sq))
wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
else
wqe->ctrl.fm_ce_se = 0;
/* Store pointer to mbuf */
sq->mbuf[pi].mbuf = mb;
@ -374,9 +392,10 @@ mlx5e_poll_tx_cq(struct mlx5e_sq *sq, int budget)
*/
sqcc = sq->cc;
while (budget--) {
while (budget > 0) {
struct mlx5_cqe64 *cqe;
struct mbuf *mb;
u16 x;
u16 ci;
cqe = mlx5e_get_cqe(&sq->cq);
@ -385,24 +404,29 @@ mlx5e_poll_tx_cq(struct mlx5e_sq *sq, int budget)
mlx5_cqwq_pop(&sq->cq.wq);
ci = sqcc & sq->wq.sz_m1;
mb = sq->mbuf[ci].mbuf;
sq->mbuf[ci].mbuf = NULL; /* Safety clear */
/* update budget according to the event factor */
budget -= sq->cev_factor;
if (mb == NULL) {
if (sq->mbuf[ci].num_bytes == 0) {
/* NOP */
sq->stats.nop++;
for (x = 0; x != sq->cev_factor; x++) {
ci = sqcc & sq->wq.sz_m1;
mb = sq->mbuf[ci].mbuf;
sq->mbuf[ci].mbuf = NULL; /* Safety clear */
if (mb == NULL) {
if (sq->mbuf[ci].num_bytes == 0) {
/* NOP */
sq->stats.nop++;
}
} else {
bus_dmamap_sync(sq->dma_tag, sq->mbuf[ci].dma_map,
BUS_DMASYNC_POSTWRITE);
bus_dmamap_unload(sq->dma_tag, sq->mbuf[ci].dma_map);
/* Free transmitted mbuf */
m_freem(mb);
}
} else {
bus_dmamap_sync(sq->dma_tag, sq->mbuf[ci].dma_map,
BUS_DMASYNC_POSTWRITE);
bus_dmamap_unload(sq->dma_tag, sq->mbuf[ci].dma_map);
/* Free transmitted mbuf */
m_freem(mb);
sqcc += sq->mbuf[ci].num_wqebbs;
}
sqcc += sq->mbuf[ci].num_wqebbs;
}
mlx5_cqwq_update_db_record(&sq->cq.wq);
@ -450,6 +474,18 @@ mlx5e_xmit_locked(struct ifnet *ifp, struct mlx5e_sq *sq, struct mbuf *mb)
if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
break;
}
/*
* Check if we need to start the event timer which flushes the
* transmit ring on timeout:
*/
if (unlikely(sq->cev_next_state == MLX5E_CEV_STATE_INITIAL &&
sq->cev_factor != 1)) {
/* start the timer */
mlx5e_sq_cev_timeout(sq);
} else {
/* don't send NOPs yet */
sq->cev_next_state = MLX5E_CEV_STATE_HOLD_NOPS;
}
return (err);
}