Implement TX completion event interleaving.
This patch implements a sysctl which allows setting a factor, N, for how many work queue elements can be generated before requiring a completion event. When a completion event happens the code simulates N completion events instead of only one. When draining a transmit queue, N-1 NOPs are transmitted at most, to force generation of the final completion event. Further a timer is running every HZ ticks to flush any remaining data off the transmit queue when the tx_completion_fact > 1. The goal of this feature is to reduce the PCI bandwidth needed when transmitting data. Sponsored by: Mellanox Technologies Tested by: Netflix MFC after: 1 week
This commit is contained in:
parent
b1940deb18
commit
376bcf6331
@ -391,6 +391,8 @@ struct mlx5e_params {
|
||||
m(+1, u64 tx_coalesce_usecs, "tx_coalesce_usecs", "Limit in usec for joining tx packets") \
|
||||
m(+1, u64 tx_coalesce_pkts, "tx_coalesce_pkts", "Maximum number of tx packets to join") \
|
||||
m(+1, u64 tx_coalesce_mode, "tx_coalesce_mode", "0: EQE mode 1: CQE mode") \
|
||||
m(+1, u64 tx_completion_fact, "tx_completion_fact", "1..MAX: Completion event ratio") \
|
||||
m(+1, u64 tx_completion_fact_max, "tx_completion_fact_max", "Maximum completion event ratio") \
|
||||
m(+1, u64 hw_lro, "hw_lro", "set to enable hw_lro") \
|
||||
m(+1, u64 cqe_zipping, "cqe_zipping", "0 : CQE zipping disabled")
|
||||
|
||||
@ -496,6 +498,13 @@ struct mlx5e_sq {
|
||||
/* dirtied @xmit */
|
||||
u16 pc __aligned(MLX5E_CACHELINE_SIZE);
|
||||
u16 bf_offset;
|
||||
u16 cev_counter; /* completion event counter */
|
||||
u16 cev_factor; /* completion event factor */
|
||||
u32 cev_next_state; /* next completion event state */
|
||||
#define MLX5E_CEV_STATE_INITIAL 0 /* timer not started */
|
||||
#define MLX5E_CEV_STATE_SEND_NOPS 1 /* send NOPs */
|
||||
#define MLX5E_CEV_STATE_HOLD_NOPS 2 /* don't send NOPs yet */
|
||||
struct callout cev_callout;
|
||||
struct mlx5e_sq_stats stats;
|
||||
|
||||
struct mlx5e_cq cq;
|
||||
@ -787,6 +796,7 @@ void mlx5e_create_stats(struct sysctl_ctx_list *,
|
||||
struct sysctl_oid_list *, const char *,
|
||||
const char **, unsigned, u64 *);
|
||||
void mlx5e_send_nop(struct mlx5e_sq *, u32, bool);
|
||||
void mlx5e_sq_cev_timeout(void *);
|
||||
int mlx5e_refresh_channel_params(struct mlx5e_priv *);
|
||||
|
||||
#endif /* _MLX5_EN_H_ */
|
||||
|
@ -48,6 +48,42 @@ mlx5e_create_stats(struct sysctl_ctx_list *ctx,
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
mlx5e_ethtool_sync_tx_completion_fact(struct mlx5e_priv *priv)
|
||||
{
|
||||
/*
|
||||
* Limit the maximum distance between completion events to
|
||||
* half of the currently set TX queue size.
|
||||
*
|
||||
* The maximum number of queue entries a single IP packet can
|
||||
* consume is given by MLX5_SEND_WQE_MAX_WQEBBS.
|
||||
*
|
||||
* The worst case max value is then given as below:
|
||||
*/
|
||||
uint64_t max = priv->params_ethtool.tx_queue_size /
|
||||
(2 * MLX5_SEND_WQE_MAX_WQEBBS);
|
||||
|
||||
/*
|
||||
* Update the maximum completion factor value in case the
|
||||
* tx_queue_size field changed. Ensure we don't overflow
|
||||
* 16-bits.
|
||||
*/
|
||||
if (max < 1)
|
||||
max = 1;
|
||||
else if (max > 65535)
|
||||
max = 65535;
|
||||
priv->params_ethtool.tx_completion_fact_max = max;
|
||||
|
||||
/*
|
||||
* Verify that the current TX completion factor is within the
|
||||
* given limits:
|
||||
*/
|
||||
if (priv->params_ethtool.tx_completion_fact < 1)
|
||||
priv->params_ethtool.tx_completion_fact = 1;
|
||||
else if (priv->params_ethtool.tx_completion_fact > max)
|
||||
priv->params_ethtool.tx_completion_fact = max;
|
||||
}
|
||||
|
||||
static int
|
||||
mlx5e_ethtool_handler(SYSCTL_HANDLER_ARGS)
|
||||
{
|
||||
@ -206,6 +242,14 @@ mlx5e_ethtool_handler(SYSCTL_HANDLER_ARGS)
|
||||
priv->params_ethtool.cqe_zipping = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (&priv->params_ethtool.arg[arg2] ==
|
||||
&priv->params_ethtool.tx_completion_fact ||
|
||||
&priv->params_ethtool.arg[arg2] ==
|
||||
&priv->params_ethtool.tx_queue_size) {
|
||||
/* verify parameter */
|
||||
mlx5e_ethtool_sync_tx_completion_fact(priv);
|
||||
}
|
||||
if (was_opened)
|
||||
mlx5e_open_locked(priv->ifp);
|
||||
done:
|
||||
@ -475,6 +519,7 @@ mlx5e_create_ethtool(struct mlx5e_priv *priv)
|
||||
priv->params_ethtool.tx_coalesce_pkts = priv->params.tx_cq_moderation_pkts;
|
||||
priv->params_ethtool.hw_lro = priv->params.hw_lro_en;
|
||||
priv->params_ethtool.cqe_zipping = priv->params.cqe_zipping_en;
|
||||
mlx5e_ethtool_sync_tx_completion_fact(priv);
|
||||
|
||||
/* create root node */
|
||||
node = SYSCTL_ADD_NODE(&priv->sysctl_ctx,
|
||||
|
@ -1185,24 +1185,82 @@ err_destroy_sq:
|
||||
}
|
||||
|
||||
static void
|
||||
mlx5e_close_sq(struct mlx5e_sq *sq)
|
||||
mlx5e_sq_send_nops_locked(struct mlx5e_sq *sq, int can_sleep)
|
||||
{
|
||||
|
||||
/* ensure hw is notified of all pending wqes */
|
||||
if (mlx5e_sq_has_room_for(sq, 1))
|
||||
/* fill up remainder with NOPs */
|
||||
while (sq->cev_counter != 0) {
|
||||
while (!mlx5e_sq_has_room_for(sq, 1)) {
|
||||
if (can_sleep != 0) {
|
||||
mtx_unlock(&sq->lock);
|
||||
msleep(4);
|
||||
mtx_lock(&sq->lock);
|
||||
} else {
|
||||
goto done;
|
||||
}
|
||||
}
|
||||
mlx5e_send_nop(sq, 1, true);
|
||||
}
|
||||
done:
|
||||
return;
|
||||
}
|
||||
|
||||
mlx5e_modify_sq(sq, MLX5_SQC_STATE_RDY, MLX5_SQC_STATE_ERR);
|
||||
void
|
||||
mlx5e_sq_cev_timeout(void *arg)
|
||||
{
|
||||
struct mlx5e_sq *sq = arg;
|
||||
|
||||
mtx_assert(&sq->lock, MA_OWNED);
|
||||
|
||||
/* check next state */
|
||||
switch (sq->cev_next_state) {
|
||||
case MLX5E_CEV_STATE_SEND_NOPS:
|
||||
/* fill TX ring with NOPs, if any */
|
||||
mlx5e_sq_send_nops_locked(sq, 0);
|
||||
|
||||
/* check if completed */
|
||||
if (sq->cev_counter == 0) {
|
||||
sq->cev_next_state = MLX5E_CEV_STATE_INITIAL;
|
||||
return;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
/* send NOPs on next timeout */
|
||||
sq->cev_next_state = MLX5E_CEV_STATE_SEND_NOPS;
|
||||
break;
|
||||
}
|
||||
|
||||
/* restart timer */
|
||||
callout_reset_curcpu(&sq->cev_callout, hz, mlx5e_sq_cev_timeout, sq);
|
||||
}
|
||||
|
||||
static void
|
||||
mlx5e_close_sq_wait(struct mlx5e_sq *sq)
|
||||
{
|
||||
|
||||
mtx_lock(&sq->lock);
|
||||
/* teardown event factor timer, if any */
|
||||
sq->cev_next_state = MLX5E_CEV_STATE_HOLD_NOPS;
|
||||
callout_stop(&sq->cev_callout);
|
||||
|
||||
/* send dummy NOPs in order to flush the transmit ring */
|
||||
mlx5e_sq_send_nops_locked(sq, 1);
|
||||
mtx_unlock(&sq->lock);
|
||||
|
||||
/* make sure it is safe to free the callout */
|
||||
callout_drain(&sq->cev_callout);
|
||||
|
||||
/* error out remaining requests */
|
||||
mlx5e_modify_sq(sq, MLX5_SQC_STATE_RDY, MLX5_SQC_STATE_ERR);
|
||||
|
||||
/* wait till SQ is empty */
|
||||
mtx_lock(&sq->lock);
|
||||
while (sq->cc != sq->pc) {
|
||||
mtx_unlock(&sq->lock);
|
||||
msleep(4);
|
||||
sq->cq.mcq.comp(&sq->cq.mcq);
|
||||
mtx_lock(&sq->lock);
|
||||
}
|
||||
mtx_unlock(&sq->lock);
|
||||
|
||||
mlx5e_disable_sq(sq);
|
||||
mlx5e_destroy_sq(sq);
|
||||
@ -1412,23 +1470,12 @@ mlx5e_open_sqs(struct mlx5e_channel *c,
|
||||
return (0);
|
||||
|
||||
err_close_sqs:
|
||||
for (tc--; tc >= 0; tc--) {
|
||||
mlx5e_close_sq(&c->sq[tc]);
|
||||
for (tc--; tc >= 0; tc--)
|
||||
mlx5e_close_sq_wait(&c->sq[tc]);
|
||||
}
|
||||
|
||||
return (err);
|
||||
}
|
||||
|
||||
static void
|
||||
mlx5e_close_sqs(struct mlx5e_channel *c)
|
||||
{
|
||||
int tc;
|
||||
|
||||
for (tc = 0; tc < c->num_tc; tc++)
|
||||
mlx5e_close_sq(&c->sq[tc]);
|
||||
}
|
||||
|
||||
static void
|
||||
mlx5e_close_sqs_wait(struct mlx5e_channel *c)
|
||||
{
|
||||
@ -1446,9 +1493,19 @@ mlx5e_chan_mtx_init(struct mlx5e_channel *c)
|
||||
mtx_init(&c->rq.mtx, "mlx5rx", MTX_NETWORK_LOCK, MTX_DEF);
|
||||
|
||||
for (tc = 0; tc < c->num_tc; tc++) {
|
||||
mtx_init(&c->sq[tc].lock, "mlx5tx", MTX_NETWORK_LOCK, MTX_DEF);
|
||||
mtx_init(&c->sq[tc].comp_lock, "mlx5comp", MTX_NETWORK_LOCK,
|
||||
struct mlx5e_sq *sq = c->sq + tc;
|
||||
|
||||
mtx_init(&sq->lock, "mlx5tx", MTX_NETWORK_LOCK, MTX_DEF);
|
||||
mtx_init(&sq->comp_lock, "mlx5comp", MTX_NETWORK_LOCK,
|
||||
MTX_DEF);
|
||||
|
||||
callout_init_mtx(&sq->cev_callout, &sq->lock, 0);
|
||||
|
||||
sq->cev_factor = c->priv->params_ethtool.tx_completion_fact;
|
||||
|
||||
/* ensure the TX completion event factor is not zero */
|
||||
if (sq->cev_factor == 0)
|
||||
sq->cev_factor = 1;
|
||||
}
|
||||
}
|
||||
|
||||
@ -1529,7 +1586,6 @@ mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
|
||||
return (0);
|
||||
|
||||
err_close_sqs:
|
||||
mlx5e_close_sqs(c);
|
||||
mlx5e_close_sqs_wait(c);
|
||||
|
||||
err_close_rx_cq:
|
||||
@ -1554,7 +1610,6 @@ mlx5e_close_channel(struct mlx5e_channel *volatile *pp)
|
||||
if (c == NULL)
|
||||
return;
|
||||
mlx5e_close_rq(&c->rq);
|
||||
mlx5e_close_sqs(c);
|
||||
}
|
||||
|
||||
static void
|
||||
|
@ -28,6 +28,18 @@
|
||||
#include "en.h"
|
||||
#include <machine/atomic.h>
|
||||
|
||||
static inline bool
|
||||
mlx5e_do_send_cqe(struct mlx5e_sq *sq)
|
||||
{
|
||||
sq->cev_counter++;
|
||||
/* interleave the CQEs */
|
||||
if (sq->cev_counter >= sq->cev_factor) {
|
||||
sq->cev_counter = 0;
|
||||
return (1);
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
void
|
||||
mlx5e_send_nop(struct mlx5e_sq *sq, u32 ds_cnt, bool notify_hw)
|
||||
{
|
||||
@ -38,7 +50,10 @@ mlx5e_send_nop(struct mlx5e_sq *sq, u32 ds_cnt, bool notify_hw)
|
||||
|
||||
wqe->ctrl.opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | MLX5_OPCODE_NOP);
|
||||
wqe->ctrl.qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt);
|
||||
wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
|
||||
if (mlx5e_do_send_cqe(sq))
|
||||
wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
|
||||
else
|
||||
wqe->ctrl.fm_ce_se = 0;
|
||||
|
||||
sq->mbuf[pi].mbuf = NULL;
|
||||
sq->mbuf[pi].num_bytes = 0;
|
||||
@ -340,7 +355,10 @@ skip_dma:
|
||||
|
||||
wqe->ctrl.opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | opcode);
|
||||
wqe->ctrl.qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt);
|
||||
wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
|
||||
if (mlx5e_do_send_cqe(sq))
|
||||
wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
|
||||
else
|
||||
wqe->ctrl.fm_ce_se = 0;
|
||||
|
||||
/* Store pointer to mbuf */
|
||||
sq->mbuf[pi].mbuf = mb;
|
||||
@ -374,9 +392,10 @@ mlx5e_poll_tx_cq(struct mlx5e_sq *sq, int budget)
|
||||
*/
|
||||
sqcc = sq->cc;
|
||||
|
||||
while (budget--) {
|
||||
while (budget > 0) {
|
||||
struct mlx5_cqe64 *cqe;
|
||||
struct mbuf *mb;
|
||||
u16 x;
|
||||
u16 ci;
|
||||
|
||||
cqe = mlx5e_get_cqe(&sq->cq);
|
||||
@ -385,24 +404,29 @@ mlx5e_poll_tx_cq(struct mlx5e_sq *sq, int budget)
|
||||
|
||||
mlx5_cqwq_pop(&sq->cq.wq);
|
||||
|
||||
ci = sqcc & sq->wq.sz_m1;
|
||||
mb = sq->mbuf[ci].mbuf;
|
||||
sq->mbuf[ci].mbuf = NULL; /* Safety clear */
|
||||
/* update budget according to the event factor */
|
||||
budget -= sq->cev_factor;
|
||||
|
||||
if (mb == NULL) {
|
||||
if (sq->mbuf[ci].num_bytes == 0) {
|
||||
/* NOP */
|
||||
sq->stats.nop++;
|
||||
for (x = 0; x != sq->cev_factor; x++) {
|
||||
ci = sqcc & sq->wq.sz_m1;
|
||||
mb = sq->mbuf[ci].mbuf;
|
||||
sq->mbuf[ci].mbuf = NULL; /* Safety clear */
|
||||
|
||||
if (mb == NULL) {
|
||||
if (sq->mbuf[ci].num_bytes == 0) {
|
||||
/* NOP */
|
||||
sq->stats.nop++;
|
||||
}
|
||||
} else {
|
||||
bus_dmamap_sync(sq->dma_tag, sq->mbuf[ci].dma_map,
|
||||
BUS_DMASYNC_POSTWRITE);
|
||||
bus_dmamap_unload(sq->dma_tag, sq->mbuf[ci].dma_map);
|
||||
|
||||
/* Free transmitted mbuf */
|
||||
m_freem(mb);
|
||||
}
|
||||
} else {
|
||||
bus_dmamap_sync(sq->dma_tag, sq->mbuf[ci].dma_map,
|
||||
BUS_DMASYNC_POSTWRITE);
|
||||
bus_dmamap_unload(sq->dma_tag, sq->mbuf[ci].dma_map);
|
||||
|
||||
/* Free transmitted mbuf */
|
||||
m_freem(mb);
|
||||
sqcc += sq->mbuf[ci].num_wqebbs;
|
||||
}
|
||||
sqcc += sq->mbuf[ci].num_wqebbs;
|
||||
}
|
||||
|
||||
mlx5_cqwq_update_db_record(&sq->cq.wq);
|
||||
@ -450,6 +474,18 @@ mlx5e_xmit_locked(struct ifnet *ifp, struct mlx5e_sq *sq, struct mbuf *mb)
|
||||
if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
|
||||
break;
|
||||
}
|
||||
/*
|
||||
* Check if we need to start the event timer which flushes the
|
||||
* transmit ring on timeout:
|
||||
*/
|
||||
if (unlikely(sq->cev_next_state == MLX5E_CEV_STATE_INITIAL &&
|
||||
sq->cev_factor != 1)) {
|
||||
/* start the timer */
|
||||
mlx5e_sq_cev_timeout(sq);
|
||||
} else {
|
||||
/* don't send NOPs yet */
|
||||
sq->cev_next_state = MLX5E_CEV_STATE_HOLD_NOPS;
|
||||
}
|
||||
return (err);
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user