numam-dpdk/drivers/net/mlx5/mlx5_rx.c

/* SPDX-License-Identifier: BSD-3-Clause
 * Copyright 2021 6WIND S.A.
 * Copyright 2021 Mellanox Technologies, Ltd
 */

#include <stdint.h>
#include <string.h>
#include <stdlib.h>

#include <rte_mbuf.h>
#include <rte_mempool.h>
#include <rte_prefetch.h>
#include <rte_common.h>
#include <rte_branch_prediction.h>
#include <rte_ether.h>
#include <rte_cycles.h>
#include <rte_flow.h>

#include <mlx5_prm.h>
#include <mlx5_common.h>
#include <mlx5_common_mr.h>
#include <rte_pmd_mlx5.h>

#include "mlx5_autoconf.h"
#include "mlx5_defs.h"
#include "mlx5.h"
#include "mlx5_utils.h"
#include "mlx5_rxtx.h"
#include "mlx5_devx.h"
#include "mlx5_rx.h"
#ifdef HAVE_MLX5_MSTFLINT
#include <mstflint/mtcr.h>
#endif


static __rte_always_inline uint32_t
rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
		   volatile struct mlx5_mini_cqe8 *mcqe);

static __rte_always_inline int
mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
		 uint16_t cqe_cnt, volatile struct mlx5_mini_cqe8 **mcqe);

static __rte_always_inline uint32_t
rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe);

static __rte_always_inline void
rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt,
	       volatile struct mlx5_cqe *cqe,
	       volatile struct mlx5_mini_cqe8 *mcqe);

static inline void
mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *__rte_restrict tcp,
			volatile struct mlx5_cqe *__rte_restrict cqe,
			uint32_t phcsum, uint8_t l4_type);

static inline void
mlx5_lro_update_hdr(uint8_t *__rte_restrict padd,
		    volatile struct mlx5_cqe *__rte_restrict cqe,
		    volatile struct mlx5_mini_cqe8 *mcqe,
		    struct mlx5_rxq_data *rxq, uint32_t len);


/**
 * Internal function to compute the number of used descriptors in an RX queue.
 *
 * @param rxq
 *   The Rx queue.
 *
 * @return
 *   The number of used Rx descriptor.
 */
static uint32_t
rx_queue_count(struct mlx5_rxq_data *rxq)
{
	struct rxq_zip *zip = &rxq->zip;
	volatile struct mlx5_cqe *cqe;
	const unsigned int cqe_n = (1 << rxq->cqe_n);
	const unsigned int sges_n = (1 << rxq->sges_n);
	const unsigned int elts_n = (1 << rxq->elts_n);
	const unsigned int strd_n = RTE_BIT32(rxq->log_strd_num);
	const unsigned int cqe_cnt = cqe_n - 1;
	unsigned int cq_ci, used;

	/* if we are processing a compressed cqe */
	if (zip->ai) {
		used = zip->cqe_cnt - zip->ai;
		cq_ci = zip->cq_ci;
	} else {
		used = 0;
		cq_ci = rxq->cq_ci;
	}
	cqe = &(*rxq->cqes)[cq_ci & cqe_cnt];
	while (check_cqe(cqe, cqe_n, cq_ci) != MLX5_CQE_STATUS_HW_OWN) {
		int8_t op_own;
		unsigned int n;

		op_own = cqe->op_own;
		if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED)
			n = rte_be_to_cpu_32(cqe->byte_cnt);
		else
			n = 1;
		cq_ci += n;
		used += n;
		cqe = &(*rxq->cqes)[cq_ci & cqe_cnt];
	}
	used = RTE_MIN(used * sges_n, elts_n * strd_n);
	return used;
}

/**
 * DPDK callback to check the status of a Rx descriptor.
 *
 * @param rx_queue
 *   The Rx queue.
 * @param[in] offset
 *   The index of the descriptor in the ring.
 *
 * @return
 *   The status of the Rx descriptor.
 */
int
mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset)
{
	struct mlx5_rxq_data *rxq = rx_queue;

	if (offset >= (1 << rxq->cqe_n)) {
		rte_errno = EINVAL;
		return -rte_errno;
	}
	if (offset < rx_queue_count(rxq))
		return RTE_ETH_RX_DESC_DONE;
	return RTE_ETH_RX_DESC_AVAIL;
}

/* Get rxq lwm percentage according to lwm number. */
static uint8_t
mlx5_rxq_lwm_to_percentage(struct mlx5_rxq_priv *rxq)
{
	struct mlx5_rxq_data *rxq_data = &rxq->ctrl->rxq;
	uint32_t wqe_cnt = 1 << (rxq_data->elts_n - rxq_data->sges_n);

	return rxq->lwm * 100 / wqe_cnt;
}

/**
 * DPDK callback to get the RX queue information.
 *
 * @param dev
 *   Pointer to the device structure.
 *
 * @param rx_queue_id
 *   Rx queue identificator.
 *
 * @param qinfo
 *   Pointer to the RX queue information structure.
 *
 * @return
 *   None.
 */

void
mlx5_rxq_info_get(struct rte_eth_dev *dev, uint16_t rx_queue_id,
		  struct rte_eth_rxq_info *qinfo)
{
	struct mlx5_rxq_ctrl *rxq_ctrl = mlx5_rxq_ctrl_get(dev, rx_queue_id);
	struct mlx5_rxq_data *rxq = mlx5_rxq_data_get(dev, rx_queue_id);
	struct mlx5_rxq_priv *rxq_priv = mlx5_rxq_get(dev, rx_queue_id);

	if (!rxq)
		return;
	qinfo->mp = mlx5_rxq_mprq_enabled(rxq) ?
					rxq->mprq_mp : rxq->mp;
	qinfo->conf.rx_thresh.pthresh = 0;
	qinfo->conf.rx_thresh.hthresh = 0;
	qinfo->conf.rx_thresh.wthresh = 0;
	qinfo->conf.rx_free_thresh = rxq->rq_repl_thresh;
	qinfo->conf.rx_drop_en = 1;
	if (rxq_ctrl == NULL || rxq_ctrl->obj == NULL)
		qinfo->conf.rx_deferred_start = 0;
	else
		qinfo->conf.rx_deferred_start = 1;
	qinfo->conf.offloads = dev->data->dev_conf.rxmode.offloads;
	qinfo->scattered_rx = dev->data->scattered_rx;
	qinfo->nb_desc = mlx5_rxq_mprq_enabled(rxq) ?
		RTE_BIT32(rxq->elts_n) * RTE_BIT32(rxq->log_strd_num) :
		RTE_BIT32(rxq->elts_n);
	qinfo->avail_thresh = rxq_priv ?
		mlx5_rxq_lwm_to_percentage(rxq_priv) : 0;
}

/**
 * DPDK callback to get the RX packet burst mode information.
 *
 * @param dev
 *   Pointer to the device structure.
 *
 * @param rx_queue_id
 *   Rx queue identification.
 *
 * @param mode
 *   Pointer to the burts mode information.
 *
 * @return
 *   0 as success, -EINVAL as failure.
 */
int
mlx5_rx_burst_mode_get(struct rte_eth_dev *dev,
		       uint16_t rx_queue_id __rte_unused,
		       struct rte_eth_burst_mode *mode)
{
	eth_rx_burst_t pkt_burst = dev->rx_pkt_burst;
	struct mlx5_rxq_priv *rxq = mlx5_rxq_get(dev, rx_queue_id);

	if (!rxq) {
		rte_errno = EINVAL;
		return -rte_errno;
	}
	if (pkt_burst == mlx5_rx_burst) {
		snprintf(mode->info, sizeof(mode->info), "%s", "Scalar");
	} else if (pkt_burst == mlx5_rx_burst_mprq) {
		snprintf(mode->info, sizeof(mode->info), "%s", "Multi-Packet RQ");
	} else if (pkt_burst == mlx5_rx_burst_vec) {
#if defined RTE_ARCH_X86_64
		snprintf(mode->info, sizeof(mode->info), "%s", "Vector SSE");
#elif defined RTE_ARCH_ARM64
		snprintf(mode->info, sizeof(mode->info), "%s", "Vector Neon");
#elif defined RTE_ARCH_PPC_64
		snprintf(mode->info, sizeof(mode->info), "%s", "Vector AltiVec");
#else
		return -EINVAL;
#endif
	} else if (pkt_burst == mlx5_rx_burst_mprq_vec) {
#if defined RTE_ARCH_X86_64
		snprintf(mode->info, sizeof(mode->info), "%s", "MPRQ Vector SSE");
#elif defined RTE_ARCH_ARM64
		snprintf(mode->info, sizeof(mode->info), "%s", "MPRQ Vector Neon");
#elif defined RTE_ARCH_PPC_64
		snprintf(mode->info, sizeof(mode->info), "%s", "MPRQ Vector AltiVec");
#else
		return -EINVAL;
#endif
	} else {
		return -EINVAL;
	}
	return 0;
}

/**
 * DPDK callback to get the number of used descriptors in a RX queue.
 *
 * @param rx_queue
 *   The Rx queue pointer.
 *
 * @return
 *   The number of used rx descriptor.
 *   -EINVAL if the queue is invalid
 */
uint32_t
mlx5_rx_queue_count(void *rx_queue)
{
	struct mlx5_rxq_data *rxq = rx_queue;
	struct rte_eth_dev *dev;

	if (!rxq) {
		rte_errno = EINVAL;
		return -rte_errno;
	}

	dev = &rte_eth_devices[rxq->port_id];

	if (dev->rx_pkt_burst == NULL ||
	    dev->rx_pkt_burst == rte_eth_pkt_burst_dummy) {
		rte_errno = ENOTSUP;
		return -rte_errno;
	}

	return rx_queue_count(rxq);
}

#define CLB_VAL_IDX 0
#define CLB_MSK_IDX 1
static int
mlx5_monitor_callback(const uint64_t value,
		const uint64_t opaque[RTE_POWER_MONITOR_OPAQUE_SZ])
{
	const uint64_t m = opaque[CLB_MSK_IDX];
	const uint64_t v = opaque[CLB_VAL_IDX];

	return (value & m) == v ? -1 : 0;
}

int mlx5_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc)
{
	struct mlx5_rxq_data *rxq = rx_queue;
	const unsigned int cqe_num = 1 << rxq->cqe_n;
	const unsigned int cqe_mask = cqe_num - 1;
	const uint16_t idx = rxq->cq_ci & cqe_num;
	volatile struct mlx5_cqe *cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_mask];

	if (unlikely(rxq->cqes == NULL)) {
		rte_errno = EINVAL;
		return -rte_errno;
	}
	pmc->addr = &cqe->op_own;
	pmc->opaque[CLB_VAL_IDX] = !!idx;
	pmc->opaque[CLB_MSK_IDX] = MLX5_CQE_OWNER_MASK;
	pmc->fn = mlx5_monitor_callback;
	pmc->size = sizeof(uint8_t);
	return 0;
}

/**
 * Translate RX completion flags to packet type.
 *
 * @param[in] rxq
 *   Pointer to RX queue structure.
 * @param[in] cqe
 *   Pointer to CQE.
 *
 * @note: fix mlx5_dev_supported_ptypes_get() if any change here.
 *
 * @return
 *   Packet type for struct rte_mbuf.
 */
static inline uint32_t
rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
				   volatile struct mlx5_mini_cqe8 *mcqe)
{
	uint8_t idx;
	uint8_t ptype;
	uint8_t pinfo = (cqe->pkt_info & 0x3) << 6;

	/* Get l3/l4 header from mini-CQE in case L3/L4 format*/
	if (mcqe == NULL ||
	    rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_L34H_STRIDX)
		ptype = (cqe->hdr_type_etc & 0xfc00) >> 10;
	else
		ptype = mcqe->hdr_type >> 2;
	/*
	 * The index to the array should have:
	 * bit[1:0] = l3_hdr_type
	 * bit[4:2] = l4_hdr_type
	 * bit[5] = ip_frag
	 * bit[6] = tunneled
	 * bit[7] = outer_l3_type
	 */
	idx = pinfo | ptype;
	return mlx5_ptype_table[idx] | rxq->tunnel * !!(idx & (1 << 6));
}

/**
 * Initialize Rx WQ and indexes.
 *
 * @param[in] rxq
 *   Pointer to RX queue structure.
 */
void
mlx5_rxq_initialize(struct mlx5_rxq_data *rxq)
{
	const unsigned int wqe_n = 1 << rxq->elts_n;
	unsigned int i;

	for (i = 0; (i != wqe_n); ++i) {
		volatile struct mlx5_wqe_data_seg *scat;
		uintptr_t addr;
		uint32_t byte_count;
		uint32_t lkey;

		if (mlx5_rxq_mprq_enabled(rxq)) {
			struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[i];

			scat = &((volatile struct mlx5_wqe_mprq *)
				rxq->wqes)[i].dseg;
			addr = (uintptr_t)mlx5_mprq_buf_addr
					(buf, RTE_BIT32(rxq->log_strd_num));
			byte_count = RTE_BIT32(rxq->log_strd_sz) *
				     RTE_BIT32(rxq->log_strd_num);
			lkey = mlx5_rx_addr2mr(rxq, addr);
		} else {
			struct rte_mbuf *buf = (*rxq->elts)[i];

			scat = &((volatile struct mlx5_wqe_data_seg *)
					rxq->wqes)[i];
			addr = rte_pktmbuf_mtod(buf, uintptr_t);
			byte_count = DATA_LEN(buf);
			lkey = mlx5_rx_mb2mr(rxq, buf);
		}
		/* scat->addr must be able to store a pointer. */
		MLX5_ASSERT(sizeof(scat->addr) >= sizeof(uintptr_t));
		*scat = (struct mlx5_wqe_data_seg){
			.addr = rte_cpu_to_be_64(addr),
			.byte_count = rte_cpu_to_be_32(byte_count),
			.lkey = lkey,
		};
	}
	rxq->consumed_strd = 0;
	rxq->decompressed = 0;
	rxq->rq_pi = 0;
	rxq->zip = (struct rxq_zip){
		.ai = 0,
	};
	rxq->elts_ci = mlx5_rxq_mprq_enabled(rxq) ?
		(wqe_n >> rxq->sges_n) * RTE_BIT32(rxq->log_strd_num) : 0;
	/* Update doorbell counter. */
	rxq->rq_ci = wqe_n >> rxq->sges_n;
	rte_io_wmb();
	*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
}

/* Must be negative. */
#define MLX5_ERROR_CQE_RET (-1)
/* Must not be negative. */
#define MLX5_RECOVERY_ERROR_RET 0

/**
 * Handle a Rx error.
 * The function inserts the RQ state to reset when the first error CQE is
 * shown, then drains the CQ by the caller function loop. When the CQ is empty,
 * it moves the RQ state to ready and initializes the RQ.
 * Next CQE identification and error counting are in the caller responsibility.
 *
 * @param[in] rxq
 *   Pointer to RX queue structure.
 * @param[in] vec
 *   1 when called from vectorized Rx burst, need to prepare mbufs for the RQ.
 *   0 when called from non-vectorized Rx burst.
 *
 * @return
 *   MLX5_RECOVERY_ERROR_RET in case of recovery error, otherwise the CQE status.
 */
int
mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec)
{
	const uint16_t cqe_n = 1 << rxq->cqe_n;
	const uint16_t cqe_mask = cqe_n - 1;
	const uint16_t wqe_n = 1 << rxq->elts_n;
	const uint16_t strd_n = RTE_BIT32(rxq->log_strd_num);
	struct mlx5_rxq_ctrl *rxq_ctrl =
			container_of(rxq, struct mlx5_rxq_ctrl, rxq);
	union {
		volatile struct mlx5_cqe *cqe;
		volatile struct mlx5_err_cqe *err_cqe;
	} u = {
		.cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_mask],
	};
	struct mlx5_mp_arg_queue_state_modify sm;
	int ret;

	switch (rxq->err_state) {
	case MLX5_RXQ_ERR_STATE_NO_ERROR:
		rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_RESET;
		/* Fall-through */
	case MLX5_RXQ_ERR_STATE_NEED_RESET:
		sm.is_wq = 1;
		sm.queue_id = rxq->idx;
		sm.state = IBV_WQS_RESET;
		if (mlx5_queue_state_modify(RXQ_DEV(rxq_ctrl), &sm))
			return MLX5_RECOVERY_ERROR_RET;
		if (rxq_ctrl->dump_file_n <
		    RXQ_PORT(rxq_ctrl)->config.max_dump_files_num) {
			MKSTR(err_str, "Unexpected CQE error syndrome "
			      "0x%02x CQN = %u RQN = %u wqe_counter = %u"
			      " rq_ci = %u cq_ci = %u", u.err_cqe->syndrome,
			      rxq->cqn, rxq_ctrl->wqn,
			      rte_be_to_cpu_16(u.err_cqe->wqe_counter),
			      rxq->rq_ci << rxq->sges_n, rxq->cq_ci);
			MKSTR(name, "dpdk_mlx5_port_%u_rxq_%u_%u",
			      rxq->port_id, rxq->idx, (uint32_t)rte_rdtsc());
			mlx5_dump_debug_information(name, NULL, err_str, 0);
			mlx5_dump_debug_information(name, "MLX5 Error CQ:",
						    (const void *)((uintptr_t)
								    rxq->cqes),
						    sizeof(*u.cqe) * cqe_n);
			mlx5_dump_debug_information(name, "MLX5 Error RQ:",
						    (const void *)((uintptr_t)
								    rxq->wqes),
						    16 * wqe_n);
			rxq_ctrl->dump_file_n++;
		}
		rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_READY;
		/* Fall-through */
	case MLX5_RXQ_ERR_STATE_NEED_READY:
		ret = check_cqe(u.cqe, cqe_n, rxq->cq_ci);
		if (ret == MLX5_CQE_STATUS_HW_OWN) {
			rte_io_wmb();
			*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
			rte_io_wmb();
			/*
			 * The RQ consumer index must be zeroed while moving
			 * from RESET state to RDY state.
			 */
			*rxq->rq_db = rte_cpu_to_be_32(0);
			rte_io_wmb();
			sm.is_wq = 1;
			sm.queue_id = rxq->idx;
			sm.state = IBV_WQS_RDY;
			if (mlx5_queue_state_modify(RXQ_DEV(rxq_ctrl), &sm))
				return MLX5_RECOVERY_ERROR_RET;
			if (vec) {
				const uint32_t elts_n =
					mlx5_rxq_mprq_enabled(rxq) ?
					wqe_n * strd_n : wqe_n;
				const uint32_t e_mask = elts_n - 1;
				uint32_t elts_ci =
					mlx5_rxq_mprq_enabled(rxq) ?
					rxq->elts_ci : rxq->rq_ci;
				uint32_t elt_idx;
				struct rte_mbuf **elt;
				int i;
				unsigned int n = elts_n - (elts_ci -
							  rxq->rq_pi);

				for (i = 0; i < (int)n; ++i) {
					elt_idx = (elts_ci + i) & e_mask;
					elt = &(*rxq->elts)[elt_idx];
					*elt = rte_mbuf_raw_alloc(rxq->mp);
					if (!*elt) {
						for (i--; i >= 0; --i) {
							elt_idx = (elts_ci +
								   i) & elts_n;
							elt = &(*rxq->elts)
								[elt_idx];
							rte_pktmbuf_free_seg
								(*elt);
						}
						return MLX5_RECOVERY_ERROR_RET;
					}
				}
				for (i = 0; i < (int)elts_n; ++i) {
					elt = &(*rxq->elts)[i];
					DATA_LEN(*elt) =
						(uint16_t)((*elt)->buf_len -
						rte_pktmbuf_headroom(*elt));
				}
				/* Padding with a fake mbuf for vec Rx. */
				for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
					(*rxq->elts)[elts_n + i] =
								&rxq->fake_mbuf;
			}
			mlx5_rxq_initialize(rxq);
			rxq->err_state = MLX5_RXQ_ERR_STATE_NO_ERROR;
		}
		return ret;
	default:
		return MLX5_RECOVERY_ERROR_RET;
	}
}

/**
 * Get size of the next packet for a given CQE. For compressed CQEs, the
 * consumer index is updated only once all packets of the current one have
 * been processed.
 *
 * @param rxq
 *   Pointer to RX queue.
 * @param cqe
 *   CQE to process.
 * @param[out] mcqe
 *   Store pointer to mini-CQE if compressed. Otherwise, the pointer is not
 *   written.
 *
 * @return
 *   0 in case of empty CQE, MLX5_ERROR_CQE_RET in case of error CQE,
 *   otherwise the packet size in regular RxQ, and striding byte
 *   count format in mprq case.
 */
static inline int
mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
		 uint16_t cqe_cnt, volatile struct mlx5_mini_cqe8 **mcqe)
{
	struct rxq_zip *zip = &rxq->zip;
	uint16_t cqe_n = cqe_cnt + 1;
	int len;
	uint16_t idx, end;

	do {
		len = 0;
		/* Process compressed data in the CQE and mini arrays. */
		if (zip->ai) {
			volatile struct mlx5_mini_cqe8 (*mc)[8] =
				(volatile struct mlx5_mini_cqe8 (*)[8])
				(uintptr_t)(&(*rxq->cqes)[zip->ca &
							  cqe_cnt].pkt_info);
			len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt &
					       rxq->byte_mask);
			*mcqe = &(*mc)[zip->ai & 7];
			if ((++zip->ai & 7) == 0) {
				/* Invalidate consumed CQEs */
				idx = zip->ca;
				end = zip->na;
				while (idx != end) {
					(*rxq->cqes)[idx & cqe_cnt].op_own =
						MLX5_CQE_INVALIDATE;
					++idx;
				}
				/*
				 * Increment consumer index to skip the number
				 * of CQEs consumed. Hardware leaves holes in
				 * the CQ ring for software use.
				 */
				zip->ca = zip->na;
				zip->na += 8;
			}
			if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) {
				/* Invalidate the rest */
				idx = zip->ca;
				end = zip->cq_ci;

				while (idx != end) {
					(*rxq->cqes)[idx & cqe_cnt].op_own =
						MLX5_CQE_INVALIDATE;
					++idx;
				}
				rxq->cq_ci = zip->cq_ci;
				zip->ai = 0;
			}
		/*
		 * No compressed data, get next CQE and verify if it is
		 * compressed.
		 */
		} else {
			int ret;
			int8_t op_own;
			uint32_t cq_ci;

			ret = check_cqe(cqe, cqe_n, rxq->cq_ci);
			if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) {
				if (unlikely(ret == MLX5_CQE_STATUS_ERR ||
					     rxq->err_state)) {
					ret = mlx5_rx_err_handle(rxq, 0);
					if (ret == MLX5_CQE_STATUS_HW_OWN ||
					    ret == MLX5_RECOVERY_ERROR_RET)
						return MLX5_ERROR_CQE_RET;
				} else {
					return 0;
				}
			}
			/*
			 * Introduce the local variable to have queue cq_ci
			 * index in queue structure always consistent with
			 * actual CQE boundary (not pointing to the middle
			 * of compressed CQE session).
			 */
			cq_ci = rxq->cq_ci + 1;
			op_own = cqe->op_own;
			if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) {
				volatile struct mlx5_mini_cqe8 (*mc)[8] =
					(volatile struct mlx5_mini_cqe8 (*)[8])
					(uintptr_t)(&(*rxq->cqes)
						[cq_ci & cqe_cnt].pkt_info);

				/* Fix endianness. */
				zip->cqe_cnt = rte_be_to_cpu_32(cqe->byte_cnt);
				/*
				 * Current mini array position is the one
				 * returned by check_cqe64().
				 *
				 * If completion comprises several mini arrays,
				 * as a special case the second one is located
				 * 7 CQEs after the initial CQE instead of 8
				 * for subsequent ones.
				 */
				zip->ca = cq_ci;
				zip->na = zip->ca + 7;
				/* Compute the next non compressed CQE. */
				zip->cq_ci = rxq->cq_ci + zip->cqe_cnt;
				/* Get packet size to return. */
				len = rte_be_to_cpu_32((*mc)[0].byte_cnt &
						       rxq->byte_mask);
				*mcqe = &(*mc)[0];
				zip->ai = 1;
				/* Prefetch all to be invalidated */
				idx = zip->ca;
				end = zip->cq_ci;
				while (idx != end) {
					rte_prefetch0(&(*rxq->cqes)[(idx) &
								    cqe_cnt]);
					++idx;
				}
			} else {
				rxq->cq_ci = cq_ci;
				len = rte_be_to_cpu_32(cqe->byte_cnt);
			}
		}
		if (unlikely(rxq->err_state)) {
			cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
			++rxq->stats.idropped;
		} else {
			return len;
		}
	} while (1);
}

/**
 * Translate RX completion flags to offload flags.
 *
 * @param[in] cqe
 *   Pointer to CQE.
 *
 * @return
 *   Offload flags (ol_flags) for struct rte_mbuf.
 */
static inline uint32_t
rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe)
{
	uint32_t ol_flags = 0;
	uint16_t flags = rte_be_to_cpu_16(cqe->hdr_type_etc);

	ol_flags =
		TRANSPOSE(flags,
			  MLX5_CQE_RX_L3_HDR_VALID,
			  RTE_MBUF_F_RX_IP_CKSUM_GOOD) |
		TRANSPOSE(flags,
			  MLX5_CQE_RX_L4_HDR_VALID,
			  RTE_MBUF_F_RX_L4_CKSUM_GOOD);
	return ol_flags;
}

/**
 * Fill in mbuf fields from RX completion flags.
 * Note that pkt->ol_flags should be initialized outside of this function.
 *
 * @param rxq
 *   Pointer to RX queue.
 * @param pkt
 *   mbuf to fill.
 * @param cqe
 *   CQE to process.
 * @param rss_hash_res
 *   Packet RSS Hash result.
 */
static inline void
rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt,
	       volatile struct mlx5_cqe *cqe,
	       volatile struct mlx5_mini_cqe8 *mcqe)
{
	/* Update packet information. */
	pkt->packet_type = rxq_cq_to_pkt_type(rxq, cqe, mcqe);
	pkt->port = unlikely(rxq->shared) ? cqe->user_index_low : rxq->port_id;

	if (rxq->rss_hash) {
		uint32_t rss_hash_res = 0;

		/* If compressed, take hash result from mini-CQE. */
		if (mcqe == NULL ||
		    rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_HASH)
			rss_hash_res = rte_be_to_cpu_32(cqe->rx_hash_res);
		else
			rss_hash_res = rte_be_to_cpu_32(mcqe->rx_hash_result);
		if (rss_hash_res) {
			pkt->hash.rss = rss_hash_res;
			pkt->ol_flags |= RTE_MBUF_F_RX_RSS_HASH;
		}
	}
	if (rxq->mark) {
		uint32_t mark = 0;

		/* If compressed, take flow tag from mini-CQE. */
		if (mcqe == NULL ||
		    rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_FTAG_STRIDX)
			mark = cqe->sop_drop_qpn;
		else
			mark = ((mcqe->byte_cnt_flow & 0xff) << 8) |
				(mcqe->flow_tag_high << 16);
		if (MLX5_FLOW_MARK_IS_VALID(mark)) {
			pkt->ol_flags |= RTE_MBUF_F_RX_FDIR;
			if (mark != RTE_BE32(MLX5_FLOW_MARK_DEFAULT)) {
				pkt->ol_flags |= RTE_MBUF_F_RX_FDIR_ID;
				pkt->hash.fdir.hi = mlx5_flow_mark_get(mark);
			}
		}
	}
	if (rxq->dynf_meta) {
		uint32_t meta = rte_be_to_cpu_32(cqe->flow_table_metadata) &
			rxq->flow_meta_port_mask;

		if (meta) {
			pkt->ol_flags |= rxq->flow_meta_mask;
			*RTE_MBUF_DYNFIELD(pkt, rxq->flow_meta_offset,
						uint32_t *) = meta;
		}
	}
	if (rxq->csum)
		pkt->ol_flags |= rxq_cq_to_ol_flags(cqe);
	if (rxq->vlan_strip) {
		bool vlan_strip;

		if (mcqe == NULL ||
		    rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_L34H_STRIDX)
			vlan_strip = cqe->hdr_type_etc &
				     RTE_BE16(MLX5_CQE_VLAN_STRIPPED);
		else
			vlan_strip = mcqe->hdr_type &
				     RTE_BE16(MLX5_CQE_VLAN_STRIPPED);
		if (vlan_strip) {
			pkt->ol_flags |= RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED;
			pkt->vlan_tci = rte_be_to_cpu_16(cqe->vlan_info);
		}
	}
	if (rxq->hw_timestamp) {
		uint64_t ts = rte_be_to_cpu_64(cqe->timestamp);

		if (rxq->rt_timestamp)
			ts = mlx5_txpp_convert_rx_ts(rxq->sh, ts);
		mlx5_timestamp_set(pkt, rxq->timestamp_offset, ts);
		pkt->ol_flags |= rxq->timestamp_rx_flag;
	}
}

/**
 * DPDK callback for RX.
 *
 * @param dpdk_rxq
 *   Generic pointer to RX queue structure.
 * @param[out] pkts
 *   Array to store received packets.
 * @param pkts_n
 *   Maximum number of packets in array.
 *
 * @return
 *   Number of packets successfully received (<= pkts_n).
 */
uint16_t
mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
{
	struct mlx5_rxq_data *rxq = dpdk_rxq;
	const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1;
	const unsigned int cqe_cnt = (1 << rxq->cqe_n) - 1;
	const unsigned int sges_n = rxq->sges_n;
	struct rte_mbuf *pkt = NULL;
	struct rte_mbuf *seg = NULL;
	volatile struct mlx5_cqe *cqe =
		&(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
	unsigned int i = 0;
	unsigned int rq_ci = rxq->rq_ci << sges_n;
	int len = 0; /* keep its value across iterations. */

	while (pkts_n) {
		unsigned int idx = rq_ci & wqe_cnt;
		volatile struct mlx5_wqe_data_seg *wqe =
			&((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[idx];
		struct rte_mbuf *rep = (*rxq->elts)[idx];
		volatile struct mlx5_mini_cqe8 *mcqe = NULL;

		if (pkt)
			NEXT(seg) = rep;
		seg = rep;
		rte_prefetch0(seg);
		rte_prefetch0(cqe);
		rte_prefetch0(wqe);
		/* Allocate the buf from the same pool. */
		rep = rte_mbuf_raw_alloc(seg->pool);
		if (unlikely(rep == NULL)) {
			++rxq->stats.rx_nombuf;
			if (!pkt) {
				/*
				 * no buffers before we even started,
				 * bail out silently.
				 */
				break;
			}
			while (pkt != seg) {
				MLX5_ASSERT(pkt != (*rxq->elts)[idx]);
				rep = NEXT(pkt);
				NEXT(pkt) = NULL;
				NB_SEGS(pkt) = 1;
				rte_mbuf_raw_free(pkt);
				pkt = rep;
			}
			rq_ci >>= sges_n;
			++rq_ci;
			rq_ci <<= sges_n;
			break;
		}
		if (!pkt) {
			cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
			len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt, &mcqe);
			if (len <= 0) {
				rte_mbuf_raw_free(rep);
				if (unlikely(len == MLX5_ERROR_CQE_RET))
					rq_ci = rxq->rq_ci << sges_n;
				break;
			}
			pkt = seg;
			MLX5_ASSERT(len >= (rxq->crc_present << 2));
			pkt->ol_flags &= RTE_MBUF_F_EXTERNAL;
			rxq_cq_to_mbuf(rxq, pkt, cqe, mcqe);
			if (rxq->crc_present)
				len -= RTE_ETHER_CRC_LEN;
			PKT_LEN(pkt) = len;
			if (cqe->lro_num_seg > 1) {
				mlx5_lro_update_hdr
					(rte_pktmbuf_mtod(pkt, uint8_t *), cqe,
					 mcqe, rxq, len);
				pkt->ol_flags |= RTE_MBUF_F_RX_LRO;
				pkt->tso_segsz = len / cqe->lro_num_seg;
			}
		}
		DATA_LEN(rep) = DATA_LEN(seg);
		PKT_LEN(rep) = PKT_LEN(seg);
		SET_DATA_OFF(rep, DATA_OFF(seg));
		PORT(rep) = PORT(seg);
		(*rxq->elts)[idx] = rep;
		/*
		 * Fill NIC descriptor with the new buffer. The lkey and size
		 * of the buffers are already known, only the buffer address
		 * changes.
		 */
		wqe->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t));
		/* If there's only one MR, no need to replace LKey in WQE. */
		if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
			wqe->lkey = mlx5_rx_mb2mr(rxq, rep);
		if (len > DATA_LEN(seg)) {
			len -= DATA_LEN(seg);
			++NB_SEGS(pkt);
			++rq_ci;
			continue;
		}
		DATA_LEN(seg) = len;
#ifdef MLX5_PMD_SOFT_COUNTERS
		/* Increment bytes counter. */
		rxq->stats.ibytes += PKT_LEN(pkt);
#endif
		/* Return packet. */
		*(pkts++) = pkt;
		pkt = NULL;
		--pkts_n;
		++i;
		/* Align consumer index to the next stride. */
		rq_ci >>= sges_n;
		++rq_ci;
		rq_ci <<= sges_n;
	}
	if (unlikely(i == 0 && ((rq_ci >> sges_n) == rxq->rq_ci)))
		return 0;
	/* Update the consumer index. */
	rxq->rq_ci = rq_ci >> sges_n;
	rte_io_wmb();
	*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
	rte_io_wmb();
	*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
#ifdef MLX5_PMD_SOFT_COUNTERS
	/* Increment packets counter. */
	rxq->stats.ipackets += i;
#endif
	return i;
}

/**
 * Update LRO packet TCP header.
 * The HW LRO feature doesn't update the TCP header after coalescing the
 * TCP segments but supplies information in CQE to fill it by SW.
 *
 * @param tcp
 *   Pointer to the TCP header.
 * @param cqe
 *   Pointer to the completion entry.
 * @param phcsum
 *   The L3 pseudo-header checksum.
 */
static inline void
mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *__rte_restrict tcp,
			volatile struct mlx5_cqe *__rte_restrict cqe,
			uint32_t phcsum, uint8_t l4_type)
{
	/*
	 * The HW calculates only the TCP payload checksum, need to complete
	 * the TCP header checksum and the L3 pseudo-header checksum.
	 */
	uint32_t csum = phcsum + cqe->csum;

	if (l4_type == MLX5_L4_HDR_TYPE_TCP_EMPTY_ACK ||
	    l4_type == MLX5_L4_HDR_TYPE_TCP_WITH_ACL) {
		tcp->tcp_flags |= RTE_TCP_ACK_FLAG;
		tcp->recv_ack = cqe->lro_ack_seq_num;
		tcp->rx_win = cqe->lro_tcp_win;
	}
	if (cqe->lro_tcppsh_abort_dupack & MLX5_CQE_LRO_PUSH_MASK)
		tcp->tcp_flags |= RTE_TCP_PSH_FLAG;
	tcp->cksum = 0;
	csum += rte_raw_cksum(tcp, (tcp->data_off >> 4) * 4);
	csum = ((csum & 0xffff0000) >> 16) + (csum & 0xffff);
	csum = (~csum) & 0xffff;
	if (csum == 0)
		csum = 0xffff;
	tcp->cksum = csum;
}

/**
 * Update LRO packet headers.
 * The HW LRO feature doesn't update the L3/TCP headers after coalescing the
 * TCP segments but supply information in CQE to fill it by SW.
 *
 * @param padd
 *   The packet address.
 * @param cqe
 *   Pointer to the completion entry.
 * @param len
 *   The packet length.
 */
static inline void
mlx5_lro_update_hdr(uint8_t *__rte_restrict padd,
		    volatile struct mlx5_cqe *__rte_restrict cqe,
		    volatile struct mlx5_mini_cqe8 *mcqe,
		    struct mlx5_rxq_data *rxq, uint32_t len)
{
	union {
		struct rte_ether_hdr *eth;
		struct rte_vlan_hdr *vlan;
		struct rte_ipv4_hdr *ipv4;
		struct rte_ipv6_hdr *ipv6;
		struct rte_tcp_hdr *tcp;
		uint8_t *hdr;
	} h = {
		.hdr = padd,
	};
	uint16_t proto = h.eth->ether_type;
	uint32_t phcsum;
	uint8_t l4_type;

	h.eth++;
	while (proto == RTE_BE16(RTE_ETHER_TYPE_VLAN) ||
	       proto == RTE_BE16(RTE_ETHER_TYPE_QINQ)) {
		proto = h.vlan->eth_proto;
		h.vlan++;
	}
	if (proto == RTE_BE16(RTE_ETHER_TYPE_IPV4)) {
		h.ipv4->time_to_live = cqe->lro_min_ttl;
		h.ipv4->total_length = rte_cpu_to_be_16(len - (h.hdr - padd));
		h.ipv4->hdr_checksum = 0;
		h.ipv4->hdr_checksum = rte_ipv4_cksum(h.ipv4);
		phcsum = rte_ipv4_phdr_cksum(h.ipv4, 0);
		h.ipv4++;
	} else {
		h.ipv6->hop_limits = cqe->lro_min_ttl;
		h.ipv6->payload_len = rte_cpu_to_be_16(len - (h.hdr - padd) -
						       sizeof(*h.ipv6));
		phcsum = rte_ipv6_phdr_cksum(h.ipv6, 0);
		h.ipv6++;
	}
	if (mcqe == NULL ||
	    rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_L34H_STRIDX)
		l4_type = (rte_be_to_cpu_16(cqe->hdr_type_etc) &
			   MLX5_CQE_L4_TYPE_MASK) >> MLX5_CQE_L4_TYPE_SHIFT;
	else
		l4_type = (rte_be_to_cpu_16(mcqe->hdr_type) &
			   MLX5_CQE_L4_TYPE_MASK) >> MLX5_CQE_L4_TYPE_SHIFT;
	mlx5_lro_update_tcp_hdr(h.tcp, cqe, phcsum, l4_type);
}

void
mlx5_mprq_buf_free(struct mlx5_mprq_buf *buf)
{
	mlx5_mprq_buf_free_cb(NULL, buf);
}

/**
 * DPDK callback for RX with Multi-Packet RQ support.
 *
 * @param dpdk_rxq
 *   Generic pointer to RX queue structure.
 * @param[out] pkts
 *   Array to store received packets.
 * @param pkts_n
 *   Maximum number of packets in array.
 *
 * @return
 *   Number of packets successfully received (<= pkts_n).
 */
uint16_t
mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
{
	struct mlx5_rxq_data *rxq = dpdk_rxq;
	const uint32_t strd_n = RTE_BIT32(rxq->log_strd_num);
	const uint32_t strd_sz = RTE_BIT32(rxq->log_strd_sz);
	const uint32_t cq_mask = (1 << rxq->cqe_n) - 1;
	const uint32_t wq_mask = (1 << rxq->elts_n) - 1;
	volatile struct mlx5_cqe *cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask];
	unsigned int i = 0;
	uint32_t rq_ci = rxq->rq_ci;
	uint16_t consumed_strd = rxq->consumed_strd;
	struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_ci & wq_mask];

	while (i < pkts_n) {
		struct rte_mbuf *pkt;
		int ret;
		uint32_t len;
		uint16_t strd_cnt;
		uint16_t strd_idx;
		uint32_t byte_cnt;
		volatile struct mlx5_mini_cqe8 *mcqe = NULL;
		enum mlx5_rqx_code rxq_code;

		if (consumed_strd == strd_n) {
			/* Replace WQE if the buffer is still in use. */
			mprq_buf_replace(rxq, rq_ci & wq_mask);
			/* Advance to the next WQE. */
			consumed_strd = 0;
			++rq_ci;
			buf = (*rxq->mprq_bufs)[rq_ci & wq_mask];
		}
		cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask];
		ret = mlx5_rx_poll_len(rxq, cqe, cq_mask, &mcqe);
		if (ret == 0)
			break;
		if (unlikely(ret == MLX5_ERROR_CQE_RET)) {
			rq_ci = rxq->rq_ci;
			consumed_strd = rxq->consumed_strd;
			break;
		}
		byte_cnt = ret;
		len = (byte_cnt & MLX5_MPRQ_LEN_MASK) >> MLX5_MPRQ_LEN_SHIFT;
		MLX5_ASSERT((int)len >= (rxq->crc_present << 2));
		if (rxq->crc_present)
			len -= RTE_ETHER_CRC_LEN;
		if (mcqe &&
		    rxq->mcqe_format == MLX5_CQE_RESP_FORMAT_FTAG_STRIDX)
			strd_cnt = (len / strd_sz) + !!(len % strd_sz);
		else
			strd_cnt = (byte_cnt & MLX5_MPRQ_STRIDE_NUM_MASK) >>
				   MLX5_MPRQ_STRIDE_NUM_SHIFT;
		MLX5_ASSERT(strd_cnt);
		consumed_strd += strd_cnt;
		if (byte_cnt & MLX5_MPRQ_FILLER_MASK)
			continue;
		strd_idx = rte_be_to_cpu_16(mcqe == NULL ?
					cqe->wqe_counter :
					mcqe->stride_idx);
		MLX5_ASSERT(strd_idx < strd_n);
		MLX5_ASSERT(!((rte_be_to_cpu_16(cqe->wqe_id) ^ rq_ci) &
			    wq_mask));
		pkt = rte_pktmbuf_alloc(rxq->mp);
		if (unlikely(pkt == NULL)) {
			++rxq->stats.rx_nombuf;
			break;
		}
		len = (byte_cnt & MLX5_MPRQ_LEN_MASK) >> MLX5_MPRQ_LEN_SHIFT;
		MLX5_ASSERT((int)len >= (rxq->crc_present << 2));
		if (rxq->crc_present)
			len -= RTE_ETHER_CRC_LEN;
		rxq_code = mprq_buf_to_pkt(rxq, pkt, len, buf,
					   strd_idx, strd_cnt);
		if (unlikely(rxq_code != MLX5_RXQ_CODE_EXIT)) {
			rte_pktmbuf_free_seg(pkt);
			if (rxq_code == MLX5_RXQ_CODE_DROPPED) {
				++rxq->stats.idropped;
				continue;
			}
			if (rxq_code == MLX5_RXQ_CODE_NOMBUF) {
				++rxq->stats.rx_nombuf;
				break;
			}
		}
		rxq_cq_to_mbuf(rxq, pkt, cqe, mcqe);
		if (cqe->lro_num_seg > 1) {
			mlx5_lro_update_hdr(rte_pktmbuf_mtod(pkt, uint8_t *),
					    cqe, mcqe, rxq, len);
			pkt->ol_flags |= RTE_MBUF_F_RX_LRO;
			pkt->tso_segsz = len / cqe->lro_num_seg;
		}
		PKT_LEN(pkt) = len;
		PORT(pkt) = rxq->port_id;
#ifdef MLX5_PMD_SOFT_COUNTERS
		/* Increment bytes counter. */
		rxq->stats.ibytes += PKT_LEN(pkt);
#endif
		/* Return packet. */
		*(pkts++) = pkt;
		++i;
	}
	/* Update the consumer indexes. */
	rxq->consumed_strd = consumed_strd;
	rte_io_wmb();
	*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
	if (rq_ci != rxq->rq_ci) {
		rxq->rq_ci = rq_ci;
		rte_io_wmb();
		*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
	}
#ifdef MLX5_PMD_SOFT_COUNTERS
	/* Increment packets counter. */
	rxq->stats.ipackets += i;
#endif
	return i;
}

/*
 * Vectorized Rx routines are not compiled in when required vector instructions
 * are not supported on a target architecture.
 * The following null stubs are needed for linkage when those are not included
 * outside of this file (e.g. mlx5_rxtx_vec_sse.c for x86).
 */

__rte_weak uint16_t
mlx5_rx_burst_vec(void *dpdk_rxq __rte_unused,
		  struct rte_mbuf **pkts __rte_unused,
		  uint16_t pkts_n __rte_unused)
{
	return 0;
}

__rte_weak uint16_t
mlx5_rx_burst_mprq_vec(void *dpdk_rxq __rte_unused,
		       struct rte_mbuf **pkts __rte_unused,
		       uint16_t pkts_n __rte_unused)
{
	return 0;
}

__rte_weak int
mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused)
{
	return -ENOTSUP;
}

__rte_weak int
mlx5_check_vec_rx_support(struct rte_eth_dev *dev __rte_unused)
{
	return -ENOTSUP;
}

int
mlx5_rx_queue_lwm_query(struct rte_eth_dev *dev,
			uint16_t *queue_id, uint8_t *lwm)
{
	struct mlx5_priv *priv = dev->data->dev_private;
	unsigned int rxq_id, found = 0, n;
	struct mlx5_rxq_priv *rxq;

	if (!queue_id)
		return -EINVAL;
	/* Query all the Rx queues of the port in a circular way. */
	for (rxq_id = *queue_id, n = 0; n < priv->rxqs_n; n++) {
		rxq = mlx5_rxq_get(dev, rxq_id);
		if (rxq && rxq->lwm_event_pending) {
			pthread_mutex_lock(&priv->sh->lwm_config_lock);
			rxq->lwm_event_pending = 0;
			pthread_mutex_unlock(&priv->sh->lwm_config_lock);
			*queue_id = rxq_id;
			found = 1;
			if (lwm)
				*lwm =  mlx5_rxq_lwm_to_percentage(rxq);
			break;
		}
		rxq_id = (rxq_id + 1) % priv->rxqs_n;
	}
	return found;
}

/**
 * Rte interrupt handler for LWM event.
 * It first checks if the event arrives, if so process the callback for
 * RTE_ETH_EVENT_RX_LWM.
 *
 * @param args
 *   Generic pointer to mlx5_priv.
 */
void
mlx5_dev_interrupt_handler_lwm(void *args)
{
	struct mlx5_priv *priv = args;
	struct mlx5_rxq_priv *rxq;
	struct rte_eth_dev *dev;
	int ret, rxq_idx = 0, port_id = 0;

	ret = priv->obj_ops.rxq_event_get_lwm(priv, &rxq_idx, &port_id);
	if (unlikely(ret < 0)) {
		DRV_LOG(WARNING, "Cannot get LWM event context.");
		return;
	}
	DRV_LOG(INFO, "%s get LWM event, port_id:%d rxq_id:%d.", __func__,
		port_id, rxq_idx);
	dev = &rte_eth_devices[port_id];
	rxq = mlx5_rxq_get(dev, rxq_idx);
	if (rxq) {
		pthread_mutex_lock(&priv->sh->lwm_config_lock);
		rxq->lwm_event_pending = 1;
		pthread_mutex_unlock(&priv->sh->lwm_config_lock);
	}
	rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_RX_AVAIL_THRESH, NULL);
}

/**
 * DPDK callback to arm an Rx queue LWM(limit watermark) event.
 * While the Rx queue fullness reaches the LWM limit, the driver catches
 * an HW event and invokes the user event callback.
 * After the last event handling, the user needs to call this API again
 * to arm an additional event.
 *
 * @param dev
 *   Pointer to the device structure.
 * @param[in] rx_queue_id
 *   Rx queue identificator.
 * @param[in] lwm
 *   The LWM value, is defined by a percentage of the Rx queue size.
 *   [1-99] to set a new LWM (update the old value).
 *   0 to unarm the event.
 *
 * @return
 *   0 : operation success.
 *   Otherwise:
 *   - ENOMEM - not enough memory to create LWM event channel.
 *   - EINVAL - the input Rxq is not created by devx.
 *   - E2BIG  - lwm is bigger than 99.
 */
int
mlx5_rx_queue_lwm_set(struct rte_eth_dev *dev, uint16_t rx_queue_id,
		      uint8_t lwm)
{
	struct mlx5_priv *priv = dev->data->dev_private;
	uint16_t port_id = PORT_ID(priv);
	struct mlx5_rxq_priv *rxq = mlx5_rxq_get(dev, rx_queue_id);
	uint16_t event_nums[1] = {MLX5_EVENT_TYPE_SRQ_LIMIT_REACHED};
	struct mlx5_rxq_data *rxq_data;
	uint32_t wqe_cnt;
	uint64_t cookie;
	int ret = 0;

	if (!rxq) {
		rte_errno = EINVAL;
		return -rte_errno;
	}
	rxq_data = &rxq->ctrl->rxq;
	/* Ensure the Rq is created by devx. */
	if (priv->obj_ops.rxq_obj_new != devx_obj_ops.rxq_obj_new) {
		rte_errno = EINVAL;
		return -rte_errno;
	}
	if (lwm > 99) {
		DRV_LOG(WARNING, "Too big LWM configuration.");
		rte_errno = E2BIG;
		return -rte_errno;
	}
	/* Start config LWM. */
	pthread_mutex_lock(&priv->sh->lwm_config_lock);
	if (rxq->lwm == 0 && lwm == 0) {
		/* Both old/new values are 0, do nothing. */
		ret = 0;
		goto end;
	}
	wqe_cnt = 1 << (rxq_data->elts_n - rxq_data->sges_n);
	if (lwm) {
		if (!priv->sh->devx_channel_lwm) {
			ret = mlx5_lwm_setup(priv);
			if (ret) {
				DRV_LOG(WARNING,
					"Failed to create shared_lwm.");
				rte_errno = ENOMEM;
				ret = -rte_errno;
				goto end;
			}
		}
		if (!rxq->lwm_devx_subscribed) {
			cookie = ((uint32_t)
				  (port_id << LWM_COOKIE_PORTID_OFFSET)) |
				(rx_queue_id << LWM_COOKIE_RXQID_OFFSET);
			ret = mlx5_os_devx_subscribe_devx_event
				(priv->sh->devx_channel_lwm,
				 rxq->devx_rq.rq->obj,
				 sizeof(event_nums),
				 event_nums,
				 cookie);
			if (ret) {
				rte_errno = rte_errno ? rte_errno : EINVAL;
				ret = -rte_errno;
				goto end;
			}
			rxq->lwm_devx_subscribed = 1;
		}
	}
	/* Save LWM to rxq and send modify_rq devx command. */
	rxq->lwm = lwm * wqe_cnt / 100;
	/* Prevent integer division loss when switch lwm number to percentage. */
	if (lwm && (lwm * wqe_cnt % 100)) {
		rxq->lwm = ((uint32_t)(rxq->lwm + 1) >= wqe_cnt) ?
			rxq->lwm : (rxq->lwm + 1);
	}
	if (lwm && !rxq->lwm) {
		/* With mprq, wqe_cnt may be < 100. */
		DRV_LOG(WARNING, "Too small LWM configuration.");
		rte_errno = EINVAL;
		ret = -rte_errno;
		goto end;
	}
	ret = mlx5_devx_modify_rq(rxq, MLX5_RXQ_MOD_RDY2RDY);
end:
	pthread_mutex_unlock(&priv->sh->lwm_config_lock);
	return ret;
}

/**
 * Mlx5 access register function to configure host shaper.
 * It calls API in libmtcr_ul to access QSHR(Qos Shaper Host Register)
 * in firmware.
 *
 * @param dev
 *   Pointer to rte_eth_dev.
 * @param lwm_triggered
 *   Flag to enable/disable lwm_triggered bit in QSHR.
 * @param rate
 *   Host shaper rate, unit is 100Mbps, set to 0 means disable the shaper.
 * @return
 *   0 : operation success.
 *   Otherwise:
 *   - ENOENT - no ibdev interface.
 *   - EBUSY  - the register access unit is busy.
 *   - EIO    - the register access command meets IO error.
 */
static int
mlxreg_host_shaper_config(struct rte_eth_dev *dev,
			  bool lwm_triggered, uint8_t rate)
{
#ifdef HAVE_MLX5_MSTFLINT
	struct mlx5_priv *priv = dev->data->dev_private;
	uint32_t data[MLX5_ST_SZ_DW(register_qshr)] = {0};
	int rc, retry_count = 3;
	mfile *mf = NULL;
	int status;
	void *ptr;

	mf = mopen(priv->sh->ibdev_name);
	if (!mf) {
		DRV_LOG(WARNING, "mopen failed\n");
		rte_errno = ENOENT;
		return -rte_errno;
	}
	MLX5_SET(register_qshr, data, connected_host, 1);
	MLX5_SET(register_qshr, data, fast_response, lwm_triggered ? 1 : 0);
	MLX5_SET(register_qshr, data, local_port, 1);
	ptr = MLX5_ADDR_OF(register_qshr, data, global_config);
	MLX5_SET(ets_global_config_register, ptr, rate_limit_update, 1);
	MLX5_SET(ets_global_config_register, ptr, max_bw_units,
		 rate ? ETS_GLOBAL_CONFIG_BW_UNIT_HUNDREDS_MBPS :
		 ETS_GLOBAL_CONFIG_BW_UNIT_DISABLED);
	MLX5_SET(ets_global_config_register, ptr, max_bw_value, rate);
	do {
		rc = maccess_reg(mf,
				 MLX5_QSHR_REGISTER_ID,
				 MACCESS_REG_METHOD_SET,
				 (u_int32_t *)&data[0],
				 sizeof(data),
				 sizeof(data),
				 sizeof(data),
				 &status);
		if ((rc != ME_ICMD_STATUS_IFC_BUSY &&
		     status != ME_REG_ACCESS_BAD_PARAM) ||
		    !(mf->flags & MDEVS_REM)) {
			break;
		}
		DRV_LOG(WARNING, "%s retry.", __func__);
		usleep(10000);
	} while (retry_count-- > 0);
	mclose(mf);
	rte_errno = (rc == ME_REG_ACCESS_DEV_BUSY) ? EBUSY : EIO;
	return rc ? -rte_errno : 0;
#else
	(void)dev;
	(void)lwm_triggered;
	(void)rate;
	return -1;
#endif
}

int rte_pmd_mlx5_host_shaper_config(int port_id, uint8_t rate,
				    uint32_t flags)
{
	struct rte_eth_dev *dev = &rte_eth_devices[port_id];
	struct mlx5_priv *priv = dev->data->dev_private;
	bool lwm_triggered =
	     !!(flags & RTE_BIT32(MLX5_HOST_SHAPER_FLAG_AVAIL_THRESH_TRIGGERED));

	if (!lwm_triggered) {
		priv->sh->host_shaper_rate = rate;
	} else {
		switch (rate) {
		case 0:
		/* Rate 0 means disable lwm_triggered. */
			priv->sh->lwm_triggered = 0;
			break;
		case 1:
		/* Rate 1 means enable lwm_triggered. */
			priv->sh->lwm_triggered = 1;
			break;
		default:
			return -ENOTSUP;
		}
	}
	return mlxreg_host_shaper_config(dev, priv->sh->lwm_triggered,
					 priv->sh->host_shaper_rate);
}