net/mlx5: extend Rx completion with error handling

When WQEs are posted to the HW to receive packets, the PMD may receive
a completion report with error from the HW, aka error CQE which is
associated to a bad WQE.

The error reason may be bad address, wrong lkey, small buffer size,
etc. that can wrongly be configured by the PMD or by the user.

Checking all the optional mistakes to prevent error CQEs doesn't make
sense due to performance impacts, moreover, some error CQEs can be
triggered because of the packets coming from the wire when the DPDK
application has no any control.

Most of the error CQE types change the RQ state to error state what
causes all the next received packets to be dropped by the HW and to be
completed with CQE flush error forever.

The current solution detects these error CQEs and even reports the
errors to the user by the statistics error counters but without
recovery, so if the RQ inserted to the error state it never moves to
ready state again and all the next packets ever will be dropped.

Extend the error CQEs handling for recovery by moving the state to
ready again, and rearranging all the RQ WQEs and the management
variables appropriately.

Sometimes the error CQE root cause is very hard to debug and even may
be related to some corner cases which are not reproducible easily,
hence a dump file with debug information will be created for the first
number of error CQEs, this number can be configured by the PMD probe
parameters.

Cc: stable@dpdk.org

Signed-off-by: Matan Azrad <matan@mellanox.com>
Acked-by: Shahaf Shuler <shahafs@mellanox.com>
This commit is contained in:
Matan Azrad 2019-05-30 10:20:36 +00:00 committed by Ferruh Yigit
parent 6bb506cc2c
commit 88c0733535
3 changed files with 266 additions and 168 deletions

View File

@ -25,6 +25,7 @@
#include <rte_common.h>
#include <rte_branch_prediction.h>
#include <rte_ether.h>
#include <rte_cycles.h>
#include "mlx5.h"
#include "mlx5_utils.h"
@ -444,7 +445,7 @@ rx_queue_count(struct mlx5_rxq_data *rxq)
cq_ci = rxq->cq_ci;
}
cqe = &(*rxq->cqes)[cq_ci & cqe_cnt];
while (check_cqe(cqe, cqe_n, cq_ci) == 0) {
while (check_cqe(cqe, cqe_n, cq_ci) != MLX5_CQE_STATUS_HW_OWN) {
int8_t op_own;
unsigned int n;
@ -1883,6 +1884,130 @@ mlx5_rxq_initialize(struct mlx5_rxq_data *rxq)
*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
}
/**
* Handle a Rx error.
* The function inserts the RQ state to reset when the first error CQE is
* shown, then drains the CQ by the caller function loop. When the CQ is empty,
* it moves the RQ state to ready and initializes the RQ.
* Next CQE identification and error counting are in the caller responsibility.
*
* @param[in] rxq
* Pointer to RX queue structure.
* @param[in] mbuf_prepare
* Whether to prepare mbufs for the RQ.
*
* @return
* -1 in case of recovery error, otherwise the CQE status.
*/
int
mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t mbuf_prepare)
{
const uint16_t cqe_n = 1 << rxq->cqe_n;
const uint16_t cqe_mask = cqe_n - 1;
const unsigned int wqe_n = 1 << rxq->elts_n;
struct mlx5_rxq_ctrl *rxq_ctrl =
container_of(rxq, struct mlx5_rxq_ctrl, rxq);
struct ibv_wq_attr mod = {
.attr_mask = IBV_WQ_ATTR_STATE,
};
union {
volatile struct mlx5_cqe *cqe;
volatile struct mlx5_err_cqe *err_cqe;
} u = {
.cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_mask],
};
int ret;
switch (rxq->err_state) {
case MLX5_RXQ_ERR_STATE_NO_ERROR:
rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_RESET;
/* Fall-through */
case MLX5_RXQ_ERR_STATE_NEED_RESET:
if (rte_eal_process_type() != RTE_PROC_PRIMARY)
return -1;
mod.wq_state = IBV_WQS_RESET;
ret = mlx5_glue->modify_wq(rxq_ctrl->ibv->wq, &mod);
if (ret) {
DRV_LOG(ERR, "Cannot change Rx WQ state to RESET %s\n",
strerror(errno));
return -1;
}
if (rxq_ctrl->dump_file_n <
rxq_ctrl->priv->config.max_dump_files_num) {
MKSTR(err_str, "Unexpected CQE error syndrome "
"0x%02x CQN = %u RQN = %u wqe_counter = %u"
" rq_ci = %u cq_ci = %u", u.err_cqe->syndrome,
rxq->cqn, rxq_ctrl->ibv->wq->wq_num,
rte_be_to_cpu_16(u.err_cqe->wqe_counter),
rxq->rq_ci << rxq->sges_n, rxq->cq_ci);
MKSTR(name, "dpdk_mlx5_port_%u_rxq_%u_%u",
rxq->port_id, rxq->idx, (uint32_t)rte_rdtsc());
mlx5_dump_debug_information(name, NULL, err_str, 0);
mlx5_dump_debug_information(name, "MLX5 Error CQ:",
(const void *)((uintptr_t)
rxq->cqes),
sizeof(*u.cqe) * cqe_n);
mlx5_dump_debug_information(name, "MLX5 Error RQ:",
(const void *)((uintptr_t)
rxq->wqes),
16 * wqe_n);
rxq_ctrl->dump_file_n++;
}
rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_READY;
/* Fall-through */
case MLX5_RXQ_ERR_STATE_NEED_READY:
ret = check_cqe(u.cqe, cqe_n, rxq->cq_ci);
if (ret == MLX5_CQE_STATUS_HW_OWN) {
rte_cio_wmb();
*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
rte_cio_wmb();
/*
* The RQ consumer index must be zeroed while moving
* from RESET state to RDY state.
*/
*rxq->rq_db = rte_cpu_to_be_32(0);
rte_cio_wmb();
mod.wq_state = IBV_WQS_RDY;
ret = mlx5_glue->modify_wq(rxq_ctrl->ibv->wq, &mod);
if (ret) {
DRV_LOG(ERR, "Cannot change Rx WQ state to RDY"
" %s\n", strerror(errno));
return -1;
}
if (mbuf_prepare) {
const uint16_t q_mask = wqe_n - 1;
uint16_t elt_idx;
struct rte_mbuf **elt;
int i;
unsigned int n = wqe_n - (rxq->rq_ci -
rxq->rq_pi);
for (i = 0; i < (int)n; ++i) {
elt_idx = (rxq->rq_ci + i) & q_mask;
elt = &(*rxq->elts)[elt_idx];
*elt = rte_mbuf_raw_alloc(rxq->mp);
if (!*elt) {
for (i--; i >= 0; --i) {
elt_idx = (rxq->rq_ci +
i) & q_mask;
elt = &(*rxq->elts)
[elt_idx];
rte_pktmbuf_free_seg
(*elt);
}
return -1;
}
}
}
mlx5_rxq_initialize(rxq);
rxq->err_state = MLX5_RXQ_ERR_STATE_NO_ERROR;
}
return ret;
default:
return -1;
}
}
/**
* Get size of the next packet for a given CQE. For compressed CQEs, the
* consumer index is updated only once all packets of the current one have
@ -1897,8 +2022,7 @@ mlx5_rxq_initialize(struct mlx5_rxq_data *rxq)
* written.
*
* @return
* Packet size in bytes (0 if there is none), -1 in case of completion
* with error.
* 0 in case of empty CQE, otherwise the packet size in bytes.
*/
static inline int
mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
@ -1906,98 +2030,118 @@ mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
{
struct rxq_zip *zip = &rxq->zip;
uint16_t cqe_n = cqe_cnt + 1;
int len = 0;
int len;
uint16_t idx, end;
/* Process compressed data in the CQE and mini arrays. */
if (zip->ai) {
volatile struct mlx5_mini_cqe8 (*mc)[8] =
(volatile struct mlx5_mini_cqe8 (*)[8])
(uintptr_t)(&(*rxq->cqes)[zip->ca & cqe_cnt].pkt_info);
len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt);
*mcqe = &(*mc)[zip->ai & 7];
if ((++zip->ai & 7) == 0) {
/* Invalidate consumed CQEs */
idx = zip->ca;
end = zip->na;
while (idx != end) {
(*rxq->cqes)[idx & cqe_cnt].op_own =
MLX5_CQE_INVALIDATE;
++idx;
}
/*
* Increment consumer index to skip the number of
* CQEs consumed. Hardware leaves holes in the CQ
* ring for software use.
*/
zip->ca = zip->na;
zip->na += 8;
}
if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) {
/* Invalidate the rest */
idx = zip->ca;
end = zip->cq_ci;
while (idx != end) {
(*rxq->cqes)[idx & cqe_cnt].op_own =
MLX5_CQE_INVALIDATE;
++idx;
}
rxq->cq_ci = zip->cq_ci;
zip->ai = 0;
}
/* No compressed data, get next CQE and verify if it is compressed. */
} else {
int ret;
int8_t op_own;
ret = check_cqe(cqe, cqe_n, rxq->cq_ci);
if (unlikely(ret == 1))
return 0;
++rxq->cq_ci;
op_own = cqe->op_own;
rte_cio_rmb();
if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) {
do {
len = 0;
/* Process compressed data in the CQE and mini arrays. */
if (zip->ai) {
volatile struct mlx5_mini_cqe8 (*mc)[8] =
(volatile struct mlx5_mini_cqe8 (*)[8])
(uintptr_t)(&(*rxq->cqes)[rxq->cq_ci &
(uintptr_t)(&(*rxq->cqes)[zip->ca &
cqe_cnt].pkt_info);
/* Fix endianness. */
zip->cqe_cnt = rte_be_to_cpu_32(cqe->byte_cnt);
/*
* Current mini array position is the one returned by
* check_cqe64().
*
* If completion comprises several mini arrays, as a
* special case the second one is located 7 CQEs after
* the initial CQE instead of 8 for subsequent ones.
*/
zip->ca = rxq->cq_ci;
zip->na = zip->ca + 7;
/* Compute the next non compressed CQE. */
--rxq->cq_ci;
zip->cq_ci = rxq->cq_ci + zip->cqe_cnt;
/* Get packet size to return. */
len = rte_be_to_cpu_32((*mc)[0].byte_cnt);
*mcqe = &(*mc)[0];
zip->ai = 1;
/* Prefetch all the entries to be invalidated */
idx = zip->ca;
end = zip->cq_ci;
while (idx != end) {
rte_prefetch0(&(*rxq->cqes)[(idx) & cqe_cnt]);
++idx;
len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt);
*mcqe = &(*mc)[zip->ai & 7];
if ((++zip->ai & 7) == 0) {
/* Invalidate consumed CQEs */
idx = zip->ca;
end = zip->na;
while (idx != end) {
(*rxq->cqes)[idx & cqe_cnt].op_own =
MLX5_CQE_INVALIDATE;
++idx;
}
/*
* Increment consumer index to skip the number
* of CQEs consumed. Hardware leaves holes in
* the CQ ring for software use.
*/
zip->ca = zip->na;
zip->na += 8;
}
if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) {
/* Invalidate the rest */
idx = zip->ca;
end = zip->cq_ci;
while (idx != end) {
(*rxq->cqes)[idx & cqe_cnt].op_own =
MLX5_CQE_INVALIDATE;
++idx;
}
rxq->cq_ci = zip->cq_ci;
zip->ai = 0;
}
/*
* No compressed data, get next CQE and verify if it is
* compressed.
*/
} else {
len = rte_be_to_cpu_32(cqe->byte_cnt);
int ret;
int8_t op_own;
ret = check_cqe(cqe, cqe_n, rxq->cq_ci);
if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) {
if (unlikely(ret == MLX5_CQE_STATUS_ERR ||
rxq->err_state)) {
ret = mlx5_rx_err_handle(rxq, 0);
if (ret == MLX5_CQE_STATUS_HW_OWN ||
ret == -1)
return 0;
} else {
return 0;
}
}
++rxq->cq_ci;
op_own = cqe->op_own;
if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) {
volatile struct mlx5_mini_cqe8 (*mc)[8] =
(volatile struct mlx5_mini_cqe8 (*)[8])
(uintptr_t)(&(*rxq->cqes)
[rxq->cq_ci &
cqe_cnt].pkt_info);
/* Fix endianness. */
zip->cqe_cnt = rte_be_to_cpu_32(cqe->byte_cnt);
/*
* Current mini array position is the one
* returned by check_cqe64().
*
* If completion comprises several mini arrays,
* as a special case the second one is located
* 7 CQEs after the initial CQE instead of 8
* for subsequent ones.
*/
zip->ca = rxq->cq_ci;
zip->na = zip->ca + 7;
/* Compute the next non compressed CQE. */
--rxq->cq_ci;
zip->cq_ci = rxq->cq_ci + zip->cqe_cnt;
/* Get packet size to return. */
len = rte_be_to_cpu_32((*mc)[0].byte_cnt);
*mcqe = &(*mc)[0];
zip->ai = 1;
/* Prefetch all to be invalidated */
idx = zip->ca;
end = zip->cq_ci;
while (idx != end) {
rte_prefetch0(&(*rxq->cqes)[(idx) &
cqe_cnt]);
++idx;
}
} else {
len = rte_be_to_cpu_32(cqe->byte_cnt);
}
}
/* Error while receiving packet. */
if (unlikely(MLX5_CQE_OPCODE(op_own) == MLX5_CQE_RESP_ERR))
return -1;
}
return len;
if (unlikely(rxq->err_state)) {
cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
++rxq->stats.idropped;
} else {
return len;
}
} while (1);
}
/**
@ -2140,12 +2284,6 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
rte_mbuf_raw_free(rep);
break;
}
if (unlikely(len == -1)) {
/* RX error, packet is likely too large. */
rte_mbuf_raw_free(rep);
++rxq->stats.idropped;
goto skip;
}
pkt = seg;
assert(len >= (rxq->crc_present << 2));
pkt->ol_flags = 0;
@ -2188,7 +2326,6 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
pkt = NULL;
--pkts_n;
++i;
skip:
/* Align consumer index to the next stride. */
rq_ci >>= sges_n;
++rq_ci;
@ -2321,11 +2458,6 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
ret = mlx5_rx_poll_len(rxq, cqe, cq_mask, &mcqe);
if (!ret)
break;
if (unlikely(ret == -1)) {
/* RX error, packet is likely too large. */
++rxq->stats.idropped;
continue;
}
byte_cnt = ret;
strd_cnt = (byte_cnt & MLX5_MPRQ_STRIDE_NUM_MASK) >>
MLX5_MPRQ_STRIDE_NUM_SHIFT;

View File

@ -36,6 +36,7 @@
#include "mlx5_autoconf.h"
#include "mlx5_defs.h"
#include "mlx5_prm.h"
#include "mlx5_glue.h"
/* Support tunnel matching. */
#define MLX5_FLOW_TUNNEL 5
@ -78,6 +79,12 @@ struct mlx5_mprq_buf {
/* Get pointer to the first stride. */
#define mlx5_mprq_buf_addr(ptr) ((ptr) + 1)
enum mlx5_rxq_err_state {
MLX5_RXQ_ERR_STATE_NO_ERROR = 0,
MLX5_RXQ_ERR_STATE_NEED_RESET,
MLX5_RXQ_ERR_STATE_NEED_READY,
};
/* RX queue descriptor. */
struct mlx5_rxq_data {
unsigned int csum:1; /* Enable checksum offloading. */
@ -92,7 +99,8 @@ struct mlx5_rxq_data {
unsigned int strd_num_n:5; /* Log 2 of the number of stride. */
unsigned int strd_sz_n:4; /* Log 2 of stride size. */
unsigned int strd_shift_en:1; /* Enable 2bytes shift on a stride. */
unsigned int :6; /* Remaining bits. */
unsigned int err_state:2; /* enum mlx5_rxq_err_state. */
unsigned int :4; /* Remaining bits. */
volatile uint32_t *rq_db;
volatile uint32_t *cq_db;
uint16_t port_id;
@ -153,6 +161,7 @@ struct mlx5_rxq_ctrl {
unsigned int irq:1; /* Whether IRQ is enabled. */
uint32_t flow_mark_n; /* Number of Mark/Flag flows using this Queue. */
uint32_t flow_tunnels_n[MLX5_FLOW_TUNNEL]; /* Tunnels counters. */
uint16_t dump_file_n; /* Number of dump files. */
};
/* Indirection table. */
@ -326,6 +335,9 @@ uint16_t mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
uint16_t mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts,
uint16_t pkts_n);
uint16_t mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n);
void mlx5_rxq_initialize(struct mlx5_rxq_data *rxq);
__rte_noinline int mlx5_rx_err_handle(struct mlx5_rxq_data *rxq,
uint8_t mbuf_prepare);
void mlx5_mprq_buf_free_cb(void *addr, void *opaque);
void mlx5_mprq_buf_free(struct mlx5_mprq_buf *buf);
uint16_t mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts,
@ -420,32 +432,12 @@ __mlx5_uar_write64(uint64_t val, void *addr, rte_spinlock_t *lock)
#define mlx5_uar_write64(val, dst, lock) __mlx5_uar_write64(val, dst, lock)
#endif
#ifndef NDEBUG
/**
* Verify or set magic value in CQE.
*
* @param cqe
* Pointer to CQE.
*
* @return
* 0 the first time.
*/
static inline int
check_cqe_seen(volatile struct mlx5_cqe *cqe)
{
static const uint8_t magic[] = "seen";
volatile uint8_t (*buf)[sizeof(cqe->rsvd1)] = &cqe->rsvd1;
int ret = 1;
unsigned int i;
for (i = 0; i < sizeof(magic) && i < sizeof(*buf); ++i)
if (!ret || (*buf)[i] != magic[i]) {
ret = 0;
(*buf)[i] = magic[i];
}
return ret;
}
#endif /* NDEBUG */
/* CQE status. */
enum mlx5_cqe_status {
MLX5_CQE_STATUS_SW_OWN,
MLX5_CQE_STATUS_HW_OWN,
MLX5_CQE_STATUS_ERR,
};
/**
* Check whether CQE is valid.
@ -458,51 +450,24 @@ check_cqe_seen(volatile struct mlx5_cqe *cqe)
* Consumer index.
*
* @return
* 0 on success, 1 on failure.
* The CQE status.
*/
static __rte_always_inline int
check_cqe(volatile struct mlx5_cqe *cqe,
unsigned int cqes_n, const uint16_t ci)
static __rte_always_inline enum mlx5_cqe_status
check_cqe(volatile struct mlx5_cqe *cqe, const uint16_t cqes_n,
const uint16_t ci)
{
uint16_t idx = ci & cqes_n;
uint8_t op_own = cqe->op_own;
uint8_t op_owner = MLX5_CQE_OWNER(op_own);
uint8_t op_code = MLX5_CQE_OPCODE(op_own);
const uint16_t idx = ci & cqes_n;
const uint8_t op_own = cqe->op_own;
const uint8_t op_owner = MLX5_CQE_OWNER(op_own);
const uint8_t op_code = MLX5_CQE_OPCODE(op_own);
if (unlikely((op_owner != (!!(idx))) || (op_code == MLX5_CQE_INVALID)))
return 1; /* No CQE. */
#ifndef NDEBUG
if ((op_code == MLX5_CQE_RESP_ERR) ||
(op_code == MLX5_CQE_REQ_ERR)) {
volatile struct mlx5_err_cqe *err_cqe = (volatile void *)cqe;
uint8_t syndrome = err_cqe->syndrome;
if ((syndrome == MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR) ||
(syndrome == MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR))
return 0;
if (!check_cqe_seen(cqe)) {
DRV_LOG(ERR,
"unexpected CQE error %u (0x%02x) syndrome"
" 0x%02x",
op_code, op_code, syndrome);
rte_hexdump(stderr, "MLX5 Error CQE:",
(const void *)((uintptr_t)err_cqe),
sizeof(*cqe));
}
return 1;
} else if ((op_code != MLX5_CQE_RESP_SEND) &&
(op_code != MLX5_CQE_REQ)) {
if (!check_cqe_seen(cqe)) {
DRV_LOG(ERR, "unexpected CQE opcode %u (0x%02x)",
op_code, op_code);
rte_hexdump(stderr, "MLX5 CQE:",
(const void *)((uintptr_t)cqe),
sizeof(*cqe));
}
return 1;
}
#endif /* NDEBUG */
return 0;
return MLX5_CQE_STATUS_HW_OWN;
rte_cio_rmb();
if (unlikely(op_code == MLX5_CQE_RESP_ERR ||
op_code == MLX5_CQE_REQ_ERR))
return MLX5_CQE_STATUS_ERR;
return MLX5_CQE_STATUS_SW_OWN;
}
/**

View File

@ -197,7 +197,7 @@ rxq_handle_pending_error(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,
for (i = 0; i < pkts_n; ++i) {
struct rte_mbuf *pkt = pkts[i];
if (pkt->packet_type == RTE_PTYPE_ALL_MASK) {
if (pkt->packet_type == RTE_PTYPE_ALL_MASK || rxq->err_state) {
#ifdef MLX5_PMD_SOFT_COUNTERS
err_bytes += PKT_LEN(pkt);
#endif
@ -212,6 +212,7 @@ rxq_handle_pending_error(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,
rxq->stats.ipackets -= (pkts_n - n);
rxq->stats.ibytes -= err_bytes;
#endif
mlx5_rx_err_handle(rxq, 1);
return n;
}
@ -236,7 +237,7 @@ mlx5_rx_burst_vec(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
uint64_t err = 0;
nb_rx = rxq_burst_v(rxq, pkts, pkts_n, &err);
if (unlikely(err))
if (unlikely(err | rxq->err_state))
nb_rx = rxq_handle_pending_error(rxq, pkts, nb_rx);
return nb_rx;
}