numam-dpdk/drivers/net/mlx5/mlx5_flow_aso.c

1985 lines
55 KiB
C
Raw Normal View History

/* SPDX-License-Identifier: BSD-3-Clause
* Copyright 2020 Mellanox Technologies, Ltd
*/
#include <mlx5_prm.h>
#include <rte_malloc.h>
#include <rte_cycles.h>
#include <rte_eal_paging.h>
#include <mlx5_malloc.h>
#include <mlx5_common_os.h>
#include <mlx5_common_devx.h>
#include "mlx5.h"
#include "mlx5_flow.h"
net/mlx5: support flow counter action for HWS This commit adds HW steering counter action support. The pool mechanism is the basic data structure for the HW steering counter. The HW steering's counter pool is based on the rte_ring of zero-copy variation. There are two global rte_rings: 1. free_list: Store the counters indexes, which are ready for use. 2. wait_reset_list: Store the counters indexes, which are just freed from the user and need to query the hardware counter to get the reset value before this counter can be reused again. The counter pool also supports cache per HW steering's queues, which are also based on the rte_ring of zero-copy variation. The cache can be configured in size, preload, threshold, and fetch size, they are all exposed via device args. The main operations of the counter pool are as follows: - Get one counter from the pool: 1. The user call _get_* API. 2. If the cache is enabled, dequeue one counter index from the local cache: 2. A: if the dequeued one from the local cache is still in reset status (counter's query_gen_when_free is equal to pool's query gen): I. Flush all counters in the local cache back to global wait_reset_list. II. Fetch _fetch_sz_ counters into the cache from the global free list. III. Fetch one counter from the cache. 3. If the cache is empty, fetch _fetch_sz_ counters from the global free list into the cache and fetch one counter from the cache. - Free one counter into the pool: 1. The user calls _put_* API. 2. Put the counter into the local cache. 3. If the local cache is full: A: Write back all counters above _threshold_ into the global wait_reset_list. B: Also, write back this counter into the global wait_reset_list. When the local cache is disabled, _get_/_put_ cache directly from/into global list. Signed-off-by: Xiaoyu Min <jackmin@nvidia.com> Acked-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>
2022-10-20 15:41:42 +00:00
#include "mlx5_hws_cnt.h"
#define MLX5_ASO_CNT_QUEUE_LOG_DESC 14
/**
* Free MR resources.
*
* @param[in] cdev
* Pointer to the mlx5 common device.
* @param[in] mr
* MR to free.
*/
static void
mlx5_aso_dereg_mr(struct mlx5_common_device *cdev, struct mlx5_pmd_mr *mr)
{
void *addr = mr->addr;
cdev->mr_scache.dereg_mr_cb(mr);
mlx5_free(addr);
memset(mr, 0, sizeof(*mr));
}
/**
* Register Memory Region.
*
* @param[in] cdev
* Pointer to the mlx5 common device.
* @param[in] length
* Size of MR buffer.
* @param[in/out] mr
* Pointer to MR to create.
*
* @return
* 0 on success, a negative errno value otherwise and rte_errno is set.
*/
static int
mlx5_aso_reg_mr(struct mlx5_common_device *cdev, size_t length,
struct mlx5_pmd_mr *mr)
{
int ret;
mr->addr = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, length, 4096,
SOCKET_ID_ANY);
if (!mr->addr) {
DRV_LOG(ERR, "Failed to create ASO bits mem for MR.");
return -1;
}
ret = cdev->mr_scache.reg_mr_cb(cdev->pd, mr->addr, length, mr);
if (ret) {
DRV_LOG(ERR, "Failed to create direct Mkey.");
mlx5_free(mr->addr);
return -1;
}
return 0;
}
/**
* Destroy Send Queue used for ASO access.
*
* @param[in] sq
* ASO SQ to destroy.
*/
static void
mlx5_aso_destroy_sq(struct mlx5_aso_sq *sq)
{
mlx5_devx_sq_destroy(&sq->sq_obj);
mlx5_devx_cq_destroy(&sq->cq.cq_obj);
memset(sq, 0, sizeof(*sq));
}
net/mlx5: support flow counter action for HWS This commit adds HW steering counter action support. The pool mechanism is the basic data structure for the HW steering counter. The HW steering's counter pool is based on the rte_ring of zero-copy variation. There are two global rte_rings: 1. free_list: Store the counters indexes, which are ready for use. 2. wait_reset_list: Store the counters indexes, which are just freed from the user and need to query the hardware counter to get the reset value before this counter can be reused again. The counter pool also supports cache per HW steering's queues, which are also based on the rte_ring of zero-copy variation. The cache can be configured in size, preload, threshold, and fetch size, they are all exposed via device args. The main operations of the counter pool are as follows: - Get one counter from the pool: 1. The user call _get_* API. 2. If the cache is enabled, dequeue one counter index from the local cache: 2. A: if the dequeued one from the local cache is still in reset status (counter's query_gen_when_free is equal to pool's query gen): I. Flush all counters in the local cache back to global wait_reset_list. II. Fetch _fetch_sz_ counters into the cache from the global free list. III. Fetch one counter from the cache. 3. If the cache is empty, fetch _fetch_sz_ counters from the global free list into the cache and fetch one counter from the cache. - Free one counter into the pool: 1. The user calls _put_* API. 2. Put the counter into the local cache. 3. If the local cache is full: A: Write back all counters above _threshold_ into the global wait_reset_list. B: Also, write back this counter into the global wait_reset_list. When the local cache is disabled, _get_/_put_ cache directly from/into global list. Signed-off-by: Xiaoyu Min <jackmin@nvidia.com> Acked-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>
2022-10-20 15:41:42 +00:00
/**
* Initialize Send Queue used for ASO access counter.
*
* @param[in] sq
* ASO SQ to initialize.
*/
static void
mlx5_aso_cnt_init_sq(struct mlx5_aso_sq *sq)
{
volatile struct mlx5_aso_wqe *restrict wqe;
int i;
int size = 1 << sq->log_desc_n;
/* All the next fields state should stay constant. */
for (i = 0, wqe = &sq->sq_obj.aso_wqes[0]; i < size; ++i, ++wqe) {
wqe->general_cseg.sq_ds = rte_cpu_to_be_32((sq->sqn << 8) |
(sizeof(*wqe) >> 4));
wqe->aso_cseg.operand_masks = rte_cpu_to_be_32
(0u |
(ASO_OPER_LOGICAL_OR << ASO_CSEG_COND_OPER_OFFSET) |
(ASO_OP_ALWAYS_FALSE << ASO_CSEG_COND_1_OPER_OFFSET) |
(ASO_OP_ALWAYS_FALSE << ASO_CSEG_COND_0_OPER_OFFSET) |
(BYTEWISE_64BYTE << ASO_CSEG_DATA_MASK_MODE_OFFSET));
wqe->aso_cseg.data_mask = RTE_BE64(UINT64_MAX);
}
}
/**
* Initialize Send Queue used for ASO access.
*
* @param[in] sq
* ASO SQ to initialize.
*/
static void
mlx5_aso_age_init_sq(struct mlx5_aso_sq *sq)
{
volatile struct mlx5_aso_wqe *restrict wqe;
int i;
int size = 1 << sq->log_desc_n;
uint64_t addr;
/* All the next fields state should stay constant. */
for (i = 0, wqe = &sq->sq_obj.aso_wqes[0]; i < size; ++i, ++wqe) {
wqe->general_cseg.sq_ds = rte_cpu_to_be_32((sq->sqn << 8) |
(sizeof(*wqe) >> 4));
wqe->aso_cseg.lkey = rte_cpu_to_be_32(sq->mr.lkey);
addr = (uint64_t)((uint64_t *)sq->mr.addr + i *
MLX5_ASO_AGE_ACTIONS_PER_POOL / 64);
wqe->aso_cseg.va_h = rte_cpu_to_be_32((uint32_t)(addr >> 32));
wqe->aso_cseg.va_l_r = rte_cpu_to_be_32((uint32_t)addr | 1u);
wqe->aso_cseg.operand_masks = rte_cpu_to_be_32
(0u |
(ASO_OPER_LOGICAL_OR << ASO_CSEG_COND_OPER_OFFSET) |
(ASO_OP_ALWAYS_TRUE << ASO_CSEG_COND_1_OPER_OFFSET) |
(ASO_OP_ALWAYS_TRUE << ASO_CSEG_COND_0_OPER_OFFSET) |
(BYTEWISE_64BYTE << ASO_CSEG_DATA_MASK_MODE_OFFSET));
wqe->aso_cseg.data_mask = RTE_BE64(UINT64_MAX);
}
}
/**
* Initialize Send Queue used for ASO flow meter access.
*
* @param[in] sq
* ASO SQ to initialize.
*/
static void
mlx5_aso_mtr_init_sq(struct mlx5_aso_sq *sq)
{
volatile struct mlx5_aso_wqe *restrict wqe;
int i;
int size = 1 << sq->log_desc_n;
/* All the next fields state should stay constant. */
for (i = 0, wqe = &sq->sq_obj.aso_wqes[0]; i < size; ++i, ++wqe) {
wqe->general_cseg.sq_ds = rte_cpu_to_be_32((sq->sqn << 8) |
(sizeof(*wqe) >> 4));
wqe->aso_cseg.operand_masks = RTE_BE32(0u |
(ASO_OPER_LOGICAL_OR << ASO_CSEG_COND_OPER_OFFSET) |
(ASO_OP_ALWAYS_TRUE << ASO_CSEG_COND_1_OPER_OFFSET) |
(ASO_OP_ALWAYS_TRUE << ASO_CSEG_COND_0_OPER_OFFSET) |
(BYTEWISE_64BYTE << ASO_CSEG_DATA_MASK_MODE_OFFSET));
wqe->general_cseg.flags = RTE_BE32(MLX5_COMP_ALWAYS <<
MLX5_COMP_MODE_OFFSET);
}
}
/*
* Initialize Send Queue used for ASO connection tracking.
*
* @param[in] sq
* ASO SQ to initialize.
*/
static void
mlx5_aso_ct_init_sq(struct mlx5_aso_sq *sq)
{
volatile struct mlx5_aso_wqe *restrict wqe;
int i;
int size = 1 << sq->log_desc_n;
uint64_t addr;
/* All the next fields state should stay constant. */
for (i = 0, wqe = &sq->sq_obj.aso_wqes[0]; i < size; ++i, ++wqe) {
wqe->general_cseg.sq_ds = rte_cpu_to_be_32((sq->sqn << 8) |
(sizeof(*wqe) >> 4));
/* One unique MR for the query data. */
wqe->aso_cseg.lkey = rte_cpu_to_be_32(sq->mr.lkey);
/* Magic number 64 represents the length of a ASO CT obj. */
addr = (uint64_t)((uintptr_t)sq->mr.addr + i * 64);
wqe->aso_cseg.va_h = rte_cpu_to_be_32((uint32_t)(addr >> 32));
wqe->aso_cseg.va_l_r = rte_cpu_to_be_32((uint32_t)addr | 1u);
/*
* The values of operand_masks are different for modify
* and query.
* And data_mask may be different for each modification. In
* query, it could be zero and ignored.
* CQE generation is always needed, in order to decide when
* it is available to create the flow or read the data.
*/
wqe->general_cseg.flags = RTE_BE32(MLX5_COMP_ALWAYS <<
MLX5_COMP_MODE_OFFSET);
}
}
/**
* Create Send Queue used for ASO access.
*
* @param[in] cdev
* Pointer to the mlx5 common device.
* @param[in/out] sq
* Pointer to SQ to create.
* @param[in] uar
* User Access Region object.
*
* @return
* 0 on success, a negative errno value otherwise and rte_errno is set.
*/
static int
mlx5_aso_sq_create(struct mlx5_common_device *cdev, struct mlx5_aso_sq *sq,
net/mlx5: support flow counter action for HWS This commit adds HW steering counter action support. The pool mechanism is the basic data structure for the HW steering counter. The HW steering's counter pool is based on the rte_ring of zero-copy variation. There are two global rte_rings: 1. free_list: Store the counters indexes, which are ready for use. 2. wait_reset_list: Store the counters indexes, which are just freed from the user and need to query the hardware counter to get the reset value before this counter can be reused again. The counter pool also supports cache per HW steering's queues, which are also based on the rte_ring of zero-copy variation. The cache can be configured in size, preload, threshold, and fetch size, they are all exposed via device args. The main operations of the counter pool are as follows: - Get one counter from the pool: 1. The user call _get_* API. 2. If the cache is enabled, dequeue one counter index from the local cache: 2. A: if the dequeued one from the local cache is still in reset status (counter's query_gen_when_free is equal to pool's query gen): I. Flush all counters in the local cache back to global wait_reset_list. II. Fetch _fetch_sz_ counters into the cache from the global free list. III. Fetch one counter from the cache. 3. If the cache is empty, fetch _fetch_sz_ counters from the global free list into the cache and fetch one counter from the cache. - Free one counter into the pool: 1. The user calls _put_* API. 2. Put the counter into the local cache. 3. If the local cache is full: A: Write back all counters above _threshold_ into the global wait_reset_list. B: Also, write back this counter into the global wait_reset_list. When the local cache is disabled, _get_/_put_ cache directly from/into global list. Signed-off-by: Xiaoyu Min <jackmin@nvidia.com> Acked-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>
2022-10-20 15:41:42 +00:00
void *uar, uint16_t log_desc_n)
{
struct mlx5_devx_cq_attr cq_attr = {
.uar_page_id = mlx5_os_get_devx_uar_page_id(uar),
};
struct mlx5_devx_create_sq_attr sq_attr = {
.user_index = 0xFFFF,
.wq_attr = (struct mlx5_devx_wq_attr){
.pd = cdev->pdn,
.uar_page = mlx5_os_get_devx_uar_page_id(uar),
},
.ts_format =
mlx5_ts_format_conv(cdev->config.hca_attr.sq_ts_format),
};
struct mlx5_devx_modify_sq_attr modify_attr = {
.state = MLX5_SQC_STATE_RDY,
};
uint16_t log_wqbb_n;
int ret;
if (mlx5_devx_cq_create(cdev->ctx, &sq->cq.cq_obj,
net/mlx5: support flow counter action for HWS This commit adds HW steering counter action support. The pool mechanism is the basic data structure for the HW steering counter. The HW steering's counter pool is based on the rte_ring of zero-copy variation. There are two global rte_rings: 1. free_list: Store the counters indexes, which are ready for use. 2. wait_reset_list: Store the counters indexes, which are just freed from the user and need to query the hardware counter to get the reset value before this counter can be reused again. The counter pool also supports cache per HW steering's queues, which are also based on the rte_ring of zero-copy variation. The cache can be configured in size, preload, threshold, and fetch size, they are all exposed via device args. The main operations of the counter pool are as follows: - Get one counter from the pool: 1. The user call _get_* API. 2. If the cache is enabled, dequeue one counter index from the local cache: 2. A: if the dequeued one from the local cache is still in reset status (counter's query_gen_when_free is equal to pool's query gen): I. Flush all counters in the local cache back to global wait_reset_list. II. Fetch _fetch_sz_ counters into the cache from the global free list. III. Fetch one counter from the cache. 3. If the cache is empty, fetch _fetch_sz_ counters from the global free list into the cache and fetch one counter from the cache. - Free one counter into the pool: 1. The user calls _put_* API. 2. Put the counter into the local cache. 3. If the local cache is full: A: Write back all counters above _threshold_ into the global wait_reset_list. B: Also, write back this counter into the global wait_reset_list. When the local cache is disabled, _get_/_put_ cache directly from/into global list. Signed-off-by: Xiaoyu Min <jackmin@nvidia.com> Acked-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>
2022-10-20 15:41:42 +00:00
log_desc_n, &cq_attr,
SOCKET_ID_ANY))
goto error;
sq->cq.cq_ci = 0;
net/mlx5: support flow counter action for HWS This commit adds HW steering counter action support. The pool mechanism is the basic data structure for the HW steering counter. The HW steering's counter pool is based on the rte_ring of zero-copy variation. There are two global rte_rings: 1. free_list: Store the counters indexes, which are ready for use. 2. wait_reset_list: Store the counters indexes, which are just freed from the user and need to query the hardware counter to get the reset value before this counter can be reused again. The counter pool also supports cache per HW steering's queues, which are also based on the rte_ring of zero-copy variation. The cache can be configured in size, preload, threshold, and fetch size, they are all exposed via device args. The main operations of the counter pool are as follows: - Get one counter from the pool: 1. The user call _get_* API. 2. If the cache is enabled, dequeue one counter index from the local cache: 2. A: if the dequeued one from the local cache is still in reset status (counter's query_gen_when_free is equal to pool's query gen): I. Flush all counters in the local cache back to global wait_reset_list. II. Fetch _fetch_sz_ counters into the cache from the global free list. III. Fetch one counter from the cache. 3. If the cache is empty, fetch _fetch_sz_ counters from the global free list into the cache and fetch one counter from the cache. - Free one counter into the pool: 1. The user calls _put_* API. 2. Put the counter into the local cache. 3. If the local cache is full: A: Write back all counters above _threshold_ into the global wait_reset_list. B: Also, write back this counter into the global wait_reset_list. When the local cache is disabled, _get_/_put_ cache directly from/into global list. Signed-off-by: Xiaoyu Min <jackmin@nvidia.com> Acked-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>
2022-10-20 15:41:42 +00:00
sq->cq.log_desc_n = log_desc_n;
sq->log_desc_n = log_desc_n;
sq_attr.cqn = sq->cq.cq_obj.cq->id;
/* for mlx5_aso_wqe that is twice the size of mlx5_wqe */
log_wqbb_n = sq->log_desc_n + 1;
ret = mlx5_devx_sq_create(cdev->ctx, &sq->sq_obj, log_wqbb_n, &sq_attr,
SOCKET_ID_ANY);
if (ret) {
DRV_LOG(ERR, "Can't create SQ object.");
rte_errno = ENOMEM;
goto error;
}
ret = mlx5_devx_cmd_modify_sq(sq->sq_obj.sq, &modify_attr);
if (ret) {
DRV_LOG(ERR, "Can't change SQ state to ready.");
rte_errno = ENOMEM;
goto error;
}
sq->pi = 0;
sq->head = 0;
sq->tail = 0;
sq->sqn = sq->sq_obj.sq->id;
rte_spinlock_init(&sq->sqsl);
return 0;
error:
mlx5_aso_destroy_sq(sq);
return -1;
}
void
mlx5_aso_mtr_queue_uninit(struct mlx5_dev_ctx_shared *sh __rte_unused,
struct mlx5_aso_mtr_pool *hws_pool,
struct mlx5_aso_mtr_pools_mng *pool_mng)
{
uint32_t i;
if (hws_pool) {
for (i = 0; i < hws_pool->nb_sq; i++)
mlx5_aso_destroy_sq(hws_pool->sq + i);
mlx5_free(hws_pool->sq);
return;
}
if (pool_mng)
mlx5_aso_destroy_sq(&pool_mng->sq);
}
int
mlx5_aso_mtr_queue_init(struct mlx5_dev_ctx_shared *sh,
struct mlx5_aso_mtr_pool *hws_pool,
struct mlx5_aso_mtr_pools_mng *pool_mng,
uint32_t nb_queues)
{
struct mlx5_common_device *cdev = sh->cdev;
struct mlx5_aso_sq *sq;
uint32_t i;
if (hws_pool) {
sq = mlx5_malloc(MLX5_MEM_ZERO,
sizeof(struct mlx5_aso_sq) * nb_queues,
RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY);
if (!sq)
return -1;
hws_pool->sq = sq;
for (i = 0; i < nb_queues; i++) {
if (mlx5_aso_sq_create(cdev, hws_pool->sq + i,
sh->tx_uar.obj,
MLX5_ASO_QUEUE_LOG_DESC))
goto error;
mlx5_aso_mtr_init_sq(hws_pool->sq + i);
}
hws_pool->nb_sq = nb_queues;
}
if (pool_mng) {
if (mlx5_aso_sq_create(cdev, &pool_mng->sq,
sh->tx_uar.obj,
MLX5_ASO_QUEUE_LOG_DESC))
return -1;
mlx5_aso_mtr_init_sq(&pool_mng->sq);
}
return 0;
error:
do {
mlx5_aso_destroy_sq(hws_pool->sq + i);
} while (i--);
return -1;
}
/**
* API to create and initialize Send Queue used for ASO access.
*
* @param[in] sh
* Pointer to shared device context.
* @param[in] aso_opc_mod
* Mode of ASO feature.
* @param[in] nb_queues
* Number of Send Queues to create.
*
* @return
* 0 on success, a negative errno value otherwise and rte_errno is set.
*/
int
mlx5_aso_queue_init(struct mlx5_dev_ctx_shared *sh,
enum mlx5_access_aso_opc_mod aso_opc_mod,
uint32_t nb_queues)
{
uint32_t sq_desc_n = 1 << MLX5_ASO_QUEUE_LOG_DESC;
struct mlx5_common_device *cdev = sh->cdev;
switch (aso_opc_mod) {
case ASO_OPC_MOD_FLOW_HIT:
if (mlx5_aso_reg_mr(cdev, (MLX5_ASO_AGE_ACTIONS_PER_POOL / 8) *
sq_desc_n, &sh->aso_age_mng->aso_sq.mr))
return -1;
if (mlx5_aso_sq_create(cdev, &sh->aso_age_mng->aso_sq,
net/mlx5: support flow counter action for HWS This commit adds HW steering counter action support. The pool mechanism is the basic data structure for the HW steering counter. The HW steering's counter pool is based on the rte_ring of zero-copy variation. There are two global rte_rings: 1. free_list: Store the counters indexes, which are ready for use. 2. wait_reset_list: Store the counters indexes, which are just freed from the user and need to query the hardware counter to get the reset value before this counter can be reused again. The counter pool also supports cache per HW steering's queues, which are also based on the rte_ring of zero-copy variation. The cache can be configured in size, preload, threshold, and fetch size, they are all exposed via device args. The main operations of the counter pool are as follows: - Get one counter from the pool: 1. The user call _get_* API. 2. If the cache is enabled, dequeue one counter index from the local cache: 2. A: if the dequeued one from the local cache is still in reset status (counter's query_gen_when_free is equal to pool's query gen): I. Flush all counters in the local cache back to global wait_reset_list. II. Fetch _fetch_sz_ counters into the cache from the global free list. III. Fetch one counter from the cache. 3. If the cache is empty, fetch _fetch_sz_ counters from the global free list into the cache and fetch one counter from the cache. - Free one counter into the pool: 1. The user calls _put_* API. 2. Put the counter into the local cache. 3. If the local cache is full: A: Write back all counters above _threshold_ into the global wait_reset_list. B: Also, write back this counter into the global wait_reset_list. When the local cache is disabled, _get_/_put_ cache directly from/into global list. Signed-off-by: Xiaoyu Min <jackmin@nvidia.com> Acked-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>
2022-10-20 15:41:42 +00:00
sh->tx_uar.obj,
MLX5_ASO_QUEUE_LOG_DESC)) {
mlx5_aso_dereg_mr(cdev, &sh->aso_age_mng->aso_sq.mr);
return -1;
}
mlx5_aso_age_init_sq(&sh->aso_age_mng->aso_sq);
break;
case ASO_OPC_MOD_POLICER:
if (mlx5_aso_mtr_queue_init(sh, NULL,
&sh->mtrmng->pools_mng, nb_queues))
return -1;
break;
case ASO_OPC_MOD_CONNECTION_TRACKING:
if (mlx5_aso_ct_queue_init(sh, sh->ct_mng, MLX5_ASO_CT_SQ_NUM))
return -1;
break;
default:
DRV_LOG(ERR, "Unknown ASO operation mode");
return -1;
}
return 0;
}
/**
* API to destroy Send Queue used for ASO access.
*
* @param[in] sh
* Pointer to shared device context.
* @param[in] aso_opc_mod
* Mode of ASO feature.
*/
void
mlx5_aso_queue_uninit(struct mlx5_dev_ctx_shared *sh,
enum mlx5_access_aso_opc_mod aso_opc_mod)
{
struct mlx5_aso_sq *sq = NULL;
switch (aso_opc_mod) {
case ASO_OPC_MOD_FLOW_HIT:
mlx5_aso_dereg_mr(sh->cdev, &sh->aso_age_mng->aso_sq.mr);
sq = &sh->aso_age_mng->aso_sq;
break;
case ASO_OPC_MOD_POLICER:
mlx5_aso_mtr_queue_uninit(sh, NULL, &sh->mtrmng->pools_mng);
break;
case ASO_OPC_MOD_CONNECTION_TRACKING:
mlx5_aso_ct_queue_uninit(sh, sh->ct_mng);
break;
default:
DRV_LOG(ERR, "Unknown ASO operation mode");
return;
}
if (sq)
mlx5_aso_destroy_sq(sq);
}
/**
* Write a burst of WQEs to ASO SQ.
*
common/mlx5: fix post doorbell barrier The rdma-core library can map doorbell register in two ways, depending on the environment variable "MLX5_SHUT_UP_BF": - as regular cached memory, the variable is either missing or set to zero. This type of mapping may cause the significant doorbell register writing latency and requires an explicit memory write barrier to mitigate this issue and prevent write combining. - as non-cached memory, the variable is present and set to not "0" value. This type of mapping may cause performance impact under heavy loading conditions but the explicit write memory barrier is not required and it may improve core performance. The UAR creation function maps a doorbell in one of the above ways according to the system. In run time, it always adds an explicit memory barrier after writing to. In cases where the doorbell was mapped as non-cached memory, the explicit memory barrier is unnecessary and may impair performance. The commit [1] solved this problem for a Tx queue. In run time, it checks the mapping type and provides the memory barrier after writing to a Tx doorbell register if it is needed. The mapping type is extracted directly from the uar_mmap_offset field in the queue properties. This patch shares this code between the drivers and extends the above solution for each of them. [1] commit 8409a28573d3 ("net/mlx5: control transmit doorbell register mapping") Fixes: f8c97babc9f4 ("compress/mlx5: add data-path functions") Fixes: 8e196c08ab53 ("crypto/mlx5: support enqueue/dequeue operations") Fixes: 4d4e245ad637 ("regex/mlx5: support enqueue") Cc: stable@dpdk.org Signed-off-by: Michael Baum <michaelba@nvidia.com> Reviewed-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com> Acked-by: Matan Azrad <matan@nvidia.com>
2021-11-03 18:35:13 +00:00
* @param[in] sh
* Pointer to shared device context.
* @param[in] n
* Index of the last valid pool.
*
* @return
* Number of WQEs in burst.
*/
static uint16_t
common/mlx5: fix post doorbell barrier The rdma-core library can map doorbell register in two ways, depending on the environment variable "MLX5_SHUT_UP_BF": - as regular cached memory, the variable is either missing or set to zero. This type of mapping may cause the significant doorbell register writing latency and requires an explicit memory write barrier to mitigate this issue and prevent write combining. - as non-cached memory, the variable is present and set to not "0" value. This type of mapping may cause performance impact under heavy loading conditions but the explicit write memory barrier is not required and it may improve core performance. The UAR creation function maps a doorbell in one of the above ways according to the system. In run time, it always adds an explicit memory barrier after writing to. In cases where the doorbell was mapped as non-cached memory, the explicit memory barrier is unnecessary and may impair performance. The commit [1] solved this problem for a Tx queue. In run time, it checks the mapping type and provides the memory barrier after writing to a Tx doorbell register if it is needed. The mapping type is extracted directly from the uar_mmap_offset field in the queue properties. This patch shares this code between the drivers and extends the above solution for each of them. [1] commit 8409a28573d3 ("net/mlx5: control transmit doorbell register mapping") Fixes: f8c97babc9f4 ("compress/mlx5: add data-path functions") Fixes: 8e196c08ab53 ("crypto/mlx5: support enqueue/dequeue operations") Fixes: 4d4e245ad637 ("regex/mlx5: support enqueue") Cc: stable@dpdk.org Signed-off-by: Michael Baum <michaelba@nvidia.com> Reviewed-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com> Acked-by: Matan Azrad <matan@nvidia.com>
2021-11-03 18:35:13 +00:00
mlx5_aso_sq_enqueue_burst(struct mlx5_dev_ctx_shared *sh, uint16_t n)
{
common/mlx5: fix post doorbell barrier The rdma-core library can map doorbell register in two ways, depending on the environment variable "MLX5_SHUT_UP_BF": - as regular cached memory, the variable is either missing or set to zero. This type of mapping may cause the significant doorbell register writing latency and requires an explicit memory write barrier to mitigate this issue and prevent write combining. - as non-cached memory, the variable is present and set to not "0" value. This type of mapping may cause performance impact under heavy loading conditions but the explicit write memory barrier is not required and it may improve core performance. The UAR creation function maps a doorbell in one of the above ways according to the system. In run time, it always adds an explicit memory barrier after writing to. In cases where the doorbell was mapped as non-cached memory, the explicit memory barrier is unnecessary and may impair performance. The commit [1] solved this problem for a Tx queue. In run time, it checks the mapping type and provides the memory barrier after writing to a Tx doorbell register if it is needed. The mapping type is extracted directly from the uar_mmap_offset field in the queue properties. This patch shares this code between the drivers and extends the above solution for each of them. [1] commit 8409a28573d3 ("net/mlx5: control transmit doorbell register mapping") Fixes: f8c97babc9f4 ("compress/mlx5: add data-path functions") Fixes: 8e196c08ab53 ("crypto/mlx5: support enqueue/dequeue operations") Fixes: 4d4e245ad637 ("regex/mlx5: support enqueue") Cc: stable@dpdk.org Signed-off-by: Michael Baum <michaelba@nvidia.com> Reviewed-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com> Acked-by: Matan Azrad <matan@nvidia.com>
2021-11-03 18:35:13 +00:00
struct mlx5_aso_age_mng *mng = sh->aso_age_mng;
volatile struct mlx5_aso_wqe *wqe;
struct mlx5_aso_sq *sq = &mng->aso_sq;
struct mlx5_aso_age_pool *pool;
uint16_t size = 1 << sq->log_desc_n;
uint16_t mask = size - 1;
uint16_t max;
uint16_t start_head = sq->head;
max = RTE_MIN(size - (uint16_t)(sq->head - sq->tail), n - sq->next);
if (unlikely(!max))
return 0;
sq->elts[start_head & mask].burst_size = max;
do {
wqe = &sq->sq_obj.aso_wqes[sq->head & mask];
rte_prefetch0(&sq->sq_obj.aso_wqes[(sq->head + 1) & mask]);
/* Fill next WQE. */
rte_rwlock_read_lock(&mng->resize_rwl);
pool = mng->pools[sq->next];
rte_rwlock_read_unlock(&mng->resize_rwl);
sq->elts[sq->head & mask].pool = pool;
wqe->general_cseg.misc =
rte_cpu_to_be_32(((struct mlx5_devx_obj *)
(pool->flow_hit_aso_obj))->id);
wqe->general_cseg.flags = RTE_BE32(MLX5_COMP_ONLY_FIRST_ERR <<
MLX5_COMP_MODE_OFFSET);
wqe->general_cseg.opcode = rte_cpu_to_be_32
(MLX5_OPCODE_ACCESS_ASO |
(ASO_OPC_MOD_FLOW_HIT <<
WQE_CSEG_OPC_MOD_OFFSET) |
(sq->pi <<
WQE_CSEG_WQE_INDEX_OFFSET));
sq->pi += 2; /* Each WQE contains 2 WQEBB's. */
sq->head++;
sq->next++;
max--;
} while (max);
wqe->general_cseg.flags = RTE_BE32(MLX5_COMP_ALWAYS <<
MLX5_COMP_MODE_OFFSET);
common/mlx5: fix post doorbell barrier The rdma-core library can map doorbell register in two ways, depending on the environment variable "MLX5_SHUT_UP_BF": - as regular cached memory, the variable is either missing or set to zero. This type of mapping may cause the significant doorbell register writing latency and requires an explicit memory write barrier to mitigate this issue and prevent write combining. - as non-cached memory, the variable is present and set to not "0" value. This type of mapping may cause performance impact under heavy loading conditions but the explicit write memory barrier is not required and it may improve core performance. The UAR creation function maps a doorbell in one of the above ways according to the system. In run time, it always adds an explicit memory barrier after writing to. In cases where the doorbell was mapped as non-cached memory, the explicit memory barrier is unnecessary and may impair performance. The commit [1] solved this problem for a Tx queue. In run time, it checks the mapping type and provides the memory barrier after writing to a Tx doorbell register if it is needed. The mapping type is extracted directly from the uar_mmap_offset field in the queue properties. This patch shares this code between the drivers and extends the above solution for each of them. [1] commit 8409a28573d3 ("net/mlx5: control transmit doorbell register mapping") Fixes: f8c97babc9f4 ("compress/mlx5: add data-path functions") Fixes: 8e196c08ab53 ("crypto/mlx5: support enqueue/dequeue operations") Fixes: 4d4e245ad637 ("regex/mlx5: support enqueue") Cc: stable@dpdk.org Signed-off-by: Michael Baum <michaelba@nvidia.com> Reviewed-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com> Acked-by: Matan Azrad <matan@nvidia.com>
2021-11-03 18:35:13 +00:00
mlx5_doorbell_ring(&sh->tx_uar.bf_db, *(volatile uint64_t *)wqe,
sq->pi, &sq->sq_obj.db_rec[MLX5_SND_DBR],
!sh->tx_uar.dbnc);
return sq->elts[start_head & mask].burst_size;
}
/**
* Debug utility function. Dump contents of error CQE and WQE.
*
* @param[in] cqe
* Error CQE to dump.
* @param[in] wqe
* Error WQE to dump.
*/
static void
mlx5_aso_dump_err_objs(volatile uint32_t *cqe, volatile uint32_t *wqe)
{
int i;
DRV_LOG(ERR, "Error cqe:");
for (i = 0; i < 16; i += 4)
DRV_LOG(ERR, "%08X %08X %08X %08X", cqe[i], cqe[i + 1],
cqe[i + 2], cqe[i + 3]);
DRV_LOG(ERR, "\nError wqe:");
for (i = 0; i < (int)sizeof(struct mlx5_aso_wqe) / 4; i += 4)
DRV_LOG(ERR, "%08X %08X %08X %08X", wqe[i], wqe[i + 1],
wqe[i + 2], wqe[i + 3]);
}
/**
* Handle case of error CQE.
*
* @param[in] sq
* ASO SQ to use.
*/
static void
mlx5_aso_cqe_err_handle(struct mlx5_aso_sq *sq)
{
struct mlx5_aso_cq *cq = &sq->cq;
uint32_t idx = cq->cq_ci & ((1 << cq->log_desc_n) - 1);
volatile struct mlx5_err_cqe *cqe =
(volatile struct mlx5_err_cqe *)&cq->cq_obj.cqes[idx];
cq->errors++;
idx = rte_be_to_cpu_16(cqe->wqe_counter) & (1u << sq->log_desc_n);
mlx5_aso_dump_err_objs((volatile uint32_t *)cqe,
(volatile uint32_t *)&sq->sq_obj.aso_wqes[idx]);
}
int
mlx5_aso_pull_completion(struct mlx5_aso_sq *sq,
struct rte_flow_op_result res[],
uint16_t n_res)
{
struct mlx5_aso_cq *cq = &sq->cq;
volatile struct mlx5_cqe *restrict cqe;
const uint32_t cq_size = 1 << cq->log_desc_n;
const uint32_t mask = cq_size - 1;
uint32_t idx;
uint32_t next_idx;
uint16_t max;
uint16_t n = 0;
int ret;
max = (uint16_t)(sq->head - sq->tail);
if (unlikely(!max || !n_res))
return 0;
next_idx = cq->cq_ci & mask;
do {
idx = next_idx;
next_idx = (cq->cq_ci + 1) & mask;
/* Need to confirm the position of the prefetch. */
rte_prefetch0(&cq->cq_obj.cqes[next_idx]);
cqe = &cq->cq_obj.cqes[idx];
ret = check_cqe(cqe, cq_size, cq->cq_ci);
/*
* Be sure owner read is done before any other cookie field or
* opaque field.
*/
rte_io_rmb();
if (ret == MLX5_CQE_STATUS_HW_OWN)
break;
res[n].user_data = sq->elts[(uint16_t)((sq->tail + n) & mask)].user_data;
if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) {
mlx5_aso_cqe_err_handle(sq);
res[n].status = RTE_FLOW_OP_ERROR;
} else {
res[n].status = RTE_FLOW_OP_SUCCESS;
}
cq->cq_ci++;
if (++n == n_res)
break;
} while (1);
if (likely(n)) {
sq->tail += n;
rte_io_wmb();
cq->cq_obj.db_rec[0] = rte_cpu_to_be_32(cq->cq_ci);
}
return n;
}
void
mlx5_aso_push_wqe(struct mlx5_dev_ctx_shared *sh,
struct mlx5_aso_sq *sq)
{
if (sq->db_pi == sq->pi)
return;
mlx5_doorbell_ring(&sh->tx_uar.bf_db, *(volatile uint64_t *)sq->db,
sq->pi, &sq->sq_obj.db_rec[MLX5_SND_DBR],
!sh->tx_uar.dbnc);
sq->db_pi = sq->pi;
}
/**
* Update ASO objects upon completion.
*
* @param[in] sh
* Shared device context.
* @param[in] n
* Number of completed ASO objects.
*/
static void
mlx5_aso_age_action_update(struct mlx5_dev_ctx_shared *sh, uint16_t n)
{
struct mlx5_aso_age_mng *mng = sh->aso_age_mng;
struct mlx5_aso_sq *sq = &mng->aso_sq;
struct mlx5_age_info *age_info;
const uint16_t size = 1 << sq->log_desc_n;
const uint16_t mask = size - 1;
const uint64_t curr = MLX5_CURR_TIME_SEC;
uint16_t expected = AGE_CANDIDATE;
uint16_t i;
for (i = 0; i < n; ++i) {
uint16_t idx = (sq->tail + i) & mask;
struct mlx5_aso_age_pool *pool = sq->elts[idx].pool;
uint64_t diff = curr - pool->time_of_last_age_check;
uint64_t *addr = sq->mr.addr;
int j;
addr += idx * MLX5_ASO_AGE_ACTIONS_PER_POOL / 64;
pool->time_of_last_age_check = curr;
for (j = 0; j < MLX5_ASO_AGE_ACTIONS_PER_POOL; j++) {
struct mlx5_aso_age_action *act = &pool->actions[j];
struct mlx5_age_param *ap = &act->age_params;
uint8_t byte;
uint8_t offset;
uint8_t *u8addr;
uint8_t hit;
if (__atomic_load_n(&ap->state, __ATOMIC_RELAXED) !=
AGE_CANDIDATE)
continue;
byte = 63 - (j / 8);
offset = j % 8;
u8addr = (uint8_t *)addr;
hit = (u8addr[byte] >> offset) & 0x1;
if (hit) {
__atomic_store_n(&ap->sec_since_last_hit, 0,
__ATOMIC_RELAXED);
} else {
struct mlx5_priv *priv;
__atomic_fetch_add(&ap->sec_since_last_hit,
diff, __ATOMIC_RELAXED);
/* If timeout passed add to aged-out list. */
if (ap->sec_since_last_hit <= ap->timeout)
continue;
priv =
rte_eth_devices[ap->port_id].data->dev_private;
age_info = GET_PORT_AGE_INFO(priv);
rte_spinlock_lock(&age_info->aged_sl);
if (__atomic_compare_exchange_n(&ap->state,
&expected,
AGE_TMOUT,
false,
__ATOMIC_RELAXED,
__ATOMIC_RELAXED)) {
LIST_INSERT_HEAD(&age_info->aged_aso,
act, next);
MLX5_AGE_SET(age_info,
MLX5_AGE_EVENT_NEW);
}
rte_spinlock_unlock(&age_info->aged_sl);
}
}
}
mlx5_age_event_prepare(sh);
}
/**
* Handle completions from WQEs sent to ASO SQ.
*
* @param[in] sh
* Shared device context.
*
* @return
* Number of CQEs handled.
*/
static uint16_t
mlx5_aso_completion_handle(struct mlx5_dev_ctx_shared *sh)
{
struct mlx5_aso_age_mng *mng = sh->aso_age_mng;
struct mlx5_aso_sq *sq = &mng->aso_sq;
struct mlx5_aso_cq *cq = &sq->cq;
volatile struct mlx5_cqe *restrict cqe;
const unsigned int cq_size = 1 << cq->log_desc_n;
const unsigned int mask = cq_size - 1;
uint32_t idx;
uint32_t next_idx = cq->cq_ci & mask;
const uint16_t max = (uint16_t)(sq->head - sq->tail);
uint16_t i = 0;
int ret;
if (unlikely(!max))
return 0;
do {
idx = next_idx;
next_idx = (cq->cq_ci + 1) & mask;
rte_prefetch0(&cq->cq_obj.cqes[next_idx]);
cqe = &cq->cq_obj.cqes[idx];
ret = check_cqe(cqe, cq_size, cq->cq_ci);
/*
* Be sure owner read is done before any other cookie field or
* opaque field.
*/
rte_io_rmb();
if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) {
if (likely(ret == MLX5_CQE_STATUS_HW_OWN))
break;
mlx5_aso_cqe_err_handle(sq);
} else {
i += sq->elts[(sq->tail + i) & mask].burst_size;
}
cq->cq_ci++;
} while (1);
if (likely(i)) {
mlx5_aso_age_action_update(sh, i);
sq->tail += i;
rte_io_wmb();
cq->cq_obj.db_rec[0] = rte_cpu_to_be_32(cq->cq_ci);
}
return i;
}
/**
* Periodically read CQEs and send WQEs to ASO SQ.
*
* @param[in] arg
* Shared device context containing the ASO SQ.
*/
static void
mlx5_flow_aso_alarm(void *arg)
{
struct mlx5_dev_ctx_shared *sh = arg;
struct mlx5_aso_sq *sq = &sh->aso_age_mng->aso_sq;
uint32_t us = 100u;
uint16_t n;
rte_rwlock_read_lock(&sh->aso_age_mng->resize_rwl);
n = sh->aso_age_mng->next;
rte_rwlock_read_unlock(&sh->aso_age_mng->resize_rwl);
mlx5_aso_completion_handle(sh);
if (sq->next == n) {
/* End of loop: wait 1 second. */
us = US_PER_S;
sq->next = 0;
}
common/mlx5: fix post doorbell barrier The rdma-core library can map doorbell register in two ways, depending on the environment variable "MLX5_SHUT_UP_BF": - as regular cached memory, the variable is either missing or set to zero. This type of mapping may cause the significant doorbell register writing latency and requires an explicit memory write barrier to mitigate this issue and prevent write combining. - as non-cached memory, the variable is present and set to not "0" value. This type of mapping may cause performance impact under heavy loading conditions but the explicit write memory barrier is not required and it may improve core performance. The UAR creation function maps a doorbell in one of the above ways according to the system. In run time, it always adds an explicit memory barrier after writing to. In cases where the doorbell was mapped as non-cached memory, the explicit memory barrier is unnecessary and may impair performance. The commit [1] solved this problem for a Tx queue. In run time, it checks the mapping type and provides the memory barrier after writing to a Tx doorbell register if it is needed. The mapping type is extracted directly from the uar_mmap_offset field in the queue properties. This patch shares this code between the drivers and extends the above solution for each of them. [1] commit 8409a28573d3 ("net/mlx5: control transmit doorbell register mapping") Fixes: f8c97babc9f4 ("compress/mlx5: add data-path functions") Fixes: 8e196c08ab53 ("crypto/mlx5: support enqueue/dequeue operations") Fixes: 4d4e245ad637 ("regex/mlx5: support enqueue") Cc: stable@dpdk.org Signed-off-by: Michael Baum <michaelba@nvidia.com> Reviewed-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com> Acked-by: Matan Azrad <matan@nvidia.com>
2021-11-03 18:35:13 +00:00
mlx5_aso_sq_enqueue_burst(sh, n);
if (rte_eal_alarm_set(us, mlx5_flow_aso_alarm, sh))
DRV_LOG(ERR, "Cannot reinitialize aso alarm.");
}
/**
* API to start ASO access using ASO SQ.
*
* @param[in] sh
* Pointer to shared device context.
*
* @return
* 0 on success, a negative errno value otherwise and rte_errno is set.
*/
int
mlx5_aso_flow_hit_queue_poll_start(struct mlx5_dev_ctx_shared *sh)
{
if (rte_eal_alarm_set(US_PER_S, mlx5_flow_aso_alarm, sh)) {
DRV_LOG(ERR, "Cannot reinitialize ASO age alarm.");
return -rte_errno;
}
return 0;
}
/**
* API to stop ASO access using ASO SQ.
*
* @param[in] sh
* Pointer to shared device context.
*
* @return
* 0 on success, a negative errno value otherwise and rte_errno is set.
*/
int
mlx5_aso_flow_hit_queue_poll_stop(struct mlx5_dev_ctx_shared *sh)
{
int retries = 1024;
if (!sh->aso_age_mng->aso_sq.sq_obj.sq)
return -EINVAL;
rte_errno = 0;
while (--retries) {
rte_eal_alarm_cancel(mlx5_flow_aso_alarm, sh);
if (rte_errno != EINPROGRESS)
break;
rte_pause();
}
return -rte_errno;
}
static uint16_t
common/mlx5: fix post doorbell barrier The rdma-core library can map doorbell register in two ways, depending on the environment variable "MLX5_SHUT_UP_BF": - as regular cached memory, the variable is either missing or set to zero. This type of mapping may cause the significant doorbell register writing latency and requires an explicit memory write barrier to mitigate this issue and prevent write combining. - as non-cached memory, the variable is present and set to not "0" value. This type of mapping may cause performance impact under heavy loading conditions but the explicit write memory barrier is not required and it may improve core performance. The UAR creation function maps a doorbell in one of the above ways according to the system. In run time, it always adds an explicit memory barrier after writing to. In cases where the doorbell was mapped as non-cached memory, the explicit memory barrier is unnecessary and may impair performance. The commit [1] solved this problem for a Tx queue. In run time, it checks the mapping type and provides the memory barrier after writing to a Tx doorbell register if it is needed. The mapping type is extracted directly from the uar_mmap_offset field in the queue properties. This patch shares this code between the drivers and extends the above solution for each of them. [1] commit 8409a28573d3 ("net/mlx5: control transmit doorbell register mapping") Fixes: f8c97babc9f4 ("compress/mlx5: add data-path functions") Fixes: 8e196c08ab53 ("crypto/mlx5: support enqueue/dequeue operations") Fixes: 4d4e245ad637 ("regex/mlx5: support enqueue") Cc: stable@dpdk.org Signed-off-by: Michael Baum <michaelba@nvidia.com> Reviewed-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com> Acked-by: Matan Azrad <matan@nvidia.com>
2021-11-03 18:35:13 +00:00
mlx5_aso_mtr_sq_enqueue_single(struct mlx5_dev_ctx_shared *sh,
struct mlx5_aso_sq *sq,
struct mlx5_aso_mtr *aso_mtr,
struct mlx5_mtr_bulk *bulk,
bool need_lock,
void *user_data,
bool push)
{
volatile struct mlx5_aso_wqe *wqe = NULL;
struct mlx5_flow_meter_info *fm = NULL;
struct mlx5_flow_meter_profile *fmp;
uint16_t size = 1 << sq->log_desc_n;
uint16_t mask = size - 1;
uint16_t res;
uint32_t dseg_idx = 0;
struct mlx5_aso_mtr_pool *pool = NULL;
uint32_t param_le;
int id;
if (need_lock)
rte_spinlock_lock(&sq->sqsl);
res = size - (uint16_t)(sq->head - sq->tail);
if (unlikely(!res)) {
DRV_LOG(ERR, "Fail: SQ is full and no free WQE to send");
if (need_lock)
rte_spinlock_unlock(&sq->sqsl);
return 0;
}
wqe = &sq->sq_obj.aso_wqes[sq->head & mask];
rte_prefetch0(&sq->sq_obj.aso_wqes[(sq->head + 1) & mask]);
/* Fill next WQE. */
fm = &aso_mtr->fm;
sq->elts[sq->head & mask].mtr = user_data ? user_data : aso_mtr;
if (aso_mtr->type == ASO_METER_INDIRECT) {
if (likely(sh->config.dv_flow_en == 2))
pool = aso_mtr->pool;
else
pool = container_of(aso_mtr, struct mlx5_aso_mtr_pool,
mtrs[aso_mtr->offset]);
id = pool->devx_obj->id;
} else {
id = bulk->devx_obj->id;
}
wqe->general_cseg.misc = rte_cpu_to_be_32(id +
(aso_mtr->offset >> 1));
wqe->general_cseg.opcode =
rte_cpu_to_be_32(MLX5_OPCODE_ACCESS_ASO |
(ASO_OPC_MOD_POLICER << WQE_CSEG_OPC_MOD_OFFSET) |
sq->pi << WQE_CSEG_WQE_INDEX_OFFSET);
/* There are 2 meters in one ASO cache line. */
dseg_idx = aso_mtr->offset & 0x1;
wqe->aso_cseg.data_mask =
RTE_BE64(MLX5_IFC_FLOW_METER_PARAM_MASK << (32 * !dseg_idx));
if (fm->is_enable) {
wqe->aso_dseg.mtrs[dseg_idx].cbs_cir =
fm->profile->srtcm_prm.cbs_cir;
wqe->aso_dseg.mtrs[dseg_idx].ebs_eir =
fm->profile->srtcm_prm.ebs_eir;
} else {
wqe->aso_dseg.mtrs[dseg_idx].cbs_cir =
RTE_BE32(MLX5_IFC_FLOW_METER_DISABLE_CBS_CIR_VAL);
wqe->aso_dseg.mtrs[dseg_idx].ebs_eir = 0;
}
fmp = fm->profile;
param_le = (1 << ASO_DSEG_VALID_OFFSET);
if (fm->color_aware)
param_le |= (MLX5_FLOW_COLOR_UNDEFINED << ASO_DSEG_SC_OFFSET);
else
param_le |= (MLX5_FLOW_COLOR_GREEN << ASO_DSEG_SC_OFFSET);
if (fmp->profile.packet_mode)
param_le |= (MLX5_METER_MODE_PKT << ASO_DSEG_MTR_MODE);
wqe->aso_dseg.mtrs[dseg_idx].v_bo_sc_bbog_mm = RTE_BE32(param_le);
switch (fmp->profile.alg) {
case RTE_MTR_SRTCM_RFC2697:
/* Only needed for RFC2697. */
if (fm->profile->srtcm_prm.ebs_eir)
wqe->aso_dseg.mtrs[dseg_idx].v_bo_sc_bbog_mm |=
RTE_BE32(1 << ASO_DSEG_BO_OFFSET);
break;
case RTE_MTR_TRTCM_RFC2698:
wqe->aso_dseg.mtrs[dseg_idx].v_bo_sc_bbog_mm |=
RTE_BE32(1 << ASO_DSEG_BBOG_OFFSET);
break;
case RTE_MTR_TRTCM_RFC4115:
wqe->aso_dseg.mtrs[dseg_idx].v_bo_sc_bbog_mm |=
RTE_BE32(1 << ASO_DSEG_BO_OFFSET);
break;
default:
break;
}
/*
* Note:
* Due to software performance reason, the token fields will not be
* set when posting the WQE to ASO SQ. It will be filled by the HW
* automatically.
*/
sq->head++;
sq->pi += 2;/* Each WQE contains 2 WQEBB's. */
if (push) {
mlx5_doorbell_ring(&sh->tx_uar.bf_db, *(volatile uint64_t *)wqe,
common/mlx5: fix post doorbell barrier The rdma-core library can map doorbell register in two ways, depending on the environment variable "MLX5_SHUT_UP_BF": - as regular cached memory, the variable is either missing or set to zero. This type of mapping may cause the significant doorbell register writing latency and requires an explicit memory write barrier to mitigate this issue and prevent write combining. - as non-cached memory, the variable is present and set to not "0" value. This type of mapping may cause performance impact under heavy loading conditions but the explicit write memory barrier is not required and it may improve core performance. The UAR creation function maps a doorbell in one of the above ways according to the system. In run time, it always adds an explicit memory barrier after writing to. In cases where the doorbell was mapped as non-cached memory, the explicit memory barrier is unnecessary and may impair performance. The commit [1] solved this problem for a Tx queue. In run time, it checks the mapping type and provides the memory barrier after writing to a Tx doorbell register if it is needed. The mapping type is extracted directly from the uar_mmap_offset field in the queue properties. This patch shares this code between the drivers and extends the above solution for each of them. [1] commit 8409a28573d3 ("net/mlx5: control transmit doorbell register mapping") Fixes: f8c97babc9f4 ("compress/mlx5: add data-path functions") Fixes: 8e196c08ab53 ("crypto/mlx5: support enqueue/dequeue operations") Fixes: 4d4e245ad637 ("regex/mlx5: support enqueue") Cc: stable@dpdk.org Signed-off-by: Michael Baum <michaelba@nvidia.com> Reviewed-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com> Acked-by: Matan Azrad <matan@nvidia.com>
2021-11-03 18:35:13 +00:00
sq->pi, &sq->sq_obj.db_rec[MLX5_SND_DBR],
!sh->tx_uar.dbnc);
sq->db_pi = sq->pi;
}
sq->db = wqe;
if (need_lock)
rte_spinlock_unlock(&sq->sqsl);
return 1;
}
static void
mlx5_aso_mtrs_status_update(struct mlx5_aso_sq *sq, uint16_t aso_mtrs_nums)
{
uint16_t size = 1 << sq->log_desc_n;
uint16_t mask = size - 1;
uint16_t i;
struct mlx5_aso_mtr *aso_mtr = NULL;
uint8_t exp_state = ASO_METER_WAIT;
for (i = 0; i < aso_mtrs_nums; ++i) {
aso_mtr = sq->elts[(sq->tail + i) & mask].mtr;
MLX5_ASSERT(aso_mtr);
(void)__atomic_compare_exchange_n(&aso_mtr->state,
&exp_state, ASO_METER_READY,
false, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
}
}
static void
mlx5_aso_mtr_completion_handle(struct mlx5_aso_sq *sq, bool need_lock)
{
struct mlx5_aso_cq *cq = &sq->cq;
volatile struct mlx5_cqe *restrict cqe;
const unsigned int cq_size = 1 << cq->log_desc_n;
const unsigned int mask = cq_size - 1;
uint32_t idx;
uint32_t next_idx = cq->cq_ci & mask;
uint16_t max;
uint16_t n = 0;
int ret;
if (need_lock)
rte_spinlock_lock(&sq->sqsl);
max = (uint16_t)(sq->head - sq->tail);
if (unlikely(!max)) {
rte_spinlock_unlock(&sq->sqsl);
return;
}
do {
idx = next_idx;
next_idx = (cq->cq_ci + 1) & mask;
rte_prefetch0(&cq->cq_obj.cqes[next_idx]);
cqe = &cq->cq_obj.cqes[idx];
ret = check_cqe(cqe, cq_size, cq->cq_ci);
/*
* Be sure owner read is done before any other cookie field or
* opaque field.
*/
rte_io_rmb();
if (ret != MLX5_CQE_STATUS_SW_OWN) {
if (likely(ret == MLX5_CQE_STATUS_HW_OWN))
break;
mlx5_aso_cqe_err_handle(sq);
} else {
n++;
}
cq->cq_ci++;
} while (1);
if (likely(n)) {
mlx5_aso_mtrs_status_update(sq, n);
sq->tail += n;
rte_io_wmb();
cq->cq_obj.db_rec[0] = rte_cpu_to_be_32(cq->cq_ci);
}
if (need_lock)
rte_spinlock_unlock(&sq->sqsl);
}
/**
* Update meter parameter by send WQE.
*
* @param[in] dev
* Pointer to Ethernet device.
* @param[in] priv
* Pointer to mlx5 private data structure.
* @param[in] fm
* Pointer to flow meter to be modified.
*
* @return
* 0 on success, a negative errno value otherwise and rte_errno is set.
*/
int
mlx5_aso_meter_update_by_wqe(struct mlx5_dev_ctx_shared *sh, uint32_t queue,
struct mlx5_aso_mtr *mtr,
struct mlx5_mtr_bulk *bulk,
void *user_data,
bool push)
{
struct mlx5_aso_sq *sq;
uint32_t poll_wqe_times = MLX5_MTR_POLL_WQE_CQE_TIMES;
bool need_lock;
int ret;
if (likely(sh->config.dv_flow_en == 2) &&
mtr->type == ASO_METER_INDIRECT) {
if (queue == MLX5_HW_INV_QUEUE) {
sq = &mtr->pool->sq[mtr->pool->nb_sq - 1];
need_lock = true;
} else {
sq = &mtr->pool->sq[queue];
need_lock = false;
}
} else {
sq = &sh->mtrmng->pools_mng.sq;
need_lock = true;
}
if (queue != MLX5_HW_INV_QUEUE) {
ret = mlx5_aso_mtr_sq_enqueue_single(sh, sq, mtr, bulk,
need_lock, user_data, push);
return ret > 0 ? 0 : -1;
}
do {
mlx5_aso_mtr_completion_handle(sq, need_lock);
if (mlx5_aso_mtr_sq_enqueue_single(sh, sq, mtr, bulk,
need_lock, NULL, true))
return 0;
/* Waiting for wqe resource. */
rte_delay_us_sleep(MLX5_ASO_WQE_CQE_RESPONSE_DELAY);
} while (--poll_wqe_times);
ethdev: add pre-defined meter policy API Currently, the flow meter policy does not support multiple actions per color; also the allowed action types per color are very limited. In addition, the policy cannot be pre-defined. Due to the growing in flow actions offload abilities there is a potential for the user to use variety of actions per color differently. This new meter policy API comes to allow this potential in the most ethdev common way using rte_flow action definition. A list of rte_flow actions will be provided by the user per color in order to create a meter policy. In addition, the API forces to pre-define the policy before the meters creation in order to allow sharing of single policy with multiple meters efficiently. meter_policy_id is added into struct rte_mtr_params. So that it can get the policy during the meters creation. Allow coloring the packet using a new rte_flow_action_color as could be done by the old policy API. Add two common policy template as macros in the head file. The next API function were added: - rte_mtr_meter_policy_add - rte_mtr_meter_policy_delete - rte_mtr_meter_policy_update - rte_mtr_meter_policy_validate The next struct was changed: - rte_mtr_params - rte_mtr_capabilities The next API was deleted: - rte_mtr_policer_actions_update To support this API the following app were changed: app/test-flow-perf: clean meter policer app/testpmd: clean meter policer To support this API the following drivers were changed: net/softnic: support meter policy API 1. Cleans meter rte_mtr_policer_action. 2. Supports policy API to get color action as policer action did. The color action will be mapped into rte_table_action_policer. net/mlx5: clean meter creation management Cleans and breaks part of the current meter management in order to allow better design with policy API. Signed-off-by: Li Zhang <lizh@nvidia.com> Signed-off-by: Haifei Luo <haifeil@nvidia.com> Signed-off-by: Jiawei Wang <jiaweiw@nvidia.com> Acked-by: Matan Azrad <matan@nvidia.com> Acked-by: Ray Kinsella <mdr@ashroe.eu> Acked-by: Ori Kam <orika@nvidia.com> Acked-by: Jasvinder Singh <jasvinder.singh@intel.com> Acked-by: Cristian Dumitrescu <cristian.dumitrescu@intel.com> Acked-by: Ajit Khaparde <ajit.khaparde@broadcom.com>
2021-04-20 14:04:49 +00:00
DRV_LOG(ERR, "Fail to send WQE for ASO meter offset %d",
mtr->offset);
return -1;
}
/**
* Wait for meter to be ready.
*
* @param[in] dev
* Pointer to Ethernet device.
* @param[in] priv
* Pointer to mlx5 private data structure.
* @param[in] fm
* Pointer to flow meter to be modified.
*
* @return
* 0 on success, a negative errno value otherwise and rte_errno is set.
*/
int
mlx5_aso_mtr_wait(struct mlx5_dev_ctx_shared *sh, uint32_t queue,
struct mlx5_aso_mtr *mtr)
{
struct mlx5_aso_sq *sq;
uint32_t poll_cqe_times = MLX5_MTR_POLL_WQE_CQE_TIMES;
uint8_t state;
bool need_lock;
if (likely(sh->config.dv_flow_en == 2) &&
mtr->type == ASO_METER_INDIRECT) {
if (queue == MLX5_HW_INV_QUEUE) {
sq = &mtr->pool->sq[mtr->pool->nb_sq - 1];
need_lock = true;
} else {
sq = &mtr->pool->sq[queue];
need_lock = false;
}
} else {
sq = &sh->mtrmng->pools_mng.sq;
need_lock = true;
}
state = __atomic_load_n(&mtr->state, __ATOMIC_RELAXED);
if (state == ASO_METER_READY || state == ASO_METER_WAIT_ASYNC)
return 0;
do {
mlx5_aso_mtr_completion_handle(sq, need_lock);
if (__atomic_load_n(&mtr->state, __ATOMIC_RELAXED) ==
ASO_METER_READY)
return 0;
/* Waiting for CQE ready. */
rte_delay_us_sleep(MLX5_ASO_WQE_CQE_RESPONSE_DELAY);
} while (--poll_cqe_times);
ethdev: add pre-defined meter policy API Currently, the flow meter policy does not support multiple actions per color; also the allowed action types per color are very limited. In addition, the policy cannot be pre-defined. Due to the growing in flow actions offload abilities there is a potential for the user to use variety of actions per color differently. This new meter policy API comes to allow this potential in the most ethdev common way using rte_flow action definition. A list of rte_flow actions will be provided by the user per color in order to create a meter policy. In addition, the API forces to pre-define the policy before the meters creation in order to allow sharing of single policy with multiple meters efficiently. meter_policy_id is added into struct rte_mtr_params. So that it can get the policy during the meters creation. Allow coloring the packet using a new rte_flow_action_color as could be done by the old policy API. Add two common policy template as macros in the head file. The next API function were added: - rte_mtr_meter_policy_add - rte_mtr_meter_policy_delete - rte_mtr_meter_policy_update - rte_mtr_meter_policy_validate The next struct was changed: - rte_mtr_params - rte_mtr_capabilities The next API was deleted: - rte_mtr_policer_actions_update To support this API the following app were changed: app/test-flow-perf: clean meter policer app/testpmd: clean meter policer To support this API the following drivers were changed: net/softnic: support meter policy API 1. Cleans meter rte_mtr_policer_action. 2. Supports policy API to get color action as policer action did. The color action will be mapped into rte_table_action_policer. net/mlx5: clean meter creation management Cleans and breaks part of the current meter management in order to allow better design with policy API. Signed-off-by: Li Zhang <lizh@nvidia.com> Signed-off-by: Haifei Luo <haifeil@nvidia.com> Signed-off-by: Jiawei Wang <jiaweiw@nvidia.com> Acked-by: Matan Azrad <matan@nvidia.com> Acked-by: Ray Kinsella <mdr@ashroe.eu> Acked-by: Ori Kam <orika@nvidia.com> Acked-by: Jasvinder Singh <jasvinder.singh@intel.com> Acked-by: Cristian Dumitrescu <cristian.dumitrescu@intel.com> Acked-by: Ajit Khaparde <ajit.khaparde@broadcom.com>
2021-04-20 14:04:49 +00:00
DRV_LOG(ERR, "Fail to poll CQE ready for ASO meter offset %d",
mtr->offset);
return -1;
}
static inline struct mlx5_aso_sq*
__mlx5_aso_ct_get_sq_in_hws(uint32_t queue,
struct mlx5_aso_ct_pool *pool)
{
return (queue == MLX5_HW_INV_QUEUE) ?
pool->shared_sq : &pool->sq[queue];
}
static inline struct mlx5_aso_sq*
__mlx5_aso_ct_get_sq_in_sws(struct mlx5_dev_ctx_shared *sh,
struct mlx5_aso_ct_action *ct)
{
return &sh->ct_mng->aso_sqs[ct->offset & (MLX5_ASO_CT_SQ_NUM - 1)];
}
static inline struct mlx5_aso_ct_pool*
__mlx5_aso_ct_get_pool(struct mlx5_dev_ctx_shared *sh,
struct mlx5_aso_ct_action *ct)
{
if (likely(sh->config.dv_flow_en == 2))
return ct->pool;
return container_of(ct, struct mlx5_aso_ct_pool, actions[ct->offset]);
}
int
mlx5_aso_ct_queue_uninit(struct mlx5_dev_ctx_shared *sh,
struct mlx5_aso_ct_pools_mng *ct_mng)
{
uint32_t i;
/* 64B per object for query. */
for (i = 0; i < ct_mng->nb_sq; i++) {
if (ct_mng->aso_sqs[i].mr.addr)
mlx5_aso_dereg_mr(sh->cdev, &ct_mng->aso_sqs[i].mr);
mlx5_aso_destroy_sq(&ct_mng->aso_sqs[i]);
}
return 0;
}
/**
* API to create and initialize CT Send Queue used for ASO access.
*
* @param[in] sh
* Pointer to shared device context.
* @param[in] ct_mng
* Pointer to the CT management struct.
* *param[in] nb_queues
* Number of queues to be allocated.
*
* @return
* 0 on success, a negative errno value otherwise and rte_errno is set.
*/
int
mlx5_aso_ct_queue_init(struct mlx5_dev_ctx_shared *sh,
struct mlx5_aso_ct_pools_mng *ct_mng,
uint32_t nb_queues)
{
uint32_t i;
/* 64B per object for query. */
for (i = 0; i < nb_queues; i++) {
if (mlx5_aso_reg_mr(sh->cdev, 64 * (1 << MLX5_ASO_QUEUE_LOG_DESC),
&ct_mng->aso_sqs[i].mr))
goto error;
if (mlx5_aso_sq_create(sh->cdev, &ct_mng->aso_sqs[i],
sh->tx_uar.obj,
MLX5_ASO_QUEUE_LOG_DESC))
goto error;
mlx5_aso_ct_init_sq(&ct_mng->aso_sqs[i]);
}
ct_mng->nb_sq = nb_queues;
return 0;
error:
do {
if (ct_mng->aso_sqs[i].mr.addr)
mlx5_aso_dereg_mr(sh->cdev, &ct_mng->aso_sqs[i].mr);
mlx5_aso_destroy_sq(&ct_mng->aso_sqs[i]);
} while (i--);
ct_mng->nb_sq = 0;
return -1;
}
/*
* Post a WQE to the ASO CT SQ to modify the context.
*
common/mlx5: fix post doorbell barrier The rdma-core library can map doorbell register in two ways, depending on the environment variable "MLX5_SHUT_UP_BF": - as regular cached memory, the variable is either missing or set to zero. This type of mapping may cause the significant doorbell register writing latency and requires an explicit memory write barrier to mitigate this issue and prevent write combining. - as non-cached memory, the variable is present and set to not "0" value. This type of mapping may cause performance impact under heavy loading conditions but the explicit write memory barrier is not required and it may improve core performance. The UAR creation function maps a doorbell in one of the above ways according to the system. In run time, it always adds an explicit memory barrier after writing to. In cases where the doorbell was mapped as non-cached memory, the explicit memory barrier is unnecessary and may impair performance. The commit [1] solved this problem for a Tx queue. In run time, it checks the mapping type and provides the memory barrier after writing to a Tx doorbell register if it is needed. The mapping type is extracted directly from the uar_mmap_offset field in the queue properties. This patch shares this code between the drivers and extends the above solution for each of them. [1] commit 8409a28573d3 ("net/mlx5: control transmit doorbell register mapping") Fixes: f8c97babc9f4 ("compress/mlx5: add data-path functions") Fixes: 8e196c08ab53 ("crypto/mlx5: support enqueue/dequeue operations") Fixes: 4d4e245ad637 ("regex/mlx5: support enqueue") Cc: stable@dpdk.org Signed-off-by: Michael Baum <michaelba@nvidia.com> Reviewed-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com> Acked-by: Matan Azrad <matan@nvidia.com>
2021-11-03 18:35:13 +00:00
* @param[in] sh
* Pointer to shared device context.
* @param[in] ct
* Pointer to the generic CT structure related to the context.
* @param[in] profile
* Pointer to configuration profile.
*
* @return
* 1 on success (WQE number), 0 on failure.
*/
static uint16_t
common/mlx5: fix post doorbell barrier The rdma-core library can map doorbell register in two ways, depending on the environment variable "MLX5_SHUT_UP_BF": - as regular cached memory, the variable is either missing or set to zero. This type of mapping may cause the significant doorbell register writing latency and requires an explicit memory write barrier to mitigate this issue and prevent write combining. - as non-cached memory, the variable is present and set to not "0" value. This type of mapping may cause performance impact under heavy loading conditions but the explicit write memory barrier is not required and it may improve core performance. The UAR creation function maps a doorbell in one of the above ways according to the system. In run time, it always adds an explicit memory barrier after writing to. In cases where the doorbell was mapped as non-cached memory, the explicit memory barrier is unnecessary and may impair performance. The commit [1] solved this problem for a Tx queue. In run time, it checks the mapping type and provides the memory barrier after writing to a Tx doorbell register if it is needed. The mapping type is extracted directly from the uar_mmap_offset field in the queue properties. This patch shares this code between the drivers and extends the above solution for each of them. [1] commit 8409a28573d3 ("net/mlx5: control transmit doorbell register mapping") Fixes: f8c97babc9f4 ("compress/mlx5: add data-path functions") Fixes: 8e196c08ab53 ("crypto/mlx5: support enqueue/dequeue operations") Fixes: 4d4e245ad637 ("regex/mlx5: support enqueue") Cc: stable@dpdk.org Signed-off-by: Michael Baum <michaelba@nvidia.com> Reviewed-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com> Acked-by: Matan Azrad <matan@nvidia.com>
2021-11-03 18:35:13 +00:00
mlx5_aso_ct_sq_enqueue_single(struct mlx5_dev_ctx_shared *sh,
struct mlx5_aso_sq *sq,
struct mlx5_aso_ct_action *ct,
const struct rte_flow_action_conntrack *profile,
bool need_lock,
void *user_data,
bool push)
{
volatile struct mlx5_aso_wqe *wqe = NULL;
uint16_t size = 1 << sq->log_desc_n;
uint16_t mask = size - 1;
uint16_t res;
struct mlx5_aso_ct_pool *pool;
void *desg;
void *orig_dir;
void *reply_dir;
if (need_lock)
rte_spinlock_lock(&sq->sqsl);
/* Prevent other threads to update the index. */
res = size - (uint16_t)(sq->head - sq->tail);
if (unlikely(!res)) {
if (need_lock)
rte_spinlock_unlock(&sq->sqsl);
DRV_LOG(ERR, "Fail: SQ is full and no free WQE to send");
return 0;
}
wqe = &sq->sq_obj.aso_wqes[sq->head & mask];
rte_prefetch0(&sq->sq_obj.aso_wqes[(sq->head + 1) & mask]);
/* Fill next WQE. */
MLX5_ASO_CT_UPDATE_STATE(ct,
user_data ? ASO_CONNTRACK_WAIT_ASYNC : ASO_CONNTRACK_WAIT);
if (user_data) {
sq->elts[sq->head & mask].user_data = user_data;
} else {
sq->elts[sq->head & mask].ct = ct;
sq->elts[sq->head & mask].query_data = NULL;
}
pool = __mlx5_aso_ct_get_pool(sh, ct);
/* Each WQE will have a single CT object. */
wqe->general_cseg.misc = rte_cpu_to_be_32(pool->devx_obj->id +
ct->offset);
wqe->general_cseg.opcode = rte_cpu_to_be_32(MLX5_OPCODE_ACCESS_ASO |
(ASO_OPC_MOD_CONNECTION_TRACKING <<
WQE_CSEG_OPC_MOD_OFFSET) |
sq->pi << WQE_CSEG_WQE_INDEX_OFFSET);
wqe->aso_cseg.operand_masks = rte_cpu_to_be_32
(0u |
(ASO_OPER_LOGICAL_OR << ASO_CSEG_COND_OPER_OFFSET) |
(ASO_OP_ALWAYS_TRUE << ASO_CSEG_COND_1_OPER_OFFSET) |
(ASO_OP_ALWAYS_TRUE << ASO_CSEG_COND_0_OPER_OFFSET) |
(BYTEWISE_64BYTE << ASO_CSEG_DATA_MASK_MODE_OFFSET));
wqe->aso_cseg.data_mask = UINT64_MAX;
/* To make compiler happy. */
desg = (void *)(uintptr_t)wqe->aso_dseg.data;
MLX5_SET(conn_track_aso, desg, valid, 1);
MLX5_SET(conn_track_aso, desg, state, profile->state);
MLX5_SET(conn_track_aso, desg, freeze_track, !profile->enable);
MLX5_SET(conn_track_aso, desg, connection_assured,
profile->live_connection);
MLX5_SET(conn_track_aso, desg, sack_permitted, profile->selective_ack);
MLX5_SET(conn_track_aso, desg, challenged_acked,
profile->challenge_ack_passed);
/* Heartbeat, retransmission_counter, retranmission_limit_exceeded: 0 */
MLX5_SET(conn_track_aso, desg, heartbeat, 0);
MLX5_SET(conn_track_aso, desg, max_ack_window,
profile->max_ack_window);
MLX5_SET(conn_track_aso, desg, retransmission_counter, 0);
MLX5_SET(conn_track_aso, desg, retranmission_limit_exceeded, 0);
MLX5_SET(conn_track_aso, desg, retranmission_limit,
profile->retransmission_limit);
MLX5_SET(conn_track_aso, desg, reply_direction_tcp_scale,
profile->reply_dir.scale);
MLX5_SET(conn_track_aso, desg, reply_direction_tcp_close_initiated,
profile->reply_dir.close_initiated);
/* Both directions will use the same liberal mode. */
MLX5_SET(conn_track_aso, desg, reply_direction_tcp_liberal_enabled,
profile->liberal_mode);
MLX5_SET(conn_track_aso, desg, reply_direction_tcp_data_unacked,
profile->reply_dir.data_unacked);
MLX5_SET(conn_track_aso, desg, reply_direction_tcp_max_ack,
profile->reply_dir.last_ack_seen);
MLX5_SET(conn_track_aso, desg, original_direction_tcp_scale,
profile->original_dir.scale);
MLX5_SET(conn_track_aso, desg, original_direction_tcp_close_initiated,
profile->original_dir.close_initiated);
MLX5_SET(conn_track_aso, desg, original_direction_tcp_liberal_enabled,
profile->liberal_mode);
MLX5_SET(conn_track_aso, desg, original_direction_tcp_data_unacked,
profile->original_dir.data_unacked);
MLX5_SET(conn_track_aso, desg, original_direction_tcp_max_ack,
profile->original_dir.last_ack_seen);
MLX5_SET(conn_track_aso, desg, last_win, profile->last_window);
MLX5_SET(conn_track_aso, desg, last_dir, profile->last_direction);
MLX5_SET(conn_track_aso, desg, last_index, profile->last_index);
MLX5_SET(conn_track_aso, desg, last_seq, profile->last_seq);
MLX5_SET(conn_track_aso, desg, last_ack, profile->last_ack);
MLX5_SET(conn_track_aso, desg, last_end, profile->last_end);
orig_dir = MLX5_ADDR_OF(conn_track_aso, desg, original_dir);
MLX5_SET(tcp_window_params, orig_dir, sent_end,
profile->original_dir.sent_end);
MLX5_SET(tcp_window_params, orig_dir, reply_end,
profile->original_dir.reply_end);
MLX5_SET(tcp_window_params, orig_dir, max_win,
profile->original_dir.max_win);
MLX5_SET(tcp_window_params, orig_dir, max_ack,
profile->original_dir.max_ack);
reply_dir = MLX5_ADDR_OF(conn_track_aso, desg, reply_dir);
MLX5_SET(tcp_window_params, reply_dir, sent_end,
profile->reply_dir.sent_end);
MLX5_SET(tcp_window_params, reply_dir, reply_end,
profile->reply_dir.reply_end);
MLX5_SET(tcp_window_params, reply_dir, max_win,
profile->reply_dir.max_win);
MLX5_SET(tcp_window_params, reply_dir, max_ack,
profile->reply_dir.max_ack);
sq->head++;
sq->pi += 2; /* Each WQE contains 2 WQEBB's. */
if (push) {
mlx5_doorbell_ring(&sh->tx_uar.bf_db, *(volatile uint64_t *)wqe,
sq->pi, &sq->sq_obj.db_rec[MLX5_SND_DBR],
!sh->tx_uar.dbnc);
sq->db_pi = sq->pi;
}
sq->db = wqe;
if (need_lock)
rte_spinlock_unlock(&sq->sqsl);
return 1;
}
/*
* Update the status field of CTs to indicate ready to be used by flows.
* A continuous number of CTs since last update.
*
* @param[in] sq
* Pointer to ASO CT SQ.
* @param[in] num
* Number of CT structures to be updated.
*
* @return
* 0 on success, a negative value.
*/
static void
mlx5_aso_ct_status_update(struct mlx5_aso_sq *sq, uint16_t num)
{
uint16_t size = 1 << sq->log_desc_n;
uint16_t mask = size - 1;
uint16_t i;
struct mlx5_aso_ct_action *ct = NULL;
uint16_t idx;
for (i = 0; i < num; i++) {
idx = (uint16_t)((sq->tail + i) & mask);
ct = sq->elts[idx].ct;
MLX5_ASSERT(ct);
MLX5_ASO_CT_UPDATE_STATE(ct, ASO_CONNTRACK_READY);
if (sq->elts[idx].query_data)
rte_memcpy(sq->elts[idx].query_data,
(char *)((uintptr_t)sq->mr.addr + idx * 64),
64);
}
}
/*
* Post a WQE to the ASO CT SQ to query the current context.
*
common/mlx5: fix post doorbell barrier The rdma-core library can map doorbell register in two ways, depending on the environment variable "MLX5_SHUT_UP_BF": - as regular cached memory, the variable is either missing or set to zero. This type of mapping may cause the significant doorbell register writing latency and requires an explicit memory write barrier to mitigate this issue and prevent write combining. - as non-cached memory, the variable is present and set to not "0" value. This type of mapping may cause performance impact under heavy loading conditions but the explicit write memory barrier is not required and it may improve core performance. The UAR creation function maps a doorbell in one of the above ways according to the system. In run time, it always adds an explicit memory barrier after writing to. In cases where the doorbell was mapped as non-cached memory, the explicit memory barrier is unnecessary and may impair performance. The commit [1] solved this problem for a Tx queue. In run time, it checks the mapping type and provides the memory barrier after writing to a Tx doorbell register if it is needed. The mapping type is extracted directly from the uar_mmap_offset field in the queue properties. This patch shares this code between the drivers and extends the above solution for each of them. [1] commit 8409a28573d3 ("net/mlx5: control transmit doorbell register mapping") Fixes: f8c97babc9f4 ("compress/mlx5: add data-path functions") Fixes: 8e196c08ab53 ("crypto/mlx5: support enqueue/dequeue operations") Fixes: 4d4e245ad637 ("regex/mlx5: support enqueue") Cc: stable@dpdk.org Signed-off-by: Michael Baum <michaelba@nvidia.com> Reviewed-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com> Acked-by: Matan Azrad <matan@nvidia.com>
2021-11-03 18:35:13 +00:00
* @param[in] sh
* Pointer to shared device context.
* @param[in] ct
* Pointer to the generic CT structure related to the context.
* @param[in] data
* Pointer to data area to be filled.
*
* @return
* 1 on success (WQE number), 0 on failure.
*/
static int
common/mlx5: fix post doorbell barrier The rdma-core library can map doorbell register in two ways, depending on the environment variable "MLX5_SHUT_UP_BF": - as regular cached memory, the variable is either missing or set to zero. This type of mapping may cause the significant doorbell register writing latency and requires an explicit memory write barrier to mitigate this issue and prevent write combining. - as non-cached memory, the variable is present and set to not "0" value. This type of mapping may cause performance impact under heavy loading conditions but the explicit write memory barrier is not required and it may improve core performance. The UAR creation function maps a doorbell in one of the above ways according to the system. In run time, it always adds an explicit memory barrier after writing to. In cases where the doorbell was mapped as non-cached memory, the explicit memory barrier is unnecessary and may impair performance. The commit [1] solved this problem for a Tx queue. In run time, it checks the mapping type and provides the memory barrier after writing to a Tx doorbell register if it is needed. The mapping type is extracted directly from the uar_mmap_offset field in the queue properties. This patch shares this code between the drivers and extends the above solution for each of them. [1] commit 8409a28573d3 ("net/mlx5: control transmit doorbell register mapping") Fixes: f8c97babc9f4 ("compress/mlx5: add data-path functions") Fixes: 8e196c08ab53 ("crypto/mlx5: support enqueue/dequeue operations") Fixes: 4d4e245ad637 ("regex/mlx5: support enqueue") Cc: stable@dpdk.org Signed-off-by: Michael Baum <michaelba@nvidia.com> Reviewed-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com> Acked-by: Matan Azrad <matan@nvidia.com>
2021-11-03 18:35:13 +00:00
mlx5_aso_ct_sq_query_single(struct mlx5_dev_ctx_shared *sh,
struct mlx5_aso_sq *sq,
struct mlx5_aso_ct_action *ct, char *data,
bool need_lock,
void *user_data,
bool push)
{
volatile struct mlx5_aso_wqe *wqe = NULL;
uint16_t size = 1 << sq->log_desc_n;
uint16_t mask = size - 1;
uint16_t res;
uint16_t wqe_idx;
struct mlx5_aso_ct_pool *pool;
enum mlx5_aso_ct_state state =
__atomic_load_n(&ct->state, __ATOMIC_RELAXED);
if (state == ASO_CONNTRACK_FREE) {
DRV_LOG(ERR, "Fail: No context to query");
return -1;
} else if (state == ASO_CONNTRACK_WAIT) {
return 0;
}
if (need_lock)
rte_spinlock_lock(&sq->sqsl);
res = size - (uint16_t)(sq->head - sq->tail);
if (unlikely(!res)) {
if (need_lock)
rte_spinlock_unlock(&sq->sqsl);
DRV_LOG(ERR, "Fail: SQ is full and no free WQE to send");
return 0;
}
MLX5_ASO_CT_UPDATE_STATE(ct,
user_data ? ASO_CONNTRACK_WAIT_ASYNC : ASO_CONNTRACK_QUERY);
wqe = &sq->sq_obj.aso_wqes[sq->head & mask];
/* Confirm the location and address of the prefetch instruction. */
rte_prefetch0(&sq->sq_obj.aso_wqes[(sq->head + 1) & mask]);
/* Fill next WQE. */
wqe_idx = sq->head & mask;
/* Check if this is async mode. */
if (user_data) {
struct mlx5_hw_q_job *job = (struct mlx5_hw_q_job *)user_data;
sq->elts[wqe_idx].ct = user_data;
job->out_data = (char *)((uintptr_t)sq->mr.addr + wqe_idx * 64);
} else {
sq->elts[wqe_idx].query_data = data;
sq->elts[wqe_idx].ct = ct;
}
pool = __mlx5_aso_ct_get_pool(sh, ct);
/* Each WQE will have a single CT object. */
wqe->general_cseg.misc = rte_cpu_to_be_32(pool->devx_obj->id +
ct->offset);
wqe->general_cseg.opcode = rte_cpu_to_be_32(MLX5_OPCODE_ACCESS_ASO |
(ASO_OPC_MOD_CONNECTION_TRACKING <<
WQE_CSEG_OPC_MOD_OFFSET) |
sq->pi << WQE_CSEG_WQE_INDEX_OFFSET);
/*
* There is no write request is required.
* ASO_OPER_LOGICAL_AND and ASO_OP_ALWAYS_FALSE are both 0.
* "BYTEWISE_64BYTE" is needed for a whole context.
* Set to 0 directly to reduce an endian swap. (Modify should rewrite.)
* "data_mask" is ignored.
* Buffer address was already filled during initialization.
*/
wqe->aso_cseg.operand_masks = rte_cpu_to_be_32(BYTEWISE_64BYTE <<
ASO_CSEG_DATA_MASK_MODE_OFFSET);
wqe->aso_cseg.data_mask = 0;
sq->head++;
/*
* Each WQE contains 2 WQEBB's, even though
* data segment is not used in this case.
*/
sq->pi += 2;
if (push) {
mlx5_doorbell_ring(&sh->tx_uar.bf_db, *(volatile uint64_t *)wqe,
sq->pi, &sq->sq_obj.db_rec[MLX5_SND_DBR],
!sh->tx_uar.dbnc);
sq->db_pi = sq->pi;
}
sq->db = wqe;
if (need_lock)
rte_spinlock_unlock(&sq->sqsl);
return 1;
}
/*
* Handle completions from WQEs sent to ASO CT.
*
* @param[in] mng
* Pointer to the CT pools management structure.
*/
static void
mlx5_aso_ct_completion_handle(struct mlx5_dev_ctx_shared *sh __rte_unused,
struct mlx5_aso_sq *sq,
bool need_lock)
{
struct mlx5_aso_cq *cq = &sq->cq;
volatile struct mlx5_cqe *restrict cqe;
const uint32_t cq_size = 1 << cq->log_desc_n;
const uint32_t mask = cq_size - 1;
uint32_t idx;
uint32_t next_idx;
uint16_t max;
uint16_t n = 0;
int ret;
if (need_lock)
rte_spinlock_lock(&sq->sqsl);
max = (uint16_t)(sq->head - sq->tail);
if (unlikely(!max)) {
if (need_lock)
rte_spinlock_unlock(&sq->sqsl);
return;
}
next_idx = cq->cq_ci & mask;
do {
idx = next_idx;
next_idx = (cq->cq_ci + 1) & mask;
/* Need to confirm the position of the prefetch. */
rte_prefetch0(&cq->cq_obj.cqes[next_idx]);
cqe = &cq->cq_obj.cqes[idx];
ret = check_cqe(cqe, cq_size, cq->cq_ci);
/*
* Be sure owner read is done before any other cookie field or
* opaque field.
*/
rte_io_rmb();
if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) {
if (likely(ret == MLX5_CQE_STATUS_HW_OWN))
break;
mlx5_aso_cqe_err_handle(sq);
} else {
n++;
}
cq->cq_ci++;
} while (1);
if (likely(n)) {
mlx5_aso_ct_status_update(sq, n);
sq->tail += n;
rte_io_wmb();
cq->cq_obj.db_rec[0] = rte_cpu_to_be_32(cq->cq_ci);
}
if (need_lock)
rte_spinlock_unlock(&sq->sqsl);
}
/*
* Update connection tracking ASO context by sending WQE.
*
* @param[in] sh
* Pointer to mlx5_dev_ctx_shared object.
* @param[in] queue
* The queue index.
* @param[in] ct
* Pointer to connection tracking offload object.
* @param[in] profile
* Pointer to connection tracking TCP parameter.
*
* @return
* 0 on success, -1 on failure.
*/
int
mlx5_aso_ct_update_by_wqe(struct mlx5_dev_ctx_shared *sh,
uint32_t queue,
struct mlx5_aso_ct_action *ct,
const struct rte_flow_action_conntrack *profile,
void *user_data,
bool push)
{
uint32_t poll_wqe_times = MLX5_CT_POLL_WQE_CQE_TIMES;
struct mlx5_aso_ct_pool *pool = __mlx5_aso_ct_get_pool(sh, ct);
struct mlx5_aso_sq *sq;
bool need_lock = !!(queue == MLX5_HW_INV_QUEUE);
int ret;
if (sh->config.dv_flow_en == 2)
sq = __mlx5_aso_ct_get_sq_in_hws(queue, pool);
else
sq = __mlx5_aso_ct_get_sq_in_sws(sh, ct);
if (queue != MLX5_HW_INV_QUEUE) {
ret = mlx5_aso_ct_sq_enqueue_single(sh, sq, ct, profile,
need_lock, user_data, push);
return ret > 0 ? 0 : -1;
}
do {
mlx5_aso_ct_completion_handle(sh, sq, need_lock);
if (mlx5_aso_ct_sq_enqueue_single(sh, sq, ct, profile,
need_lock, NULL, true))
return 0;
/* Waiting for wqe resource. */
rte_delay_us_sleep(10u);
} while (--poll_wqe_times);
DRV_LOG(ERR, "Fail to send WQE for ASO CT %d in pool %d",
ct->offset, pool->index);
return -1;
}
/*
* The routine is used to wait for WQE completion to continue with queried data.
*
* @param[in] sh
* Pointer to mlx5_dev_ctx_shared object.
* @param[in] queue
* The queue which CT works on..
* @param[in] ct
* Pointer to connection tracking offload object.
*
* @return
* 0 on success, -1 on failure.
*/
int
mlx5_aso_ct_wait_ready(struct mlx5_dev_ctx_shared *sh, uint32_t queue,
struct mlx5_aso_ct_action *ct)
{
uint32_t poll_cqe_times = MLX5_CT_POLL_WQE_CQE_TIMES;
struct mlx5_aso_ct_pool *pool = __mlx5_aso_ct_get_pool(sh, ct);
struct mlx5_aso_sq *sq;
bool need_lock = !!(queue == MLX5_HW_INV_QUEUE);
if (sh->config.dv_flow_en == 2)
sq = __mlx5_aso_ct_get_sq_in_hws(queue, pool);
else
sq = __mlx5_aso_ct_get_sq_in_sws(sh, ct);
if (__atomic_load_n(&ct->state, __ATOMIC_RELAXED) ==
ASO_CONNTRACK_READY)
return 0;
do {
mlx5_aso_ct_completion_handle(sh, sq, need_lock);
if (__atomic_load_n(&ct->state, __ATOMIC_RELAXED) ==
ASO_CONNTRACK_READY)
return 0;
/* Waiting for CQE ready, consider should block or sleep. */
rte_delay_us_sleep(MLX5_ASO_WQE_CQE_RESPONSE_DELAY);
} while (--poll_cqe_times);
DRV_LOG(ERR, "Fail to poll CQE for ASO CT %d in pool %d",
ct->offset, pool->index);
return -1;
}
/*
* Convert the hardware conntrack data format into the profile.
*
* @param[in] profile
* Pointer to conntrack profile to be filled after query.
* @param[in] wdata
* Pointer to data fetched from hardware.
*/
void
mlx5_aso_ct_obj_analyze(struct rte_flow_action_conntrack *profile,
char *wdata)
{
void *o_dir = MLX5_ADDR_OF(conn_track_aso, wdata, original_dir);
void *r_dir = MLX5_ADDR_OF(conn_track_aso, wdata, reply_dir);
/* MLX5_GET16 should be taken into consideration. */
profile->state = (enum rte_flow_conntrack_state)
MLX5_GET(conn_track_aso, wdata, state);
profile->enable = !MLX5_GET(conn_track_aso, wdata, freeze_track);
profile->selective_ack = MLX5_GET(conn_track_aso, wdata,
sack_permitted);
profile->live_connection = MLX5_GET(conn_track_aso, wdata,
connection_assured);
profile->challenge_ack_passed = MLX5_GET(conn_track_aso, wdata,
challenged_acked);
profile->max_ack_window = MLX5_GET(conn_track_aso, wdata,
max_ack_window);
profile->retransmission_limit = MLX5_GET(conn_track_aso, wdata,
retranmission_limit);
profile->last_window = MLX5_GET(conn_track_aso, wdata, last_win);
profile->last_direction = MLX5_GET(conn_track_aso, wdata, last_dir);
profile->last_index = (enum rte_flow_conntrack_tcp_last_index)
MLX5_GET(conn_track_aso, wdata, last_index);
profile->last_seq = MLX5_GET(conn_track_aso, wdata, last_seq);
profile->last_ack = MLX5_GET(conn_track_aso, wdata, last_ack);
profile->last_end = MLX5_GET(conn_track_aso, wdata, last_end);
profile->liberal_mode = MLX5_GET(conn_track_aso, wdata,
reply_direction_tcp_liberal_enabled) |
MLX5_GET(conn_track_aso, wdata,
original_direction_tcp_liberal_enabled);
/* No liberal in the RTE structure profile. */
profile->reply_dir.scale = MLX5_GET(conn_track_aso, wdata,
reply_direction_tcp_scale);
profile->reply_dir.close_initiated = MLX5_GET(conn_track_aso, wdata,
reply_direction_tcp_close_initiated);
profile->reply_dir.data_unacked = MLX5_GET(conn_track_aso, wdata,
reply_direction_tcp_data_unacked);
profile->reply_dir.last_ack_seen = MLX5_GET(conn_track_aso, wdata,
reply_direction_tcp_max_ack);
profile->reply_dir.sent_end = MLX5_GET(tcp_window_params,
r_dir, sent_end);
profile->reply_dir.reply_end = MLX5_GET(tcp_window_params,
r_dir, reply_end);
profile->reply_dir.max_win = MLX5_GET(tcp_window_params,
r_dir, max_win);
profile->reply_dir.max_ack = MLX5_GET(tcp_window_params,
r_dir, max_ack);
profile->original_dir.scale = MLX5_GET(conn_track_aso, wdata,
original_direction_tcp_scale);
profile->original_dir.close_initiated = MLX5_GET(conn_track_aso, wdata,
original_direction_tcp_close_initiated);
profile->original_dir.data_unacked = MLX5_GET(conn_track_aso, wdata,
original_direction_tcp_data_unacked);
profile->original_dir.last_ack_seen = MLX5_GET(conn_track_aso, wdata,
original_direction_tcp_max_ack);
profile->original_dir.sent_end = MLX5_GET(tcp_window_params,
o_dir, sent_end);
profile->original_dir.reply_end = MLX5_GET(tcp_window_params,
o_dir, reply_end);
profile->original_dir.max_win = MLX5_GET(tcp_window_params,
o_dir, max_win);
profile->original_dir.max_ack = MLX5_GET(tcp_window_params,
o_dir, max_ack);
}
/*
* Query connection tracking information parameter by send WQE.
*
* @param[in] dev
* Pointer to Ethernet device.
* @param[in] ct
* Pointer to connection tracking offload object.
* @param[out] profile
* Pointer to connection tracking TCP information.
*
* @return
* 0 on success, -1 on failure.
*/
int
mlx5_aso_ct_query_by_wqe(struct mlx5_dev_ctx_shared *sh,
uint32_t queue,
struct mlx5_aso_ct_action *ct,
struct rte_flow_action_conntrack *profile,
void *user_data, bool push)
{
uint32_t poll_wqe_times = MLX5_CT_POLL_WQE_CQE_TIMES;
struct mlx5_aso_ct_pool *pool = __mlx5_aso_ct_get_pool(sh, ct);
struct mlx5_aso_sq *sq;
bool need_lock = !!(queue == MLX5_HW_INV_QUEUE);
char out_data[64 * 2];
int ret;
if (sh->config.dv_flow_en == 2)
sq = __mlx5_aso_ct_get_sq_in_hws(queue, pool);
else
sq = __mlx5_aso_ct_get_sq_in_sws(sh, ct);
if (queue != MLX5_HW_INV_QUEUE) {
ret = mlx5_aso_ct_sq_query_single(sh, sq, ct, out_data,
need_lock, user_data, push);
return ret > 0 ? 0 : -1;
}
do {
mlx5_aso_ct_completion_handle(sh, sq, need_lock);
ret = mlx5_aso_ct_sq_query_single(sh, sq, ct, out_data,
need_lock, NULL, true);
if (ret < 0)
return ret;
else if (ret > 0)
goto data_handle;
/* Waiting for wqe resource or state. */
else
rte_delay_us_sleep(10u);
} while (--poll_wqe_times);
DRV_LOG(ERR, "Fail to send WQE for ASO CT %d in pool %d",
ct->offset, pool->index);
return -1;
data_handle:
ret = mlx5_aso_ct_wait_ready(sh, queue, ct);
if (!ret)
mlx5_aso_ct_obj_analyze(profile, out_data);
return ret;
}
/*
* Make sure the conntrack context is synchronized with hardware before
* creating a flow rule that uses it.
*
* @param[in] sh
* Pointer to shared device context.
* @param[in] ct
* Pointer to connection tracking offload object.
*
* @return
* 0 on success, a negative errno value otherwise and rte_errno is set.
*/
int
mlx5_aso_ct_available(struct mlx5_dev_ctx_shared *sh,
uint32_t queue,
struct mlx5_aso_ct_action *ct)
{
struct mlx5_aso_ct_pool *pool = __mlx5_aso_ct_get_pool(sh, ct);
struct mlx5_aso_sq *sq;
bool need_lock = !!(queue == MLX5_HW_INV_QUEUE);
uint32_t poll_cqe_times = MLX5_CT_POLL_WQE_CQE_TIMES;
enum mlx5_aso_ct_state state =
__atomic_load_n(&ct->state, __ATOMIC_RELAXED);
if (sh->config.dv_flow_en == 2)
sq = __mlx5_aso_ct_get_sq_in_hws(queue, pool);
else
sq = __mlx5_aso_ct_get_sq_in_sws(sh, ct);
if (state == ASO_CONNTRACK_FREE) {
rte_errno = ENXIO;
return -rte_errno;
} else if (state == ASO_CONNTRACK_READY ||
state == ASO_CONNTRACK_QUERY ||
state == ASO_CONNTRACK_WAIT_ASYNC) {
return 0;
}
do {
mlx5_aso_ct_completion_handle(sh, sq, need_lock);
state = __atomic_load_n(&ct->state, __ATOMIC_RELAXED);
if (state == ASO_CONNTRACK_READY ||
state == ASO_CONNTRACK_QUERY)
return 0;
/* Waiting for CQE ready, consider should block or sleep. */
rte_delay_us_block(MLX5_ASO_WQE_CQE_RESPONSE_DELAY);
} while (--poll_cqe_times);
rte_errno = EBUSY;
return -rte_errno;
}
net/mlx5: support flow counter action for HWS This commit adds HW steering counter action support. The pool mechanism is the basic data structure for the HW steering counter. The HW steering's counter pool is based on the rte_ring of zero-copy variation. There are two global rte_rings: 1. free_list: Store the counters indexes, which are ready for use. 2. wait_reset_list: Store the counters indexes, which are just freed from the user and need to query the hardware counter to get the reset value before this counter can be reused again. The counter pool also supports cache per HW steering's queues, which are also based on the rte_ring of zero-copy variation. The cache can be configured in size, preload, threshold, and fetch size, they are all exposed via device args. The main operations of the counter pool are as follows: - Get one counter from the pool: 1. The user call _get_* API. 2. If the cache is enabled, dequeue one counter index from the local cache: 2. A: if the dequeued one from the local cache is still in reset status (counter's query_gen_when_free is equal to pool's query gen): I. Flush all counters in the local cache back to global wait_reset_list. II. Fetch _fetch_sz_ counters into the cache from the global free list. III. Fetch one counter from the cache. 3. If the cache is empty, fetch _fetch_sz_ counters from the global free list into the cache and fetch one counter from the cache. - Free one counter into the pool: 1. The user calls _put_* API. 2. Put the counter into the local cache. 3. If the local cache is full: A: Write back all counters above _threshold_ into the global wait_reset_list. B: Also, write back this counter into the global wait_reset_list. When the local cache is disabled, _get_/_put_ cache directly from/into global list. Signed-off-by: Xiaoyu Min <jackmin@nvidia.com> Acked-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>
2022-10-20 15:41:42 +00:00
int
mlx5_aso_cnt_queue_init(struct mlx5_dev_ctx_shared *sh)
{
struct mlx5_hws_aso_mng *aso_mng = NULL;
uint8_t idx;
struct mlx5_aso_sq *sq;
MLX5_ASSERT(sh);
MLX5_ASSERT(sh->cnt_svc);
aso_mng = &sh->cnt_svc->aso_mng;
aso_mng->sq_num = HWS_CNT_ASO_SQ_NUM;
for (idx = 0; idx < HWS_CNT_ASO_SQ_NUM; idx++) {
sq = &aso_mng->sqs[idx];
if (mlx5_aso_sq_create(sh->cdev, sq, sh->tx_uar.obj,
MLX5_ASO_CNT_QUEUE_LOG_DESC))
goto error;
mlx5_aso_cnt_init_sq(sq);
}
return 0;
error:
mlx5_aso_cnt_queue_uninit(sh);
return -1;
}
void
mlx5_aso_cnt_queue_uninit(struct mlx5_dev_ctx_shared *sh)
{
uint16_t idx;
for (idx = 0; idx < sh->cnt_svc->aso_mng.sq_num; idx++)
mlx5_aso_destroy_sq(&sh->cnt_svc->aso_mng.sqs[idx]);
sh->cnt_svc->aso_mng.sq_num = 0;
}
static uint16_t
mlx5_aso_cnt_sq_enqueue_burst(struct mlx5_hws_cnt_pool *cpool,
struct mlx5_dev_ctx_shared *sh,
struct mlx5_aso_sq *sq, uint32_t n,
uint32_t offset, uint32_t dcs_id_base)
{
volatile struct mlx5_aso_wqe *wqe;
uint16_t size = 1 << sq->log_desc_n;
uint16_t mask = size - 1;
uint16_t max;
uint32_t upper_offset = offset;
uint64_t addr;
uint32_t ctrl_gen_id = 0;
uint8_t opcmod = sh->cdev->config.hca_attr.flow_access_aso_opc_mod;
rte_be32_t lkey = rte_cpu_to_be_32(cpool->raw_mng->mr.lkey);
uint16_t aso_n = (uint16_t)(RTE_ALIGN_CEIL(n, 4) / 4);
uint32_t ccntid;
max = RTE_MIN(size - (uint16_t)(sq->head - sq->tail), aso_n);
if (unlikely(!max))
return 0;
upper_offset += (max * 4);
/* Because only one burst at one time, we can use the same elt. */
sq->elts[0].burst_size = max;
ctrl_gen_id = dcs_id_base;
ctrl_gen_id /= 4;
do {
ccntid = upper_offset - max * 4;
wqe = &sq->sq_obj.aso_wqes[sq->head & mask];
rte_prefetch0(&sq->sq_obj.aso_wqes[(sq->head + 1) & mask]);
wqe->general_cseg.misc = rte_cpu_to_be_32(ctrl_gen_id);
wqe->general_cseg.flags = RTE_BE32(MLX5_COMP_ONLY_FIRST_ERR <<
MLX5_COMP_MODE_OFFSET);
wqe->general_cseg.opcode = rte_cpu_to_be_32
(MLX5_OPCODE_ACCESS_ASO |
(opcmod <<
WQE_CSEG_OPC_MOD_OFFSET) |
(sq->pi <<
WQE_CSEG_WQE_INDEX_OFFSET));
addr = (uint64_t)RTE_PTR_ADD(cpool->raw_mng->raw,
ccntid * sizeof(struct flow_counter_stats));
wqe->aso_cseg.va_h = rte_cpu_to_be_32((uint32_t)(addr >> 32));
wqe->aso_cseg.va_l_r = rte_cpu_to_be_32((uint32_t)addr | 1u);
wqe->aso_cseg.lkey = lkey;
sq->pi += 2; /* Each WQE contains 2 WQEBB's. */
sq->head++;
sq->next++;
ctrl_gen_id++;
max--;
} while (max);
wqe->general_cseg.flags = RTE_BE32(MLX5_COMP_ALWAYS <<
MLX5_COMP_MODE_OFFSET);
mlx5_doorbell_ring(&sh->tx_uar.bf_db, *(volatile uint64_t *)wqe,
sq->pi, &sq->sq_obj.db_rec[MLX5_SND_DBR],
!sh->tx_uar.dbnc);
return sq->elts[0].burst_size;
}
static uint16_t
mlx5_aso_cnt_completion_handle(struct mlx5_aso_sq *sq)
{
struct mlx5_aso_cq *cq = &sq->cq;
volatile struct mlx5_cqe *restrict cqe;
const unsigned int cq_size = 1 << cq->log_desc_n;
const unsigned int mask = cq_size - 1;
uint32_t idx;
uint32_t next_idx = cq->cq_ci & mask;
const uint16_t max = (uint16_t)(sq->head - sq->tail);
uint16_t i = 0;
int ret;
if (unlikely(!max))
return 0;
idx = next_idx;
next_idx = (cq->cq_ci + 1) & mask;
rte_prefetch0(&cq->cq_obj.cqes[next_idx]);
cqe = &cq->cq_obj.cqes[idx];
ret = check_cqe(cqe, cq_size, cq->cq_ci);
/*
* Be sure owner read is done before any other cookie field or
* opaque field.
*/
rte_io_rmb();
if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) {
if (likely(ret == MLX5_CQE_STATUS_HW_OWN))
return 0; /* return immediately. */
mlx5_aso_cqe_err_handle(sq);
}
i += sq->elts[0].burst_size;
sq->elts[0].burst_size = 0;
cq->cq_ci++;
if (likely(i)) {
sq->tail += i;
rte_io_wmb();
cq->cq_obj.db_rec[0] = rte_cpu_to_be_32(cq->cq_ci);
}
return i;
}
static uint16_t
mlx5_aso_cnt_query_one_dcs(struct mlx5_dev_ctx_shared *sh,
struct mlx5_hws_cnt_pool *cpool,
uint8_t dcs_idx, uint32_t num)
{
uint32_t dcs_id = cpool->dcs_mng.dcs[dcs_idx].obj->id;
uint64_t cnt_num = cpool->dcs_mng.dcs[dcs_idx].batch_sz;
uint64_t left;
uint32_t iidx = cpool->dcs_mng.dcs[dcs_idx].iidx;
uint32_t offset;
uint16_t mask;
uint16_t sq_idx;
uint64_t burst_sz = (uint64_t)(1 << MLX5_ASO_CNT_QUEUE_LOG_DESC) * 4 *
sh->cnt_svc->aso_mng.sq_num;
uint64_t qburst_sz = burst_sz / sh->cnt_svc->aso_mng.sq_num;
uint64_t n;
struct mlx5_aso_sq *sq;
cnt_num = RTE_MIN(num, cnt_num);
left = cnt_num;
while (left) {
mask = 0;
for (sq_idx = 0; sq_idx < sh->cnt_svc->aso_mng.sq_num;
sq_idx++) {
if (left == 0) {
mask |= (1 << sq_idx);
continue;
}
n = RTE_MIN(left, qburst_sz);
offset = cnt_num - left;
offset += iidx;
mlx5_aso_cnt_sq_enqueue_burst(cpool, sh,
&sh->cnt_svc->aso_mng.sqs[sq_idx], n,
offset, dcs_id);
left -= n;
}
do {
for (sq_idx = 0; sq_idx < sh->cnt_svc->aso_mng.sq_num;
sq_idx++) {
sq = &sh->cnt_svc->aso_mng.sqs[sq_idx];
if (mlx5_aso_cnt_completion_handle(sq))
mask |= (1 << sq_idx);
}
} while (mask < ((1 << sh->cnt_svc->aso_mng.sq_num) - 1));
}
return cnt_num;
}
/*
* Query FW counter via ASO WQE.
*
* ASO query counter use _sync_ mode, means:
* 1. each SQ issue one burst with several WQEs
* 2. ask for CQE at last WQE
* 3. busy poll CQ of each SQ's
* 4. If all SQ's CQE are received then goto step 1, issue next burst
*
* @param[in] sh
* Pointer to shared device.
* @param[in] cpool
* Pointer to counter pool.
*
* @return
* 0 on success, -1 on failure.
*/
int
mlx5_aso_cnt_query(struct mlx5_dev_ctx_shared *sh,
struct mlx5_hws_cnt_pool *cpool)
{
uint32_t idx;
uint32_t num;
uint32_t cnt_num = mlx5_hws_cnt_pool_get_size(cpool) -
rte_ring_count(cpool->free_list);
for (idx = 0; idx < cpool->dcs_mng.batch_total; idx++) {
num = RTE_MIN(cnt_num, cpool->dcs_mng.dcs[idx].batch_sz);
mlx5_aso_cnt_query_one_dcs(sh, cpool, idx, num);
cnt_num -= num;
if (cnt_num == 0)
break;
}
return 0;
}