vdpa/mlx5: add VM memory registration task

The driver creates a direct MR object of
the HW for each VM memory region,
which maps the VM physical address to
the actual physical address.

Later, after all the MRs are ready,
the driver creates an indirect MR to group all the direct MRs
into one virtual space from the HW perspective.

Create direct MRs in parallel using the MT mechanism.
After completion, the primary thread creates the indirect MR
needed for the following virtqs configurations.

This optimization accelerrate the LM process and
reduce its time by 5%.

Signed-off-by: Li Zhang <lizh@nvidia.com>
Acked-by: Matan Azrad <matan@nvidia.com>
Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
This commit is contained in:
Li Zhang 2022-06-18 12:02:53 +03:00 committed by Maxime Coquelin
parent 69e07f43a2
commit 06ebaaea20
5 changed files with 259 additions and 98 deletions

View File

@ -772,7 +772,6 @@ mlx5_vdpa_dev_probe(struct mlx5_common_device *cdev,
rte_errno = rte_errno ? rte_errno : EINVAL;
goto error;
}
SLIST_INIT(&priv->mr_list);
pthread_mutex_lock(&priv_list_lock);
TAILQ_INSERT_TAIL(&priv_list, priv, next);
pthread_mutex_unlock(&priv_list_lock);

View File

@ -59,7 +59,6 @@ struct mlx5_vdpa_event_qp {
};
struct mlx5_vdpa_query_mr {
SLIST_ENTRY(mlx5_vdpa_query_mr) next;
union {
struct ibv_mr *mr;
struct mlx5_devx_obj *mkey;
@ -76,10 +75,17 @@ enum {
#define MLX5_VDPA_MAX_C_THRD 256
#define MLX5_VDPA_MAX_TASKS_PER_THRD 4096
#define MLX5_VDPA_TASKS_PER_DEV 64
#define MLX5_VDPA_MAX_MRS 0xFFFF
/* Vdpa task types. */
enum mlx5_vdpa_task_type {
MLX5_VDPA_TASK_REG_MR = 1,
};
/* Generic task information and size must be multiple of 4B. */
struct mlx5_vdpa_task {
struct mlx5_vdpa_priv *priv;
enum mlx5_vdpa_task_type type;
uint32_t *remaining_cnt;
uint32_t *err_cnt;
uint32_t idx;
@ -101,6 +107,14 @@ struct mlx5_vdpa_conf_thread_mng {
};
extern struct mlx5_vdpa_conf_thread_mng conf_thread_mng;
struct mlx5_vdpa_vmem_info {
struct rte_vhost_memory *vmem;
uint32_t entries_num;
uint64_t gcd;
uint64_t size;
uint8_t mode;
};
struct mlx5_vdpa_virtq {
SLIST_ENTRY(mlx5_vdpa_virtq) next;
uint8_t enable;
@ -176,7 +190,7 @@ struct mlx5_vdpa_priv {
struct mlx5_hca_vdpa_attr caps;
uint32_t gpa_mkey_index;
struct ibv_mr *null_mr;
struct rte_vhost_memory *vmem;
struct mlx5_vdpa_vmem_info vmem_info;
struct mlx5dv_devx_event_channel *eventc;
struct mlx5dv_devx_event_channel *err_chnl;
struct mlx5_uar uar;
@ -187,11 +201,13 @@ struct mlx5_vdpa_priv {
uint8_t num_lag_ports;
uint64_t features; /* Negotiated features. */
uint16_t log_max_rqt_size;
uint16_t last_c_thrd_idx;
uint16_t num_mrs; /* Number of memory regions. */
struct mlx5_vdpa_steer steer;
struct mlx5dv_var *var;
void *virtq_db_addr;
struct mlx5_pmd_wrapped_mr lm_mr;
SLIST_HEAD(mr_list, mlx5_vdpa_query_mr) mr_list;
struct mlx5_vdpa_query_mr **mrs;
struct mlx5_vdpa_virtq virtqs[];
};
@ -548,5 +564,12 @@ mlx5_vdpa_mult_threads_destroy(bool need_unlock);
bool
mlx5_vdpa_task_add(struct mlx5_vdpa_priv *priv,
uint32_t thrd_idx,
uint32_t num);
enum mlx5_vdpa_task_type task_type,
uint32_t *bulk_refcnt, uint32_t *bulk_err_cnt,
void **task_data, uint32_t num);
int
mlx5_vdpa_register_mr(struct mlx5_vdpa_priv *priv, uint32_t idx);
bool
mlx5_vdpa_c_thread_wait_bulk_tasks_done(uint32_t *remaining_cnt,
uint32_t *err_cnt, uint32_t sleep_time);
#endif /* RTE_PMD_MLX5_VDPA_H_ */

View File

@ -47,16 +47,23 @@ mlx5_vdpa_c_thrd_ring_enqueue_bulk(struct rte_ring *r,
bool
mlx5_vdpa_task_add(struct mlx5_vdpa_priv *priv,
uint32_t thrd_idx,
uint32_t num)
enum mlx5_vdpa_task_type task_type,
uint32_t *remaining_cnt, uint32_t *err_cnt,
void **task_data, uint32_t num)
{
struct rte_ring *rng = conf_thread_mng.cthrd[thrd_idx].rng;
struct mlx5_vdpa_task task[MLX5_VDPA_TASKS_PER_DEV];
uint32_t *data = (uint32_t *)task_data;
uint32_t i;
MLX5_ASSERT(num <= MLX5_VDPA_TASKS_PER_DEV);
for (i = 0 ; i < num; i++) {
task[i].priv = priv;
/* To be added later. */
task[i].type = task_type;
task[i].remaining_cnt = remaining_cnt;
task[i].err_cnt = err_cnt;
task[i].idx = data[i];
}
if (!mlx5_vdpa_c_thrd_ring_enqueue_bulk(rng, (void **)&task, num, NULL))
return -1;
@ -71,6 +78,23 @@ mlx5_vdpa_task_add(struct mlx5_vdpa_priv *priv,
return 0;
}
bool
mlx5_vdpa_c_thread_wait_bulk_tasks_done(uint32_t *remaining_cnt,
uint32_t *err_cnt, uint32_t sleep_time)
{
/* Check and wait all tasks done. */
while (__atomic_load_n(remaining_cnt,
__ATOMIC_RELAXED) != 0) {
rte_delay_us_sleep(sleep_time);
}
if (__atomic_load_n(err_cnt,
__ATOMIC_RELAXED)) {
DRV_LOG(ERR, "Tasks done with error.");
return true;
}
return false;
}
static void *
mlx5_vdpa_c_thread_handle(void *arg)
{
@ -81,6 +105,7 @@ mlx5_vdpa_c_thread_handle(void *arg)
struct rte_ring *rng;
uint32_t thrd_idx;
uint32_t task_num;
int ret;
for (thrd_idx = 0; thrd_idx < multhrd->max_thrds;
thrd_idx++)
@ -99,13 +124,29 @@ mlx5_vdpa_c_thread_handle(void *arg)
&multhrd->cthrd[thrd_idx].c_cond,
&multhrd->cthrd_lock);
pthread_mutex_unlock(&multhrd->cthrd_lock);
continue;
}
priv = task.priv;
if (priv == NULL)
continue;
__atomic_fetch_sub(task.remaining_cnt,
switch (task.type) {
case MLX5_VDPA_TASK_REG_MR:
ret = mlx5_vdpa_register_mr(priv, task.idx);
if (ret) {
DRV_LOG(ERR,
"Failed to register mr %d.", task.idx);
__atomic_fetch_add(task.err_cnt, 1,
__ATOMIC_RELAXED);
}
break;
default:
DRV_LOG(ERR, "Invalid vdpa task type %d.",
task.type);
break;
}
if (task.remaining_cnt)
__atomic_fetch_sub(task.remaining_cnt,
1, __ATOMIC_RELAXED);
/* To be added later. */
}
return NULL;
}

View File

@ -17,25 +17,33 @@
void
mlx5_vdpa_mem_dereg(struct mlx5_vdpa_priv *priv)
{
struct mlx5_vdpa_query_mr *mrs =
(struct mlx5_vdpa_query_mr *)priv->mrs;
struct mlx5_vdpa_query_mr *entry;
struct mlx5_vdpa_query_mr *next;
int i;
entry = SLIST_FIRST(&priv->mr_list);
while (entry) {
next = SLIST_NEXT(entry, next);
if (entry->is_indirect)
claim_zero(mlx5_devx_cmd_destroy(entry->mkey));
else
claim_zero(mlx5_glue->dereg_mr(entry->mr));
SLIST_REMOVE(&priv->mr_list, entry, mlx5_vdpa_query_mr, next);
rte_free(entry);
entry = next;
if (priv->mrs) {
for (i = priv->num_mrs - 1; i >= 0; i--) {
entry = &mrs[i];
if (entry->is_indirect) {
if (entry->mkey)
claim_zero(
mlx5_devx_cmd_destroy(entry->mkey));
} else {
if (entry->mr)
claim_zero(
mlx5_glue->dereg_mr(entry->mr));
}
}
rte_free(priv->mrs);
priv->mrs = NULL;
priv->num_mrs = 0;
}
SLIST_INIT(&priv->mr_list);
if (priv->vmem) {
free(priv->vmem);
priv->vmem = NULL;
if (priv->vmem_info.vmem) {
free(priv->vmem_info.vmem);
priv->vmem_info.vmem = NULL;
}
priv->gpa_mkey_index = 0;
}
static int
@ -167,72 +175,29 @@ mlx5_vdpa_mem_cmp(struct rte_vhost_memory *mem1, struct rte_vhost_memory *mem2)
#define KLM_SIZE_MAX_ALIGN(sz) ((sz) > MLX5_MAX_KLM_BYTE_COUNT ? \
MLX5_MAX_KLM_BYTE_COUNT : (sz))
/*
* The target here is to group all the physical memory regions of the
* virtio device in one indirect mkey.
* For KLM Fixed Buffer Size mode (HW find the translation entry in one
* read according to the guest physical address):
* All the sub-direct mkeys of it must be in the same size, hence, each
* one of them should be in the GCD size of all the virtio memory
* regions and the holes between them.
* For KLM mode (each entry may be in different size so HW must iterate
* the entries):
* Each virtio memory region and each hole between them have one entry,
* just need to cover the maximum allowed size(2G) by splitting entries
* which their associated memory regions are bigger than 2G.
* It means that each virtio memory region may be mapped to more than
* one direct mkey in the 2 modes.
* All the holes of invalid memory between the virtio memory regions
* will be mapped to the null memory region for security.
*/
int
mlx5_vdpa_mem_register(struct mlx5_vdpa_priv *priv)
static int
mlx5_vdpa_create_indirect_mkey(struct mlx5_vdpa_priv *priv)
{
struct mlx5_devx_mkey_attr mkey_attr;
struct mlx5_vdpa_query_mr *entry = NULL;
struct rte_vhost_mem_region *reg = NULL;
uint8_t mode = 0;
uint32_t entries_num = 0;
uint32_t i;
uint64_t gcd = 0;
uint64_t klm_size;
uint64_t mem_size;
uint64_t k;
int klm_index = 0;
int ret;
struct rte_vhost_memory *mem = mlx5_vdpa_vhost_mem_regions_prepare
(priv->vid, &mode, &mem_size, &gcd, &entries_num);
struct mlx5_vdpa_query_mr *mrs =
(struct mlx5_vdpa_query_mr *)priv->mrs;
struct mlx5_vdpa_query_mr *entry;
struct rte_vhost_mem_region *reg;
uint8_t mode = priv->vmem_info.mode;
uint32_t entries_num = priv->vmem_info.entries_num;
struct rte_vhost_memory *mem = priv->vmem_info.vmem;
struct mlx5_klm klm_array[entries_num];
uint64_t gcd = priv->vmem_info.gcd;
int ret = -rte_errno;
uint64_t klm_size;
int klm_index = 0;
uint64_t k;
uint32_t i;
if (!mem)
return -rte_errno;
if (priv->vmem != NULL) {
if (mlx5_vdpa_mem_cmp(mem, priv->vmem) == 0) {
/* VM memory not changed, reuse resources. */
free(mem);
return 0;
}
mlx5_vdpa_mem_dereg(priv);
}
priv->vmem = mem;
/* If it is the last entry, create indirect mkey. */
for (i = 0; i < mem->nregions; i++) {
entry = &mrs[i];
reg = &mem->regions[i];
entry = rte_zmalloc(__func__, sizeof(*entry), 0);
if (!entry) {
ret = -ENOMEM;
DRV_LOG(ERR, "Failed to allocate mem entry memory.");
goto error;
}
entry->mr = mlx5_glue->reg_mr_iova(priv->cdev->pd,
(void *)(uintptr_t)(reg->host_user_addr),
reg->size, reg->guest_phys_addr,
IBV_ACCESS_LOCAL_WRITE);
if (!entry->mr) {
DRV_LOG(ERR, "Failed to create direct Mkey.");
ret = -rte_errno;
goto error;
}
entry->is_indirect = 0;
if (i > 0) {
uint64_t sadd;
uint64_t empty_region_sz = reg->guest_phys_addr -
@ -265,11 +230,10 @@ mlx5_vdpa_mem_register(struct mlx5_vdpa_priv *priv)
klm_array[klm_index].address = reg->guest_phys_addr + k;
klm_index++;
}
SLIST_INSERT_HEAD(&priv->mr_list, entry, next);
}
memset(&mkey_attr, 0, sizeof(mkey_attr));
mkey_attr.addr = (uintptr_t)(mem->regions[0].guest_phys_addr);
mkey_attr.size = mem_size;
mkey_attr.size = priv->vmem_info.size;
mkey_attr.pd = priv->cdev->pdn;
mkey_attr.umem_id = 0;
/* Must be zero for KLM mode. */
@ -278,25 +242,159 @@ mlx5_vdpa_mem_register(struct mlx5_vdpa_priv *priv)
mkey_attr.pg_access = 0;
mkey_attr.klm_array = klm_array;
mkey_attr.klm_num = klm_index;
entry = rte_zmalloc(__func__, sizeof(*entry), 0);
if (!entry) {
DRV_LOG(ERR, "Failed to allocate memory for indirect entry.");
ret = -ENOMEM;
goto error;
}
entry = &mrs[mem->nregions];
entry->mkey = mlx5_devx_cmd_mkey_create(priv->cdev->ctx, &mkey_attr);
if (!entry->mkey) {
DRV_LOG(ERR, "Failed to create indirect Mkey.");
ret = -rte_errno;
goto error;
rte_errno = -ret;
return ret;
}
entry->is_indirect = 1;
SLIST_INSERT_HEAD(&priv->mr_list, entry, next);
priv->gpa_mkey_index = entry->mkey->id;
return 0;
}
/*
* The target here is to group all the physical memory regions of the
* virtio device in one indirect mkey.
* For KLM Fixed Buffer Size mode (HW find the translation entry in one
* read according to the guest phisical address):
* All the sub-direct mkeys of it must be in the same size, hence, each
* one of them should be in the GCD size of all the virtio memory
* regions and the holes between them.
* For KLM mode (each entry may be in different size so HW must iterate
* the entries):
* Each virtio memory region and each hole between them have one entry,
* just need to cover the maximum allowed size(2G) by splitting entries
* which their associated memory regions are bigger than 2G.
* It means that each virtio memory region may be mapped to more than
* one direct mkey in the 2 modes.
* All the holes of invalid memory between the virtio memory regions
* will be mapped to the null memory region for security.
*/
int
mlx5_vdpa_mem_register(struct mlx5_vdpa_priv *priv)
{
void *mrs;
uint8_t mode = 0;
int ret = -rte_errno;
uint32_t i, thrd_idx, data[1];
uint32_t remaining_cnt = 0, err_cnt = 0, task_num = 0;
struct rte_vhost_memory *mem = mlx5_vdpa_vhost_mem_regions_prepare
(priv->vid, &mode, &priv->vmem_info.size,
&priv->vmem_info.gcd, &priv->vmem_info.entries_num);
if (!mem)
return -rte_errno;
if (priv->vmem_info.vmem != NULL) {
if (mlx5_vdpa_mem_cmp(mem, priv->vmem_info.vmem) == 0) {
/* VM memory not changed, reuse resources. */
free(mem);
return 0;
}
mlx5_vdpa_mem_dereg(priv);
}
priv->vmem_info.vmem = mem;
priv->vmem_info.mode = mode;
priv->num_mrs = mem->nregions;
if (!priv->num_mrs || priv->num_mrs >= MLX5_VDPA_MAX_MRS) {
DRV_LOG(ERR,
"Invalid number of memory regions.");
goto error;
}
/* The last one is indirect mkey entry. */
priv->num_mrs++;
mrs = rte_zmalloc("mlx5 vDPA memory regions",
sizeof(struct mlx5_vdpa_query_mr) * priv->num_mrs, 0);
priv->mrs = mrs;
if (!priv->mrs) {
DRV_LOG(ERR, "Failed to allocate private memory regions.");
goto error;
}
if (priv->use_c_thread) {
uint32_t main_task_idx[mem->nregions];
for (i = 0; i < mem->nregions; i++) {
thrd_idx = i % (conf_thread_mng.max_thrds + 1);
if (!thrd_idx) {
main_task_idx[task_num] = i;
task_num++;
continue;
}
thrd_idx = priv->last_c_thrd_idx + 1;
if (thrd_idx >= conf_thread_mng.max_thrds)
thrd_idx = 0;
priv->last_c_thrd_idx = thrd_idx;
data[0] = i;
if (mlx5_vdpa_task_add(priv, thrd_idx,
MLX5_VDPA_TASK_REG_MR,
&remaining_cnt, &err_cnt,
(void **)&data, 1)) {
DRV_LOG(ERR,
"Fail to add task mem region (%d)", i);
main_task_idx[task_num] = i;
task_num++;
}
}
for (i = 0; i < task_num; i++) {
ret = mlx5_vdpa_register_mr(priv,
main_task_idx[i]);
if (ret) {
DRV_LOG(ERR,
"Failed to register mem region %d.", i);
goto error;
}
}
if (mlx5_vdpa_c_thread_wait_bulk_tasks_done(&remaining_cnt,
&err_cnt, 100)) {
DRV_LOG(ERR,
"Failed to wait register mem region tasks ready.");
goto error;
}
} else {
for (i = 0; i < mem->nregions; i++) {
ret = mlx5_vdpa_register_mr(priv, i);
if (ret) {
DRV_LOG(ERR,
"Failed to register mem region %d.", i);
goto error;
}
}
}
ret = mlx5_vdpa_create_indirect_mkey(priv);
if (ret) {
DRV_LOG(ERR, "Failed to create indirect mkey .");
goto error;
}
return 0;
error:
rte_free(entry);
mlx5_vdpa_mem_dereg(priv);
rte_errno = -ret;
return ret;
}
int
mlx5_vdpa_register_mr(struct mlx5_vdpa_priv *priv, uint32_t idx)
{
struct rte_vhost_memory *mem = priv->vmem_info.vmem;
struct mlx5_vdpa_query_mr *mrs =
(struct mlx5_vdpa_query_mr *)priv->mrs;
struct mlx5_vdpa_query_mr *entry;
struct rte_vhost_mem_region *reg;
int ret;
reg = &mem->regions[idx];
entry = &mrs[idx];
entry->mr = mlx5_glue->reg_mr_iova
(priv->cdev->pd,
(void *)(uintptr_t)(reg->host_user_addr),
reg->size, reg->guest_phys_addr,
IBV_ACCESS_LOCAL_WRITE);
if (!entry->mr) {
DRV_LOG(ERR, "Failed to create direct Mkey.");
ret = -rte_errno;
return ret;
}
entry->is_indirect = 0;
return 0;
}

View File

@ -353,21 +353,21 @@ mlx5_vdpa_virtq_sub_objs_prepare(struct mlx5_vdpa_priv *priv,
}
}
if (attr->q_type == MLX5_VIRTQ_TYPE_SPLIT) {
gpa = mlx5_vdpa_hva_to_gpa(priv->vmem,
gpa = mlx5_vdpa_hva_to_gpa(priv->vmem_info.vmem,
(uint64_t)(uintptr_t)vq->desc);
if (!gpa) {
DRV_LOG(ERR, "Failed to get descriptor ring GPA.");
return -1;
}
attr->desc_addr = gpa;
gpa = mlx5_vdpa_hva_to_gpa(priv->vmem,
gpa = mlx5_vdpa_hva_to_gpa(priv->vmem_info.vmem,
(uint64_t)(uintptr_t)vq->used);
if (!gpa) {
DRV_LOG(ERR, "Failed to get GPA for used ring.");
return -1;
}
attr->used_addr = gpa;
gpa = mlx5_vdpa_hva_to_gpa(priv->vmem,
gpa = mlx5_vdpa_hva_to_gpa(priv->vmem_info.vmem,
(uint64_t)(uintptr_t)vq->avail);
if (!gpa) {
DRV_LOG(ERR, "Failed to get GPA for available ring.");