numam-dpdk/drivers/vdpa/mlx5/mlx5_vdpa_mem.c
Li Zhang 06ebaaea20 vdpa/mlx5: add VM memory registration task
The driver creates a direct MR object of
the HW for each VM memory region,
which maps the VM physical address to
the actual physical address.

Later, after all the MRs are ready,
the driver creates an indirect MR to group all the direct MRs
into one virtual space from the HW perspective.

Create direct MRs in parallel using the MT mechanism.
After completion, the primary thread creates the indirect MR
needed for the following virtqs configurations.

This optimization accelerrate the LM process and
reduce its time by 5%.

Signed-off-by: Li Zhang <lizh@nvidia.com>
Acked-by: Matan Azrad <matan@nvidia.com>
Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
2022-06-21 11:18:15 +02:00

401 lines
12 KiB
C

/* SPDX-License-Identifier: BSD-3-Clause
* Copyright 2019 Mellanox Technologies, Ltd
*/
#include <stdlib.h>
#include <rte_malloc.h>
#include <rte_errno.h>
#include <rte_common.h>
#include <rte_sched_common.h>
#include <mlx5_prm.h>
#include <mlx5_common.h>
#include "mlx5_vdpa_utils.h"
#include "mlx5_vdpa.h"
void
mlx5_vdpa_mem_dereg(struct mlx5_vdpa_priv *priv)
{
struct mlx5_vdpa_query_mr *mrs =
(struct mlx5_vdpa_query_mr *)priv->mrs;
struct mlx5_vdpa_query_mr *entry;
int i;
if (priv->mrs) {
for (i = priv->num_mrs - 1; i >= 0; i--) {
entry = &mrs[i];
if (entry->is_indirect) {
if (entry->mkey)
claim_zero(
mlx5_devx_cmd_destroy(entry->mkey));
} else {
if (entry->mr)
claim_zero(
mlx5_glue->dereg_mr(entry->mr));
}
}
rte_free(priv->mrs);
priv->mrs = NULL;
priv->num_mrs = 0;
}
if (priv->vmem_info.vmem) {
free(priv->vmem_info.vmem);
priv->vmem_info.vmem = NULL;
}
priv->gpa_mkey_index = 0;
}
static int
mlx5_vdpa_regions_addr_cmp(const void *a, const void *b)
{
const struct rte_vhost_mem_region *region_a = a;
const struct rte_vhost_mem_region *region_b = b;
if (region_a->guest_phys_addr < region_b->guest_phys_addr)
return -1;
if (region_a->guest_phys_addr > region_b->guest_phys_addr)
return 1;
return 0;
}
#define KLM_NUM_MAX_ALIGN(sz) (RTE_ALIGN_CEIL(sz, MLX5_MAX_KLM_BYTE_COUNT) / \
MLX5_MAX_KLM_BYTE_COUNT)
/*
* Allocate and sort the region list and choose indirect mkey mode:
* 1. Calculate GCD, guest memory size and indirect mkey entries num per mode.
* 2. Align GCD to the maximum allowed size(2G) and to be power of 2.
* 2. Decide the indirect mkey mode according to the next rules:
* a. If both KLM_FBS entries number and KLM entries number are bigger
* than the maximum allowed(MLX5_DEVX_MAX_KLM_ENTRIES) - error.
* b. KLM mode if KLM_FBS entries number is bigger than the maximum
* allowed(MLX5_DEVX_MAX_KLM_ENTRIES).
* c. KLM mode if GCD is smaller than the minimum allowed(4K).
* d. KLM mode if the total size of KLM entries is in one cache line
* and the total size of KLM_FBS entries is not in one cache line.
* e. Otherwise, KLM_FBS mode.
*/
static struct rte_vhost_memory *
mlx5_vdpa_vhost_mem_regions_prepare(int vid, uint8_t *mode, uint64_t *mem_size,
uint64_t *gcd, uint32_t *entries_num)
{
struct rte_vhost_memory *mem;
uint64_t size;
uint64_t klm_entries_num = 0;
uint64_t klm_fbs_entries_num;
uint32_t i;
int ret = rte_vhost_get_mem_table(vid, &mem);
if (ret < 0) {
DRV_LOG(ERR, "Failed to get VM memory layout vid =%d.", vid);
rte_errno = EINVAL;
return NULL;
}
qsort(mem->regions, mem->nregions, sizeof(mem->regions[0]),
mlx5_vdpa_regions_addr_cmp);
*mem_size = (mem->regions[(mem->nregions - 1)].guest_phys_addr) +
(mem->regions[(mem->nregions - 1)].size) -
(mem->regions[0].guest_phys_addr);
*gcd = 0;
for (i = 0; i < mem->nregions; ++i) {
DRV_LOG(INFO, "Region %u: HVA 0x%" PRIx64 ", GPA 0x%" PRIx64
", size 0x%" PRIx64 ".", i,
mem->regions[i].host_user_addr,
mem->regions[i].guest_phys_addr, mem->regions[i].size);
if (i > 0) {
/* Hole handle. */
size = mem->regions[i].guest_phys_addr -
(mem->regions[i - 1].guest_phys_addr +
mem->regions[i - 1].size);
*gcd = rte_get_gcd64(*gcd, size);
klm_entries_num += KLM_NUM_MAX_ALIGN(size);
}
size = mem->regions[i].size;
*gcd = rte_get_gcd64(*gcd, size);
klm_entries_num += KLM_NUM_MAX_ALIGN(size);
}
if (*gcd > MLX5_MAX_KLM_BYTE_COUNT)
*gcd = rte_get_gcd64(*gcd, MLX5_MAX_KLM_BYTE_COUNT);
if (!RTE_IS_POWER_OF_2(*gcd)) {
uint64_t candidate_gcd = rte_align64prevpow2(*gcd);
while (candidate_gcd > 1 && (*gcd % candidate_gcd))
candidate_gcd /= 2;
DRV_LOG(DEBUG, "GCD 0x%" PRIx64 " is not power of 2. Adjusted "
"GCD is 0x%" PRIx64 ".", *gcd, candidate_gcd);
*gcd = candidate_gcd;
}
klm_fbs_entries_num = *mem_size / *gcd;
if (*gcd < MLX5_MIN_KLM_FIXED_BUFFER_SIZE || klm_fbs_entries_num >
MLX5_DEVX_MAX_KLM_ENTRIES ||
((klm_entries_num * sizeof(struct mlx5_klm)) <=
RTE_CACHE_LINE_SIZE && (klm_fbs_entries_num *
sizeof(struct mlx5_klm)) >
RTE_CACHE_LINE_SIZE)) {
*mode = MLX5_MKC_ACCESS_MODE_KLM;
*entries_num = klm_entries_num;
DRV_LOG(INFO, "Indirect mkey mode is KLM.");
} else {
*mode = MLX5_MKC_ACCESS_MODE_KLM_FBS;
*entries_num = klm_fbs_entries_num;
DRV_LOG(INFO, "Indirect mkey mode is KLM Fixed Buffer Size.");
}
DRV_LOG(DEBUG, "Memory registration information: nregions = %u, "
"mem_size = 0x%" PRIx64 ", GCD = 0x%" PRIx64
", klm_fbs_entries_num = 0x%" PRIx64 ", klm_entries_num = 0x%"
PRIx64 ".", mem->nregions, *mem_size, *gcd, klm_fbs_entries_num,
klm_entries_num);
if (*entries_num > MLX5_DEVX_MAX_KLM_ENTRIES) {
DRV_LOG(ERR, "Failed to prepare memory of vid %d - memory is "
"too fragmented.", vid);
free(mem);
return NULL;
}
return mem;
}
static int
mlx5_vdpa_mem_cmp(struct rte_vhost_memory *mem1, struct rte_vhost_memory *mem2)
{
uint32_t i;
if (mem1->nregions != mem2->nregions)
return -1;
for (i = 0; i < mem1->nregions; i++) {
if (mem1->regions[i].guest_phys_addr !=
mem2->regions[i].guest_phys_addr)
return -1;
if (mem1->regions[i].size != mem2->regions[i].size)
return -1;
}
return 0;
}
#define KLM_SIZE_MAX_ALIGN(sz) ((sz) > MLX5_MAX_KLM_BYTE_COUNT ? \
MLX5_MAX_KLM_BYTE_COUNT : (sz))
static int
mlx5_vdpa_create_indirect_mkey(struct mlx5_vdpa_priv *priv)
{
struct mlx5_devx_mkey_attr mkey_attr;
struct mlx5_vdpa_query_mr *mrs =
(struct mlx5_vdpa_query_mr *)priv->mrs;
struct mlx5_vdpa_query_mr *entry;
struct rte_vhost_mem_region *reg;
uint8_t mode = priv->vmem_info.mode;
uint32_t entries_num = priv->vmem_info.entries_num;
struct rte_vhost_memory *mem = priv->vmem_info.vmem;
struct mlx5_klm klm_array[entries_num];
uint64_t gcd = priv->vmem_info.gcd;
int ret = -rte_errno;
uint64_t klm_size;
int klm_index = 0;
uint64_t k;
uint32_t i;
/* If it is the last entry, create indirect mkey. */
for (i = 0; i < mem->nregions; i++) {
entry = &mrs[i];
reg = &mem->regions[i];
if (i > 0) {
uint64_t sadd;
uint64_t empty_region_sz = reg->guest_phys_addr -
(mem->regions[i - 1].guest_phys_addr +
mem->regions[i - 1].size);
if (empty_region_sz > 0) {
sadd = mem->regions[i - 1].guest_phys_addr +
mem->regions[i - 1].size;
klm_size = mode == MLX5_MKC_ACCESS_MODE_KLM ?
KLM_SIZE_MAX_ALIGN(empty_region_sz) : gcd;
for (k = 0; k < empty_region_sz;
k += klm_size) {
klm_array[klm_index].byte_count =
k + klm_size > empty_region_sz ?
empty_region_sz - k : klm_size;
klm_array[klm_index].mkey =
priv->null_mr->lkey;
klm_array[klm_index].address = sadd + k;
klm_index++;
}
}
}
klm_size = mode == MLX5_MKC_ACCESS_MODE_KLM ?
KLM_SIZE_MAX_ALIGN(reg->size) : gcd;
for (k = 0; k < reg->size; k += klm_size) {
klm_array[klm_index].byte_count = k + klm_size >
reg->size ? reg->size - k : klm_size;
klm_array[klm_index].mkey = entry->mr->lkey;
klm_array[klm_index].address = reg->guest_phys_addr + k;
klm_index++;
}
}
memset(&mkey_attr, 0, sizeof(mkey_attr));
mkey_attr.addr = (uintptr_t)(mem->regions[0].guest_phys_addr);
mkey_attr.size = priv->vmem_info.size;
mkey_attr.pd = priv->cdev->pdn;
mkey_attr.umem_id = 0;
/* Must be zero for KLM mode. */
mkey_attr.log_entity_size = mode == MLX5_MKC_ACCESS_MODE_KLM_FBS ?
rte_log2_u64(gcd) : 0;
mkey_attr.pg_access = 0;
mkey_attr.klm_array = klm_array;
mkey_attr.klm_num = klm_index;
entry = &mrs[mem->nregions];
entry->mkey = mlx5_devx_cmd_mkey_create(priv->cdev->ctx, &mkey_attr);
if (!entry->mkey) {
DRV_LOG(ERR, "Failed to create indirect Mkey.");
rte_errno = -ret;
return ret;
}
entry->is_indirect = 1;
priv->gpa_mkey_index = entry->mkey->id;
return 0;
}
/*
* The target here is to group all the physical memory regions of the
* virtio device in one indirect mkey.
* For KLM Fixed Buffer Size mode (HW find the translation entry in one
* read according to the guest phisical address):
* All the sub-direct mkeys of it must be in the same size, hence, each
* one of them should be in the GCD size of all the virtio memory
* regions and the holes between them.
* For KLM mode (each entry may be in different size so HW must iterate
* the entries):
* Each virtio memory region and each hole between them have one entry,
* just need to cover the maximum allowed size(2G) by splitting entries
* which their associated memory regions are bigger than 2G.
* It means that each virtio memory region may be mapped to more than
* one direct mkey in the 2 modes.
* All the holes of invalid memory between the virtio memory regions
* will be mapped to the null memory region for security.
*/
int
mlx5_vdpa_mem_register(struct mlx5_vdpa_priv *priv)
{
void *mrs;
uint8_t mode = 0;
int ret = -rte_errno;
uint32_t i, thrd_idx, data[1];
uint32_t remaining_cnt = 0, err_cnt = 0, task_num = 0;
struct rte_vhost_memory *mem = mlx5_vdpa_vhost_mem_regions_prepare
(priv->vid, &mode, &priv->vmem_info.size,
&priv->vmem_info.gcd, &priv->vmem_info.entries_num);
if (!mem)
return -rte_errno;
if (priv->vmem_info.vmem != NULL) {
if (mlx5_vdpa_mem_cmp(mem, priv->vmem_info.vmem) == 0) {
/* VM memory not changed, reuse resources. */
free(mem);
return 0;
}
mlx5_vdpa_mem_dereg(priv);
}
priv->vmem_info.vmem = mem;
priv->vmem_info.mode = mode;
priv->num_mrs = mem->nregions;
if (!priv->num_mrs || priv->num_mrs >= MLX5_VDPA_MAX_MRS) {
DRV_LOG(ERR,
"Invalid number of memory regions.");
goto error;
}
/* The last one is indirect mkey entry. */
priv->num_mrs++;
mrs = rte_zmalloc("mlx5 vDPA memory regions",
sizeof(struct mlx5_vdpa_query_mr) * priv->num_mrs, 0);
priv->mrs = mrs;
if (!priv->mrs) {
DRV_LOG(ERR, "Failed to allocate private memory regions.");
goto error;
}
if (priv->use_c_thread) {
uint32_t main_task_idx[mem->nregions];
for (i = 0; i < mem->nregions; i++) {
thrd_idx = i % (conf_thread_mng.max_thrds + 1);
if (!thrd_idx) {
main_task_idx[task_num] = i;
task_num++;
continue;
}
thrd_idx = priv->last_c_thrd_idx + 1;
if (thrd_idx >= conf_thread_mng.max_thrds)
thrd_idx = 0;
priv->last_c_thrd_idx = thrd_idx;
data[0] = i;
if (mlx5_vdpa_task_add(priv, thrd_idx,
MLX5_VDPA_TASK_REG_MR,
&remaining_cnt, &err_cnt,
(void **)&data, 1)) {
DRV_LOG(ERR,
"Fail to add task mem region (%d)", i);
main_task_idx[task_num] = i;
task_num++;
}
}
for (i = 0; i < task_num; i++) {
ret = mlx5_vdpa_register_mr(priv,
main_task_idx[i]);
if (ret) {
DRV_LOG(ERR,
"Failed to register mem region %d.", i);
goto error;
}
}
if (mlx5_vdpa_c_thread_wait_bulk_tasks_done(&remaining_cnt,
&err_cnt, 100)) {
DRV_LOG(ERR,
"Failed to wait register mem region tasks ready.");
goto error;
}
} else {
for (i = 0; i < mem->nregions; i++) {
ret = mlx5_vdpa_register_mr(priv, i);
if (ret) {
DRV_LOG(ERR,
"Failed to register mem region %d.", i);
goto error;
}
}
}
ret = mlx5_vdpa_create_indirect_mkey(priv);
if (ret) {
DRV_LOG(ERR, "Failed to create indirect mkey .");
goto error;
}
return 0;
error:
mlx5_vdpa_mem_dereg(priv);
rte_errno = -ret;
return ret;
}
int
mlx5_vdpa_register_mr(struct mlx5_vdpa_priv *priv, uint32_t idx)
{
struct rte_vhost_memory *mem = priv->vmem_info.vmem;
struct mlx5_vdpa_query_mr *mrs =
(struct mlx5_vdpa_query_mr *)priv->mrs;
struct mlx5_vdpa_query_mr *entry;
struct rte_vhost_mem_region *reg;
int ret;
reg = &mem->regions[idx];
entry = &mrs[idx];
entry->mr = mlx5_glue->reg_mr_iova
(priv->cdev->pd,
(void *)(uintptr_t)(reg->host_user_addr),
reg->size, reg->guest_phys_addr,
IBV_ACCESS_LOCAL_WRITE);
if (!entry->mr) {
DRV_LOG(ERR, "Failed to create direct Mkey.");
ret = -rte_errno;
return ret;
}
entry->is_indirect = 0;
return 0;
}