numam-dpdk/drivers/net/mana/mr.c
Long Li 0f5db3c68b net/mana: implement memory registration
MANA hardware has iommu built-in, that provides hardware safe access to
user memory through memory registration. Since memory registration is an
expensive operation, this patch implements a two level memory registration
cache mechanisum for each queue and for each port.

Signed-off-by: Long Li <longli@microsoft.com>
2022-10-06 10:49:00 +02:00

349 lines
7.6 KiB
C

/* SPDX-License-Identifier: BSD-3-Clause
* Copyright 2022 Microsoft Corporation
*/
#include <rte_malloc.h>
#include <ethdev_driver.h>
#include <rte_eal_paging.h>
#include <infiniband/verbs.h>
#include "mana.h"
struct mana_range {
uintptr_t start;
uintptr_t end;
uint32_t len;
};
void
mana_mempool_chunk_cb(struct rte_mempool *mp __rte_unused, void *opaque,
struct rte_mempool_memhdr *memhdr, unsigned int idx)
{
struct mana_range *ranges = opaque;
struct mana_range *range = &ranges[idx];
uint64_t page_size = rte_mem_page_size();
range->start = RTE_ALIGN_FLOOR((uintptr_t)memhdr->addr, page_size);
range->end = RTE_ALIGN_CEIL((uintptr_t)memhdr->addr + memhdr->len,
page_size);
range->len = range->end - range->start;
}
/*
* Register all memory regions from pool.
*/
int
mana_new_pmd_mr(struct mana_mr_btree *local_tree, struct mana_priv *priv,
struct rte_mempool *pool)
{
struct ibv_mr *ibv_mr;
struct mana_range ranges[pool->nb_mem_chunks];
uint32_t i;
struct mana_mr_cache *mr;
int ret;
rte_mempool_mem_iter(pool, mana_mempool_chunk_cb, ranges);
for (i = 0; i < pool->nb_mem_chunks; i++) {
if (ranges[i].len > priv->max_mr_size) {
DRV_LOG(ERR, "memory chunk size %u exceeding max MR",
ranges[i].len);
return -ENOMEM;
}
DRV_LOG(DEBUG,
"registering memory chunk start 0x%" PRIx64 " len %u",
ranges[i].start, ranges[i].len);
if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
/* Send a message to the primary to do MR */
ret = mana_mp_req_mr_create(priv, ranges[i].start,
ranges[i].len);
if (ret) {
DRV_LOG(ERR,
"MR failed start 0x%" PRIx64 " len %u",
ranges[i].start, ranges[i].len);
return ret;
}
continue;
}
ibv_mr = ibv_reg_mr(priv->ib_pd, (void *)ranges[i].start,
ranges[i].len, IBV_ACCESS_LOCAL_WRITE);
if (ibv_mr) {
DRV_LOG(DEBUG, "MR lkey %u addr %p len %" PRIu64,
ibv_mr->lkey, ibv_mr->addr, ibv_mr->length);
mr = rte_calloc("MANA MR", 1, sizeof(*mr), 0);
mr->lkey = ibv_mr->lkey;
mr->addr = (uintptr_t)ibv_mr->addr;
mr->len = ibv_mr->length;
mr->verb_obj = ibv_mr;
rte_spinlock_lock(&priv->mr_btree_lock);
ret = mana_mr_btree_insert(&priv->mr_btree, mr);
rte_spinlock_unlock(&priv->mr_btree_lock);
if (ret) {
ibv_dereg_mr(ibv_mr);
DRV_LOG(ERR, "Failed to add to global MR btree");
return ret;
}
ret = mana_mr_btree_insert(local_tree, mr);
if (ret) {
/* Don't need to clean up MR as it's already
* in the global tree
*/
DRV_LOG(ERR, "Failed to add to local MR btree");
return ret;
}
} else {
DRV_LOG(ERR, "MR failed at 0x%" PRIx64 " len %u",
ranges[i].start, ranges[i].len);
return -errno;
}
}
return 0;
}
/*
* Deregister a MR.
*/
void
mana_del_pmd_mr(struct mana_mr_cache *mr)
{
int ret;
struct ibv_mr *ibv_mr = (struct ibv_mr *)mr->verb_obj;
ret = ibv_dereg_mr(ibv_mr);
if (ret)
DRV_LOG(ERR, "dereg MR failed ret %d", ret);
}
/*
* Find a MR from cache. If not found, register a new MR.
*/
struct mana_mr_cache *
mana_find_pmd_mr(struct mana_mr_btree *local_mr_btree, struct mana_priv *priv,
struct rte_mbuf *mbuf)
{
struct rte_mempool *pool = mbuf->pool;
int ret, second_try = 0;
struct mana_mr_cache *mr;
uint16_t idx;
DRV_LOG(DEBUG, "finding mr for mbuf addr %p len %d",
mbuf->buf_addr, mbuf->buf_len);
try_again:
/* First try to find the MR in local queue tree */
mr = mana_mr_btree_lookup(local_mr_btree, &idx,
(uintptr_t)mbuf->buf_addr, mbuf->buf_len);
if (mr) {
DRV_LOG(DEBUG,
"Local mr lkey %u addr 0x%" PRIx64 " len %" PRIu64,
mr->lkey, mr->addr, mr->len);
return mr;
}
/* If not found, try to find the MR in global tree */
rte_spinlock_lock(&priv->mr_btree_lock);
mr = mana_mr_btree_lookup(&priv->mr_btree, &idx,
(uintptr_t)mbuf->buf_addr,
mbuf->buf_len);
rte_spinlock_unlock(&priv->mr_btree_lock);
/* If found in the global tree, add it to the local tree */
if (mr) {
ret = mana_mr_btree_insert(local_mr_btree, mr);
if (ret) {
DRV_LOG(DEBUG, "Failed to add MR to local tree.");
return NULL;
}
DRV_LOG(DEBUG,
"Added local MR key %u addr 0x%" PRIx64 " len %" PRIu64,
mr->lkey, mr->addr, mr->len);
return mr;
}
if (second_try) {
DRV_LOG(ERR, "Internal error second try failed");
return NULL;
}
ret = mana_new_pmd_mr(local_mr_btree, priv, pool);
if (ret) {
DRV_LOG(ERR, "Failed to allocate MR ret %d addr %p len %d",
ret, mbuf->buf_addr, mbuf->buf_len);
return NULL;
}
second_try = 1;
goto try_again;
}
void
mana_remove_all_mr(struct mana_priv *priv)
{
struct mana_mr_btree *bt = &priv->mr_btree;
struct mana_mr_cache *mr;
struct ibv_mr *ibv_mr;
uint16_t i;
rte_spinlock_lock(&priv->mr_btree_lock);
/* Start with index 1 as the 1st entry is always NULL */
for (i = 1; i < bt->len; i++) {
mr = &bt->table[i];
ibv_mr = mr->verb_obj;
ibv_dereg_mr(ibv_mr);
}
bt->len = 1;
rte_spinlock_unlock(&priv->mr_btree_lock);
}
/*
* Expand the MR cache.
* MR cache is maintained as a btree and expand on demand.
*/
static int
mana_mr_btree_expand(struct mana_mr_btree *bt, int n)
{
void *mem;
mem = rte_realloc_socket(bt->table, n * sizeof(struct mana_mr_cache),
0, bt->socket);
if (!mem) {
DRV_LOG(ERR, "Failed to expand btree size %d", n);
return -1;
}
DRV_LOG(ERR, "Expanded btree to size %d", n);
bt->table = mem;
bt->size = n;
return 0;
}
/*
* Look for a region of memory in MR cache.
*/
struct mana_mr_cache *
mana_mr_btree_lookup(struct mana_mr_btree *bt, uint16_t *idx,
uintptr_t addr, size_t len)
{
struct mana_mr_cache *table;
uint16_t n;
uint16_t base = 0;
int ret;
n = bt->len;
/* Try to double the cache if it's full */
if (n == bt->size) {
ret = mana_mr_btree_expand(bt, bt->size << 1);
if (ret)
return NULL;
}
table = bt->table;
/* Do binary search on addr */
do {
uint16_t delta = n >> 1;
if (addr < table[base + delta].addr) {
n = delta;
} else {
base += delta;
n -= delta;
}
} while (n > 1);
*idx = base;
if (addr + len <= table[base].addr + table[base].len)
return &table[base];
DRV_LOG(DEBUG,
"addr 0x%" PRIx64 " len %zu idx %u sum 0x%" PRIx64 " not found",
addr, len, *idx, addr + len);
return NULL;
}
int
mana_mr_btree_init(struct mana_mr_btree *bt, int n, int socket)
{
memset(bt, 0, sizeof(*bt));
bt->table = rte_calloc_socket("MANA B-tree table",
n,
sizeof(struct mana_mr_cache),
0, socket);
if (!bt->table) {
DRV_LOG(ERR, "Failed to allocate B-tree n %d socket %d",
n, socket);
return -ENOMEM;
}
bt->socket = socket;
bt->size = n;
/* First entry must be NULL for binary search to work */
bt->table[0] = (struct mana_mr_cache) {
.lkey = UINT32_MAX,
};
bt->len = 1;
DRV_LOG(ERR, "B-tree initialized table %p size %d len %d",
bt->table, n, bt->len);
return 0;
}
void
mana_mr_btree_free(struct mana_mr_btree *bt)
{
rte_free(bt->table);
memset(bt, 0, sizeof(*bt));
}
int
mana_mr_btree_insert(struct mana_mr_btree *bt, struct mana_mr_cache *entry)
{
struct mana_mr_cache *table;
uint16_t idx = 0;
uint16_t shift;
if (mana_mr_btree_lookup(bt, &idx, entry->addr, entry->len)) {
DRV_LOG(DEBUG, "Addr 0x%" PRIx64 " len %zu exists in btree",
entry->addr, entry->len);
return 0;
}
if (bt->len >= bt->size) {
bt->overflow = 1;
return -1;
}
table = bt->table;
idx++;
shift = (bt->len - idx) * sizeof(struct mana_mr_cache);
if (shift) {
DRV_LOG(DEBUG, "Moving %u bytes from idx %u to %u",
shift, idx, idx + 1);
memmove(&table[idx + 1], &table[idx], shift);
}
table[idx] = *entry;
bt->len++;
DRV_LOG(DEBUG,
"Inserted MR b-tree table %p idx %d addr 0x%" PRIx64 " len %zu",
table, idx, entry->addr, entry->len);
return 0;
}