net/mana: implement memory registration

MANA hardware has iommu built-in, that provides hardware safe access to
user memory through memory registration. Since memory registration is an
expensive operation, this patch implements a two level memory registration
cache mechanisum for each queue and for each port.

Signed-off-by: Long Li <longli@microsoft.com>
This commit is contained in:
Long Li 2022-10-05 16:22:00 -07:00 committed by Ferruh Yigit
parent f7dc479a13
commit 0f5db3c68b
5 changed files with 503 additions and 0 deletions

View File

@ -111,6 +111,8 @@ mana_dev_close(struct rte_eth_dev *dev)
struct mana_priv *priv = dev->data->dev_private;
int ret;
mana_remove_all_mr(priv);
ret = mana_intr_uninstall(priv);
if (ret)
return ret;
@ -331,6 +333,13 @@ mana_dev_tx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_idx,
goto fail;
}
ret = mana_mr_btree_init(&txq->mr_btree,
MANA_MR_BTREE_PER_QUEUE_N, socket_id);
if (ret) {
DRV_LOG(ERR, "Failed to init TXQ MR btree");
goto fail;
}
DRV_LOG(DEBUG, "idx %u nb_desc %u socket %u txq->desc_ring %p",
queue_idx, nb_desc, socket_id, txq->desc_ring);
@ -353,6 +362,8 @@ mana_dev_tx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
{
struct mana_txq *txq = dev->data->tx_queues[qid];
mana_mr_btree_free(&txq->mr_btree);
rte_free(txq->desc_ring);
rte_free(txq);
}
@ -392,6 +403,13 @@ mana_dev_rx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_idx,
rxq->desc_ring_head = 0;
rxq->desc_ring_tail = 0;
ret = mana_mr_btree_init(&rxq->mr_btree,
MANA_MR_BTREE_PER_QUEUE_N, socket_id);
if (ret) {
DRV_LOG(ERR, "Failed to init RXQ MR btree");
goto fail;
}
rxq->priv = priv;
rxq->num_desc = nb_desc;
rxq->mp = mp;
@ -410,6 +428,8 @@ mana_dev_rx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
{
struct mana_rxq *rxq = dev->data->rx_queues[qid];
mana_mr_btree_free(&rxq->mr_btree);
rte_free(rxq->desc_ring);
rte_free(rxq);
}

View File

@ -44,6 +44,22 @@ struct mana_shared_data {
#define MAX_RECEIVE_BUFFERS_PER_QUEUE 256
#define MAX_SEND_BUFFERS_PER_QUEUE 256
struct mana_mr_cache {
uint32_t lkey;
uintptr_t addr;
size_t len;
void *verb_obj;
};
#define MANA_MR_BTREE_CACHE_N 512
struct mana_mr_btree {
uint16_t len; /* Used entries */
uint16_t size; /* Total entries */
int overflow;
int socket;
struct mana_mr_cache *table;
};
struct mana_process_priv {
void *db_page;
};
@ -73,6 +89,8 @@ struct mana_priv {
int max_recv_sge;
int max_mr;
uint64_t max_mr_size;
struct mana_mr_btree mr_btree;
rte_spinlock_t mr_btree_lock;
};
struct mana_txq_desc {
@ -85,6 +103,8 @@ struct mana_rxq_desc {
uint32_t wqe_size_in_bu;
};
#define MANA_MR_BTREE_PER_QUEUE_N 64
struct mana_txq {
struct mana_priv *priv;
uint32_t num_desc;
@ -97,6 +117,7 @@ struct mana_txq {
*/
uint32_t desc_ring_head, desc_ring_tail;
struct mana_mr_btree mr_btree;
unsigned int socket;
};
@ -113,6 +134,8 @@ struct mana_rxq {
*/
uint32_t desc_ring_head, desc_ring_tail;
struct mana_mr_btree mr_btree;
unsigned int socket;
};
@ -135,6 +158,24 @@ uint16_t mana_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
uint16_t mana_tx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
uint16_t pkts_n);
struct mana_mr_cache *mana_find_pmd_mr(struct mana_mr_btree *local_tree,
struct mana_priv *priv,
struct rte_mbuf *mbuf);
int mana_new_pmd_mr(struct mana_mr_btree *local_tree, struct mana_priv *priv,
struct rte_mempool *pool);
void mana_remove_all_mr(struct mana_priv *priv);
void mana_del_pmd_mr(struct mana_mr_cache *mr);
void mana_mempool_chunk_cb(struct rte_mempool *mp, void *opaque,
struct rte_mempool_memhdr *memhdr, unsigned int idx);
struct mana_mr_cache *mana_mr_btree_lookup(struct mana_mr_btree *bt,
uint16_t *idx,
uintptr_t addr, size_t len);
int mana_mr_btree_insert(struct mana_mr_btree *bt, struct mana_mr_cache *entry);
int mana_mr_btree_init(struct mana_mr_btree *bt, int n, int socket);
void mana_mr_btree_free(struct mana_mr_btree *bt);
/** Request timeout for IPC. */
#define MANA_MP_REQ_TIMEOUT_SEC 5
@ -163,6 +204,7 @@ int mana_mp_init_secondary(void);
void mana_mp_uninit_primary(void);
void mana_mp_uninit_secondary(void);
int mana_mp_req_verbs_cmd_fd(struct rte_eth_dev *dev);
int mana_mp_req_mr_create(struct mana_priv *priv, uintptr_t addr, uint32_t len);
void mana_mp_req_on_rxtx(struct rte_eth_dev *dev, enum mana_mp_req_type type);

View File

@ -12,6 +12,7 @@ deps += ['pci', 'bus_pci', 'net', 'eal', 'kvargs']
sources += files(
'mana.c',
'mp.c',
'mr.c',
)
libnames = ['ibverbs', 'mana' ]

View File

@ -12,6 +12,55 @@
extern struct mana_shared_data *mana_shared_data;
/*
* Process MR request from secondary process.
*/
static int
mana_mp_mr_create(struct mana_priv *priv, uintptr_t addr, uint32_t len)
{
struct ibv_mr *ibv_mr;
int ret;
struct mana_mr_cache *mr;
ibv_mr = ibv_reg_mr(priv->ib_pd, (void *)addr, len,
IBV_ACCESS_LOCAL_WRITE);
if (!ibv_mr)
return -errno;
DRV_LOG(DEBUG, "MR (2nd) lkey %u addr %p len %zu",
ibv_mr->lkey, ibv_mr->addr, ibv_mr->length);
mr = rte_calloc("MANA MR", 1, sizeof(*mr), 0);
if (!mr) {
DRV_LOG(ERR, "(2nd) Failed to allocate MR");
ret = -ENOMEM;
goto fail_alloc;
}
mr->lkey = ibv_mr->lkey;
mr->addr = (uintptr_t)ibv_mr->addr;
mr->len = ibv_mr->length;
mr->verb_obj = ibv_mr;
rte_spinlock_lock(&priv->mr_btree_lock);
ret = mana_mr_btree_insert(&priv->mr_btree, mr);
rte_spinlock_unlock(&priv->mr_btree_lock);
if (ret) {
DRV_LOG(ERR, "(2nd) Failed to add to global MR btree");
goto fail_btree;
}
return 0;
fail_btree:
rte_free(mr);
fail_alloc:
ibv_dereg_mr(ibv_mr);
return ret;
}
static void
mp_init_msg(struct rte_mp_msg *msg, enum mana_mp_req_type type, int port_id)
{
@ -47,6 +96,12 @@ mana_mp_primary_handle(const struct rte_mp_msg *mp_msg, const void *peer)
mp_init_msg(&mp_res, param->type, param->port_id);
switch (param->type) {
case MANA_MP_REQ_CREATE_MR:
ret = mana_mp_mr_create(priv, param->addr, param->len);
res->result = ret;
ret = rte_mp_reply(&mp_res, peer);
break;
case MANA_MP_REQ_VERBS_CMD_FD:
mp_res.num_fds = 1;
mp_res.fds[0] = priv->ib_ctx->cmd_fd;
@ -194,6 +249,43 @@ mana_mp_req_verbs_cmd_fd(struct rte_eth_dev *dev)
return ret;
}
/*
* Request the primary process to register a MR.
*/
int
mana_mp_req_mr_create(struct mana_priv *priv, uintptr_t addr, uint32_t len)
{
struct rte_mp_msg mp_req = {0};
struct rte_mp_msg *mp_res;
struct rte_mp_reply mp_rep;
struct mana_mp_param *req = (struct mana_mp_param *)mp_req.param;
struct mana_mp_param *res;
struct timespec ts = {.tv_sec = MANA_MP_REQ_TIMEOUT_SEC, .tv_nsec = 0};
int ret;
mp_init_msg(&mp_req, MANA_MP_REQ_CREATE_MR, priv->port_id);
req->addr = addr;
req->len = len;
ret = rte_mp_request_sync(&mp_req, &mp_rep, &ts);
if (ret) {
DRV_LOG(ERR, "Port %u request to primary failed",
req->port_id);
return ret;
}
if (mp_rep.nb_received != 1)
return -EPROTO;
mp_res = &mp_rep.msgs[0];
res = (struct mana_mp_param *)mp_res->param;
ret = res->result;
free(mp_rep.msgs);
return ret;
}
void
mana_mp_req_on_rxtx(struct rte_eth_dev *dev, enum mana_mp_req_type type)
{

348
drivers/net/mana/mr.c Normal file
View File

@ -0,0 +1,348 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright 2022 Microsoft Corporation
*/
#include <rte_malloc.h>
#include <ethdev_driver.h>
#include <rte_eal_paging.h>
#include <infiniband/verbs.h>
#include "mana.h"
struct mana_range {
uintptr_t start;
uintptr_t end;
uint32_t len;
};
void
mana_mempool_chunk_cb(struct rte_mempool *mp __rte_unused, void *opaque,
struct rte_mempool_memhdr *memhdr, unsigned int idx)
{
struct mana_range *ranges = opaque;
struct mana_range *range = &ranges[idx];
uint64_t page_size = rte_mem_page_size();
range->start = RTE_ALIGN_FLOOR((uintptr_t)memhdr->addr, page_size);
range->end = RTE_ALIGN_CEIL((uintptr_t)memhdr->addr + memhdr->len,
page_size);
range->len = range->end - range->start;
}
/*
* Register all memory regions from pool.
*/
int
mana_new_pmd_mr(struct mana_mr_btree *local_tree, struct mana_priv *priv,
struct rte_mempool *pool)
{
struct ibv_mr *ibv_mr;
struct mana_range ranges[pool->nb_mem_chunks];
uint32_t i;
struct mana_mr_cache *mr;
int ret;
rte_mempool_mem_iter(pool, mana_mempool_chunk_cb, ranges);
for (i = 0; i < pool->nb_mem_chunks; i++) {
if (ranges[i].len > priv->max_mr_size) {
DRV_LOG(ERR, "memory chunk size %u exceeding max MR",
ranges[i].len);
return -ENOMEM;
}
DRV_LOG(DEBUG,
"registering memory chunk start 0x%" PRIx64 " len %u",
ranges[i].start, ranges[i].len);
if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
/* Send a message to the primary to do MR */
ret = mana_mp_req_mr_create(priv, ranges[i].start,
ranges[i].len);
if (ret) {
DRV_LOG(ERR,
"MR failed start 0x%" PRIx64 " len %u",
ranges[i].start, ranges[i].len);
return ret;
}
continue;
}
ibv_mr = ibv_reg_mr(priv->ib_pd, (void *)ranges[i].start,
ranges[i].len, IBV_ACCESS_LOCAL_WRITE);
if (ibv_mr) {
DRV_LOG(DEBUG, "MR lkey %u addr %p len %" PRIu64,
ibv_mr->lkey, ibv_mr->addr, ibv_mr->length);
mr = rte_calloc("MANA MR", 1, sizeof(*mr), 0);
mr->lkey = ibv_mr->lkey;
mr->addr = (uintptr_t)ibv_mr->addr;
mr->len = ibv_mr->length;
mr->verb_obj = ibv_mr;
rte_spinlock_lock(&priv->mr_btree_lock);
ret = mana_mr_btree_insert(&priv->mr_btree, mr);
rte_spinlock_unlock(&priv->mr_btree_lock);
if (ret) {
ibv_dereg_mr(ibv_mr);
DRV_LOG(ERR, "Failed to add to global MR btree");
return ret;
}
ret = mana_mr_btree_insert(local_tree, mr);
if (ret) {
/* Don't need to clean up MR as it's already
* in the global tree
*/
DRV_LOG(ERR, "Failed to add to local MR btree");
return ret;
}
} else {
DRV_LOG(ERR, "MR failed at 0x%" PRIx64 " len %u",
ranges[i].start, ranges[i].len);
return -errno;
}
}
return 0;
}
/*
* Deregister a MR.
*/
void
mana_del_pmd_mr(struct mana_mr_cache *mr)
{
int ret;
struct ibv_mr *ibv_mr = (struct ibv_mr *)mr->verb_obj;
ret = ibv_dereg_mr(ibv_mr);
if (ret)
DRV_LOG(ERR, "dereg MR failed ret %d", ret);
}
/*
* Find a MR from cache. If not found, register a new MR.
*/
struct mana_mr_cache *
mana_find_pmd_mr(struct mana_mr_btree *local_mr_btree, struct mana_priv *priv,
struct rte_mbuf *mbuf)
{
struct rte_mempool *pool = mbuf->pool;
int ret, second_try = 0;
struct mana_mr_cache *mr;
uint16_t idx;
DRV_LOG(DEBUG, "finding mr for mbuf addr %p len %d",
mbuf->buf_addr, mbuf->buf_len);
try_again:
/* First try to find the MR in local queue tree */
mr = mana_mr_btree_lookup(local_mr_btree, &idx,
(uintptr_t)mbuf->buf_addr, mbuf->buf_len);
if (mr) {
DRV_LOG(DEBUG,
"Local mr lkey %u addr 0x%" PRIx64 " len %" PRIu64,
mr->lkey, mr->addr, mr->len);
return mr;
}
/* If not found, try to find the MR in global tree */
rte_spinlock_lock(&priv->mr_btree_lock);
mr = mana_mr_btree_lookup(&priv->mr_btree, &idx,
(uintptr_t)mbuf->buf_addr,
mbuf->buf_len);
rte_spinlock_unlock(&priv->mr_btree_lock);
/* If found in the global tree, add it to the local tree */
if (mr) {
ret = mana_mr_btree_insert(local_mr_btree, mr);
if (ret) {
DRV_LOG(DEBUG, "Failed to add MR to local tree.");
return NULL;
}
DRV_LOG(DEBUG,
"Added local MR key %u addr 0x%" PRIx64 " len %" PRIu64,
mr->lkey, mr->addr, mr->len);
return mr;
}
if (second_try) {
DRV_LOG(ERR, "Internal error second try failed");
return NULL;
}
ret = mana_new_pmd_mr(local_mr_btree, priv, pool);
if (ret) {
DRV_LOG(ERR, "Failed to allocate MR ret %d addr %p len %d",
ret, mbuf->buf_addr, mbuf->buf_len);
return NULL;
}
second_try = 1;
goto try_again;
}
void
mana_remove_all_mr(struct mana_priv *priv)
{
struct mana_mr_btree *bt = &priv->mr_btree;
struct mana_mr_cache *mr;
struct ibv_mr *ibv_mr;
uint16_t i;
rte_spinlock_lock(&priv->mr_btree_lock);
/* Start with index 1 as the 1st entry is always NULL */
for (i = 1; i < bt->len; i++) {
mr = &bt->table[i];
ibv_mr = mr->verb_obj;
ibv_dereg_mr(ibv_mr);
}
bt->len = 1;
rte_spinlock_unlock(&priv->mr_btree_lock);
}
/*
* Expand the MR cache.
* MR cache is maintained as a btree and expand on demand.
*/
static int
mana_mr_btree_expand(struct mana_mr_btree *bt, int n)
{
void *mem;
mem = rte_realloc_socket(bt->table, n * sizeof(struct mana_mr_cache),
0, bt->socket);
if (!mem) {
DRV_LOG(ERR, "Failed to expand btree size %d", n);
return -1;
}
DRV_LOG(ERR, "Expanded btree to size %d", n);
bt->table = mem;
bt->size = n;
return 0;
}
/*
* Look for a region of memory in MR cache.
*/
struct mana_mr_cache *
mana_mr_btree_lookup(struct mana_mr_btree *bt, uint16_t *idx,
uintptr_t addr, size_t len)
{
struct mana_mr_cache *table;
uint16_t n;
uint16_t base = 0;
int ret;
n = bt->len;
/* Try to double the cache if it's full */
if (n == bt->size) {
ret = mana_mr_btree_expand(bt, bt->size << 1);
if (ret)
return NULL;
}
table = bt->table;
/* Do binary search on addr */
do {
uint16_t delta = n >> 1;
if (addr < table[base + delta].addr) {
n = delta;
} else {
base += delta;
n -= delta;
}
} while (n > 1);
*idx = base;
if (addr + len <= table[base].addr + table[base].len)
return &table[base];
DRV_LOG(DEBUG,
"addr 0x%" PRIx64 " len %zu idx %u sum 0x%" PRIx64 " not found",
addr, len, *idx, addr + len);
return NULL;
}
int
mana_mr_btree_init(struct mana_mr_btree *bt, int n, int socket)
{
memset(bt, 0, sizeof(*bt));
bt->table = rte_calloc_socket("MANA B-tree table",
n,
sizeof(struct mana_mr_cache),
0, socket);
if (!bt->table) {
DRV_LOG(ERR, "Failed to allocate B-tree n %d socket %d",
n, socket);
return -ENOMEM;
}
bt->socket = socket;
bt->size = n;
/* First entry must be NULL for binary search to work */
bt->table[0] = (struct mana_mr_cache) {
.lkey = UINT32_MAX,
};
bt->len = 1;
DRV_LOG(ERR, "B-tree initialized table %p size %d len %d",
bt->table, n, bt->len);
return 0;
}
void
mana_mr_btree_free(struct mana_mr_btree *bt)
{
rte_free(bt->table);
memset(bt, 0, sizeof(*bt));
}
int
mana_mr_btree_insert(struct mana_mr_btree *bt, struct mana_mr_cache *entry)
{
struct mana_mr_cache *table;
uint16_t idx = 0;
uint16_t shift;
if (mana_mr_btree_lookup(bt, &idx, entry->addr, entry->len)) {
DRV_LOG(DEBUG, "Addr 0x%" PRIx64 " len %zu exists in btree",
entry->addr, entry->len);
return 0;
}
if (bt->len >= bt->size) {
bt->overflow = 1;
return -1;
}
table = bt->table;
idx++;
shift = (bt->len - idx) * sizeof(struct mana_mr_cache);
if (shift) {
DRV_LOG(DEBUG, "Moving %u bytes from idx %u to %u",
shift, idx, idx + 1);
memmove(&table[idx + 1], &table[idx], shift);
}
table[idx] = *entry;
bt->len++;
DRV_LOG(DEBUG,
"Inserted MR b-tree table %p idx %d addr 0x%" PRIx64 " len %zu",
table, idx, entry->addr, entry->len);
return 0;
}