net/mlx4: support secondary process

In order to support secondary process, a few features are required.

a) rdma-core library should allocate device resources using DPDK's
   memory allocator.

b) UAR should be remapped for secondary processes. Currently, in order
   not to use different data structure for secondary processes, PMD
   tries to reserve identical virtual address space for both primary
   and secondary processes.

c) IPC channel is necessary, which can be easily set with rte_mp APIs.
   Through the channel, Verbs command FD is delivered to the secondary
   process and the device stop/start event is also broadcast from
   primary process.

Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
Acked-by: Shahaf Shuler <shahafs@mellanox.com>
This commit is contained in:
Yongseok Koh 2019-04-01 14:15:53 -07:00 committed by Ferruh Yigit
parent 8e49376400
commit 0203d33a10
12 changed files with 898 additions and 23 deletions

View File

@ -29,6 +29,7 @@ Packet type parsing = Y
Basic stats = Y
Stats per queue = Y
FW version = Y
Multiprocess aware = Y
Other kdrv = Y
Power8 = Y
x86-32 = Y

View File

@ -145,6 +145,16 @@ below.
Limitations
-----------
- For secondary process:
- Forked secondary process not supported.
- All mempools must be initialized before rte_eth_dev_start().
- External memory unregistered in EAL memseg list cannot be used for DMA
unless such memory has been registered by ``mlx4_mr_update_ext_mp()`` in
primary process and remapped to the same virtual address in secondary
process. If the external memory is registered by primary process but has
different virtual address in secondary process, unexpected error may happen.
- CRC stripping is supported by default and always reported as "true".
The ability to enable/disable CRC stripping requires OFED version
4.3-1.5.0.0 and above or rdma-core version v18 and above.

View File

@ -18,6 +18,7 @@ ifneq ($(CONFIG_RTE_IBVERBS_LINK_DLOPEN),y)
SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_glue.c
endif
SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_intr.c
SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_mp.c
SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_mr.c
SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_rxq.c
SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_rxtx.c
@ -92,6 +93,11 @@ mlx4_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
infiniband/mlx4dv.h \
enum MLX4DV_SET_CTX_ATTR_BUF_ALLOCATORS \
$(AUTOCONF_OUTPUT)
$Q sh -- '$<' '$@' \
HAVE_IBV_MLX4_UAR_MMAP_OFFSET \
infiniband/mlx4dv.h \
enum MLX4DV_QP_MASK_UAR_MMAP_OFFSET \
$(AUTOCONF_OUTPUT)
$Q sh -- '$<' '$@' \
HAVE_IBV_MLX4_WQE_LSO_SEG \
infiniband/mlx4dv.h \

View File

@ -33,6 +33,7 @@ if build
'mlx4_ethdev.c',
'mlx4_flow.c',
'mlx4_intr.c',
'mlx4_mp.c',
'mlx4_mr.c',
'mlx4_rxq.c',
'mlx4_rxtx.c',
@ -76,6 +77,8 @@ if build
has_sym_args = [
[ 'HAVE_IBV_MLX4_BUF_ALLOCATORS', 'infiniband/mlx4dv.h',
'MLX4DV_SET_CTX_ATTR_BUF_ALLOCATORS' ],
[ 'HAVE_IBV_MLX4_UAR_MMAP_OFFSET', 'infiniband/mlx4dv.h',
'MLX4DV_QP_MASK_UAR_MMAP_OFFSET' ],
]
config = configuration_data()
foreach arg:has_sym_args

View File

@ -17,6 +17,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <unistd.h>
/* Verbs headers do not support -pedantic. */
@ -48,10 +49,16 @@
#include "mlx4_rxtx.h"
#include "mlx4_utils.h"
struct mlx4_dev_list mlx4_mem_event_cb_list =
LIST_HEAD_INITIALIZER(mlx4_mem_event_cb_list);
static const char *MZ_MLX4_PMD_SHARED_DATA = "mlx4_pmd_shared_data";
rte_rwlock_t mlx4_mem_event_rwlock = RTE_RWLOCK_INITIALIZER;
/* Shared memory between primary and secondary processes. */
struct mlx4_shared_data *mlx4_shared_data;
/* Spinlock for mlx4_shared_data allocation. */
static rte_spinlock_t mlx4_shared_data_lock = RTE_SPINLOCK_INITIALIZER;
/* Process local data for secondary processes. */
static struct mlx4_local_data mlx4_local_data;
/** Configuration structure for device arguments. */
struct mlx4_conf {
@ -69,6 +76,77 @@ const char *pmd_mlx4_init_params[] = {
static void mlx4_dev_stop(struct rte_eth_dev *dev);
/**
* Initialize shared data between primary and secondary process.
*
* A memzone is reserved by primary process and secondary processes attach to
* the memzone.
*
* @return
* 0 on success, a negative errno value otherwise and rte_errno is set.
*/
static int
mlx4_init_shared_data(void)
{
const struct rte_memzone *mz;
int ret = 0;
rte_spinlock_lock(&mlx4_shared_data_lock);
if (mlx4_shared_data == NULL) {
if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
/* Allocate shared memory. */
mz = rte_memzone_reserve(MZ_MLX4_PMD_SHARED_DATA,
sizeof(*mlx4_shared_data),
SOCKET_ID_ANY, 0);
if (mz == NULL) {
ERROR("Cannot allocate mlx4 shared data\n");
ret = -rte_errno;
goto error;
}
mlx4_shared_data = mz->addr;
memset(mlx4_shared_data, 0, sizeof(*mlx4_shared_data));
rte_spinlock_init(&mlx4_shared_data->lock);
} else {
/* Lookup allocated shared memory. */
mz = rte_memzone_lookup(MZ_MLX4_PMD_SHARED_DATA);
if (mz == NULL) {
ERROR("Cannot attach mlx4 shared data\n");
ret = -rte_errno;
goto error;
}
mlx4_shared_data = mz->addr;
memset(&mlx4_local_data, 0, sizeof(mlx4_local_data));
}
}
error:
rte_spinlock_unlock(&mlx4_shared_data_lock);
return ret;
}
/**
* Uninitialize shared data between primary and secondary process.
*
* The pointer of secondary process is dereferenced and primary process frees
* the memzone.
*/
static void
mlx4_uninit_shared_data(void)
{
const struct rte_memzone *mz;
rte_spinlock_lock(&mlx4_shared_data_lock);
if (mlx4_shared_data) {
if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
mz = rte_memzone_lookup(MZ_MLX4_PMD_SHARED_DATA);
rte_memzone_free(mz);
} else {
memset(&mlx4_local_data, 0, sizeof(mlx4_local_data));
}
mlx4_shared_data = NULL;
}
rte_spinlock_unlock(&mlx4_shared_data_lock);
}
#ifdef HAVE_IBV_MLX4_BUF_ALLOCATORS
/**
* Verbs callback to allocate a memory. This function should allocate the space
@ -181,6 +259,11 @@ mlx4_dev_start(struct rte_eth_dev *dev)
return 0;
DEBUG("%p: attaching configured flows to all RX queues", (void *)dev);
priv->started = 1;
ret = mlx4_tx_uar_remap(dev, priv->ctx->cmd_fd);
if (ret) {
ERROR("%p: cannot remap UAR", (void *)dev);
goto err;
}
ret = mlx4_rss_init(priv);
if (ret) {
ERROR("%p: cannot initialize RSS resources: %s",
@ -208,6 +291,8 @@ mlx4_dev_start(struct rte_eth_dev *dev)
rte_wmb();
dev->tx_pkt_burst = mlx4_tx_burst;
dev->rx_pkt_burst = mlx4_rx_burst;
/* Enable datapath on secondary process. */
mlx4_mp_req_start_rxtx(dev);
return 0;
err:
mlx4_dev_stop(dev);
@ -226,6 +311,8 @@ static void
mlx4_dev_stop(struct rte_eth_dev *dev)
{
struct mlx4_priv *priv = dev->data->dev_private;
const size_t page_size = sysconf(_SC_PAGESIZE);
int i;
if (!priv->started)
return;
@ -234,9 +321,20 @@ mlx4_dev_stop(struct rte_eth_dev *dev)
dev->tx_pkt_burst = mlx4_tx_burst_removed;
dev->rx_pkt_burst = mlx4_rx_burst_removed;
rte_wmb();
/* Disable datapath on secondary process. */
mlx4_mp_req_stop_rxtx(dev);
mlx4_flow_sync(priv, NULL);
mlx4_rxq_intr_disable(priv);
mlx4_rss_deinit(priv);
for (i = 0; i != dev->data->nb_tx_queues; ++i) {
struct txq *txq;
txq = dev->data->tx_queues[i];
if (!txq)
continue;
munmap((void *)RTE_ALIGN_FLOOR((uintptr_t)txq->msq.db,
page_size), page_size);
}
}
/**
@ -259,6 +357,8 @@ mlx4_dev_close(struct rte_eth_dev *dev)
dev->rx_pkt_burst = mlx4_rx_burst_removed;
dev->tx_pkt_burst = mlx4_tx_burst_removed;
rte_wmb();
/* Disable datapath on secondary process. */
mlx4_mp_req_stop_rxtx(dev);
mlx4_flow_clean(priv);
mlx4_rss_deinit(priv);
for (i = 0; i != dev->data->nb_rx_queues; ++i)
@ -310,6 +410,14 @@ static const struct eth_dev_ops mlx4_dev_ops = {
.is_removed = mlx4_is_removed,
};
/* Available operations from secondary process. */
static const struct eth_dev_ops mlx4_dev_sec_ops = {
.stats_get = mlx4_stats_get,
.stats_reset = mlx4_stats_reset,
.fw_version_get = mlx4_fw_version_get,
.dev_infos_get = mlx4_dev_infos_get,
};
/**
* Get PCI information from struct ibv_device.
*
@ -549,6 +657,200 @@ mlx4_hw_rss_sup(struct ibv_context *ctx, struct ibv_pd *pd,
static struct rte_pci_driver mlx4_driver;
static int
find_lower_va_bound(const struct rte_memseg_list *msl,
const struct rte_memseg *ms, void *arg)
{
void **addr = arg;
if (msl->external)
return 0;
if (*addr == NULL)
*addr = ms->addr;
else
*addr = RTE_MIN(*addr, ms->addr);
return 0;
}
/**
* Reserve UAR address space for primary process.
*
* Process local resource is used by both primary and secondary to avoid
* duplicate reservation. The space has to be available on both primary and
* secondary process, TXQ UAR maps to this area using fixed mmap w/o double
* check.
*
* @return
* 0 on success, a negative errno value otherwise and rte_errno is set.
*/
static int
mlx4_uar_init_primary(void)
{
struct mlx4_shared_data *sd = mlx4_shared_data;
void *addr = (void *)0;
if (sd->uar_base)
return 0;
/* find out lower bound of hugepage segments */
rte_memseg_walk(find_lower_va_bound, &addr);
/* keep distance to hugepages to minimize potential conflicts. */
addr = RTE_PTR_SUB(addr, (uintptr_t)(MLX4_UAR_OFFSET + MLX4_UAR_SIZE));
/* anonymous mmap, no real memory consumption. */
addr = mmap(addr, MLX4_UAR_SIZE,
PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (addr == MAP_FAILED) {
ERROR("failed to reserve UAR address space, please"
" adjust MLX4_UAR_SIZE or try --base-virtaddr");
rte_errno = ENOMEM;
return -rte_errno;
}
/* Accept either same addr or a new addr returned from mmap if target
* range occupied.
*/
INFO("reserved UAR address space: %p", addr);
sd->uar_base = addr; /* for primary and secondary UAR re-mmap. */
return 0;
}
/**
* Unmap UAR address space reserved for primary process.
*/
static void
mlx4_uar_uninit_primary(void)
{
struct mlx4_shared_data *sd = mlx4_shared_data;
if (!sd->uar_base)
return;
munmap(sd->uar_base, MLX4_UAR_SIZE);
sd->uar_base = NULL;
}
/**
* Reserve UAR address space for secondary process, align with primary process.
*
* @return
* 0 on success, a negative errno value otherwise and rte_errno is set.
*/
static int
mlx4_uar_init_secondary(void)
{
struct mlx4_shared_data *sd = mlx4_shared_data;
struct mlx4_local_data *ld = &mlx4_local_data;
void *addr;
if (ld->uar_base) { /* Already reserved. */
assert(sd->uar_base == ld->uar_base);
return 0;
}
assert(sd->uar_base);
/* anonymous mmap, no real memory consumption. */
addr = mmap(sd->uar_base, MLX4_UAR_SIZE,
PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (addr == MAP_FAILED) {
ERROR("UAR mmap failed: %p size: %llu",
sd->uar_base, MLX4_UAR_SIZE);
rte_errno = ENXIO;
return -rte_errno;
}
if (sd->uar_base != addr) {
ERROR("UAR address %p size %llu occupied, please"
" adjust MLX4_UAR_OFFSET or try EAL parameter"
" --base-virtaddr",
sd->uar_base, MLX4_UAR_SIZE);
rte_errno = ENXIO;
return -rte_errno;
}
ld->uar_base = addr;
INFO("reserved UAR address space: %p", addr);
return 0;
}
/**
* Unmap UAR address space reserved for secondary process.
*/
static void
mlx4_uar_uninit_secondary(void)
{
struct mlx4_local_data *ld = &mlx4_local_data;
if (!ld->uar_base)
return;
munmap(ld->uar_base, MLX4_UAR_SIZE);
ld->uar_base = NULL;
}
/**
* PMD global initialization.
*
* Independent from individual device, this function initializes global
* per-PMD data structures distinguishing primary and secondary processes.
* Hence, each initialization is called once per a process.
*
* @return
* 0 on success, a negative errno value otherwise and rte_errno is set.
*/
static int
mlx4_init_once(void)
{
struct mlx4_shared_data *sd;
struct mlx4_local_data *ld = &mlx4_local_data;
int ret;
if (mlx4_init_shared_data())
return -rte_errno;
sd = mlx4_shared_data;
assert(sd);
rte_spinlock_lock(&sd->lock);
switch (rte_eal_process_type()) {
case RTE_PROC_PRIMARY:
if (sd->init_done)
break;
LIST_INIT(&sd->mem_event_cb_list);
rte_rwlock_init(&sd->mem_event_rwlock);
rte_mem_event_callback_register("MLX4_MEM_EVENT_CB",
mlx4_mr_mem_event_cb, NULL);
mlx4_mp_init_primary();
ret = mlx4_uar_init_primary();
if (ret)
goto error;
sd->init_done = true;
break;
case RTE_PROC_SECONDARY:
if (ld->init_done)
break;
mlx4_mp_init_secondary();
ret = mlx4_uar_init_secondary();
if (ret)
goto error;
++sd->secondary_cnt;
ld->init_done = true;
break;
default:
break;
}
rte_spinlock_unlock(&sd->lock);
return 0;
error:
switch (rte_eal_process_type()) {
case RTE_PROC_PRIMARY:
mlx4_uar_uninit_primary();
mlx4_mp_uninit_primary();
rte_mem_event_callback_unregister("MLX4_MEM_EVENT_CB", NULL);
break;
case RTE_PROC_SECONDARY:
mlx4_uar_uninit_secondary();
mlx4_mp_uninit_secondary();
break;
default:
break;
}
rte_spinlock_unlock(&sd->lock);
mlx4_uninit_shared_data();
return -rte_errno;
}
/**
* DPDK callback to register a PCI device.
*
@ -579,6 +881,12 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
int i;
(void)pci_drv;
err = mlx4_init_once();
if (err) {
ERROR("unable to init PMD global data: %s",
strerror(rte_errno));
return -rte_errno;
}
assert(pci_drv == &mlx4_driver);
list = mlx4_glue->get_device_list(&i);
if (list == NULL) {
@ -659,6 +967,7 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
struct mlx4_priv *priv = NULL;
struct rte_eth_dev *eth_dev = NULL;
struct ether_addr mac;
char name[RTE_ETH_NAME_MAX_LEN];
/* If port is not enabled, skip. */
if (!(conf.ports.enabled & (1 << i)))
@ -669,6 +978,51 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
err = ENODEV;
goto port_error;
}
snprintf(name, sizeof(name), "%s port %u",
mlx4_glue->get_device_name(ibv_dev), port);
if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
eth_dev = rte_eth_dev_attach_secondary(name);
if (eth_dev == NULL) {
ERROR("can not attach rte ethdev");
rte_errno = ENOMEM;
err = rte_errno;
goto error;
}
priv = eth_dev->data->dev_private;
if (!priv->verbs_alloc_ctx.enabled) {
ERROR("secondary process is not supported"
" due to lack of external allocator"
" from Verbs");
rte_errno = ENOTSUP;
err = rte_errno;
goto error;
}
eth_dev->device = &pci_dev->device;
eth_dev->dev_ops = &mlx4_dev_sec_ops;
/* Receive command fd from primary process. */
err = mlx4_mp_req_verbs_cmd_fd(eth_dev);
if (err < 0) {
err = rte_errno;
goto error;
}
/* Remap UAR for Tx queues. */
err = mlx4_tx_uar_remap(eth_dev, err);
if (err) {
err = rte_errno;
goto error;
}
/*
* Ethdev pointer is still required as input since
* the primary device is not accessible from the
* secondary process.
*/
eth_dev->tx_pkt_burst = mlx4_tx_burst;
eth_dev->rx_pkt_burst = mlx4_rx_burst;
claim_zero(mlx4_glue->close_device(ctx));
rte_eth_copy_pci_info(eth_dev, pci_dev);
rte_eth_dev_probing_finish(eth_dev);
continue;
}
/* Check port status. */
err = mlx4_glue->query_port(ctx, port, &port_attr);
if (err) {
@ -774,14 +1128,7 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
/* Get actual MTU if possible. */
mlx4_mtu_get(priv, &priv->mtu);
DEBUG("port %u MTU is %u", priv->port, priv->mtu);
/* from rte_ethdev.c */
{
char name[RTE_ETH_NAME_MAX_LEN];
snprintf(name, sizeof(name), "%s port %u",
mlx4_glue->get_device_name(ibv_dev), port);
eth_dev = rte_eth_dev_allocate(name);
}
eth_dev = rte_eth_dev_allocate(name);
if (eth_dev == NULL) {
err = ENOMEM;
ERROR("can not allocate rte ethdev");
@ -818,9 +1165,13 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
.free = &mlx4_free_verbs_buf,
.data = priv,
};
mlx4_glue->dv_set_context_attr
err = mlx4_glue->dv_set_context_attr
(ctx, MLX4DV_SET_CTX_ATTR_BUF_ALLOCATORS,
(void *)((uintptr_t)&alctr));
if (err)
WARN("Verbs external allocator is not supported");
else
priv->verbs_alloc_ctx.enabled = 1;
#endif
/* Bring Ethernet device up. */
DEBUG("forcing Ethernet interface up");
@ -842,9 +1193,10 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
goto port_error;
}
/* Add device to memory callback list. */
rte_rwlock_write_lock(&mlx4_mem_event_rwlock);
LIST_INSERT_HEAD(&mlx4_mem_event_cb_list, priv, mem_event_cb);
rte_rwlock_write_unlock(&mlx4_mem_event_rwlock);
rte_rwlock_write_lock(&mlx4_shared_data->mem_event_rwlock);
LIST_INSERT_HEAD(&mlx4_shared_data->mem_event_cb_list,
priv, mem_event_cb);
rte_rwlock_write_unlock(&mlx4_shared_data->mem_event_rwlock);
rte_eth_dev_probing_finish(eth_dev);
continue;
port_error:
@ -1075,8 +1427,6 @@ RTE_INIT(rte_mlx4_pmd_init)
}
mlx4_glue->fork_init();
rte_pci_register(&mlx4_driver);
rte_mem_event_callback_register("MLX4_MEM_EVENT_CB",
mlx4_mr_mem_event_cb, NULL);
}
RTE_PMD_EXPORT_NAME(net_mlx4, __COUNTER__);

View File

@ -53,6 +53,16 @@
/** Port parameter. */
#define MLX4_PMD_PORT_KVARG "port"
/* Reserved address space for UAR mapping. */
#define MLX4_UAR_SIZE (1ULL << (sizeof(uintptr_t) * 4))
/* Offset of reserved UAR address space to hugepage memory. Offset is used here
* to minimize possibility of address next to hugepage being used by other code
* in either primary or secondary process, failing to map TX UAR would make TX
* packets invisible to HW.
*/
#define MLX4_UAR_OFFSET (2ULL << (sizeof(uintptr_t) * 4))
enum {
PCI_VENDOR_ID_MELLANOX = 0x15b3,
};
@ -63,6 +73,26 @@ enum {
PCI_DEVICE_ID_MELLANOX_CONNECTX3PRO = 0x1007,
};
/* Request types for IPC. */
enum mlx4_mp_req_type {
MLX4_MP_REQ_VERBS_CMD_FD = 1,
MLX4_MP_REQ_START_RXTX,
MLX4_MP_REQ_STOP_RXTX,
};
/* Pameters for IPC. */
struct mlx4_mp_param {
enum mlx4_mp_req_type type;
int port_id;
int result;
};
/** Request timeout for IPC. */
#define MLX4_MP_REQ_TIMEOUT_SEC 5
/** Key string for IPC. */
#define MLX4_MP_NAME "net_mlx4_mp"
/** Driver name reported to lower layers and used in log output. */
#define MLX4_DRIVER_NAME "net_mlx4"
@ -86,6 +116,7 @@ enum mlx4_verbs_alloc_type {
* resources it is allocating.
*/
struct mlx4_verbs_alloc_ctx {
int enabled;
enum mlx4_verbs_alloc_type type; /* Kind of object being allocated. */
const void *obj; /* Pointer to the DPDK object. */
};
@ -93,6 +124,27 @@ struct mlx4_verbs_alloc_ctx {
LIST_HEAD(mlx4_dev_list, mlx4_priv);
LIST_HEAD(mlx4_mr_list, mlx4_mr);
/* Shared data between primary and secondary processes. */
struct mlx4_shared_data {
rte_spinlock_t lock;
/* Global spinlock for primary and secondary processes. */
int init_done; /* Whether primary has done initialization. */
unsigned int secondary_cnt; /* Number of secondary processes init'd. */
void *uar_base;
/* Reserved UAR address space for TXQ UAR(hw doorbell) mapping. */
struct mlx4_dev_list mem_event_cb_list;
rte_rwlock_t mem_event_rwlock;
};
/* Per-process data structure, not visible to other processes. */
struct mlx4_local_data {
int init_done; /* Whether a secondary has done initialization. */
void *uar_base;
/* Reserved UAR address space for TXQ UAR(hw doorbell) mapping. */
};
extern struct mlx4_shared_data *mlx4_shared_data;
/** Private data structure. */
struct mlx4_priv {
LIST_ENTRY(mlx4_priv) mem_event_cb;
@ -175,4 +227,13 @@ void mlx4_rxq_intr_disable(struct mlx4_priv *priv);
int mlx4_rx_intr_disable(struct rte_eth_dev *dev, uint16_t idx);
int mlx4_rx_intr_enable(struct rte_eth_dev *dev, uint16_t idx);
/* mlx4_mp.c */
void mlx4_mp_req_start_rxtx(struct rte_eth_dev *dev);
void mlx4_mp_req_stop_rxtx(struct rte_eth_dev *dev);
int mlx4_mp_req_verbs_cmd_fd(struct rte_eth_dev *dev);
void mlx4_mp_init_primary(void);
void mlx4_mp_uninit_primary(void);
void mlx4_mp_init_secondary(void);
void mlx4_mp_uninit_secondary(void);
#endif /* RTE_PMD_MLX4_H_ */

304
drivers/net/mlx4/mlx4_mp.c Normal file
View File

@ -0,0 +1,304 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright 2019 6WIND S.A.
* Copyright 2019 Mellanox Technologies, Ltd
*/
#include <assert.h>
#include <stdio.h>
#include <time.h>
#include <rte_eal.h>
#include <rte_ethdev_driver.h>
#include <rte_string_fns.h>
#include "mlx4.h"
#include "mlx4_rxtx.h"
#include "mlx4_utils.h"
/**
* Initialize IPC message.
*
* @param[in] dev
* Pointer to Ethernet structure.
* @param[out] msg
* Pointer to message to fill in.
* @param[in] type
* Message type.
*/
static inline void
mp_init_msg(struct rte_eth_dev *dev, struct rte_mp_msg *msg,
enum mlx4_mp_req_type type)
{
struct mlx4_mp_param *param = (struct mlx4_mp_param *)msg->param;
memset(msg, 0, sizeof(*msg));
strlcpy(msg->name, MLX4_MP_NAME, sizeof(msg->name));
msg->len_param = sizeof(*param);
param->type = type;
param->port_id = dev->data->port_id;
}
/**
* IPC message handler of primary process.
*
* @param[in] dev
* Pointer to Ethernet structure.
* @param[in] peer
* Pointer to the peer socket path.
*
* @return
* 0 on success, negative errno value otherwise and rte_errno is set.
*/
static int
mp_primary_handle(const struct rte_mp_msg *mp_msg, const void *peer)
{
struct rte_mp_msg mp_res;
struct mlx4_mp_param *res = (struct mlx4_mp_param *)mp_res.param;
const struct mlx4_mp_param *param =
(const struct mlx4_mp_param *)mp_msg->param;
struct rte_eth_dev *dev;
struct mlx4_priv *priv;
int ret;
assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
if (!rte_eth_dev_is_valid_port(param->port_id)) {
rte_errno = ENODEV;
ERROR("port %u invalid port ID", param->port_id);
return -rte_errno;
}
dev = &rte_eth_devices[param->port_id];
priv = dev->data->dev_private;
switch (param->type) {
case MLX4_MP_REQ_VERBS_CMD_FD:
mp_init_msg(dev, &mp_res, param->type);
mp_res.num_fds = 1;
mp_res.fds[0] = priv->ctx->cmd_fd;
res->result = 0;
ret = rte_mp_reply(&mp_res, peer);
break;
default:
rte_errno = EINVAL;
ERROR("port %u invalid mp request type", dev->data->port_id);
return -rte_errno;
}
return ret;
}
/**
* IPC message handler of a secondary process.
*
* @param[in] dev
* Pointer to Ethernet structure.
* @param[in] peer
* Pointer to the peer socket path.
*
* @return
* 0 on success, a negative errno value otherwise and rte_errno is set.
*/
static int
mp_secondary_handle(const struct rte_mp_msg *mp_msg, const void *peer)
{
struct rte_mp_msg mp_res;
struct mlx4_mp_param *res = (struct mlx4_mp_param *)mp_res.param;
const struct mlx4_mp_param *param =
(const struct mlx4_mp_param *)mp_msg->param;
struct rte_eth_dev *dev;
int ret;
assert(rte_eal_process_type() == RTE_PROC_SECONDARY);
if (!rte_eth_dev_is_valid_port(param->port_id)) {
rte_errno = ENODEV;
ERROR("port %u invalid port ID", param->port_id);
return -rte_errno;
}
dev = &rte_eth_devices[param->port_id];
switch (param->type) {
case MLX4_MP_REQ_START_RXTX:
INFO("port %u starting datapath", dev->data->port_id);
rte_mb();
dev->tx_pkt_burst = mlx4_tx_burst;
dev->rx_pkt_burst = mlx4_rx_burst;
mp_init_msg(dev, &mp_res, param->type);
res->result = 0;
ret = rte_mp_reply(&mp_res, peer);
break;
case MLX4_MP_REQ_STOP_RXTX:
INFO("port %u stopping datapath", dev->data->port_id);
dev->tx_pkt_burst = mlx4_tx_burst_removed;
dev->rx_pkt_burst = mlx4_rx_burst_removed;
rte_mb();
mp_init_msg(dev, &mp_res, param->type);
res->result = 0;
ret = rte_mp_reply(&mp_res, peer);
break;
default:
rte_errno = EINVAL;
ERROR("port %u invalid mp request type", dev->data->port_id);
return -rte_errno;
}
return ret;
}
/**
* Broadcast request of stopping/starting data-path to secondary processes.
*
* @param[in] dev
* Pointer to Ethernet structure.
* @param[in] type
* Request type.
*/
static void
mp_req_on_rxtx(struct rte_eth_dev *dev, enum mlx4_mp_req_type type)
{
struct rte_mp_msg mp_req;
struct rte_mp_msg *mp_res;
struct rte_mp_reply mp_rep;
struct mlx4_mp_param *res __rte_unused;
struct timespec ts = {.tv_sec = MLX4_MP_REQ_TIMEOUT_SEC, .tv_nsec = 0};
int ret;
int i;
assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
if (!mlx4_shared_data->secondary_cnt)
return;
if (type != MLX4_MP_REQ_START_RXTX && type != MLX4_MP_REQ_STOP_RXTX) {
ERROR("port %u unknown request (req_type %d)",
dev->data->port_id, type);
return;
}
mp_init_msg(dev, &mp_req, type);
ret = rte_mp_request_sync(&mp_req, &mp_rep, &ts);
if (ret) {
ERROR("port %u failed to request stop/start Rx/Tx (%d)",
dev->data->port_id, type);
goto exit;
}
if (mp_rep.nb_sent != mp_rep.nb_received) {
ERROR("port %u not all secondaries responded (req_type %d)",
dev->data->port_id, type);
goto exit;
}
for (i = 0; i < mp_rep.nb_received; i++) {
mp_res = &mp_rep.msgs[i];
res = (struct mlx4_mp_param *)mp_res->param;
if (res->result) {
ERROR("port %u request failed on secondary #%d",
dev->data->port_id, i);
goto exit;
}
}
exit:
free(mp_rep.msgs);
}
/**
* Broadcast request of starting data-path to secondary processes. The request
* is synchronous.
*
* @param[in] dev
* Pointer to Ethernet structure.
*/
void
mlx4_mp_req_start_rxtx(struct rte_eth_dev *dev)
{
mp_req_on_rxtx(dev, MLX4_MP_REQ_START_RXTX);
}
/**
* Broadcast request of stopping data-path to secondary processes. The request
* is synchronous.
*
* @param[in] dev
* Pointer to Ethernet structure.
*/
void
mlx4_mp_req_stop_rxtx(struct rte_eth_dev *dev)
{
mp_req_on_rxtx(dev, MLX4_MP_REQ_STOP_RXTX);
}
/**
* IPC message handler of primary process.
*
* @param[in] dev
* Pointer to Ethernet structure.
*
* @return
* fd on success, a negative errno value otherwise and rte_errno is set.
*/
int
mlx4_mp_req_verbs_cmd_fd(struct rte_eth_dev *dev)
{
struct rte_mp_msg mp_req;
struct rte_mp_msg *mp_res;
struct rte_mp_reply mp_rep;
struct mlx4_mp_param *res;
struct timespec ts = {.tv_sec = MLX4_MP_REQ_TIMEOUT_SEC, .tv_nsec = 0};
int ret;
assert(rte_eal_process_type() == RTE_PROC_SECONDARY);
mp_init_msg(dev, &mp_req, MLX4_MP_REQ_VERBS_CMD_FD);
ret = rte_mp_request_sync(&mp_req, &mp_rep, &ts);
if (ret) {
ERROR("port %u request to primary process failed",
dev->data->port_id);
return -rte_errno;
}
assert(mp_rep.nb_received == 1);
mp_res = &mp_rep.msgs[0];
res = (struct mlx4_mp_param *)mp_res->param;
if (res->result) {
rte_errno = -res->result;
ERROR("port %u failed to get command FD from primary process",
dev->data->port_id);
ret = -rte_errno;
goto exit;
}
assert(mp_res->num_fds == 1);
ret = mp_res->fds[0];
DEBUG("port %u command FD from primary is %d",
dev->data->port_id, ret);
exit:
free(mp_rep.msgs);
return ret;
}
/**
* Initialize by primary process.
*/
void
mlx4_mp_init_primary(void)
{
assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
rte_mp_action_register(MLX4_MP_NAME, mp_primary_handle);
}
/**
* Un-initialize by primary process.
*/
void
mlx4_mp_uninit_primary(void)
{
assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
rte_mp_action_unregister(MLX4_MP_NAME);
}
/**
* Initialize by secondary process.
*/
void
mlx4_mp_init_secondary(void)
{
assert(rte_eal_process_type() == RTE_PROC_SECONDARY);
rte_mp_action_register(MLX4_MP_NAME, mp_secondary_handle);
}
/**
* Un-initialize by secondary process.
*/
void
mlx4_mp_uninit_secondary(void)
{
assert(rte_eal_process_type() == RTE_PROC_SECONDARY);
rte_mp_action_unregister(MLX4_MP_NAME);
}

View File

@ -489,6 +489,8 @@ mlx4_mr_garbage_collect(struct rte_eth_dev *dev)
struct mlx4_mr *mr_next;
struct mlx4_mr_list free_list = LIST_HEAD_INITIALIZER(free_list);
/* Must be called from the primary process. */
assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
/*
* MR can't be freed with holding the lock because rte_free() could call
* memory free callback function. This will be a deadlock situation.
@ -561,6 +563,14 @@ mlx4_mr_create(struct rte_eth_dev *dev, struct mlx4_mr_cache *entry,
DEBUG("port %u creating a MR using address (%p)",
dev->data->port_id, (void *)addr);
if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
WARN("port %u using address (%p) of unregistered mempool"
" in secondary process, please create mempool"
" before rte_eth_dev_start()",
dev->data->port_id, (void *)addr);
rte_errno = EPERM;
goto err_nolock;
}
/*
* Release detached MRs if any. This can't be called with holding either
* memory_hotplug_lock or priv->mr.rwlock. MRs on the free list have
@ -890,14 +900,17 @@ mlx4_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr,
size_t len, void *arg __rte_unused)
{
struct mlx4_priv *priv;
struct mlx4_dev_list *dev_list = &mlx4_shared_data->mem_event_cb_list;
/* Must be called from the primary process. */
assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
switch (event_type) {
case RTE_MEM_EVENT_FREE:
rte_rwlock_read_lock(&mlx4_mem_event_rwlock);
rte_rwlock_read_lock(&mlx4_shared_data->mem_event_rwlock);
/* Iterate all the existing mlx4 devices. */
LIST_FOREACH(priv, &mlx4_mem_event_cb_list, mem_event_cb)
LIST_FOREACH(priv, dev_list, mem_event_cb)
mlx4_mr_mem_event_free_cb(ETH_DEV(priv), addr, len);
rte_rwlock_read_unlock(&mlx4_mem_event_rwlock);
rte_rwlock_read_unlock(&mlx4_shared_data->mem_event_rwlock);
break;
case RTE_MEM_EVENT_ALLOC:
default:
@ -1130,6 +1143,7 @@ mlx4_mr_update_ext_mp_cb(struct rte_mempool *mp, void *opaque,
struct mlx4_mr_cache entry;
uint32_t lkey;
assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
/* If already registered, it should return. */
rte_rwlock_read_lock(&priv->mr.rwlock);
lkey = mr_lookup_dev(dev, &entry, addr);
@ -1225,6 +1239,14 @@ mlx4_tx_update_ext_mp(struct txq *txq, uintptr_t addr, struct rte_mempool *mp)
struct mlx4_mr_ctrl *mr_ctrl = &txq->mr_ctrl;
struct mlx4_priv *priv = txq->priv;
if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
WARN("port %u using address (%p) from unregistered mempool"
" having externally allocated memory"
" in secondary process, please create mempool"
" prior to rte_eth_dev_start()",
PORT_ID(priv), (void *)addr);
return UINT32_MAX;
}
mlx4_mr_update_ext_mp(ETH_DEV(priv), mr_ctrl, mp);
return mlx4_tx_addr2mr_bh(txq, addr);
}
@ -1336,9 +1358,9 @@ mlx4_mr_release(struct rte_eth_dev *dev)
struct mlx4_mr *mr_next = LIST_FIRST(&priv->mr.mr_list);
/* Remove from memory callback device list. */
rte_rwlock_write_lock(&mlx4_mem_event_rwlock);
rte_rwlock_write_lock(&mlx4_shared_data->mem_event_rwlock);
LIST_REMOVE(priv, mem_event_cb);
rte_rwlock_write_unlock(&mlx4_mem_event_rwlock);
rte_rwlock_write_unlock(&mlx4_shared_data->mem_event_rwlock);
#ifndef NDEBUG
mlx4_mr_dump_dev(dev);
#endif

View File

@ -77,7 +77,9 @@ struct mlx4_sq {
uint32_t owner_opcode;
/**< Default owner opcode with HW valid owner bit. */
uint32_t stamp; /**< Stamp value with an invalid HW owner bit. */
volatile uint32_t *db; /**< Pointer to the doorbell. */
volatile uint32_t *qp_sdb; /**< Pointer to the doorbell. */
volatile uint32_t *db; /**< Pointer to the doorbell remapped. */
off_t uar_mmap_offset; /* UAR mmap offset for non-primary process. */
uint32_t doorbell_qpn; /**< qp number to write to the doorbell. */
};

View File

@ -1365,6 +1365,7 @@ mlx4_tx_burst_removed(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
(void)dpdk_txq;
(void)pkts;
(void)pkts_n;
rte_mb();
return 0;
}
@ -1390,5 +1391,6 @@ mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
(void)dpdk_rxq;
(void)pkts;
(void)pkts_n;
rte_mb();
return 0;
}

View File

@ -152,6 +152,7 @@ uint16_t mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
/* mlx4_txq.c */
int mlx4_tx_uar_remap(struct rte_eth_dev *dev, int fd);
uint64_t mlx4_get_tx_port_offloads(struct mlx4_priv *priv);
int mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx,
uint16_t desc, unsigned int socket,

View File

@ -13,7 +13,9 @@
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include <sys/mman.h>
#include <inttypes.h>
#include <unistd.h>
/* Verbs headers do not support -pedantic. */
#ifdef PEDANTIC
@ -37,6 +39,100 @@
#include "mlx4_rxtx.h"
#include "mlx4_utils.h"
/**
* Mmap TX UAR(HW doorbell) pages into reserved UAR address space.
* Both primary and secondary process do mmap to make UAR address
* aligned.
*
* @param[in] dev
* Pointer to Ethernet device.
* @param fd
* Verbs file descriptor to map UAR pages.
*
* @return
* 0 on success, a negative errno value otherwise and rte_errno is set.
*/
#ifdef HAVE_IBV_MLX4_UAR_MMAP_OFFSET
int
mlx4_tx_uar_remap(struct rte_eth_dev *dev, int fd)
{
unsigned int i, j;
const unsigned int txqs_n = dev->data->nb_tx_queues;
uintptr_t pages[txqs_n];
unsigned int pages_n = 0;
uintptr_t uar_va;
uintptr_t off;
void *addr;
void *ret;
struct txq *txq;
int already_mapped;
size_t page_size = sysconf(_SC_PAGESIZE);
memset(pages, 0, txqs_n * sizeof(uintptr_t));
/*
* As rdma-core, UARs are mapped in size of OS page size.
* Use aligned address to avoid duplicate mmap.
* Ref to libmlx4 function: mlx4_init_context()
*/
for (i = 0; i != txqs_n; ++i) {
txq = dev->data->tx_queues[i];
if (!txq)
continue;
/* UAR addr form verbs used to find dup and offset in page. */
uar_va = (uintptr_t)txq->msq.qp_sdb;
off = uar_va & (page_size - 1); /* offset in page. */
uar_va = RTE_ALIGN_FLOOR(uar_va, page_size); /* page addr. */
already_mapped = 0;
for (j = 0; j != pages_n; ++j) {
if (pages[j] == uar_va) {
already_mapped = 1;
break;
}
}
/* new address in reserved UAR address space. */
addr = RTE_PTR_ADD(mlx4_shared_data->uar_base,
uar_va & (uintptr_t)(MLX4_UAR_SIZE - 1));
if (!already_mapped) {
pages[pages_n++] = uar_va;
/* fixed mmap to specified address in reserved
* address space.
*/
ret = mmap(addr, page_size,
PROT_WRITE, MAP_FIXED | MAP_SHARED, fd,
txq->msq.uar_mmap_offset);
if (ret != addr) {
/* fixed mmap has to return same address. */
ERROR("port %u call to mmap failed on UAR"
" for txq %u",
dev->data->port_id, i);
rte_errno = ENXIO;
return -rte_errno;
}
}
if (rte_eal_process_type() == RTE_PROC_PRIMARY) /* save once. */
txq->msq.db = RTE_PTR_ADD((void *)addr, off);
else
assert(txq->msq.db ==
RTE_PTR_ADD((void *)addr, off));
}
return 0;
}
#else
int
mlx4_tx_uar_remap(struct rte_eth_dev *dev __rte_unused, int fd __rte_unused)
{
/*
* Even if rdma-core doesn't support UAR remap, primary process
* shouldn't be interrupted.
*/
if (rte_eal_process_type() == RTE_PROC_PRIMARY)
return 0;
ERROR("UAR remap is not supported");
rte_errno = ENOTSUP;
return -rte_errno;
}
#endif
/**
* Free Tx queue elements.
*
@ -89,7 +185,13 @@ mlx4_txq_fill_dv_obj_info(struct txq *txq, struct mlx4dv_obj *mlxdv)
sq->owner_opcode = MLX4_OPCODE_SEND | (0u << MLX4_SQ_OWNER_BIT);
sq->stamp = rte_cpu_to_be_32(MLX4_SQ_STAMP_VAL |
(0u << MLX4_SQ_OWNER_BIT));
#ifdef HAVE_IBV_MLX4_UAR_MMAP_OFFSET
sq->uar_mmap_offset = dqp->uar_mmap_offset;
sq->qp_sdb = dqp->sdb;
#else
sq->uar_mmap_offset = -1; /* Make mmap() fail. */
sq->db = dqp->sdb;
#endif
sq->doorbell_qpn = dqp->doorbell_qpn;
cq->buf = dcq->buf.buf;
cq->cqe_cnt = dcq->cqe_cnt;
@ -307,6 +409,11 @@ mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
goto error;
}
/* Retrieve device queue information. */
#ifdef HAVE_IBV_MLX4_UAR_MMAP_OFFSET
dv_qp = (struct mlx4dv_qp){
.comp_mask = MLX4DV_QP_MASK_UAR_MMAP_OFFSET,
};
#endif
mlxdv.cq.in = txq->cq;
mlxdv.cq.out = &dv_cq;
mlxdv.qp.in = txq->qp;
@ -318,6 +425,12 @@ mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
" accessing the device queues", (void *)dev);
goto error;
}
#ifdef HAVE_IBV_MLX4_UAR_MMAP_OFFSET
if (!(dv_qp.comp_mask & MLX4DV_QP_MASK_UAR_MMAP_OFFSET)) {
WARN("%p: failed to obtain UAR mmap offset", (void *)dev);
dv_qp.uar_mmap_offset = -1; /* Make mmap() fail. */
}
#endif
mlx4_txq_fill_dv_obj_info(txq, &mlxdv);
/* Save first wqe pointer in the first element. */
(&(*txq->elts)[0])->wqe =