numam-dpdk/drivers/net/mana/mana.c
Long Li afd5d17072 net/mana: support Rx interrupts
mana can receive Rx interrupts from kernel through RDMA verbs interface.
Implement Rx interrupts in the driver.

Signed-off-by: Long Li <longli@microsoft.com>
2022-10-06 10:49:00 +02:00

1503 lines
34 KiB
C

/* SPDX-License-Identifier: BSD-3-Clause
* Copyright 2022 Microsoft Corporation
*/
#include <unistd.h>
#include <dirent.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <ethdev_driver.h>
#include <ethdev_pci.h>
#include <rte_kvargs.h>
#include <rte_eal_paging.h>
#include <infiniband/verbs.h>
#include <infiniband/manadv.h>
#include <assert.h>
#include "mana.h"
/* Shared memory between primary/secondary processes, per driver */
/* Data to track primary/secondary usage */
struct mana_shared_data *mana_shared_data;
static struct mana_shared_data mana_local_data;
/* The memory region for the above data */
static const struct rte_memzone *mana_shared_mz;
static const char *MZ_MANA_SHARED_DATA = "mana_shared_data";
/* Spinlock for mana_shared_data */
static rte_spinlock_t mana_shared_data_lock = RTE_SPINLOCK_INITIALIZER;
/* Allocate a buffer on the stack and fill it with a printf format string. */
#define MANA_MKSTR(name, ...) \
int mkstr_size_##name = snprintf(NULL, 0, "" __VA_ARGS__); \
char name[mkstr_size_##name + 1]; \
\
memset(name, 0, mkstr_size_##name + 1); \
snprintf(name, sizeof(name), "" __VA_ARGS__)
int mana_logtype_driver;
int mana_logtype_init;
/*
* Callback from rdma-core to allocate a buffer for a queue.
*/
void *
mana_alloc_verbs_buf(size_t size, void *data)
{
void *ret;
size_t alignment = rte_mem_page_size();
int socket = (int)(uintptr_t)data;
DRV_LOG(DEBUG, "size=%zu socket=%d", size, socket);
if (alignment == (size_t)-1) {
DRV_LOG(ERR, "Failed to get mem page size");
rte_errno = ENOMEM;
return NULL;
}
ret = rte_zmalloc_socket("mana_verb_buf", size, alignment, socket);
if (!ret && size)
rte_errno = ENOMEM;
return ret;
}
void
mana_free_verbs_buf(void *ptr, void *data __rte_unused)
{
rte_free(ptr);
}
static int
mana_dev_configure(struct rte_eth_dev *dev)
{
struct mana_priv *priv = dev->data->dev_private;
struct rte_eth_conf *dev_conf = &dev->data->dev_conf;
if (dev_conf->rxmode.mq_mode & RTE_ETH_MQ_RX_RSS_FLAG)
dev_conf->rxmode.offloads |= RTE_ETH_RX_OFFLOAD_RSS_HASH;
if (dev->data->nb_rx_queues != dev->data->nb_tx_queues) {
DRV_LOG(ERR, "Only support equal number of rx/tx queues");
return -EINVAL;
}
if (!rte_is_power_of_2(dev->data->nb_rx_queues)) {
DRV_LOG(ERR, "number of TX/RX queues must be power of 2");
return -EINVAL;
}
priv->num_queues = dev->data->nb_rx_queues;
manadv_set_context_attr(priv->ib_ctx, MANADV_CTX_ATTR_BUF_ALLOCATORS,
(void *)((uintptr_t)&(struct manadv_ctx_allocators){
.alloc = &mana_alloc_verbs_buf,
.free = &mana_free_verbs_buf,
.data = 0,
}));
return 0;
}
static void
rx_intr_vec_disable(struct mana_priv *priv)
{
struct rte_intr_handle *intr_handle = priv->intr_handle;
rte_intr_free_epoll_fd(intr_handle);
rte_intr_vec_list_free(intr_handle);
rte_intr_nb_efd_set(intr_handle, 0);
}
static int
rx_intr_vec_enable(struct mana_priv *priv)
{
unsigned int i;
unsigned int rxqs_n = priv->dev_data->nb_rx_queues;
unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
struct rte_intr_handle *intr_handle = priv->intr_handle;
int ret;
rx_intr_vec_disable(priv);
if (rte_intr_vec_list_alloc(intr_handle, NULL, n)) {
DRV_LOG(ERR, "Failed to allocate memory for interrupt vector");
return -ENOMEM;
}
for (i = 0; i < n; i++) {
struct mana_rxq *rxq = priv->dev_data->rx_queues[i];
ret = rte_intr_vec_list_index_set(intr_handle, i,
RTE_INTR_VEC_RXTX_OFFSET + i);
if (ret) {
DRV_LOG(ERR, "Failed to set intr vec %u", i);
return ret;
}
ret = rte_intr_efds_index_set(intr_handle, i, rxq->channel->fd);
if (ret) {
DRV_LOG(ERR, "Failed to set FD at intr %u", i);
return ret;
}
}
return rte_intr_nb_efd_set(intr_handle, n);
}
static void
rxq_intr_disable(struct mana_priv *priv)
{
int err = rte_errno;
rx_intr_vec_disable(priv);
rte_errno = err;
}
static int
rxq_intr_enable(struct mana_priv *priv)
{
const struct rte_eth_intr_conf *const intr_conf =
&priv->dev_data->dev_conf.intr_conf;
if (!intr_conf->rxq)
return 0;
return rx_intr_vec_enable(priv);
}
static int
mana_dev_start(struct rte_eth_dev *dev)
{
int ret;
struct mana_priv *priv = dev->data->dev_private;
rte_spinlock_init(&priv->mr_btree_lock);
ret = mana_mr_btree_init(&priv->mr_btree, MANA_MR_BTREE_CACHE_N,
dev->device->numa_node);
if (ret) {
DRV_LOG(ERR, "Failed to init device MR btree %d", ret);
return ret;
}
ret = mana_start_tx_queues(dev);
if (ret) {
DRV_LOG(ERR, "failed to start tx queues %d", ret);
goto failed_tx;
}
ret = mana_start_rx_queues(dev);
if (ret) {
DRV_LOG(ERR, "failed to start rx queues %d", ret);
goto failed_rx;
}
rte_wmb();
dev->tx_pkt_burst = mana_tx_burst;
dev->rx_pkt_burst = mana_rx_burst;
DRV_LOG(INFO, "TX/RX queues have started");
/* Enable datapath for secondary processes */
mana_mp_req_on_rxtx(dev, MANA_MP_REQ_START_RXTX);
ret = rxq_intr_enable(priv);
if (ret) {
DRV_LOG(ERR, "Failed to enable RX interrupts");
goto failed_intr;
}
return 0;
failed_intr:
mana_stop_rx_queues(dev);
failed_rx:
mana_stop_tx_queues(dev);
failed_tx:
mana_mr_btree_free(&priv->mr_btree);
return ret;
}
static int
mana_dev_stop(struct rte_eth_dev *dev)
{
int ret;
struct mana_priv *priv = dev->data->dev_private;
rxq_intr_disable(priv);
dev->tx_pkt_burst = mana_tx_burst_removed;
dev->rx_pkt_burst = mana_rx_burst_removed;
/* Stop datapath on secondary processes */
mana_mp_req_on_rxtx(dev, MANA_MP_REQ_STOP_RXTX);
rte_wmb();
ret = mana_stop_tx_queues(dev);
if (ret) {
DRV_LOG(ERR, "failed to stop tx queues");
return ret;
}
ret = mana_stop_rx_queues(dev);
if (ret) {
DRV_LOG(ERR, "failed to stop tx queues");
return ret;
}
return 0;
}
static int mana_intr_uninstall(struct mana_priv *priv);
static int
mana_dev_close(struct rte_eth_dev *dev)
{
struct mana_priv *priv = dev->data->dev_private;
int ret;
mana_remove_all_mr(priv);
ret = mana_intr_uninstall(priv);
if (ret)
return ret;
ret = ibv_close_device(priv->ib_ctx);
if (ret) {
ret = errno;
return ret;
}
return 0;
}
static int
mana_dev_info_get(struct rte_eth_dev *dev,
struct rte_eth_dev_info *dev_info)
{
struct mana_priv *priv = dev->data->dev_private;
dev_info->max_mtu = RTE_ETHER_MTU;
/* RX params */
dev_info->min_rx_bufsize = MIN_RX_BUF_SIZE;
dev_info->max_rx_pktlen = MAX_FRAME_SIZE;
dev_info->max_rx_queues = priv->max_rx_queues;
dev_info->max_tx_queues = priv->max_tx_queues;
dev_info->max_mac_addrs = MANA_MAX_MAC_ADDR;
dev_info->max_hash_mac_addrs = 0;
dev_info->max_vfs = 1;
/* Offload params */
dev_info->rx_offload_capa = MANA_DEV_RX_OFFLOAD_SUPPORT;
dev_info->tx_offload_capa = MANA_DEV_TX_OFFLOAD_SUPPORT;
/* RSS */
dev_info->reta_size = INDIRECTION_TABLE_NUM_ELEMENTS;
dev_info->hash_key_size = TOEPLITZ_HASH_KEY_SIZE_IN_BYTES;
dev_info->flow_type_rss_offloads = MANA_ETH_RSS_SUPPORT;
/* Thresholds */
dev_info->default_rxconf = (struct rte_eth_rxconf){
.rx_thresh = {
.pthresh = 8,
.hthresh = 8,
.wthresh = 0,
},
.rx_free_thresh = 32,
/* If no descriptors available, pkts are dropped by default */
.rx_drop_en = 1,
};
dev_info->default_txconf = (struct rte_eth_txconf){
.tx_thresh = {
.pthresh = 32,
.hthresh = 0,
.wthresh = 0,
},
.tx_rs_thresh = 32,
.tx_free_thresh = 32,
};
/* Buffer limits */
dev_info->rx_desc_lim.nb_min = MIN_BUFFERS_PER_QUEUE;
dev_info->rx_desc_lim.nb_max = priv->max_rx_desc;
dev_info->rx_desc_lim.nb_align = MIN_BUFFERS_PER_QUEUE;
dev_info->rx_desc_lim.nb_seg_max = priv->max_recv_sge;
dev_info->rx_desc_lim.nb_mtu_seg_max = priv->max_recv_sge;
dev_info->tx_desc_lim.nb_min = MIN_BUFFERS_PER_QUEUE;
dev_info->tx_desc_lim.nb_max = priv->max_tx_desc;
dev_info->tx_desc_lim.nb_align = MIN_BUFFERS_PER_QUEUE;
dev_info->tx_desc_lim.nb_seg_max = priv->max_send_sge;
dev_info->rx_desc_lim.nb_mtu_seg_max = priv->max_recv_sge;
/* Speed */
dev_info->speed_capa = RTE_ETH_LINK_SPEED_100G;
/* RX params */
dev_info->default_rxportconf.burst_size = 1;
dev_info->default_rxportconf.ring_size = MAX_RECEIVE_BUFFERS_PER_QUEUE;
dev_info->default_rxportconf.nb_queues = 1;
/* TX params */
dev_info->default_txportconf.burst_size = 1;
dev_info->default_txportconf.ring_size = MAX_SEND_BUFFERS_PER_QUEUE;
dev_info->default_txportconf.nb_queues = 1;
return 0;
}
static void
mana_dev_tx_queue_info(struct rte_eth_dev *dev, uint16_t queue_id,
struct rte_eth_txq_info *qinfo)
{
struct mana_txq *txq = dev->data->tx_queues[queue_id];
qinfo->conf.offloads = dev->data->dev_conf.txmode.offloads;
qinfo->nb_desc = txq->num_desc;
}
static void
mana_dev_rx_queue_info(struct rte_eth_dev *dev, uint16_t queue_id,
struct rte_eth_rxq_info *qinfo)
{
struct mana_rxq *rxq = dev->data->rx_queues[queue_id];
qinfo->mp = rxq->mp;
qinfo->nb_desc = rxq->num_desc;
qinfo->conf.offloads = dev->data->dev_conf.rxmode.offloads;
}
static const uint32_t *
mana_supported_ptypes(struct rte_eth_dev *dev __rte_unused)
{
static const uint32_t ptypes[] = {
RTE_PTYPE_L2_ETHER,
RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,
RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,
RTE_PTYPE_L4_FRAG,
RTE_PTYPE_L4_TCP,
RTE_PTYPE_L4_UDP,
RTE_PTYPE_UNKNOWN
};
return ptypes;
}
static int
mana_rss_hash_update(struct rte_eth_dev *dev,
struct rte_eth_rss_conf *rss_conf)
{
struct mana_priv *priv = dev->data->dev_private;
/* Currently can only update RSS hash when device is stopped */
if (dev->data->dev_started) {
DRV_LOG(ERR, "Can't update RSS after device has started");
return -ENODEV;
}
if (rss_conf->rss_hf & ~MANA_ETH_RSS_SUPPORT) {
DRV_LOG(ERR, "Port %u invalid RSS HF 0x%" PRIx64,
dev->data->port_id, rss_conf->rss_hf);
return -EINVAL;
}
if (rss_conf->rss_key && rss_conf->rss_key_len) {
if (rss_conf->rss_key_len != TOEPLITZ_HASH_KEY_SIZE_IN_BYTES) {
DRV_LOG(ERR, "Port %u key len must be %u long",
dev->data->port_id,
TOEPLITZ_HASH_KEY_SIZE_IN_BYTES);
return -EINVAL;
}
priv->rss_conf.rss_key_len = rss_conf->rss_key_len;
priv->rss_conf.rss_key =
rte_zmalloc("mana_rss", rss_conf->rss_key_len,
RTE_CACHE_LINE_SIZE);
if (!priv->rss_conf.rss_key)
return -ENOMEM;
memcpy(priv->rss_conf.rss_key, rss_conf->rss_key,
rss_conf->rss_key_len);
}
priv->rss_conf.rss_hf = rss_conf->rss_hf;
return 0;
}
static int
mana_rss_hash_conf_get(struct rte_eth_dev *dev,
struct rte_eth_rss_conf *rss_conf)
{
struct mana_priv *priv = dev->data->dev_private;
if (!rss_conf)
return -EINVAL;
if (rss_conf->rss_key &&
rss_conf->rss_key_len >= priv->rss_conf.rss_key_len) {
memcpy(rss_conf->rss_key, priv->rss_conf.rss_key,
priv->rss_conf.rss_key_len);
}
rss_conf->rss_key_len = priv->rss_conf.rss_key_len;
rss_conf->rss_hf = priv->rss_conf.rss_hf;
return 0;
}
static int
mana_dev_tx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_idx,
uint16_t nb_desc, unsigned int socket_id,
const struct rte_eth_txconf *tx_conf __rte_unused)
{
struct mana_priv *priv = dev->data->dev_private;
struct mana_txq *txq;
int ret;
txq = rte_zmalloc_socket("mana_txq", sizeof(*txq), 0, socket_id);
if (!txq) {
DRV_LOG(ERR, "failed to allocate txq");
return -ENOMEM;
}
txq->socket = socket_id;
txq->desc_ring = rte_malloc_socket("mana_tx_desc_ring",
sizeof(struct mana_txq_desc) *
nb_desc,
RTE_CACHE_LINE_SIZE, socket_id);
if (!txq->desc_ring) {
DRV_LOG(ERR, "failed to allocate txq desc_ring");
ret = -ENOMEM;
goto fail;
}
ret = mana_mr_btree_init(&txq->mr_btree,
MANA_MR_BTREE_PER_QUEUE_N, socket_id);
if (ret) {
DRV_LOG(ERR, "Failed to init TXQ MR btree");
goto fail;
}
DRV_LOG(DEBUG, "idx %u nb_desc %u socket %u txq->desc_ring %p",
queue_idx, nb_desc, socket_id, txq->desc_ring);
txq->desc_ring_head = 0;
txq->desc_ring_tail = 0;
txq->priv = priv;
txq->num_desc = nb_desc;
dev->data->tx_queues[queue_idx] = txq;
return 0;
fail:
rte_free(txq->desc_ring);
rte_free(txq);
return ret;
}
static void
mana_dev_tx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
{
struct mana_txq *txq = dev->data->tx_queues[qid];
mana_mr_btree_free(&txq->mr_btree);
rte_free(txq->desc_ring);
rte_free(txq);
}
static int
mana_dev_rx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_idx,
uint16_t nb_desc, unsigned int socket_id,
const struct rte_eth_rxconf *rx_conf __rte_unused,
struct rte_mempool *mp)
{
struct mana_priv *priv = dev->data->dev_private;
struct mana_rxq *rxq;
int ret;
rxq = rte_zmalloc_socket("mana_rxq", sizeof(*rxq), 0, socket_id);
if (!rxq) {
DRV_LOG(ERR, "failed to allocate rxq");
return -ENOMEM;
}
DRV_LOG(DEBUG, "idx %u nb_desc %u socket %u",
queue_idx, nb_desc, socket_id);
rxq->socket = socket_id;
rxq->desc_ring = rte_zmalloc_socket("mana_rx_mbuf_ring",
sizeof(struct mana_rxq_desc) *
nb_desc,
RTE_CACHE_LINE_SIZE, socket_id);
if (!rxq->desc_ring) {
DRV_LOG(ERR, "failed to allocate rxq desc_ring");
ret = -ENOMEM;
goto fail;
}
rxq->desc_ring_head = 0;
rxq->desc_ring_tail = 0;
ret = mana_mr_btree_init(&rxq->mr_btree,
MANA_MR_BTREE_PER_QUEUE_N, socket_id);
if (ret) {
DRV_LOG(ERR, "Failed to init RXQ MR btree");
goto fail;
}
rxq->priv = priv;
rxq->num_desc = nb_desc;
rxq->mp = mp;
dev->data->rx_queues[queue_idx] = rxq;
return 0;
fail:
rte_free(rxq->desc_ring);
rte_free(rxq);
return ret;
}
static void
mana_dev_rx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
{
struct mana_rxq *rxq = dev->data->rx_queues[qid];
mana_mr_btree_free(&rxq->mr_btree);
rte_free(rxq->desc_ring);
rte_free(rxq);
}
static int
mana_dev_link_update(struct rte_eth_dev *dev,
int wait_to_complete __rte_unused)
{
struct rte_eth_link link;
/* MANA has no concept of carrier state, always reporting UP */
link = (struct rte_eth_link) {
.link_duplex = RTE_ETH_LINK_FULL_DUPLEX,
.link_autoneg = RTE_ETH_LINK_SPEED_FIXED,
.link_speed = RTE_ETH_SPEED_NUM_100G,
.link_status = RTE_ETH_LINK_UP,
};
return rte_eth_linkstatus_set(dev, &link);
}
static int
mana_dev_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
{
unsigned int i;
for (i = 0; i < dev->data->nb_tx_queues; i++) {
struct mana_txq *txq = dev->data->tx_queues[i];
if (!txq)
continue;
stats->opackets = txq->stats.packets;
stats->obytes = txq->stats.bytes;
stats->oerrors = txq->stats.errors;
if (i < RTE_ETHDEV_QUEUE_STAT_CNTRS) {
stats->q_opackets[i] = txq->stats.packets;
stats->q_obytes[i] = txq->stats.bytes;
}
}
stats->rx_nombuf = 0;
for (i = 0; i < dev->data->nb_rx_queues; i++) {
struct mana_rxq *rxq = dev->data->rx_queues[i];
if (!rxq)
continue;
stats->ipackets = rxq->stats.packets;
stats->ibytes = rxq->stats.bytes;
stats->ierrors = rxq->stats.errors;
/* There is no good way to get stats->imissed, not setting it */
if (i < RTE_ETHDEV_QUEUE_STAT_CNTRS) {
stats->q_ipackets[i] = rxq->stats.packets;
stats->q_ibytes[i] = rxq->stats.bytes;
}
stats->rx_nombuf += rxq->stats.nombuf;
}
return 0;
}
static int
mana_dev_stats_reset(struct rte_eth_dev *dev __rte_unused)
{
unsigned int i;
PMD_INIT_FUNC_TRACE();
for (i = 0; i < dev->data->nb_tx_queues; i++) {
struct mana_txq *txq = dev->data->tx_queues[i];
if (!txq)
continue;
memset(&txq->stats, 0, sizeof(txq->stats));
}
for (i = 0; i < dev->data->nb_rx_queues; i++) {
struct mana_rxq *rxq = dev->data->rx_queues[i];
if (!rxq)
continue;
memset(&rxq->stats, 0, sizeof(rxq->stats));
}
return 0;
}
static const struct eth_dev_ops mana_dev_ops = {
.dev_configure = mana_dev_configure,
.dev_start = mana_dev_start,
.dev_stop = mana_dev_stop,
.dev_close = mana_dev_close,
.dev_infos_get = mana_dev_info_get,
.txq_info_get = mana_dev_tx_queue_info,
.rxq_info_get = mana_dev_rx_queue_info,
.dev_supported_ptypes_get = mana_supported_ptypes,
.rss_hash_update = mana_rss_hash_update,
.rss_hash_conf_get = mana_rss_hash_conf_get,
.tx_queue_setup = mana_dev_tx_queue_setup,
.tx_queue_release = mana_dev_tx_queue_release,
.rx_queue_setup = mana_dev_rx_queue_setup,
.rx_queue_release = mana_dev_rx_queue_release,
.rx_queue_intr_enable = mana_rx_intr_enable,
.rx_queue_intr_disable = mana_rx_intr_disable,
.link_update = mana_dev_link_update,
.stats_get = mana_dev_stats_get,
.stats_reset = mana_dev_stats_reset,
};
static const struct eth_dev_ops mana_dev_secondary_ops = {
.stats_get = mana_dev_stats_get,
.stats_reset = mana_dev_stats_reset,
.dev_infos_get = mana_dev_info_get,
};
uint16_t
mana_rx_burst_removed(void *dpdk_rxq __rte_unused,
struct rte_mbuf **pkts __rte_unused,
uint16_t pkts_n __rte_unused)
{
rte_mb();
return 0;
}
uint16_t
mana_tx_burst_removed(void *dpdk_rxq __rte_unused,
struct rte_mbuf **pkts __rte_unused,
uint16_t pkts_n __rte_unused)
{
rte_mb();
return 0;
}
#define ETH_MANA_MAC_ARG "mac"
static const char * const mana_init_args[] = {
ETH_MANA_MAC_ARG,
NULL,
};
/* Support of parsing up to 8 mac address from EAL command line */
#define MAX_NUM_ADDRESS 8
struct mana_conf {
struct rte_ether_addr mac_array[MAX_NUM_ADDRESS];
unsigned int index;
};
static int
mana_arg_parse_callback(const char *key, const char *val, void *private)
{
struct mana_conf *conf = (struct mana_conf *)private;
int ret;
DRV_LOG(INFO, "key=%s value=%s index=%d", key, val, conf->index);
if (conf->index >= MAX_NUM_ADDRESS) {
DRV_LOG(ERR, "Exceeding max MAC address");
return 1;
}
ret = rte_ether_unformat_addr(val, &conf->mac_array[conf->index]);
if (ret) {
DRV_LOG(ERR, "Invalid MAC address %s", val);
return ret;
}
conf->index++;
return 0;
}
static int
mana_parse_args(struct rte_devargs *devargs, struct mana_conf *conf)
{
struct rte_kvargs *kvlist;
unsigned int arg_count;
int ret = 0;
kvlist = rte_kvargs_parse(devargs->drv_str, mana_init_args);
if (!kvlist) {
DRV_LOG(ERR, "failed to parse kvargs args=%s", devargs->drv_str);
return -EINVAL;
}
arg_count = rte_kvargs_count(kvlist, mana_init_args[0]);
if (arg_count > MAX_NUM_ADDRESS) {
ret = -EINVAL;
goto free_kvlist;
}
ret = rte_kvargs_process(kvlist, mana_init_args[0],
mana_arg_parse_callback, conf);
if (ret) {
DRV_LOG(ERR, "error parsing args");
goto free_kvlist;
}
free_kvlist:
rte_kvargs_free(kvlist);
return ret;
}
static int
get_port_mac(struct ibv_device *device, unsigned int port,
struct rte_ether_addr *addr)
{
FILE *file;
int ret = 0;
DIR *dir;
struct dirent *dent;
unsigned int dev_port;
char mac[20];
MANA_MKSTR(path, "%s/device/net", device->ibdev_path);
dir = opendir(path);
if (!dir)
return -ENOENT;
while ((dent = readdir(dir))) {
char *name = dent->d_name;
MANA_MKSTR(port_path, "%s/%s/dev_port", path, name);
/* Ignore . and .. */
if ((name[0] == '.') &&
((name[1] == '\0') ||
((name[1] == '.') && (name[2] == '\0'))))
continue;
file = fopen(port_path, "r");
if (!file)
continue;
ret = fscanf(file, "%u", &dev_port);
fclose(file);
if (ret != 1)
continue;
/* Ethernet ports start at 0, IB port start at 1 */
if (dev_port == port - 1) {
MANA_MKSTR(address_path, "%s/%s/address", path, name);
file = fopen(address_path, "r");
if (!file)
continue;
ret = fscanf(file, "%s", mac);
fclose(file);
if (ret < 0)
break;
ret = rte_ether_unformat_addr(mac, addr);
if (ret)
DRV_LOG(ERR, "unrecognized mac addr %s", mac);
break;
}
}
closedir(dir);
return ret;
}
static int
mana_ibv_device_to_pci_addr(const struct ibv_device *device,
struct rte_pci_addr *pci_addr)
{
FILE *file;
char *line = NULL;
size_t len = 0;
MANA_MKSTR(path, "%s/device/uevent", device->ibdev_path);
file = fopen(path, "r");
if (!file)
return -errno;
while (getline(&line, &len, file) != -1) {
/* Extract information. */
if (sscanf(line,
"PCI_SLOT_NAME="
"%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n",
&pci_addr->domain,
&pci_addr->bus,
&pci_addr->devid,
&pci_addr->function) == 4) {
break;
}
}
free(line);
fclose(file);
return 0;
}
/*
* Interrupt handler from IB layer to notify this device is being removed.
*/
static void
mana_intr_handler(void *arg)
{
struct mana_priv *priv = arg;
struct ibv_context *ctx = priv->ib_ctx;
struct ibv_async_event event;
/* Read and ack all messages from IB device */
while (true) {
if (ibv_get_async_event(ctx, &event))
break;
if (event.event_type == IBV_EVENT_DEVICE_FATAL) {
struct rte_eth_dev *dev;
dev = &rte_eth_devices[priv->port_id];
if (dev->data->dev_conf.intr_conf.rmv)
rte_eth_dev_callback_process(dev,
RTE_ETH_EVENT_INTR_RMV, NULL);
}
ibv_ack_async_event(&event);
}
}
static int
mana_intr_uninstall(struct mana_priv *priv)
{
int ret;
ret = rte_intr_callback_unregister(priv->intr_handle,
mana_intr_handler, priv);
if (ret <= 0) {
DRV_LOG(ERR, "Failed to unregister intr callback ret %d", ret);
return ret;
}
rte_intr_instance_free(priv->intr_handle);
return 0;
}
int
mana_fd_set_non_blocking(int fd)
{
int ret = fcntl(fd, F_GETFL);
if (ret != -1 && !fcntl(fd, F_SETFL, ret | O_NONBLOCK))
return 0;
rte_errno = errno;
return -rte_errno;
}
static int
mana_intr_install(struct rte_eth_dev *eth_dev, struct mana_priv *priv)
{
int ret;
struct ibv_context *ctx = priv->ib_ctx;
priv->intr_handle = rte_intr_instance_alloc(RTE_INTR_INSTANCE_F_SHARED);
if (!priv->intr_handle) {
DRV_LOG(ERR, "Failed to allocate intr_handle");
rte_errno = ENOMEM;
return -ENOMEM;
}
ret = rte_intr_fd_set(priv->intr_handle, -1);
if (ret)
goto free_intr;
ret = mana_fd_set_non_blocking(ctx->async_fd);
if (ret) {
DRV_LOG(ERR, "Failed to change async_fd to NONBLOCK");
goto free_intr;
}
ret = rte_intr_fd_set(priv->intr_handle, ctx->async_fd);
if (ret)
goto free_intr;
ret = rte_intr_type_set(priv->intr_handle, RTE_INTR_HANDLE_EXT);
if (ret)
goto free_intr;
ret = rte_intr_callback_register(priv->intr_handle,
mana_intr_handler, priv);
if (ret) {
DRV_LOG(ERR, "Failed to register intr callback");
rte_intr_fd_set(priv->intr_handle, -1);
goto free_intr;
}
eth_dev->intr_handle = priv->intr_handle;
return 0;
free_intr:
rte_intr_instance_free(priv->intr_handle);
priv->intr_handle = NULL;
return ret;
}
static int
mana_proc_priv_init(struct rte_eth_dev *dev)
{
struct mana_process_priv *priv;
priv = rte_zmalloc_socket("mana_proc_priv",
sizeof(struct mana_process_priv),
RTE_CACHE_LINE_SIZE,
dev->device->numa_node);
if (!priv)
return -ENOMEM;
dev->process_private = priv;
return 0;
}
/*
* Map the doorbell page for the secondary process through IB device handle.
*/
static int
mana_map_doorbell_secondary(struct rte_eth_dev *eth_dev, int fd)
{
struct mana_process_priv *priv = eth_dev->process_private;
void *addr;
addr = mmap(NULL, rte_mem_page_size(), PROT_WRITE, MAP_SHARED, fd, 0);
if (addr == MAP_FAILED) {
DRV_LOG(ERR, "Failed to map secondary doorbell port %u",
eth_dev->data->port_id);
return -ENOMEM;
}
DRV_LOG(INFO, "Secondary doorbell mapped to %p", addr);
priv->db_page = addr;
return 0;
}
/* Initialize shared data for the driver (all devices) */
static int
mana_init_shared_data(void)
{
int ret = 0;
const struct rte_memzone *secondary_mz;
rte_spinlock_lock(&mana_shared_data_lock);
/* Skip if shared data is already initialized */
if (mana_shared_data)
goto exit;
if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
mana_shared_mz = rte_memzone_reserve(MZ_MANA_SHARED_DATA,
sizeof(*mana_shared_data),
SOCKET_ID_ANY, 0);
if (!mana_shared_mz) {
DRV_LOG(ERR, "Cannot allocate mana shared data");
ret = -rte_errno;
goto exit;
}
mana_shared_data = mana_shared_mz->addr;
memset(mana_shared_data, 0, sizeof(*mana_shared_data));
rte_spinlock_init(&mana_shared_data->lock);
} else {
secondary_mz = rte_memzone_lookup(MZ_MANA_SHARED_DATA);
if (!secondary_mz) {
DRV_LOG(ERR, "Cannot attach mana shared data");
ret = -rte_errno;
goto exit;
}
mana_shared_data = secondary_mz->addr;
memset(&mana_local_data, 0, sizeof(mana_local_data));
}
exit:
rte_spinlock_unlock(&mana_shared_data_lock);
return ret;
}
/*
* Init the data structures for use in primary and secondary processes.
*/
static int
mana_init_once(void)
{
int ret;
ret = mana_init_shared_data();
if (ret)
return ret;
rte_spinlock_lock(&mana_shared_data->lock);
switch (rte_eal_process_type()) {
case RTE_PROC_PRIMARY:
if (mana_shared_data->init_done)
break;
ret = mana_mp_init_primary();
if (ret)
break;
DRV_LOG(ERR, "MP INIT PRIMARY");
mana_shared_data->init_done = 1;
break;
case RTE_PROC_SECONDARY:
if (mana_local_data.init_done)
break;
ret = mana_mp_init_secondary();
if (ret)
break;
DRV_LOG(ERR, "MP INIT SECONDARY");
mana_local_data.init_done = 1;
break;
default:
/* Impossible, internal error */
ret = -EPROTO;
break;
}
rte_spinlock_unlock(&mana_shared_data->lock);
return ret;
}
/*
* Probe an IB port
* Return value:
* positive value: successfully probed port
* 0: port not matching specified MAC address
* negative value: error code
*/
static int
mana_probe_port(struct ibv_device *ibdev, struct ibv_device_attr_ex *dev_attr,
uint8_t port, struct rte_pci_device *pci_dev, struct rte_ether_addr *addr)
{
struct mana_priv *priv = NULL;
struct rte_eth_dev *eth_dev = NULL;
struct ibv_parent_domain_init_attr attr = {0};
char address[64];
char name[RTE_ETH_NAME_MAX_LEN];
int ret;
struct ibv_context *ctx = NULL;
rte_ether_format_addr(address, sizeof(address), addr);
DRV_LOG(INFO, "device located port %u address %s", port, address);
priv = rte_zmalloc_socket(NULL, sizeof(*priv), RTE_CACHE_LINE_SIZE,
SOCKET_ID_ANY);
if (!priv)
return -ENOMEM;
snprintf(name, sizeof(name), "%s_port%d", pci_dev->device.name, port);
if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
int fd;
eth_dev = rte_eth_dev_attach_secondary(name);
if (!eth_dev) {
DRV_LOG(ERR, "Can't attach to dev %s", name);
ret = -ENOMEM;
goto failed;
}
eth_dev->device = &pci_dev->device;
eth_dev->dev_ops = &mana_dev_secondary_ops;
ret = mana_proc_priv_init(eth_dev);
if (ret)
goto failed;
priv->process_priv = eth_dev->process_private;
/* Get the IB FD from the primary process */
fd = mana_mp_req_verbs_cmd_fd(eth_dev);
if (fd < 0) {
DRV_LOG(ERR, "Failed to get FD %d", fd);
ret = -ENODEV;
goto failed;
}
ret = mana_map_doorbell_secondary(eth_dev, fd);
if (ret) {
DRV_LOG(ERR, "Failed secondary map %d", fd);
goto failed;
}
/* fd is no not used after mapping doorbell */
close(fd);
eth_dev->tx_pkt_burst = mana_tx_burst_removed;
eth_dev->rx_pkt_burst = mana_rx_burst_removed;
rte_spinlock_lock(&mana_shared_data->lock);
mana_shared_data->secondary_cnt++;
mana_local_data.secondary_cnt++;
rte_spinlock_unlock(&mana_shared_data->lock);
rte_eth_copy_pci_info(eth_dev, pci_dev);
rte_eth_dev_probing_finish(eth_dev);
return 0;
}
ctx = ibv_open_device(ibdev);
if (!ctx) {
DRV_LOG(ERR, "Failed to open IB device %s", ibdev->name);
ret = -ENODEV;
goto failed;
}
eth_dev = rte_eth_dev_allocate(name);
if (!eth_dev) {
ret = -ENOMEM;
goto failed;
}
eth_dev->data->mac_addrs =
rte_calloc("mana_mac", 1,
sizeof(struct rte_ether_addr), 0);
if (!eth_dev->data->mac_addrs) {
ret = -ENOMEM;
goto failed;
}
rte_ether_addr_copy(addr, eth_dev->data->mac_addrs);
priv->ib_pd = ibv_alloc_pd(ctx);
if (!priv->ib_pd) {
DRV_LOG(ERR, "ibv_alloc_pd failed port %d", port);
ret = -ENOMEM;
goto failed;
}
/* Create a parent domain with the port number */
attr.pd = priv->ib_pd;
attr.comp_mask = IBV_PARENT_DOMAIN_INIT_ATTR_PD_CONTEXT;
attr.pd_context = (void *)(uint64_t)port;
priv->ib_parent_pd = ibv_alloc_parent_domain(ctx, &attr);
if (!priv->ib_parent_pd) {
DRV_LOG(ERR, "ibv_alloc_parent_domain failed port %d", port);
ret = -ENOMEM;
goto failed;
}
priv->ib_ctx = ctx;
priv->port_id = eth_dev->data->port_id;
priv->dev_port = port;
eth_dev->data->dev_private = priv;
priv->dev_data = eth_dev->data;
priv->max_rx_queues = dev_attr->orig_attr.max_qp;
priv->max_tx_queues = dev_attr->orig_attr.max_qp;
priv->max_rx_desc =
RTE_MIN(dev_attr->orig_attr.max_qp_wr,
dev_attr->orig_attr.max_cqe);
priv->max_tx_desc =
RTE_MIN(dev_attr->orig_attr.max_qp_wr,
dev_attr->orig_attr.max_cqe);
priv->max_send_sge = dev_attr->orig_attr.max_sge;
priv->max_recv_sge = dev_attr->orig_attr.max_sge;
priv->max_mr = dev_attr->orig_attr.max_mr;
priv->max_mr_size = dev_attr->orig_attr.max_mr_size;
DRV_LOG(INFO, "dev %s max queues %d desc %d sge %d",
name, priv->max_rx_queues, priv->max_rx_desc,
priv->max_send_sge);
rte_eth_copy_pci_info(eth_dev, pci_dev);
/* Create async interrupt handler */
ret = mana_intr_install(eth_dev, priv);
if (ret) {
DRV_LOG(ERR, "Failed to install intr handler");
goto failed;
}
rte_spinlock_lock(&mana_shared_data->lock);
mana_shared_data->primary_cnt++;
rte_spinlock_unlock(&mana_shared_data->lock);
eth_dev->device = &pci_dev->device;
DRV_LOG(INFO, "device %s at port %u", name, eth_dev->data->port_id);
eth_dev->rx_pkt_burst = mana_rx_burst_removed;
eth_dev->tx_pkt_burst = mana_tx_burst_removed;
eth_dev->dev_ops = &mana_dev_ops;
rte_eth_dev_probing_finish(eth_dev);
return 0;
failed:
/* Free the resource for the port failed */
if (priv) {
if (priv->ib_parent_pd)
ibv_dealloc_pd(priv->ib_parent_pd);
if (priv->ib_pd)
ibv_dealloc_pd(priv->ib_pd);
}
if (eth_dev)
rte_eth_dev_release_port(eth_dev);
rte_free(priv);
if (ctx)
ibv_close_device(ctx);
return ret;
}
/*
* Goes through the IB device list to look for the IB port matching the
* mac_addr. If found, create a rte_eth_dev for it.
*/
static int
mana_pci_probe_mac(struct rte_pci_device *pci_dev,
struct rte_ether_addr *mac_addr)
{
struct ibv_device **ibv_list;
int ibv_idx;
struct ibv_context *ctx;
int num_devices;
int ret = 0;
uint8_t port;
ibv_list = ibv_get_device_list(&num_devices);
for (ibv_idx = 0; ibv_idx < num_devices; ibv_idx++) {
struct ibv_device *ibdev = ibv_list[ibv_idx];
struct rte_pci_addr pci_addr;
struct ibv_device_attr_ex dev_attr;
DRV_LOG(INFO, "Probe device name %s dev_name %s ibdev_path %s",
ibdev->name, ibdev->dev_name, ibdev->ibdev_path);
if (mana_ibv_device_to_pci_addr(ibdev, &pci_addr))
continue;
/* Ignore if this IB device is not this PCI device */
if (pci_dev->addr.domain != pci_addr.domain ||
pci_dev->addr.bus != pci_addr.bus ||
pci_dev->addr.devid != pci_addr.devid ||
pci_dev->addr.function != pci_addr.function)
continue;
ctx = ibv_open_device(ibdev);
if (!ctx) {
DRV_LOG(ERR, "Failed to open IB device %s",
ibdev->name);
continue;
}
ret = ibv_query_device_ex(ctx, NULL, &dev_attr);
ibv_close_device(ctx);
for (port = 1; port <= dev_attr.orig_attr.phys_port_cnt;
port++) {
struct rte_ether_addr addr;
ret = get_port_mac(ibdev, port, &addr);
if (ret)
continue;
if (mac_addr && !rte_is_same_ether_addr(&addr, mac_addr))
continue;
ret = mana_probe_port(ibdev, &dev_attr, port, pci_dev, &addr);
if (ret)
DRV_LOG(ERR, "Probe on IB port %u failed %d", port, ret);
else
DRV_LOG(INFO, "Successfully probed on IB port %u", port);
}
}
ibv_free_device_list(ibv_list);
return ret;
}
/*
* Main callback function from PCI bus to probe a device.
*/
static int
mana_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
struct rte_pci_device *pci_dev)
{
struct rte_devargs *args = pci_dev->device.devargs;
struct mana_conf conf = {0};
unsigned int i;
int ret;
if (args && args->drv_str) {
ret = mana_parse_args(args, &conf);
if (ret) {
DRV_LOG(ERR, "Failed to parse parameters args = %s",
args->drv_str);
return ret;
}
}
ret = mana_init_once();
if (ret) {
DRV_LOG(ERR, "Failed to init PMD global data %d", ret);
return ret;
}
/* If there are no driver parameters, probe on all ports */
if (!conf.index)
return mana_pci_probe_mac(pci_dev, NULL);
for (i = 0; i < conf.index; i++) {
ret = mana_pci_probe_mac(pci_dev, &conf.mac_array[i]);
if (ret)
return ret;
}
return 0;
}
static int
mana_dev_uninit(struct rte_eth_dev *dev)
{
return mana_dev_close(dev);
}
/*
* Callback from PCI to remove this device.
*/
static int
mana_pci_remove(struct rte_pci_device *pci_dev)
{
if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
rte_spinlock_lock(&mana_shared_data_lock);
rte_spinlock_lock(&mana_shared_data->lock);
RTE_VERIFY(mana_shared_data->primary_cnt > 0);
mana_shared_data->primary_cnt--;
if (!mana_shared_data->primary_cnt) {
DRV_LOG(DEBUG, "mp uninit primary");
mana_mp_uninit_primary();
}
rte_spinlock_unlock(&mana_shared_data->lock);
/* Also free the shared memory if this is the last */
if (!mana_shared_data->primary_cnt) {
DRV_LOG(DEBUG, "free shared memezone data");
rte_memzone_free(mana_shared_mz);
}
rte_spinlock_unlock(&mana_shared_data_lock);
} else {
rte_spinlock_lock(&mana_shared_data_lock);
rte_spinlock_lock(&mana_shared_data->lock);
RTE_VERIFY(mana_shared_data->secondary_cnt > 0);
mana_shared_data->secondary_cnt--;
rte_spinlock_unlock(&mana_shared_data->lock);
RTE_VERIFY(mana_local_data.secondary_cnt > 0);
mana_local_data.secondary_cnt--;
if (!mana_local_data.secondary_cnt) {
DRV_LOG(DEBUG, "mp uninit secondary");
mana_mp_uninit_secondary();
}
rte_spinlock_unlock(&mana_shared_data_lock);
}
return rte_eth_dev_pci_generic_remove(pci_dev, mana_dev_uninit);
}
static const struct rte_pci_id mana_pci_id_map[] = {
{
RTE_PCI_DEVICE(PCI_VENDOR_ID_MICROSOFT,
PCI_DEVICE_ID_MICROSOFT_MANA)
},
{
.vendor_id = 0
},
};
static struct rte_pci_driver mana_pci_driver = {
.id_table = mana_pci_id_map,
.probe = mana_pci_probe,
.remove = mana_pci_remove,
.drv_flags = RTE_PCI_DRV_INTR_RMV,
};
RTE_PMD_REGISTER_PCI(net_mana, mana_pci_driver);
RTE_PMD_REGISTER_PCI_TABLE(net_mana, mana_pci_id_map);
RTE_PMD_REGISTER_KMOD_DEP(net_mana, "* ib_uverbs & mana_ib");
RTE_LOG_REGISTER_SUFFIX(mana_logtype_init, init, NOTICE);
RTE_LOG_REGISTER_SUFFIX(mana_logtype_driver, driver, NOTICE);
RTE_PMD_REGISTER_PARAM_STRING(net_mana, ETH_MANA_MAC_ARG "=<mac_addr>");