numam-dpdk/drivers/net/mlx4/mlx4.h

252 lines
8.4 KiB
C
Raw Normal View History

/* SPDX-License-Identifier: BSD-3-Clause
* Copyright 2012 6WIND S.A.
* Copyright 2012 Mellanox Technologies, Ltd
*/
#ifndef RTE_PMD_MLX4_H_
#define RTE_PMD_MLX4_H_
#include <net/if.h>
#include <stdint.h>
#include <sys/queue.h>
/* Verbs headers do not support -pedantic. */
#ifdef PEDANTIC
#pragma GCC diagnostic ignored "-Wpedantic"
#endif
#include <infiniband/verbs.h>
#ifdef PEDANTIC
#pragma GCC diagnostic error "-Wpedantic"
#endif
#include <rte_ethdev_driver.h>
#include <rte_ether.h>
#include <rte_interrupts.h>
#include <rte_mempool.h>
net/mlx4: add new memory region support This is the new design of Memory Region (MR) for mlx PMD, in order to: - Accommodate the new memory hotplug model. - Support non-contiguous Mempool. There are multiple layers for MR search. L0 is to look up the last-hit entry which is pointed by mr_ctrl->mru (Most Recently Used). If L0 misses, L1 is to look up the address in a fixed-sized array by linear search. L0/L1 is in an inline function - mlx4_mr_lookup_cache(). If L1 misses, the bottom-half function is called to look up the address from the bigger local cache of the queue. This is L2 - mlx4_mr_addr2mr_bh() and it is not an inline function. Data structure for L2 is the Binary Tree. If L2 misses, the search falls into the slowest path which takes locks in order to access global device cache (priv->mr.cache) which is also a B-tree and caches the original MR list (priv->mr.mr_list) of the device. Unless the global cache is overflowed, it is all-inclusive of the MR list. This is L3 - mlx4_mr_lookup_dev(). The size of the L3 cache table is limited and can't be expanded on the fly due to deadlock. Refer to the comments in the code for the details - mr_lookup_dev(). If L3 is overflowed, the list will have to be searched directly bypassing the cache although it is slower. If L3 misses, a new MR for the address should be created - mlx4_mr_create(). When it creates a new MR, it tries to register adjacent memsegs as much as possible which are virtually contiguous around the address. This must take two locks - memory_hotplug_lock and priv->mr.rwlock. Due to memory_hotplug_lock, there can't be any allocation/free of memory inside. In the free callback of the memory hotplug event, freed space is searched from the MR list and corresponding bits are cleared from the bitmap of MRs. This can fragment a MR and the MR will have multiple search entries in the caches. Once there's a change by the event, the global cache must be rebuilt and all the per-queue caches will be flushed as well. If memory is frequently freed in run-time, that may cause jitter on dataplane processing in the worst case by incurring MR cache flush and rebuild. But, it would be the least probable scenario. To guarantee the most optimal performance, it is highly recommended to use an EAL option - '--socket-mem'. Then, the reserved memory will be pinned and won't be freed dynamically. And it is also recommended to configure per-lcore cache of Mempool. Even though there're many MRs for a device or MRs are highly fragmented, the cache of Mempool will be much helpful to reduce misses on per-queue caches anyway. '--legacy-mem' is also supported. Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
2018-05-09 11:09:06 +00:00
#include <rte_rwlock.h>
#include "mlx4_mr.h"
#ifndef IBV_RX_HASH_INNER
/** This is not necessarily defined by supported RDMA core versions. */
#define IBV_RX_HASH_INNER (1ull << 31)
#endif /* IBV_RX_HASH_INNER */
/** Maximum number of simultaneous MAC addresses. This value is arbitrary. */
#define MLX4_MAX_MAC_ADDRESSES 128
/** Request send completion once in every 64 sends, might be less. */
#define MLX4_PMD_TX_PER_COMP_REQ 64
/** Maximum size for inline data. */
#define MLX4_PMD_MAX_INLINE 0
/** Fixed RSS hash key size in bytes. Cannot be modified. */
#define MLX4_RSS_HASH_KEY_SIZE 40
/** Interrupt alarm timeout value in microseconds. */
#define MLX4_INTR_ALARM_TIMEOUT 100000
/* Maximum packet headers size (L2+L3+L4) for TSO. */
#define MLX4_MAX_TSO_HEADER 192
/** Port parameter. */
#define MLX4_PMD_PORT_KVARG "port"
/** Enable extending memsegs when creating a MR. */
#define MLX4_MR_EXT_MEMSEG_EN_KVARG "mr_ext_memseg_en"
enum {
PCI_VENDOR_ID_MELLANOX = 0x15b3,
};
enum {
PCI_DEVICE_ID_MELLANOX_CONNECTX3 = 0x1003,
PCI_DEVICE_ID_MELLANOX_CONNECTX3VF = 0x1004,
PCI_DEVICE_ID_MELLANOX_CONNECTX3PRO = 0x1007,
};
/* Request types for IPC. */
enum mlx4_mp_req_type {
MLX4_MP_REQ_VERBS_CMD_FD = 1,
MLX4_MP_REQ_CREATE_MR,
MLX4_MP_REQ_START_RXTX,
MLX4_MP_REQ_STOP_RXTX,
};
/* Pameters for IPC. */
struct mlx4_mp_param {
enum mlx4_mp_req_type type;
int port_id;
int result;
RTE_STD_C11
union {
uintptr_t addr; /* MLX4_MP_REQ_CREATE_MR */
} args;
};
/** Request timeout for IPC. */
#define MLX4_MP_REQ_TIMEOUT_SEC 5
/** Key string for IPC. */
#define MLX4_MP_NAME "net_mlx4_mp"
/** Driver name reported to lower layers and used in log output. */
#define MLX4_DRIVER_NAME "net_mlx4"
struct mlx4_drop;
struct mlx4_rss;
struct rxq;
struct txq;
struct rte_flow;
/**
* Type of object being allocated.
*/
enum mlx4_verbs_alloc_type {
MLX4_VERBS_ALLOC_TYPE_NONE,
MLX4_VERBS_ALLOC_TYPE_TX_QUEUE,
MLX4_VERBS_ALLOC_TYPE_RX_QUEUE,
};
/**
* Verbs allocator needs a context to know in the callback which kind of
* resources it is allocating.
*/
struct mlx4_verbs_alloc_ctx {
int enabled;
enum mlx4_verbs_alloc_type type; /* Kind of object being allocated. */
const void *obj; /* Pointer to the DPDK object. */
};
LIST_HEAD(mlx4_dev_list, mlx4_priv);
net/mlx4: add new memory region support This is the new design of Memory Region (MR) for mlx PMD, in order to: - Accommodate the new memory hotplug model. - Support non-contiguous Mempool. There are multiple layers for MR search. L0 is to look up the last-hit entry which is pointed by mr_ctrl->mru (Most Recently Used). If L0 misses, L1 is to look up the address in a fixed-sized array by linear search. L0/L1 is in an inline function - mlx4_mr_lookup_cache(). If L1 misses, the bottom-half function is called to look up the address from the bigger local cache of the queue. This is L2 - mlx4_mr_addr2mr_bh() and it is not an inline function. Data structure for L2 is the Binary Tree. If L2 misses, the search falls into the slowest path which takes locks in order to access global device cache (priv->mr.cache) which is also a B-tree and caches the original MR list (priv->mr.mr_list) of the device. Unless the global cache is overflowed, it is all-inclusive of the MR list. This is L3 - mlx4_mr_lookup_dev(). The size of the L3 cache table is limited and can't be expanded on the fly due to deadlock. Refer to the comments in the code for the details - mr_lookup_dev(). If L3 is overflowed, the list will have to be searched directly bypassing the cache although it is slower. If L3 misses, a new MR for the address should be created - mlx4_mr_create(). When it creates a new MR, it tries to register adjacent memsegs as much as possible which are virtually contiguous around the address. This must take two locks - memory_hotplug_lock and priv->mr.rwlock. Due to memory_hotplug_lock, there can't be any allocation/free of memory inside. In the free callback of the memory hotplug event, freed space is searched from the MR list and corresponding bits are cleared from the bitmap of MRs. This can fragment a MR and the MR will have multiple search entries in the caches. Once there's a change by the event, the global cache must be rebuilt and all the per-queue caches will be flushed as well. If memory is frequently freed in run-time, that may cause jitter on dataplane processing in the worst case by incurring MR cache flush and rebuild. But, it would be the least probable scenario. To guarantee the most optimal performance, it is highly recommended to use an EAL option - '--socket-mem'. Then, the reserved memory will be pinned and won't be freed dynamically. And it is also recommended to configure per-lcore cache of Mempool. Even though there're many MRs for a device or MRs are highly fragmented, the cache of Mempool will be much helpful to reduce misses on per-queue caches anyway. '--legacy-mem' is also supported. Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
2018-05-09 11:09:06 +00:00
LIST_HEAD(mlx4_mr_list, mlx4_mr);
/* Shared data between primary and secondary processes. */
struct mlx4_shared_data {
rte_spinlock_t lock;
/* Global spinlock for primary and secondary processes. */
int init_done; /* Whether primary has done initialization. */
unsigned int secondary_cnt; /* Number of secondary processes init'd. */
struct mlx4_dev_list mem_event_cb_list;
rte_rwlock_t mem_event_rwlock;
};
/* Per-process data structure, not visible to other processes. */
struct mlx4_local_data {
int init_done; /* Whether a secondary has done initialization. */
};
extern struct mlx4_shared_data *mlx4_shared_data;
/* Per-process private structure. */
struct mlx4_proc_priv {
size_t uar_table_sz;
/* Size of UAR register table. */
void *uar_table[];
/* Table of UAR registers for each process. */
};
#define MLX4_PROC_PRIV(port_id) \
((struct mlx4_proc_priv *)rte_eth_devices[port_id].process_private)
/** Private data structure. */
struct mlx4_priv {
LIST_ENTRY(mlx4_priv) mem_event_cb;
/**< Called by memory event callback. */
struct rte_eth_dev_data *dev_data; /* Pointer to device data. */
struct ibv_context *ctx; /**< Verbs context. */
struct ibv_device_attr device_attr; /**< Device properties. */
struct ibv_pd *pd; /**< Protection Domain. */
/* Device properties. */
unsigned int if_index; /**< Associated network device index */
uint16_t mtu; /**< Configured MTU. */
uint8_t port; /**< Physical port number. */
uint32_t started:1; /**< Device started, flows enabled. */
uint32_t vf:1; /**< This is a VF device. */
uint32_t intr_alarm:1; /**< An interrupt alarm is scheduled. */
uint32_t isolated:1; /**< Toggle isolated mode. */
uint32_t rss_init:1; /**< Common RSS context is initialized. */
uint32_t hw_csum:1; /**< Checksum offload is supported. */
uint32_t hw_csum_l2tun:1; /**< Checksum support for L2 tunnels. */
uint32_t hw_fcs_strip:1; /**< FCS stripping toggling is supported. */
uint32_t tso:1; /**< Transmit segmentation offload is supported. */
uint32_t mr_ext_memseg_en:1;
/** Whether memseg should be extended for MR creation. */
uint32_t tso_max_payload_sz; /**< Max supported TSO payload size. */
uint32_t hw_rss_max_qps; /**< Max Rx Queues supported by RSS. */
uint64_t hw_rss_sup; /**< Supported RSS hash fields (Verbs format). */
struct rte_intr_handle intr_handle; /**< Port interrupt handle. */
struct mlx4_drop *drop; /**< Shared resources for drop flow rules. */
net/mlx4: add new memory region support This is the new design of Memory Region (MR) for mlx PMD, in order to: - Accommodate the new memory hotplug model. - Support non-contiguous Mempool. There are multiple layers for MR search. L0 is to look up the last-hit entry which is pointed by mr_ctrl->mru (Most Recently Used). If L0 misses, L1 is to look up the address in a fixed-sized array by linear search. L0/L1 is in an inline function - mlx4_mr_lookup_cache(). If L1 misses, the bottom-half function is called to look up the address from the bigger local cache of the queue. This is L2 - mlx4_mr_addr2mr_bh() and it is not an inline function. Data structure for L2 is the Binary Tree. If L2 misses, the search falls into the slowest path which takes locks in order to access global device cache (priv->mr.cache) which is also a B-tree and caches the original MR list (priv->mr.mr_list) of the device. Unless the global cache is overflowed, it is all-inclusive of the MR list. This is L3 - mlx4_mr_lookup_dev(). The size of the L3 cache table is limited and can't be expanded on the fly due to deadlock. Refer to the comments in the code for the details - mr_lookup_dev(). If L3 is overflowed, the list will have to be searched directly bypassing the cache although it is slower. If L3 misses, a new MR for the address should be created - mlx4_mr_create(). When it creates a new MR, it tries to register adjacent memsegs as much as possible which are virtually contiguous around the address. This must take two locks - memory_hotplug_lock and priv->mr.rwlock. Due to memory_hotplug_lock, there can't be any allocation/free of memory inside. In the free callback of the memory hotplug event, freed space is searched from the MR list and corresponding bits are cleared from the bitmap of MRs. This can fragment a MR and the MR will have multiple search entries in the caches. Once there's a change by the event, the global cache must be rebuilt and all the per-queue caches will be flushed as well. If memory is frequently freed in run-time, that may cause jitter on dataplane processing in the worst case by incurring MR cache flush and rebuild. But, it would be the least probable scenario. To guarantee the most optimal performance, it is highly recommended to use an EAL option - '--socket-mem'. Then, the reserved memory will be pinned and won't be freed dynamically. And it is also recommended to configure per-lcore cache of Mempool. Even though there're many MRs for a device or MRs are highly fragmented, the cache of Mempool will be much helpful to reduce misses on per-queue caches anyway. '--legacy-mem' is also supported. Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
2018-05-09 11:09:06 +00:00
struct {
uint32_t dev_gen; /* Generation number to flush local caches. */
rte_rwlock_t rwlock; /* MR Lock. */
struct mlx4_mr_btree cache; /* Global MR cache table. */
struct mlx4_mr_list mr_list; /* Registered MR list. */
struct mlx4_mr_list mr_free_list; /* Freed MR list. */
} mr;
LIST_HEAD(, mlx4_rss) rss; /**< Shared targets for Rx flow rules. */
LIST_HEAD(, rte_flow) flows; /**< Configured flow rule handles. */
struct rte_ether_addr mac[MLX4_MAX_MAC_ADDRESSES];
/**< Configured MAC addresses. Unused entries are zeroed. */
uint32_t mac_mc; /**< Number of trailing multicast entries in mac[]. */
struct mlx4_verbs_alloc_ctx verbs_alloc_ctx;
/**< Context for Verbs allocator. */
};
#define PORT_ID(priv) ((priv)->dev_data->port_id)
#define ETH_DEV(priv) (&rte_eth_devices[PORT_ID(priv)])
/* mlx4_ethdev.c */
int mlx4_get_ifname(const struct mlx4_priv *priv, char (*ifname)[IF_NAMESIZE]);
net: add rte prefix to ether defines Add 'RTE_' prefix to defines: - rename ETHER_ADDR_LEN as RTE_ETHER_ADDR_LEN. - rename ETHER_TYPE_LEN as RTE_ETHER_TYPE_LEN. - rename ETHER_CRC_LEN as RTE_ETHER_CRC_LEN. - rename ETHER_HDR_LEN as RTE_ETHER_HDR_LEN. - rename ETHER_MIN_LEN as RTE_ETHER_MIN_LEN. - rename ETHER_MAX_LEN as RTE_ETHER_MAX_LEN. - rename ETHER_MTU as RTE_ETHER_MTU. - rename ETHER_MAX_VLAN_FRAME_LEN as RTE_ETHER_MAX_VLAN_FRAME_LEN. - rename ETHER_MAX_VLAN_ID as RTE_ETHER_MAX_VLAN_ID. - rename ETHER_MAX_JUMBO_FRAME_LEN as RTE_ETHER_MAX_JUMBO_FRAME_LEN. - rename ETHER_MIN_MTU as RTE_ETHER_MIN_MTU. - rename ETHER_LOCAL_ADMIN_ADDR as RTE_ETHER_LOCAL_ADMIN_ADDR. - rename ETHER_GROUP_ADDR as RTE_ETHER_GROUP_ADDR. - rename ETHER_TYPE_IPv4 as RTE_ETHER_TYPE_IPv4. - rename ETHER_TYPE_IPv6 as RTE_ETHER_TYPE_IPv6. - rename ETHER_TYPE_ARP as RTE_ETHER_TYPE_ARP. - rename ETHER_TYPE_VLAN as RTE_ETHER_TYPE_VLAN. - rename ETHER_TYPE_RARP as RTE_ETHER_TYPE_RARP. - rename ETHER_TYPE_QINQ as RTE_ETHER_TYPE_QINQ. - rename ETHER_TYPE_ETAG as RTE_ETHER_TYPE_ETAG. - rename ETHER_TYPE_1588 as RTE_ETHER_TYPE_1588. - rename ETHER_TYPE_SLOW as RTE_ETHER_TYPE_SLOW. - rename ETHER_TYPE_TEB as RTE_ETHER_TYPE_TEB. - rename ETHER_TYPE_LLDP as RTE_ETHER_TYPE_LLDP. - rename ETHER_TYPE_MPLS as RTE_ETHER_TYPE_MPLS. - rename ETHER_TYPE_MPLSM as RTE_ETHER_TYPE_MPLSM. - rename ETHER_VXLAN_HLEN as RTE_ETHER_VXLAN_HLEN. - rename ETHER_ADDR_FMT_SIZE as RTE_ETHER_ADDR_FMT_SIZE. - rename VXLAN_GPE_TYPE_IPV4 as RTE_VXLAN_GPE_TYPE_IPV4. - rename VXLAN_GPE_TYPE_IPV6 as RTE_VXLAN_GPE_TYPE_IPV6. - rename VXLAN_GPE_TYPE_ETH as RTE_VXLAN_GPE_TYPE_ETH. - rename VXLAN_GPE_TYPE_NSH as RTE_VXLAN_GPE_TYPE_NSH. - rename VXLAN_GPE_TYPE_MPLS as RTE_VXLAN_GPE_TYPE_MPLS. - rename VXLAN_GPE_TYPE_GBP as RTE_VXLAN_GPE_TYPE_GBP. - rename VXLAN_GPE_TYPE_VBNG as RTE_VXLAN_GPE_TYPE_VBNG. - rename ETHER_VXLAN_GPE_HLEN as RTE_ETHER_VXLAN_GPE_HLEN. Do not update the command line library to avoid adding a dependency to librte_net. Signed-off-by: Olivier Matz <olivier.matz@6wind.com> Reviewed-by: Stephen Hemminger <stephen@networkplumber.org> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> Reviewed-by: Ferruh Yigit <ferruh.yigit@intel.com>
2019-05-21 16:13:05 +00:00
int mlx4_get_mac(struct mlx4_priv *priv, uint8_t (*mac)[RTE_ETHER_ADDR_LEN]);
int mlx4_mtu_get(struct mlx4_priv *priv, uint16_t *mtu);
int mlx4_mtu_set(struct rte_eth_dev *dev, uint16_t mtu);
int mlx4_dev_set_link_down(struct rte_eth_dev *dev);
int mlx4_dev_set_link_up(struct rte_eth_dev *dev);
int mlx4_promiscuous_enable(struct rte_eth_dev *dev);
int mlx4_promiscuous_disable(struct rte_eth_dev *dev);
int mlx4_allmulticast_enable(struct rte_eth_dev *dev);
int mlx4_allmulticast_disable(struct rte_eth_dev *dev);
void mlx4_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index);
int mlx4_mac_addr_add(struct rte_eth_dev *dev, struct rte_ether_addr *mac_addr,
uint32_t index, uint32_t vmdq);
int mlx4_mac_addr_set(struct rte_eth_dev *dev, struct rte_ether_addr *mac_addr);
int mlx4_set_mc_addr_list(struct rte_eth_dev *dev, struct rte_ether_addr *list,
uint32_t num);
int mlx4_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on);
int mlx4_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats);
int mlx4_stats_reset(struct rte_eth_dev *dev);
int mlx4_fw_version_get(struct rte_eth_dev *dev, char *fw_ver, size_t fw_size);
int mlx4_dev_infos_get(struct rte_eth_dev *dev,
struct rte_eth_dev_info *info);
int mlx4_link_update(struct rte_eth_dev *dev, int wait_to_complete);
int mlx4_flow_ctrl_get(struct rte_eth_dev *dev,
struct rte_eth_fc_conf *fc_conf);
int mlx4_flow_ctrl_set(struct rte_eth_dev *dev,
struct rte_eth_fc_conf *fc_conf);
const uint32_t *mlx4_dev_supported_ptypes_get(struct rte_eth_dev *dev);
int mlx4_is_removed(struct rte_eth_dev *dev);
/* mlx4_intr.c */
int mlx4_intr_uninstall(struct mlx4_priv *priv);
int mlx4_intr_install(struct mlx4_priv *priv);
int mlx4_rxq_intr_enable(struct mlx4_priv *priv);
void mlx4_rxq_intr_disable(struct mlx4_priv *priv);
int mlx4_rx_intr_disable(struct rte_eth_dev *dev, uint16_t idx);
int mlx4_rx_intr_enable(struct rte_eth_dev *dev, uint16_t idx);
/* mlx4_mp.c */
void mlx4_mp_req_start_rxtx(struct rte_eth_dev *dev);
void mlx4_mp_req_stop_rxtx(struct rte_eth_dev *dev);
int mlx4_mp_req_mr_create(struct rte_eth_dev *dev, uintptr_t addr);
int mlx4_mp_req_verbs_cmd_fd(struct rte_eth_dev *dev);
int mlx4_mp_init_primary(void);
void mlx4_mp_uninit_primary(void);
int mlx4_mp_init_secondary(void);
void mlx4_mp_uninit_secondary(void);
#endif /* RTE_PMD_MLX4_H_ */