Yongseok Koh 9797bfcce1 net/mlx4: add new memory region support
This is the new design of Memory Region (MR) for mlx PMD, in order to:
- Accommodate the new memory hotplug model.
- Support non-contiguous Mempool.

There are multiple layers for MR search.

L0 is to look up the last-hit entry which is pointed by mr_ctrl->mru (Most
Recently Used). If L0 misses, L1 is to look up the address in a fixed-sized
array by linear search. L0/L1 is in an inline function -
mlx4_mr_lookup_cache().

If L1 misses, the bottom-half function is called to look up the address
from the bigger local cache of the queue. This is L2 - mlx4_mr_addr2mr_bh()
and it is not an inline function. Data structure for L2 is the Binary Tree.

If L2 misses, the search falls into the slowest path which takes locks in
order to access global device cache (priv->mr.cache) which is also a B-tree
and caches the original MR list (priv->mr.mr_list) of the device. Unless
the global cache is overflowed, it is all-inclusive of the MR list. This is
L3 - mlx4_mr_lookup_dev(). The size of the L3 cache table is limited and
can't be expanded on the fly due to deadlock. Refer to the comments in the
code for the details - mr_lookup_dev(). If L3 is overflowed, the list will
have to be searched directly bypassing the cache although it is slower.

If L3 misses, a new MR for the address should be created -
mlx4_mr_create(). When it creates a new MR, it tries to register adjacent
memsegs as much as possible which are virtually contiguous around the
address. This must take two locks - memory_hotplug_lock and
priv->mr.rwlock. Due to memory_hotplug_lock, there can't be any
allocation/free of memory inside.

In the free callback of the memory hotplug event, freed space is searched
from the MR list and corresponding bits are cleared from the bitmap of MRs.
This can fragment a MR and the MR will have multiple search entries in the
caches. Once there's a change by the event, the global cache must be
rebuilt and all the per-queue caches will be flushed as well. If memory is
frequently freed in run-time, that may cause jitter on dataplane processing
in the worst case by incurring MR cache flush and rebuild. But, it would be
the least probable scenario.

To guarantee the most optimal performance, it is highly recommended to use
an EAL option - '--socket-mem'. Then, the reserved memory will be pinned
and won't be freed dynamically. And it is also recommended to configure
per-lcore cache of Mempool. Even though there're many MRs for a device or
MRs are highly fragmented, the cache of Mempool will be much helpful to
reduce misses on per-queue caches anyway.

'--legacy-mem' is also supported.

Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
2018-05-14 22:31:52 +01:00

148 lines
5.2 KiB
C

/* SPDX-License-Identifier: BSD-3-Clause
* Copyright 2012 6WIND S.A.
* Copyright 2012 Mellanox Technologies, Ltd
*/
#ifndef RTE_PMD_MLX4_H_
#define RTE_PMD_MLX4_H_
#include <net/if.h>
#include <stdint.h>
#include <sys/queue.h>
/* Verbs headers do not support -pedantic. */
#ifdef PEDANTIC
#pragma GCC diagnostic ignored "-Wpedantic"
#endif
#include <infiniband/verbs.h>
#ifdef PEDANTIC
#pragma GCC diagnostic error "-Wpedantic"
#endif
#include <rte_ethdev_driver.h>
#include <rte_ether.h>
#include <rte_interrupts.h>
#include <rte_mempool.h>
#include <rte_rwlock.h>
#include "mlx4_mr.h"
#ifndef IBV_RX_HASH_INNER
/** This is not necessarily defined by supported RDMA core versions. */
#define IBV_RX_HASH_INNER (1ull << 31)
#endif /* IBV_RX_HASH_INNER */
/** Maximum number of simultaneous MAC addresses. This value is arbitrary. */
#define MLX4_MAX_MAC_ADDRESSES 128
/** Request send completion once in every 64 sends, might be less. */
#define MLX4_PMD_TX_PER_COMP_REQ 64
/** Maximum size for inline data. */
#define MLX4_PMD_MAX_INLINE 0
/** Fixed RSS hash key size in bytes. Cannot be modified. */
#define MLX4_RSS_HASH_KEY_SIZE 40
/** Interrupt alarm timeout value in microseconds. */
#define MLX4_INTR_ALARM_TIMEOUT 100000
/** Port parameter. */
#define MLX4_PMD_PORT_KVARG "port"
enum {
PCI_VENDOR_ID_MELLANOX = 0x15b3,
};
enum {
PCI_DEVICE_ID_MELLANOX_CONNECTX3 = 0x1003,
PCI_DEVICE_ID_MELLANOX_CONNECTX3VF = 0x1004,
PCI_DEVICE_ID_MELLANOX_CONNECTX3PRO = 0x1007,
};
/** Driver name reported to lower layers and used in log output. */
#define MLX4_DRIVER_NAME "net_mlx4"
struct mlx4_drop;
struct mlx4_rss;
struct rxq;
struct txq;
struct rte_flow;
LIST_HEAD(mlx4_dev_list, priv);
LIST_HEAD(mlx4_mr_list, mlx4_mr);
/** Private data structure. */
struct priv {
LIST_ENTRY(priv) mem_event_cb; /* Called by memory event callback. */
struct rte_eth_dev *dev; /**< Ethernet device. */
struct ibv_context *ctx; /**< Verbs context. */
struct ibv_device_attr device_attr; /**< Device properties. */
struct ibv_pd *pd; /**< Protection Domain. */
/* Device properties. */
uint16_t mtu; /**< Configured MTU. */
uint8_t port; /**< Physical port number. */
uint32_t started:1; /**< Device started, flows enabled. */
uint32_t vf:1; /**< This is a VF device. */
uint32_t intr_alarm:1; /**< An interrupt alarm is scheduled. */
uint32_t isolated:1; /**< Toggle isolated mode. */
uint32_t rss_init:1; /**< Common RSS context is initialized. */
uint32_t hw_csum:1; /**< Checksum offload is supported. */
uint32_t hw_csum_l2tun:1; /**< Checksum support for L2 tunnels. */
uint32_t hw_fcs_strip:1; /**< FCS stripping toggling is supported. */
uint64_t hw_rss_sup; /**< Supported RSS hash fields (Verbs format). */
struct rte_intr_handle intr_handle; /**< Port interrupt handle. */
struct mlx4_drop *drop; /**< Shared resources for drop flow rules. */
struct {
uint32_t dev_gen; /* Generation number to flush local caches. */
rte_rwlock_t rwlock; /* MR Lock. */
struct mlx4_mr_btree cache; /* Global MR cache table. */
struct mlx4_mr_list mr_list; /* Registered MR list. */
struct mlx4_mr_list mr_free_list; /* Freed MR list. */
} mr;
LIST_HEAD(, mlx4_rss) rss; /**< Shared targets for Rx flow rules. */
LIST_HEAD(, rte_flow) flows; /**< Configured flow rule handles. */
struct ether_addr mac[MLX4_MAX_MAC_ADDRESSES];
/**< Configured MAC addresses. Unused entries are zeroed. */
};
/* mlx4_ethdev.c */
int mlx4_get_ifname(const struct priv *priv, char (*ifname)[IF_NAMESIZE]);
int mlx4_get_mac(struct priv *priv, uint8_t (*mac)[ETHER_ADDR_LEN]);
int mlx4_mtu_get(struct priv *priv, uint16_t *mtu);
int mlx4_mtu_set(struct rte_eth_dev *dev, uint16_t mtu);
int mlx4_dev_set_link_down(struct rte_eth_dev *dev);
int mlx4_dev_set_link_up(struct rte_eth_dev *dev);
void mlx4_promiscuous_enable(struct rte_eth_dev *dev);
void mlx4_promiscuous_disable(struct rte_eth_dev *dev);
void mlx4_allmulticast_enable(struct rte_eth_dev *dev);
void mlx4_allmulticast_disable(struct rte_eth_dev *dev);
void mlx4_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index);
int mlx4_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac_addr,
uint32_t index, uint32_t vmdq);
int mlx4_mac_addr_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr);
int mlx4_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on);
int mlx4_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats);
void mlx4_stats_reset(struct rte_eth_dev *dev);
void mlx4_dev_infos_get(struct rte_eth_dev *dev,
struct rte_eth_dev_info *info);
int mlx4_link_update(struct rte_eth_dev *dev, int wait_to_complete);
int mlx4_flow_ctrl_get(struct rte_eth_dev *dev,
struct rte_eth_fc_conf *fc_conf);
int mlx4_flow_ctrl_set(struct rte_eth_dev *dev,
struct rte_eth_fc_conf *fc_conf);
const uint32_t *mlx4_dev_supported_ptypes_get(struct rte_eth_dev *dev);
int mlx4_is_removed(struct rte_eth_dev *dev);
/* mlx4_intr.c */
int mlx4_intr_uninstall(struct priv *priv);
int mlx4_intr_install(struct priv *priv);
int mlx4_rxq_intr_enable(struct priv *priv);
void mlx4_rxq_intr_disable(struct priv *priv);
int mlx4_rx_intr_disable(struct rte_eth_dev *dev, uint16_t idx);
int mlx4_rx_intr_enable(struct rte_eth_dev *dev, uint16_t idx);
#endif /* RTE_PMD_MLX4_H_ */