numam-dpdk/drivers/net/mlx5/mlx5.h

948 lines
37 KiB
C
Raw Normal View History

/* SPDX-License-Identifier: BSD-3-Clause
* Copyright 2015 6WIND S.A.
* Copyright 2015 Mellanox Technologies, Ltd
*/
#ifndef RTE_PMD_MLX5_H_
#define RTE_PMD_MLX5_H_
#include <stddef.h>
#include <stdbool.h>
#include <stdint.h>
#include <limits.h>
#include <net/if.h>
#include <netinet/in.h>
#include <sys/queue.h>
/* Verbs header. */
/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
#ifdef PEDANTIC
#pragma GCC diagnostic ignored "-Wpedantic"
#endif
#include <infiniband/verbs.h>
#ifdef PEDANTIC
#pragma GCC diagnostic error "-Wpedantic"
#endif
#include <rte_pci.h>
#include <rte_ether.h>
#include <rte_ethdev_driver.h>
net/mlx5: add new memory region support This is the new design of Memory Region (MR) for mlx PMD, in order to: - Accommodate the new memory hotplug model. - Support non-contiguous Mempool. There are multiple layers for MR search. L0 is to look up the last-hit entry which is pointed by mr_ctrl->mru (Most Recently Used). If L0 misses, L1 is to look up the address in a fixed-sized array by linear search. L0/L1 is in an inline function - mlx5_mr_lookup_cache(). If L1 misses, the bottom-half function is called to look up the address from the bigger local cache of the queue. This is L2 - mlx5_mr_addr2mr_bh() and it is not an inline function. Data structure for L2 is the Binary Tree. If L2 misses, the search falls into the slowest path which takes locks in order to access global device cache (priv->mr.cache) which is also a B-tree and caches the original MR list (priv->mr.mr_list) of the device. Unless the global cache is overflowed, it is all-inclusive of the MR list. This is L3 - mlx5_mr_lookup_dev(). The size of the L3 cache table is limited and can't be expanded on the fly due to deadlock. Refer to the comments in the code for the details - mr_lookup_dev(). If L3 is overflowed, the list will have to be searched directly bypassing the cache although it is slower. If L3 misses, a new MR for the address should be created - mlx5_mr_create(). When it creates a new MR, it tries to register adjacent memsegs as much as possible which are virtually contiguous around the address. This must take two locks - memory_hotplug_lock and priv->mr.rwlock. Due to memory_hotplug_lock, there can't be any allocation/free of memory inside. In the free callback of the memory hotplug event, freed space is searched from the MR list and corresponding bits are cleared from the bitmap of MRs. This can fragment a MR and the MR will have multiple search entries in the caches. Once there's a change by the event, the global cache must be rebuilt and all the per-queue caches will be flushed as well. If memory is frequently freed in run-time, that may cause jitter on dataplane processing in the worst case by incurring MR cache flush and rebuild. But, it would be the least probable scenario. To guarantee the most optimal performance, it is highly recommended to use an EAL option - '--socket-mem'. Then, the reserved memory will be pinned and won't be freed dynamically. And it is also recommended to configure per-lcore cache of Mempool. Even though there're many MRs for a device or MRs are highly fragmented, the cache of Mempool will be much helpful to reduce misses on per-queue caches anyway. '--legacy-mem' is also supported. Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
2018-05-09 11:09:04 +00:00
#include <rte_rwlock.h>
#include <rte_interrupts.h>
#include <rte_errno.h>
#include <rte_flow.h>
#include <mlx5_glue.h>
#include <mlx5_devx_cmds.h>
#include <mlx5_prm.h>
#include <mlx5_nl.h>
#include <mlx5_common_mp.h>
#include <mlx5_common_mr.h>
#include "mlx5_defs.h"
#include "mlx5_utils.h"
#include "mlx5_os.h"
#include "mlx5_autoconf.h"
enum mlx5_ipool_index {
#ifdef HAVE_IBV_FLOW_DV_SUPPORT
MLX5_IPOOL_DECAP_ENCAP = 0, /* Pool for encap/decap resource. */
MLX5_IPOOL_PUSH_VLAN, /* Pool for push vlan resource. */
MLX5_IPOOL_TAG, /* Pool for tag resource. */
MLX5_IPOOL_PORT_ID, /* Pool for port id resource. */
MLX5_IPOOL_JUMP, /* Pool for jump resource. */
#endif
MLX5_IPOOL_MTR, /* Pool for meter resource. */
MLX5_IPOOL_MCP, /* Pool for metadata resource. */
MLX5_IPOOL_HRXQ, /* Pool for hrxq resource. */
MLX5_IPOOL_MLX5_FLOW, /* Pool for mlx5 flow handle. */
MLX5_IPOOL_RTE_FLOW, /* Pool for rte_flow. */
MLX5_IPOOL_MAX,
};
net/mlx5: add reclaim memory mode Currently, when flow destroyed, some memory resources may still be kept as cached to help next time create flow more efficiently. Some system may need the resources to be more flexible with flow create and destroy. After peak time, with millions of flows destroyed, the system would prefer the resources to be reclaimed completely, no cache is needed. Then the resources can be allocated and used by other components. The system is not so sensitive about the flow insertion rate, but more care about the resources. Both DPDK mlx5 PMD driver and the low level component rdma-core have provided the flow resources to be configured cached or not, but there is no APIs or parameters exposed to user to configure the flow resources cache mode. In this case, introduce a new PMD devarg to let user configure the flow resources cache mode will be helpful. This commit is to add a new "reclaim_mem_mode" to help user configure if the destroyed flows' cache resources should be kept or not. Their will be three mode can be chosen: 1. 0(none). It means the flow resources will be cached as usual. The resources will be cached, helpful with flow insertion rate. 2. 1(light). It will only enable the DPDK PMD level resources reclaim. 3. 2(aggressive). Both DPDK PMD level and rdma-core low level will be configured as reclaimed mode. With these three mode, user can configure the resources cache mode with different levels. Signed-off-by: Suanming Mou <suanmingm@mellanox.com> Acked-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
2020-06-01 06:09:43 +00:00
/*
* There are three reclaim memory mode supported.
* 0(none) means no memory reclaim.
* 1(light) means only PMD level reclaim.
* 2(aggressive) means both PMD and rdma-core level reclaim.
*/
enum mlx5_reclaim_mem_mode {
MLX5_RCM_NONE, /* Don't reclaim memory. */
MLX5_RCM_LIGHT, /* Reclaim PMD level. */
MLX5_RCM_AGGR, /* Reclaim PMD and rdma-core level. */
};
/* Device attributes used in mlx5 PMD */
struct mlx5_dev_attr {
uint64_t device_cap_flags_ex;
int max_qp_wr;
int max_sge;
int max_cq;
int max_qp;
uint32_t raw_packet_caps;
uint32_t max_rwq_indirection_table_size;
uint32_t max_tso;
uint32_t tso_supported_qpts;
uint64_t flags;
uint64_t comp_mask;
uint32_t sw_parsing_offloads;
uint32_t min_single_stride_log_num_of_bytes;
uint32_t max_single_stride_log_num_of_bytes;
uint32_t min_single_wqe_log_num_of_strides;
uint32_t max_single_wqe_log_num_of_strides;
uint32_t stride_supported_qpts;
uint32_t tunnel_offloads_caps;
char fw_ver[64];
};
/** Data associated with devices to spawn. */
struct mlx5_dev_spawn_data {
uint32_t ifindex; /**< Network interface index. */
uint32_t max_port; /**< Device maximal port index. */
uint32_t phys_port; /**< Device physical port index. */
int pf_bond; /**< bonding device PF index. < 0 - no bonding */
struct mlx5_switch_info info; /**< Switch information. */
void *phys_dev; /**< Associated physical device. */
struct rte_eth_dev *eth_dev; /**< Associated Ethernet device. */
struct rte_pci_device *pci_dev; /**< Backend PCI device. */
};
/** Key string for IPC. */
#define MLX5_MP_NAME "net_mlx5_mp"
LIST_HEAD(mlx5_dev_list, mlx5_dev_ctx_shared);
net/mlx5: add new memory region support This is the new design of Memory Region (MR) for mlx PMD, in order to: - Accommodate the new memory hotplug model. - Support non-contiguous Mempool. There are multiple layers for MR search. L0 is to look up the last-hit entry which is pointed by mr_ctrl->mru (Most Recently Used). If L0 misses, L1 is to look up the address in a fixed-sized array by linear search. L0/L1 is in an inline function - mlx5_mr_lookup_cache(). If L1 misses, the bottom-half function is called to look up the address from the bigger local cache of the queue. This is L2 - mlx5_mr_addr2mr_bh() and it is not an inline function. Data structure for L2 is the Binary Tree. If L2 misses, the search falls into the slowest path which takes locks in order to access global device cache (priv->mr.cache) which is also a B-tree and caches the original MR list (priv->mr.mr_list) of the device. Unless the global cache is overflowed, it is all-inclusive of the MR list. This is L3 - mlx5_mr_lookup_dev(). The size of the L3 cache table is limited and can't be expanded on the fly due to deadlock. Refer to the comments in the code for the details - mr_lookup_dev(). If L3 is overflowed, the list will have to be searched directly bypassing the cache although it is slower. If L3 misses, a new MR for the address should be created - mlx5_mr_create(). When it creates a new MR, it tries to register adjacent memsegs as much as possible which are virtually contiguous around the address. This must take two locks - memory_hotplug_lock and priv->mr.rwlock. Due to memory_hotplug_lock, there can't be any allocation/free of memory inside. In the free callback of the memory hotplug event, freed space is searched from the MR list and corresponding bits are cleared from the bitmap of MRs. This can fragment a MR and the MR will have multiple search entries in the caches. Once there's a change by the event, the global cache must be rebuilt and all the per-queue caches will be flushed as well. If memory is frequently freed in run-time, that may cause jitter on dataplane processing in the worst case by incurring MR cache flush and rebuild. But, it would be the least probable scenario. To guarantee the most optimal performance, it is highly recommended to use an EAL option - '--socket-mem'. Then, the reserved memory will be pinned and won't be freed dynamically. And it is also recommended to configure per-lcore cache of Mempool. Even though there're many MRs for a device or MRs are highly fragmented, the cache of Mempool will be much helpful to reduce misses on per-queue caches anyway. '--legacy-mem' is also supported. Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
2018-05-09 11:09:04 +00:00
/* Shared data between primary and secondary processes. */
net/mlx5: add new memory region support This is the new design of Memory Region (MR) for mlx PMD, in order to: - Accommodate the new memory hotplug model. - Support non-contiguous Mempool. There are multiple layers for MR search. L0 is to look up the last-hit entry which is pointed by mr_ctrl->mru (Most Recently Used). If L0 misses, L1 is to look up the address in a fixed-sized array by linear search. L0/L1 is in an inline function - mlx5_mr_lookup_cache(). If L1 misses, the bottom-half function is called to look up the address from the bigger local cache of the queue. This is L2 - mlx5_mr_addr2mr_bh() and it is not an inline function. Data structure for L2 is the Binary Tree. If L2 misses, the search falls into the slowest path which takes locks in order to access global device cache (priv->mr.cache) which is also a B-tree and caches the original MR list (priv->mr.mr_list) of the device. Unless the global cache is overflowed, it is all-inclusive of the MR list. This is L3 - mlx5_mr_lookup_dev(). The size of the L3 cache table is limited and can't be expanded on the fly due to deadlock. Refer to the comments in the code for the details - mr_lookup_dev(). If L3 is overflowed, the list will have to be searched directly bypassing the cache although it is slower. If L3 misses, a new MR for the address should be created - mlx5_mr_create(). When it creates a new MR, it tries to register adjacent memsegs as much as possible which are virtually contiguous around the address. This must take two locks - memory_hotplug_lock and priv->mr.rwlock. Due to memory_hotplug_lock, there can't be any allocation/free of memory inside. In the free callback of the memory hotplug event, freed space is searched from the MR list and corresponding bits are cleared from the bitmap of MRs. This can fragment a MR and the MR will have multiple search entries in the caches. Once there's a change by the event, the global cache must be rebuilt and all the per-queue caches will be flushed as well. If memory is frequently freed in run-time, that may cause jitter on dataplane processing in the worst case by incurring MR cache flush and rebuild. But, it would be the least probable scenario. To guarantee the most optimal performance, it is highly recommended to use an EAL option - '--socket-mem'. Then, the reserved memory will be pinned and won't be freed dynamically. And it is also recommended to configure per-lcore cache of Mempool. Even though there're many MRs for a device or MRs are highly fragmented, the cache of Mempool will be much helpful to reduce misses on per-queue caches anyway. '--legacy-mem' is also supported. Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
2018-05-09 11:09:04 +00:00
struct mlx5_shared_data {
rte_spinlock_t lock;
/* Global spinlock for primary and secondary processes. */
int init_done; /* Whether primary has done initialization. */
unsigned int secondary_cnt; /* Number of secondary processes init'd. */
net/mlx5: add new memory region support This is the new design of Memory Region (MR) for mlx PMD, in order to: - Accommodate the new memory hotplug model. - Support non-contiguous Mempool. There are multiple layers for MR search. L0 is to look up the last-hit entry which is pointed by mr_ctrl->mru (Most Recently Used). If L0 misses, L1 is to look up the address in a fixed-sized array by linear search. L0/L1 is in an inline function - mlx5_mr_lookup_cache(). If L1 misses, the bottom-half function is called to look up the address from the bigger local cache of the queue. This is L2 - mlx5_mr_addr2mr_bh() and it is not an inline function. Data structure for L2 is the Binary Tree. If L2 misses, the search falls into the slowest path which takes locks in order to access global device cache (priv->mr.cache) which is also a B-tree and caches the original MR list (priv->mr.mr_list) of the device. Unless the global cache is overflowed, it is all-inclusive of the MR list. This is L3 - mlx5_mr_lookup_dev(). The size of the L3 cache table is limited and can't be expanded on the fly due to deadlock. Refer to the comments in the code for the details - mr_lookup_dev(). If L3 is overflowed, the list will have to be searched directly bypassing the cache although it is slower. If L3 misses, a new MR for the address should be created - mlx5_mr_create(). When it creates a new MR, it tries to register adjacent memsegs as much as possible which are virtually contiguous around the address. This must take two locks - memory_hotplug_lock and priv->mr.rwlock. Due to memory_hotplug_lock, there can't be any allocation/free of memory inside. In the free callback of the memory hotplug event, freed space is searched from the MR list and corresponding bits are cleared from the bitmap of MRs. This can fragment a MR and the MR will have multiple search entries in the caches. Once there's a change by the event, the global cache must be rebuilt and all the per-queue caches will be flushed as well. If memory is frequently freed in run-time, that may cause jitter on dataplane processing in the worst case by incurring MR cache flush and rebuild. But, it would be the least probable scenario. To guarantee the most optimal performance, it is highly recommended to use an EAL option - '--socket-mem'. Then, the reserved memory will be pinned and won't be freed dynamically. And it is also recommended to configure per-lcore cache of Mempool. Even though there're many MRs for a device or MRs are highly fragmented, the cache of Mempool will be much helpful to reduce misses on per-queue caches anyway. '--legacy-mem' is also supported. Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
2018-05-09 11:09:04 +00:00
struct mlx5_dev_list mem_event_cb_list;
rte_rwlock_t mem_event_rwlock;
};
/* Per-process data structure, not visible to other processes. */
struct mlx5_local_data {
int init_done; /* Whether a secondary has done initialization. */
};
net/mlx5: add new memory region support This is the new design of Memory Region (MR) for mlx PMD, in order to: - Accommodate the new memory hotplug model. - Support non-contiguous Mempool. There are multiple layers for MR search. L0 is to look up the last-hit entry which is pointed by mr_ctrl->mru (Most Recently Used). If L0 misses, L1 is to look up the address in a fixed-sized array by linear search. L0/L1 is in an inline function - mlx5_mr_lookup_cache(). If L1 misses, the bottom-half function is called to look up the address from the bigger local cache of the queue. This is L2 - mlx5_mr_addr2mr_bh() and it is not an inline function. Data structure for L2 is the Binary Tree. If L2 misses, the search falls into the slowest path which takes locks in order to access global device cache (priv->mr.cache) which is also a B-tree and caches the original MR list (priv->mr.mr_list) of the device. Unless the global cache is overflowed, it is all-inclusive of the MR list. This is L3 - mlx5_mr_lookup_dev(). The size of the L3 cache table is limited and can't be expanded on the fly due to deadlock. Refer to the comments in the code for the details - mr_lookup_dev(). If L3 is overflowed, the list will have to be searched directly bypassing the cache although it is slower. If L3 misses, a new MR for the address should be created - mlx5_mr_create(). When it creates a new MR, it tries to register adjacent memsegs as much as possible which are virtually contiguous around the address. This must take two locks - memory_hotplug_lock and priv->mr.rwlock. Due to memory_hotplug_lock, there can't be any allocation/free of memory inside. In the free callback of the memory hotplug event, freed space is searched from the MR list and corresponding bits are cleared from the bitmap of MRs. This can fragment a MR and the MR will have multiple search entries in the caches. Once there's a change by the event, the global cache must be rebuilt and all the per-queue caches will be flushed as well. If memory is frequently freed in run-time, that may cause jitter on dataplane processing in the worst case by incurring MR cache flush and rebuild. But, it would be the least probable scenario. To guarantee the most optimal performance, it is highly recommended to use an EAL option - '--socket-mem'. Then, the reserved memory will be pinned and won't be freed dynamically. And it is also recommended to configure per-lcore cache of Mempool. Even though there're many MRs for a device or MRs are highly fragmented, the cache of Mempool will be much helpful to reduce misses on per-queue caches anyway. '--legacy-mem' is also supported. Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
2018-05-09 11:09:04 +00:00
extern struct mlx5_shared_data *mlx5_shared_data;
extern struct rte_pci_driver mlx5_driver;
/* Dev ops structs */
extern const struct eth_dev_ops mlx5_os_dev_ops;
extern const struct eth_dev_ops mlx5_os_dev_sec_ops;
extern const struct eth_dev_ops mlx5_os_dev_ops_isolate;
net/mlx5: add new memory region support This is the new design of Memory Region (MR) for mlx PMD, in order to: - Accommodate the new memory hotplug model. - Support non-contiguous Mempool. There are multiple layers for MR search. L0 is to look up the last-hit entry which is pointed by mr_ctrl->mru (Most Recently Used). If L0 misses, L1 is to look up the address in a fixed-sized array by linear search. L0/L1 is in an inline function - mlx5_mr_lookup_cache(). If L1 misses, the bottom-half function is called to look up the address from the bigger local cache of the queue. This is L2 - mlx5_mr_addr2mr_bh() and it is not an inline function. Data structure for L2 is the Binary Tree. If L2 misses, the search falls into the slowest path which takes locks in order to access global device cache (priv->mr.cache) which is also a B-tree and caches the original MR list (priv->mr.mr_list) of the device. Unless the global cache is overflowed, it is all-inclusive of the MR list. This is L3 - mlx5_mr_lookup_dev(). The size of the L3 cache table is limited and can't be expanded on the fly due to deadlock. Refer to the comments in the code for the details - mr_lookup_dev(). If L3 is overflowed, the list will have to be searched directly bypassing the cache although it is slower. If L3 misses, a new MR for the address should be created - mlx5_mr_create(). When it creates a new MR, it tries to register adjacent memsegs as much as possible which are virtually contiguous around the address. This must take two locks - memory_hotplug_lock and priv->mr.rwlock. Due to memory_hotplug_lock, there can't be any allocation/free of memory inside. In the free callback of the memory hotplug event, freed space is searched from the MR list and corresponding bits are cleared from the bitmap of MRs. This can fragment a MR and the MR will have multiple search entries in the caches. Once there's a change by the event, the global cache must be rebuilt and all the per-queue caches will be flushed as well. If memory is frequently freed in run-time, that may cause jitter on dataplane processing in the worst case by incurring MR cache flush and rebuild. But, it would be the least probable scenario. To guarantee the most optimal performance, it is highly recommended to use an EAL option - '--socket-mem'. Then, the reserved memory will be pinned and won't be freed dynamically. And it is also recommended to configure per-lcore cache of Mempool. Even though there're many MRs for a device or MRs are highly fragmented, the cache of Mempool will be much helpful to reduce misses on per-queue caches anyway. '--legacy-mem' is also supported. Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
2018-05-09 11:09:04 +00:00
struct mlx5_counter_ctrl {
/* Name of the counter. */
char dpdk_name[RTE_ETH_XSTATS_NAME_SIZE];
/* Name of the counter on the device table. */
char ctr_name[RTE_ETH_XSTATS_NAME_SIZE];
uint32_t dev:1; /**< Nonzero for dev counters. */
};
struct mlx5_xstats_ctrl {
/* Number of device stats. */
uint16_t stats_n;
/* Number of device stats identified by PMD. */
uint16_t mlx5_stats_n;
/* Index in the device counters table. */
uint16_t dev_table_idx[MLX5_MAX_XSTATS];
uint64_t base[MLX5_MAX_XSTATS];
uint64_t xstats[MLX5_MAX_XSTATS];
uint64_t hw_stats[MLX5_MAX_XSTATS];
struct mlx5_counter_ctrl info[MLX5_MAX_XSTATS];
};
struct mlx5_stats_ctrl {
/* Base for imissed counter. */
uint64_t imissed_base;
uint64_t imissed;
};
/* Default PMD specific parameter value. */
#define MLX5_ARG_UNSET (-1)
#define MLX5_LRO_SUPPORTED(dev) \
(((struct mlx5_priv *)((dev)->data->dev_private))->config.lro.supported)
/* Maximal size of coalesced segment for LRO is set in chunks of 256 Bytes. */
#define MLX5_LRO_SEG_CHUNK_SIZE 256u
/* Maximal size of aggregated LRO packet. */
#define MLX5_MAX_LRO_SIZE (UINT8_MAX * MLX5_LRO_SEG_CHUNK_SIZE)
/* LRO configurations structure. */
struct mlx5_lro_config {
uint32_t supported:1; /* Whether LRO is supported. */
uint32_t timeout; /* User configuration. */
};
/*
* Device configuration structure.
*
* Merged configuration from:
*
* - Device capabilities,
* - User device parameters disabled features.
*/
struct mlx5_dev_config {
unsigned int hw_csum:1; /* Checksum offload is supported. */
unsigned int hw_vlan_strip:1; /* VLAN stripping is supported. */
unsigned int hw_vlan_insert:1; /* VLAN insertion in WQE is supported. */
unsigned int hw_fcs_strip:1; /* FCS stripping is supported. */
unsigned int hw_padding:1; /* End alignment padding is supported. */
unsigned int vf:1; /* This is a VF. */
unsigned int tunnel_en:1;
/* Whether tunnel stateless offloads are supported. */
unsigned int mpls_en:1; /* MPLS over GRE/UDP is enabled. */
unsigned int cqe_comp:1; /* CQE compression is enabled. */
unsigned int cqe_pad:1; /* CQE padding is enabled. */
unsigned int tso:1; /* Whether TSO is supported. */
unsigned int rx_vec_en:1; /* Rx vector is enabled. */
unsigned int mr_ext_memseg_en:1;
/* Whether memseg should be extended for MR creation. */
unsigned int l3_vxlan_en:1; /* Enable L3 VXLAN flow creation. */
unsigned int vf_nl_en:1; /* Enable Netlink requests in VF mode. */
unsigned int dv_esw_en:1; /* Enable E-Switch DV flow. */
unsigned int dv_flow_en:1; /* Enable DV flow. */
net/mlx5: add devarg for extensive metadata support The PMD parameter dv_xmeta_en is added to control extensive metadata support. A nonzero value enables extensive flow metadata support if device is capable and driver supports it. This can enable extensive support of MARK and META item of rte_flow. The newly introduced SET_TAG and SET_META actions do not depend on dv_xmeta_en parameter, because there is no compatibility issue for new entities. The dv_xmeta_en is disabled by default. There are some possible configurations, depending on parameter value: - 0, this is default value, defines the legacy mode, the MARK and META related actions and items operate only within NIC Tx and NIC Rx steering domains, no MARK and META information crosses the domain boundaries. The MARK item is 24 bits wide, the META item is 32 bits wide. - 1, this engages extensive metadata mode, the MARK and META related actions and items operate within all supported steering domains, including FDB, MARK and META information may cross the domain boundaries. The ``MARK`` item is 24 bits wide, the META item width depends on kernel and firmware configurations and might be 0, 16 or 32 bits. Within NIC Tx domain META data width is 32 bits for compatibility, the actual width of data transferred to the FDB domain depends on kernel configuration and may be vary. The actual supported width can be retrieved in runtime by series of rte_flow_validate() trials. - 2, this engages extensive metadata mode, the MARK and META related actions and items operate within all supported steering domains, including FDB, MARK and META information may cross the domain boundaries. The META item is 32 bits wide, the MARK item width depends on kernel and firmware configurations and might be 0, 16 or 24 bits. The actual supported width can be retrieved in runtime by series of rte_flow_validate() trials. If there is no E-Switch configuration the ``dv_xmeta_en`` parameter is ignored and the device is configured to operate in legacy mode (0). Signed-off-by: Yongseok Koh <yskoh@mellanox.com> Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com> Acked-by: Matan Azrad <matan@mellanox.com>
2019-11-07 17:09:54 +00:00
unsigned int dv_xmeta_en:2; /* Enable extensive flow metadata. */
unsigned int lacp_by_user:1;
/* Enable user to manage LACP traffic. */
unsigned int swp:1; /* Tx generic tunnel checksum and TSO offload. */
unsigned int devx:1; /* Whether devx interface is available or not. */
unsigned int dest_tir:1; /* Whether advanced DR API is available. */
net/mlx5: add reclaim memory mode Currently, when flow destroyed, some memory resources may still be kept as cached to help next time create flow more efficiently. Some system may need the resources to be more flexible with flow create and destroy. After peak time, with millions of flows destroyed, the system would prefer the resources to be reclaimed completely, no cache is needed. Then the resources can be allocated and used by other components. The system is not so sensitive about the flow insertion rate, but more care about the resources. Both DPDK mlx5 PMD driver and the low level component rdma-core have provided the flow resources to be configured cached or not, but there is no APIs or parameters exposed to user to configure the flow resources cache mode. In this case, introduce a new PMD devarg to let user configure the flow resources cache mode will be helpful. This commit is to add a new "reclaim_mem_mode" to help user configure if the destroyed flows' cache resources should be kept or not. Their will be three mode can be chosen: 1. 0(none). It means the flow resources will be cached as usual. The resources will be cached, helpful with flow insertion rate. 2. 1(light). It will only enable the DPDK PMD level resources reclaim. 3. 2(aggressive). Both DPDK PMD level and rdma-core low level will be configured as reclaimed mode. With these three mode, user can configure the resources cache mode with different levels. Signed-off-by: Suanming Mou <suanmingm@mellanox.com> Acked-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
2020-06-01 06:09:43 +00:00
unsigned int reclaim_mode:2; /* Memory reclaim mode. */
struct {
unsigned int enabled:1; /* Whether MPRQ is enabled. */
unsigned int stride_num_n; /* Number of strides. */
unsigned int stride_size_n; /* Size of a stride. */
unsigned int min_stride_size_n; /* Min size of a stride. */
unsigned int max_stride_size_n; /* Max size of a stride. */
unsigned int max_memcpy_len;
/* Maximum packet size to memcpy Rx packets. */
unsigned int min_rxqs_num;
/* Rx queue count threshold to enable MPRQ. */
} mprq; /* Configurations for Multi-Packet RQ. */
int mps; /* Multi-packet send supported mode. */
net/mlx5: control transmit doorbell register mapping The rdma core library can map doorbell register in two ways, depending on the environment variable "MLX5_SHUT_UP_BF": - as regular cached memory, the variable is either missing or set to zero. This type of mapping may cause the significant doorbell register writing latency and requires explicit memory write barrier to mitigate this issue and prevent write combining. - as non-cached memory, the variable is present and set to not "0" value. This type of mapping may cause performance impact under heavy loading conditions but the explicit write memory barrier is not required and it may improve core performance. The new devarg is introduced "tx_db_nc", if this parameter is set to zero, the doorbell register is forced to be mapped to cached memory and requires explicit memory barrier after writing to. If "tx_db_nc" is set to non-zero value the doorbell will be mapped as non-cached memory, not requiring the memory barrier. If "tx_db_nc" is missing the behaviour will be defined by presence of "MLX5_SHUT_UP_BF" in environment. If variable is missed the default value zero will be set for ARM64 hosts and one for others. In run time the code checks the mapping type and provides the memory barrier after writing to tx doorbell register if it is needed. The mapping type is extracted directly from the uar_mmap_offset field in the queue properties. Fixes: 18a1c20044c0 ("net/mlx5: implement Tx burst template") Cc: stable@dpdk.org Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com> Acked-by: Matan Azrad <matan@mellanox.com>
2019-11-08 15:07:50 +00:00
int dbnc; /* Skip doorbell register write barrier. */
unsigned int flow_prio; /* Number of flow priorities. */
enum modify_reg flow_mreg_c[MLX5_MREG_C_NUM];
/* Availibility of mreg_c's. */
unsigned int tso_max_payload_sz; /* Maximum TCP payload for TSO. */
unsigned int ind_table_max_size; /* Maximum indirection table size. */
unsigned int max_dump_files_num; /* Maximum dump files per queue. */
unsigned int log_hp_size; /* Single hairpin queue data size in total. */
int txqs_inline; /* Queue number threshold for inlining. */
int txq_inline_min; /* Minimal amount of data bytes to inline. */
int txq_inline_max; /* Max packet size for inlining with SEND. */
int txq_inline_mpw; /* Max packet size for inlining with eMPW. */
int tx_pp; /* Timestamp scheduling granularity in nanoseconds. */
int tx_skew; /* Tx scheduling skew between WQE and data on wire. */
struct mlx5_hca_attr hca_attr; /* HCA attributes. */
struct mlx5_lro_config lro; /* LRO configuration. */
};
/**
* Type of object being allocated.
*/
enum mlx5_verbs_alloc_type {
MLX5_VERBS_ALLOC_TYPE_NONE,
MLX5_VERBS_ALLOC_TYPE_TX_QUEUE,
MLX5_VERBS_ALLOC_TYPE_RX_QUEUE,
};
net/mlx5: add workaround for VLAN in virtual machine On some virtual setups (particularly on ESXi) when we have SR-IOV and E-Switch enabled there is the problem to receive VLAN traffic on VF interfaces. The NIC driver in ESXi hypervisor does not setup E-Switch vport setting correctly and VLAN traffic targeted to VF is dropped. The patch provides the temporary workaround - if the rule containing the VLAN pattern is being installed for VF the VLAN network interface over VF is created, like the command does: ip link add link vf.if name mlx5.wa.1.100 type vlan id 100 The PMD in DPDK maintains the database of created VLAN interfaces for each existing VF and requested VLAN tags. When all of the RTE Flows using the given VLAN tag are removed the created VLAN interface with this VLAN tag is deleted. The name of created VLAN interface follows the format: evmlx.d1.d2, where d1 is VF interface ifindex, d2 - VLAN ifindex Implementation limitations: - mask in rules is ignored, rule must specify VLAN tags exactly, no wildcards (which are implemented by the masks) are allowed - virtual environment is detected via rte_hypervisor() call, and the type of hypervisor is checked. Currently we engage the workaround for ESXi and unrecognized hypervisors (which always happen on platforms other than x86 - it means workaround applied for the Flow over PCI VF). There are no confirmed data the other hypervisors (HyperV, Qemu) need this workaround, we are trying to reduce the list of configurations on those workaround should be applied. Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com> Acked-by: Matan Azrad <matan@mellanox.com> Acked-by: Shahaf Shuler <shahafs@mellanox.com>
2019-07-30 09:20:24 +00:00
/* Structure for VF VLAN workaround. */
struct mlx5_vf_vlan {
uint32_t tag:12;
uint32_t created:1;
};
/**
* Verbs allocator needs a context to know in the callback which kind of
* resources it is allocating.
*/
struct mlx5_verbs_alloc_ctx {
enum mlx5_verbs_alloc_type type; /* Kind of object being allocated. */
const void *obj; /* Pointer to the DPDK object. */
};
/* Flow drop context necessary due to Verbs API. */
struct mlx5_drop {
struct mlx5_hrxq *hrxq; /* Hash Rx queue queue. */
struct mlx5_rxq_obj *rxq; /* Rx queue object. */
};
net/mlx5: accelerate DV flow counter transactions The DevX interface exposes a new feature to the PMD that can allocate a batch of counters by one FW command. It can improve the flow transaction rate (with count action). Add a new counter pools mechanism to manage HW counters in the PMD. So, for each flow with counter creation the PMD will try to find a free counter in the PMD pools container and only if there is no a free counter, it will allocate a new DevX batch counters. Currently we cannot support batch counter for a group 0 flow, so create a 2 container types, one which allocates counters one by one and one which allocates X counters by the batch feature. The allocated counters objects are never released back to the HW assuming the flows maximum number will be close to the actual value of the flows number. Later, it can be updated, and dynamic release mechanism can be added. The counters are contained in pools, each pool with 512 counters. The pools are contained in counter containers according to the allocation resolution type - single or batch. The cache memory of the counters statistics is saved as raw data per pool. All the raw data memory is allocated for all the container in one memory allocation and is managed by counter_stats_mem_mng structure which registers all the raw memory to the HW. Each pool points to one raw data structure. The query operation is in pool resolution which updates all the pool counter raw data by one operation. Signed-off-by: Matan Azrad <matan@mellanox.com> Acked-by: Shahaf Shuler <shahafs@mellanox.com>
2019-07-16 14:34:53 +00:00
#define MLX5_COUNTERS_PER_POOL 512
#define MLX5_MAX_PENDING_QUERIES 4
#define MLX5_CNT_CONTAINER_RESIZE 64
#define MLX5_CNT_AGE_OFFSET 0x80000000
#define CNT_SIZE (sizeof(struct mlx5_flow_counter))
#define CNTEXT_SIZE (sizeof(struct mlx5_flow_counter_ext))
#define AGE_SIZE (sizeof(struct mlx5_age_param))
#define MLX5_AGING_TIME_DELAY 7
#define CNT_POOL_TYPE_EXT (1 << 0)
#define CNT_POOL_TYPE_AGE (1 << 1)
#define IS_EXT_POOL(pool) (((pool)->type) & CNT_POOL_TYPE_EXT)
#define IS_AGE_POOL(pool) (((pool)->type) & CNT_POOL_TYPE_AGE)
#define MLX_CNT_IS_AGE(counter) ((counter) & MLX5_CNT_AGE_OFFSET ? 1 : 0)
#define MLX5_CNT_LEN(pool) \
(CNT_SIZE + \
(IS_AGE_POOL(pool) ? AGE_SIZE : 0) + \
(IS_EXT_POOL(pool) ? CNTEXT_SIZE : 0))
#define MLX5_POOL_GET_CNT(pool, index) \
((struct mlx5_flow_counter *) \
((uint8_t *)((pool) + 1) + (index) * (MLX5_CNT_LEN(pool))))
#define MLX5_CNT_ARRAY_IDX(pool, cnt) \
((int)(((uint8_t *)(cnt) - (uint8_t *)((pool) + 1)) / \
MLX5_CNT_LEN(pool)))
/*
* The pool index and offset of counter in the pool array makes up the
* counter index. In case the counter is from pool 0 and offset 0, it
* should plus 1 to avoid index 0, since 0 means invalid counter index
* currently.
*/
#define MLX5_MAKE_CNT_IDX(pi, offset) \
((pi) * MLX5_COUNTERS_PER_POOL + (offset) + 1)
#define MLX5_CNT_TO_CNT_EXT(pool, cnt) \
((struct mlx5_flow_counter_ext *)\
((uint8_t *)((cnt) + 1) + \
(IS_AGE_POOL(pool) ? AGE_SIZE : 0)))
#define MLX5_GET_POOL_CNT_EXT(pool, offset) \
MLX5_CNT_TO_CNT_EXT(pool, MLX5_POOL_GET_CNT((pool), (offset)))
#define MLX5_CNT_TO_AGE(cnt) \
((struct mlx5_age_param *)((cnt) + 1))
/*
* The maximum single counter is 0x800000 as MLX5_CNT_BATCH_OFFSET
* defines. The pool size is 512, pool index should never reach
* INT16_MAX.
*/
#define POOL_IDX_INVALID UINT16_MAX
net/mlx5: accelerate DV flow counter transactions The DevX interface exposes a new feature to the PMD that can allocate a batch of counters by one FW command. It can improve the flow transaction rate (with count action). Add a new counter pools mechanism to manage HW counters in the PMD. So, for each flow with counter creation the PMD will try to find a free counter in the PMD pools container and only if there is no a free counter, it will allocate a new DevX batch counters. Currently we cannot support batch counter for a group 0 flow, so create a 2 container types, one which allocates counters one by one and one which allocates X counters by the batch feature. The allocated counters objects are never released back to the HW assuming the flows maximum number will be close to the actual value of the flows number. Later, it can be updated, and dynamic release mechanism can be added. The counters are contained in pools, each pool with 512 counters. The pools are contained in counter containers according to the allocation resolution type - single or batch. The cache memory of the counters statistics is saved as raw data per pool. All the raw data memory is allocated for all the container in one memory allocation and is managed by counter_stats_mem_mng structure which registers all the raw memory to the HW. Each pool points to one raw data structure. The query operation is in pool resolution which updates all the pool counter raw data by one operation. Signed-off-by: Matan Azrad <matan@mellanox.com> Acked-by: Shahaf Shuler <shahafs@mellanox.com>
2019-07-16 14:34:53 +00:00
struct mlx5_flow_counter_pool;
/*age status*/
enum {
AGE_FREE, /* Initialized state. */
AGE_CANDIDATE, /* Counter assigned to flows. */
AGE_TMOUT, /* Timeout, wait for rte_flow_get_aged_flows and destroy. */
};
#define MLX5_CNT_CONTAINER(sh, batch, age) (&(sh)->cmng.ccont \
[(batch) * 2 + (age)])
enum {
MLX5_CCONT_TYPE_SINGLE,
MLX5_CCONT_TYPE_SINGLE_FOR_AGE,
MLX5_CCONT_TYPE_BATCH,
MLX5_CCONT_TYPE_BATCH_FOR_AGE,
MLX5_CCONT_TYPE_MAX,
};
/* Counter age parameter. */
struct mlx5_age_param {
rte_atomic16_t state; /**< Age state. */
uint16_t port_id; /**< Port id of the counter. */
uint32_t timeout:15; /**< Age timeout in unit of 0.1sec. */
uint32_t expire:16; /**< Expire time(0.1sec) in the future. */
void *context; /**< Flow counter age context. */
};
net/mlx5: accelerate DV flow counter transactions The DevX interface exposes a new feature to the PMD that can allocate a batch of counters by one FW command. It can improve the flow transaction rate (with count action). Add a new counter pools mechanism to manage HW counters in the PMD. So, for each flow with counter creation the PMD will try to find a free counter in the PMD pools container and only if there is no a free counter, it will allocate a new DevX batch counters. Currently we cannot support batch counter for a group 0 flow, so create a 2 container types, one which allocates counters one by one and one which allocates X counters by the batch feature. The allocated counters objects are never released back to the HW assuming the flows maximum number will be close to the actual value of the flows number. Later, it can be updated, and dynamic release mechanism can be added. The counters are contained in pools, each pool with 512 counters. The pools are contained in counter containers according to the allocation resolution type - single or batch. The cache memory of the counters statistics is saved as raw data per pool. All the raw data memory is allocated for all the container in one memory allocation and is managed by counter_stats_mem_mng structure which registers all the raw memory to the HW. Each pool points to one raw data structure. The query operation is in pool resolution which updates all the pool counter raw data by one operation. Signed-off-by: Matan Azrad <matan@mellanox.com> Acked-by: Shahaf Shuler <shahafs@mellanox.com>
2019-07-16 14:34:53 +00:00
struct flow_counter_stats {
uint64_t hits;
uint64_t bytes;
};
struct mlx5_flow_counter_pool;
/* Generic counters information. */
net/mlx5: accelerate DV flow counter transactions The DevX interface exposes a new feature to the PMD that can allocate a batch of counters by one FW command. It can improve the flow transaction rate (with count action). Add a new counter pools mechanism to manage HW counters in the PMD. So, for each flow with counter creation the PMD will try to find a free counter in the PMD pools container and only if there is no a free counter, it will allocate a new DevX batch counters. Currently we cannot support batch counter for a group 0 flow, so create a 2 container types, one which allocates counters one by one and one which allocates X counters by the batch feature. The allocated counters objects are never released back to the HW assuming the flows maximum number will be close to the actual value of the flows number. Later, it can be updated, and dynamic release mechanism can be added. The counters are contained in pools, each pool with 512 counters. The pools are contained in counter containers according to the allocation resolution type - single or batch. The cache memory of the counters statistics is saved as raw data per pool. All the raw data memory is allocated for all the container in one memory allocation and is managed by counter_stats_mem_mng structure which registers all the raw memory to the HW. Each pool points to one raw data structure. The query operation is in pool resolution which updates all the pool counter raw data by one operation. Signed-off-by: Matan Azrad <matan@mellanox.com> Acked-by: Shahaf Shuler <shahafs@mellanox.com>
2019-07-16 14:34:53 +00:00
struct mlx5_flow_counter {
TAILQ_ENTRY(mlx5_flow_counter) next;
/**< Pointer to the next flow counter structure. */
union {
uint64_t hits; /**< Reset value of hits packets. */
struct mlx5_flow_counter_pool *pool; /**< Counter pool. */
};
uint64_t bytes; /**< Reset value of bytes. */
void *action; /**< Pointer to the dv action. */
};
/* Extend counters information for none batch counters. */
struct mlx5_flow_counter_ext {
net/mlx5: accelerate DV flow counter transactions The DevX interface exposes a new feature to the PMD that can allocate a batch of counters by one FW command. It can improve the flow transaction rate (with count action). Add a new counter pools mechanism to manage HW counters in the PMD. So, for each flow with counter creation the PMD will try to find a free counter in the PMD pools container and only if there is no a free counter, it will allocate a new DevX batch counters. Currently we cannot support batch counter for a group 0 flow, so create a 2 container types, one which allocates counters one by one and one which allocates X counters by the batch feature. The allocated counters objects are never released back to the HW assuming the flows maximum number will be close to the actual value of the flows number. Later, it can be updated, and dynamic release mechanism can be added. The counters are contained in pools, each pool with 512 counters. The pools are contained in counter containers according to the allocation resolution type - single or batch. The cache memory of the counters statistics is saved as raw data per pool. All the raw data memory is allocated for all the container in one memory allocation and is managed by counter_stats_mem_mng structure which registers all the raw memory to the HW. Each pool points to one raw data structure. The query operation is in pool resolution which updates all the pool counter raw data by one operation. Signed-off-by: Matan Azrad <matan@mellanox.com> Acked-by: Shahaf Shuler <shahafs@mellanox.com>
2019-07-16 14:34:53 +00:00
uint32_t shared:1; /**< Share counter ID with other flow rules. */
uint32_t batch: 1;
/**< Whether the counter was allocated by batch command. */
uint32_t ref_cnt:30; /**< Reference counter. */
uint32_t id; /**< User counter ID. */
net/mlx5: accelerate DV flow counter transactions The DevX interface exposes a new feature to the PMD that can allocate a batch of counters by one FW command. It can improve the flow transaction rate (with count action). Add a new counter pools mechanism to manage HW counters in the PMD. So, for each flow with counter creation the PMD will try to find a free counter in the PMD pools container and only if there is no a free counter, it will allocate a new DevX batch counters. Currently we cannot support batch counter for a group 0 flow, so create a 2 container types, one which allocates counters one by one and one which allocates X counters by the batch feature. The allocated counters objects are never released back to the HW assuming the flows maximum number will be close to the actual value of the flows number. Later, it can be updated, and dynamic release mechanism can be added. The counters are contained in pools, each pool with 512 counters. The pools are contained in counter containers according to the allocation resolution type - single or batch. The cache memory of the counters statistics is saved as raw data per pool. All the raw data memory is allocated for all the container in one memory allocation and is managed by counter_stats_mem_mng structure which registers all the raw memory to the HW. Each pool points to one raw data structure. The query operation is in pool resolution which updates all the pool counter raw data by one operation. Signed-off-by: Matan Azrad <matan@mellanox.com> Acked-by: Shahaf Shuler <shahafs@mellanox.com>
2019-07-16 14:34:53 +00:00
union { /**< Holds the counters for the rule. */
#if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42)
struct ibv_counter_set *cs;
#elif defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45)
struct ibv_counters *cs;
#endif
struct mlx5_devx_obj *dcs; /**< Counter Devx object. */
};
};
TAILQ_HEAD(mlx5_counters, mlx5_flow_counter);
/* Generic counter pool structure - query is in pool resolution. */
net/mlx5: accelerate DV flow counter transactions The DevX interface exposes a new feature to the PMD that can allocate a batch of counters by one FW command. It can improve the flow transaction rate (with count action). Add a new counter pools mechanism to manage HW counters in the PMD. So, for each flow with counter creation the PMD will try to find a free counter in the PMD pools container and only if there is no a free counter, it will allocate a new DevX batch counters. Currently we cannot support batch counter for a group 0 flow, so create a 2 container types, one which allocates counters one by one and one which allocates X counters by the batch feature. The allocated counters objects are never released back to the HW assuming the flows maximum number will be close to the actual value of the flows number. Later, it can be updated, and dynamic release mechanism can be added. The counters are contained in pools, each pool with 512 counters. The pools are contained in counter containers according to the allocation resolution type - single or batch. The cache memory of the counters statistics is saved as raw data per pool. All the raw data memory is allocated for all the container in one memory allocation and is managed by counter_stats_mem_mng structure which registers all the raw memory to the HW. Each pool points to one raw data structure. The query operation is in pool resolution which updates all the pool counter raw data by one operation. Signed-off-by: Matan Azrad <matan@mellanox.com> Acked-by: Shahaf Shuler <shahafs@mellanox.com>
2019-07-16 14:34:53 +00:00
struct mlx5_flow_counter_pool {
TAILQ_ENTRY(mlx5_flow_counter_pool) next;
struct mlx5_counters counters[2]; /* Free counter list. */
union {
struct mlx5_devx_obj *min_dcs;
rte_atomic64_t a64_dcs;
};
/* The devx object of the minimum counter ID. */
uint32_t index:29; /* Pool index in container. */
uint32_t type:2; /* Memory type behind the counter array. */
volatile uint32_t query_gen:1; /* Query round. */
rte_spinlock_t sl; /* The pool lock. */
struct mlx5_counter_stats_raw *raw;
struct mlx5_counter_stats_raw *raw_hw; /* The raw on HW working. */
net/mlx5: accelerate DV flow counter transactions The DevX interface exposes a new feature to the PMD that can allocate a batch of counters by one FW command. It can improve the flow transaction rate (with count action). Add a new counter pools mechanism to manage HW counters in the PMD. So, for each flow with counter creation the PMD will try to find a free counter in the PMD pools container and only if there is no a free counter, it will allocate a new DevX batch counters. Currently we cannot support batch counter for a group 0 flow, so create a 2 container types, one which allocates counters one by one and one which allocates X counters by the batch feature. The allocated counters objects are never released back to the HW assuming the flows maximum number will be close to the actual value of the flows number. Later, it can be updated, and dynamic release mechanism can be added. The counters are contained in pools, each pool with 512 counters. The pools are contained in counter containers according to the allocation resolution type - single or batch. The cache memory of the counters statistics is saved as raw data per pool. All the raw data memory is allocated for all the container in one memory allocation and is managed by counter_stats_mem_mng structure which registers all the raw memory to the HW. Each pool points to one raw data structure. The query operation is in pool resolution which updates all the pool counter raw data by one operation. Signed-off-by: Matan Azrad <matan@mellanox.com> Acked-by: Shahaf Shuler <shahafs@mellanox.com>
2019-07-16 14:34:53 +00:00
};
struct mlx5_counter_stats_raw;
/* Memory management structure for group of counter statistics raws. */
struct mlx5_counter_stats_mem_mng {
LIST_ENTRY(mlx5_counter_stats_mem_mng) next;
struct mlx5_counter_stats_raw *raws;
struct mlx5_devx_obj *dm;
void *umem;
net/mlx5: accelerate DV flow counter transactions The DevX interface exposes a new feature to the PMD that can allocate a batch of counters by one FW command. It can improve the flow transaction rate (with count action). Add a new counter pools mechanism to manage HW counters in the PMD. So, for each flow with counter creation the PMD will try to find a free counter in the PMD pools container and only if there is no a free counter, it will allocate a new DevX batch counters. Currently we cannot support batch counter for a group 0 flow, so create a 2 container types, one which allocates counters one by one and one which allocates X counters by the batch feature. The allocated counters objects are never released back to the HW assuming the flows maximum number will be close to the actual value of the flows number. Later, it can be updated, and dynamic release mechanism can be added. The counters are contained in pools, each pool with 512 counters. The pools are contained in counter containers according to the allocation resolution type - single or batch. The cache memory of the counters statistics is saved as raw data per pool. All the raw data memory is allocated for all the container in one memory allocation and is managed by counter_stats_mem_mng structure which registers all the raw memory to the HW. Each pool points to one raw data structure. The query operation is in pool resolution which updates all the pool counter raw data by one operation. Signed-off-by: Matan Azrad <matan@mellanox.com> Acked-by: Shahaf Shuler <shahafs@mellanox.com>
2019-07-16 14:34:53 +00:00
};
/* Raw memory structure for the counter statistics values of a pool. */
struct mlx5_counter_stats_raw {
LIST_ENTRY(mlx5_counter_stats_raw) next;
int min_dcs_id;
struct mlx5_counter_stats_mem_mng *mem_mng;
volatile struct flow_counter_stats *data;
};
TAILQ_HEAD(mlx5_counter_pools, mlx5_flow_counter_pool);
/* Container structure for counter pools. */
struct mlx5_pools_container {
rte_atomic16_t n_valid; /* Number of valid pools. */
net/mlx5: accelerate DV flow counter transactions The DevX interface exposes a new feature to the PMD that can allocate a batch of counters by one FW command. It can improve the flow transaction rate (with count action). Add a new counter pools mechanism to manage HW counters in the PMD. So, for each flow with counter creation the PMD will try to find a free counter in the PMD pools container and only if there is no a free counter, it will allocate a new DevX batch counters. Currently we cannot support batch counter for a group 0 flow, so create a 2 container types, one which allocates counters one by one and one which allocates X counters by the batch feature. The allocated counters objects are never released back to the HW assuming the flows maximum number will be close to the actual value of the flows number. Later, it can be updated, and dynamic release mechanism can be added. The counters are contained in pools, each pool with 512 counters. The pools are contained in counter containers according to the allocation resolution type - single or batch. The cache memory of the counters statistics is saved as raw data per pool. All the raw data memory is allocated for all the container in one memory allocation and is managed by counter_stats_mem_mng structure which registers all the raw memory to the HW. Each pool points to one raw data structure. The query operation is in pool resolution which updates all the pool counter raw data by one operation. Signed-off-by: Matan Azrad <matan@mellanox.com> Acked-by: Shahaf Shuler <shahafs@mellanox.com>
2019-07-16 14:34:53 +00:00
uint16_t n; /* Number of pools. */
uint16_t last_pool_idx; /* Last used pool index */
int min_id; /* The minimum counter ID in the pools. */
int max_id; /* The maximum counter ID in the pools. */
rte_spinlock_t resize_sl; /* The resize lock. */
rte_spinlock_t csl; /* The counter free list lock. */
struct mlx5_counters counters; /* Free counter list. */
net/mlx5: accelerate DV flow counter transactions The DevX interface exposes a new feature to the PMD that can allocate a batch of counters by one FW command. It can improve the flow transaction rate (with count action). Add a new counter pools mechanism to manage HW counters in the PMD. So, for each flow with counter creation the PMD will try to find a free counter in the PMD pools container and only if there is no a free counter, it will allocate a new DevX batch counters. Currently we cannot support batch counter for a group 0 flow, so create a 2 container types, one which allocates counters one by one and one which allocates X counters by the batch feature. The allocated counters objects are never released back to the HW assuming the flows maximum number will be close to the actual value of the flows number. Later, it can be updated, and dynamic release mechanism can be added. The counters are contained in pools, each pool with 512 counters. The pools are contained in counter containers according to the allocation resolution type - single or batch. The cache memory of the counters statistics is saved as raw data per pool. All the raw data memory is allocated for all the container in one memory allocation and is managed by counter_stats_mem_mng structure which registers all the raw memory to the HW. Each pool points to one raw data structure. The query operation is in pool resolution which updates all the pool counter raw data by one operation. Signed-off-by: Matan Azrad <matan@mellanox.com> Acked-by: Shahaf Shuler <shahafs@mellanox.com>
2019-07-16 14:34:53 +00:00
struct mlx5_counter_pools pool_list; /* Counter pool list. */
struct mlx5_flow_counter_pool **pools; /* Counter pool array. */
struct mlx5_counter_stats_mem_mng *mem_mng;
net/mlx5: accelerate DV flow counter transactions The DevX interface exposes a new feature to the PMD that can allocate a batch of counters by one FW command. It can improve the flow transaction rate (with count action). Add a new counter pools mechanism to manage HW counters in the PMD. So, for each flow with counter creation the PMD will try to find a free counter in the PMD pools container and only if there is no a free counter, it will allocate a new DevX batch counters. Currently we cannot support batch counter for a group 0 flow, so create a 2 container types, one which allocates counters one by one and one which allocates X counters by the batch feature. The allocated counters objects are never released back to the HW assuming the flows maximum number will be close to the actual value of the flows number. Later, it can be updated, and dynamic release mechanism can be added. The counters are contained in pools, each pool with 512 counters. The pools are contained in counter containers according to the allocation resolution type - single or batch. The cache memory of the counters statistics is saved as raw data per pool. All the raw data memory is allocated for all the container in one memory allocation and is managed by counter_stats_mem_mng structure which registers all the raw memory to the HW. Each pool points to one raw data structure. The query operation is in pool resolution which updates all the pool counter raw data by one operation. Signed-off-by: Matan Azrad <matan@mellanox.com> Acked-by: Shahaf Shuler <shahafs@mellanox.com>
2019-07-16 14:34:53 +00:00
/* Hold the memory management for the next allocated pools raws. */
};
/* Counter global management structure. */
struct mlx5_flow_counter_mng {
struct mlx5_pools_container ccont[MLX5_CCONT_TYPE_MAX];
net/mlx5: accelerate DV flow counter transactions The DevX interface exposes a new feature to the PMD that can allocate a batch of counters by one FW command. It can improve the flow transaction rate (with count action). Add a new counter pools mechanism to manage HW counters in the PMD. So, for each flow with counter creation the PMD will try to find a free counter in the PMD pools container and only if there is no a free counter, it will allocate a new DevX batch counters. Currently we cannot support batch counter for a group 0 flow, so create a 2 container types, one which allocates counters one by one and one which allocates X counters by the batch feature. The allocated counters objects are never released back to the HW assuming the flows maximum number will be close to the actual value of the flows number. Later, it can be updated, and dynamic release mechanism can be added. The counters are contained in pools, each pool with 512 counters. The pools are contained in counter containers according to the allocation resolution type - single or batch. The cache memory of the counters statistics is saved as raw data per pool. All the raw data memory is allocated for all the container in one memory allocation and is managed by counter_stats_mem_mng structure which registers all the raw memory to the HW. Each pool points to one raw data structure. The query operation is in pool resolution which updates all the pool counter raw data by one operation. Signed-off-by: Matan Azrad <matan@mellanox.com> Acked-by: Shahaf Shuler <shahafs@mellanox.com>
2019-07-16 14:34:53 +00:00
struct mlx5_counters flow_counters; /* Legacy flow counter list. */
uint8_t pending_queries;
uint8_t batch;
uint16_t pool_index;
uint8_t age;
uint8_t query_thread_on;
net/mlx5: accelerate DV flow counter transactions The DevX interface exposes a new feature to the PMD that can allocate a batch of counters by one FW command. It can improve the flow transaction rate (with count action). Add a new counter pools mechanism to manage HW counters in the PMD. So, for each flow with counter creation the PMD will try to find a free counter in the PMD pools container and only if there is no a free counter, it will allocate a new DevX batch counters. Currently we cannot support batch counter for a group 0 flow, so create a 2 container types, one which allocates counters one by one and one which allocates X counters by the batch feature. The allocated counters objects are never released back to the HW assuming the flows maximum number will be close to the actual value of the flows number. Later, it can be updated, and dynamic release mechanism can be added. The counters are contained in pools, each pool with 512 counters. The pools are contained in counter containers according to the allocation resolution type - single or batch. The cache memory of the counters statistics is saved as raw data per pool. All the raw data memory is allocated for all the container in one memory allocation and is managed by counter_stats_mem_mng structure which registers all the raw memory to the HW. Each pool points to one raw data structure. The query operation is in pool resolution which updates all the pool counter raw data by one operation. Signed-off-by: Matan Azrad <matan@mellanox.com> Acked-by: Shahaf Shuler <shahafs@mellanox.com>
2019-07-16 14:34:53 +00:00
LIST_HEAD(mem_mngs, mlx5_counter_stats_mem_mng) mem_mngs;
LIST_HEAD(stat_raws, mlx5_counter_stats_raw) free_stat_raws;
net/mlx5: accelerate DV flow counter transactions The DevX interface exposes a new feature to the PMD that can allocate a batch of counters by one FW command. It can improve the flow transaction rate (with count action). Add a new counter pools mechanism to manage HW counters in the PMD. So, for each flow with counter creation the PMD will try to find a free counter in the PMD pools container and only if there is no a free counter, it will allocate a new DevX batch counters. Currently we cannot support batch counter for a group 0 flow, so create a 2 container types, one which allocates counters one by one and one which allocates X counters by the batch feature. The allocated counters objects are never released back to the HW assuming the flows maximum number will be close to the actual value of the flows number. Later, it can be updated, and dynamic release mechanism can be added. The counters are contained in pools, each pool with 512 counters. The pools are contained in counter containers according to the allocation resolution type - single or batch. The cache memory of the counters statistics is saved as raw data per pool. All the raw data memory is allocated for all the container in one memory allocation and is managed by counter_stats_mem_mng structure which registers all the raw memory to the HW. Each pool points to one raw data structure. The query operation is in pool resolution which updates all the pool counter raw data by one operation. Signed-off-by: Matan Azrad <matan@mellanox.com> Acked-by: Shahaf Shuler <shahafs@mellanox.com>
2019-07-16 14:34:53 +00:00
};
/* Default miss action resource structure. */
struct mlx5_flow_default_miss_resource {
void *action; /* Pointer to the rdma-core action. */
rte_atomic32_t refcnt; /* Default miss action reference counter. */
};
#define MLX5_AGE_EVENT_NEW 1
#define MLX5_AGE_TRIGGER 2
#define MLX5_AGE_SET(age_info, BIT) \
((age_info)->flags |= (1 << (BIT)))
#define MLX5_AGE_GET(age_info, BIT) \
((age_info)->flags & (1 << (BIT)))
#define GET_PORT_AGE_INFO(priv) \
(&((priv)->sh->port[(priv)->dev_port - 1].age_info))
/* Aging information for per port. */
struct mlx5_age_info {
uint8_t flags; /*Indicate if is new event or need be trigered*/
struct mlx5_counters aged_counters; /* Aged flow counter list. */
rte_spinlock_t aged_sl; /* Aged flow counter list lock. */
};
/* Per port data of shared IB device. */
struct mlx5_dev_shared_port {
uint32_t ih_port_id;
uint32_t devx_ih_port_id;
/*
* Interrupt handler port_id. Used by shared interrupt
* handler to find the corresponding rte_eth device
* by IB port index. If value is equal or greater
* RTE_MAX_ETHPORTS it means there is no subhandler
* installed for specified IB port index.
*/
struct mlx5_age_info age_info;
/* Aging information for per port. */
};
/* Table key of the hash organization. */
union mlx5_flow_tbl_key {
struct {
/* Table ID should be at the lowest address. */
uint32_t table_id; /**< ID of the table. */
uint16_t reserved; /**< must be zero for comparison. */
uint8_t domain; /**< 1 - FDB, 0 - NIC TX/RX. */
uint8_t direction; /**< 1 - egress, 0 - ingress. */
};
uint64_t v64; /**< full 64bits value of key */
};
/* Table structure. */
struct mlx5_flow_tbl_resource {
void *obj; /**< Pointer to DR table object. */
rte_atomic32_t refcnt; /**< Reference counter. */
};
#define MLX5_MAX_TABLES UINT16_MAX
#define MLX5_FLOW_TABLE_LEVEL_METER (UINT16_MAX - 3)
#define MLX5_FLOW_TABLE_LEVEL_SUFFIX (UINT16_MAX - 2)
#define MLX5_HAIRPIN_TX_TABLE (UINT16_MAX - 1)
/* Reserve the last two tables for metadata register copy. */
#define MLX5_FLOW_MREG_ACT_TABLE_GROUP (MLX5_MAX_TABLES - 1)
#define MLX5_FLOW_MREG_CP_TABLE_GROUP (MLX5_MAX_TABLES - 2)
/* Tables for metering splits should be added here. */
#define MLX5_MAX_TABLES_EXTERNAL (MLX5_MAX_TABLES - 3)
#define MLX5_MAX_TABLES_FDB UINT16_MAX
/* ID generation structure. */
struct mlx5_flow_id_pool {
uint32_t *free_arr; /**< Pointer to the a array of free values. */
uint32_t base_index;
/**< The next index that can be used without any free elements. */
uint32_t *curr; /**< Pointer to the index to pop. */
uint32_t *last; /**< Pointer to the last element in the empty arrray. */
uint32_t max_id; /**< Maximum id can be allocated from the pool. */
};
/*
* Shared Infiniband device context for Master/Representors
* which belong to same IB device with multiple IB ports.
**/
struct mlx5_dev_ctx_shared {
LIST_ENTRY(mlx5_dev_ctx_shared) next;
uint32_t refcnt;
uint32_t devx:1; /* Opened with DV. */
uint32_t max_port; /* Maximal IB device port index. */
void *ctx; /* Verbs/DV/DevX context. */
void *pd; /* Protection Domain. */
uint32_t pdn; /* Protection Domain number. */
uint32_t tdn; /* Transport Domain number. */
char ibdev_name[DEV_SYSFS_NAME_MAX]; /* SYSFS dev name. */
char ibdev_path[DEV_SYSFS_PATH_MAX]; /* SYSFS dev path for secondary */
struct mlx5_dev_attr device_attr; /* Device properties. */
LIST_ENTRY(mlx5_dev_ctx_shared) mem_event_cb;
/**< Called by memory event callback. */
struct mlx5_mr_share_cache share_cache;
/* Shared DV/DR flow data section. */
pthread_mutex_t dv_mutex; /* DV context mutex. */
uint32_t dv_meta_mask; /* flow META metadata supported mask. */
uint32_t dv_mark_mask; /* flow MARK metadata supported mask. */
uint32_t dv_regc0_mask; /* available bits of metatada reg_c[0]. */
uint32_t dv_refcnt; /* DV/DR data reference counter. */
void *fdb_domain; /* FDB Direct Rules name space handle. */
void *rx_domain; /* RX Direct Rules name space handle. */
void *tx_domain; /* TX Direct Rules name space handle. */
#ifndef RTE_ARCH_64
rte_spinlock_t uar_lock_cq; /* CQs share a common distinct UAR */
rte_spinlock_t uar_lock[MLX5_UAR_PAGE_NUM_MAX];
/* UAR same-page access control required in 32bit implementations. */
#endif
struct mlx5_hlist *flow_tbls;
/* Direct Rules tables for FDB, NIC TX+RX */
void *esw_drop_action; /* Pointer to DR E-Switch drop action. */
void *pop_vlan_action; /* Pointer to DR pop VLAN action. */
uint32_t encaps_decaps; /* Encap/decap action indexed memory list. */
LIST_HEAD(modify_cmd, mlx5_flow_dv_modify_hdr_resource) modify_cmds;
struct mlx5_hlist *tag_table;
uint32_t port_id_action_list; /* List of port ID actions. */
uint32_t push_vlan_action_list; /* List of push VLAN actions. */
net/mlx5: accelerate DV flow counter transactions The DevX interface exposes a new feature to the PMD that can allocate a batch of counters by one FW command. It can improve the flow transaction rate (with count action). Add a new counter pools mechanism to manage HW counters in the PMD. So, for each flow with counter creation the PMD will try to find a free counter in the PMD pools container and only if there is no a free counter, it will allocate a new DevX batch counters. Currently we cannot support batch counter for a group 0 flow, so create a 2 container types, one which allocates counters one by one and one which allocates X counters by the batch feature. The allocated counters objects are never released back to the HW assuming the flows maximum number will be close to the actual value of the flows number. Later, it can be updated, and dynamic release mechanism can be added. The counters are contained in pools, each pool with 512 counters. The pools are contained in counter containers according to the allocation resolution type - single or batch. The cache memory of the counters statistics is saved as raw data per pool. All the raw data memory is allocated for all the container in one memory allocation and is managed by counter_stats_mem_mng structure which registers all the raw memory to the HW. Each pool points to one raw data structure. The query operation is in pool resolution which updates all the pool counter raw data by one operation. Signed-off-by: Matan Azrad <matan@mellanox.com> Acked-by: Shahaf Shuler <shahafs@mellanox.com>
2019-07-16 14:34:53 +00:00
struct mlx5_flow_counter_mng cmng; /* Counters management structure. */
struct mlx5_flow_default_miss_resource default_miss;
/* Default miss action resource structure. */
struct mlx5_indexed_pool *ipool[MLX5_IPOOL_MAX];
/* Memory Pool for mlx5 flow resources. */
struct mlx5_l3t_tbl *cnt_id_tbl; /* Shared counter lookup table. */
/* Shared interrupt handler section. */
struct rte_intr_handle intr_handle; /* Interrupt handler for device. */
struct rte_intr_handle intr_handle_devx; /* DEVX interrupt handler. */
void *devx_comp; /* DEVX async comp obj. */
struct mlx5_devx_obj *tis; /* TIS object. */
struct mlx5_devx_obj *td; /* Transport domain. */
struct mlx5_flow_id_pool *flow_id_pool; /* Flow ID pool. */
struct mlx5dv_devx_uar *tx_uar; /* Tx/packer pacing shared UAR. */
struct mlx5_dev_shared_port port[]; /* per device port data array. */
};
/* Per-process private structure. */
struct mlx5_proc_priv {
size_t uar_table_sz;
/* Size of UAR register table. */
void *uar_table[];
/* Table of UAR registers for each process. */
};
/* MTR profile list. */
TAILQ_HEAD(mlx5_mtr_profiles, mlx5_flow_meter_profile);
/* MTR list. */
TAILQ_HEAD(mlx5_flow_meters, mlx5_flow_meter);
#define MLX5_PROC_PRIV(port_id) \
((struct mlx5_proc_priv *)rte_eth_devices[port_id].process_private)
struct mlx5_priv {
struct rte_eth_dev_data *dev_data; /* Pointer to device data. */
struct mlx5_dev_ctx_shared *sh; /* Shared device context. */
uint32_t dev_port; /* Device port number. */
struct rte_pci_device *pci_dev; /* Backend PCI device. */
struct rte_ether_addr mac[MLX5_MAX_MAC_ADDRESSES]; /* MAC addresses. */
BITFIELD_DECLARE(mac_own, uint64_t, MLX5_MAX_MAC_ADDRESSES);
/* Bit-field of MAC addresses owned by the PMD. */
uint16_t vlan_filter[MLX5_MAX_VLAN_IDS]; /* VLAN filters table. */
unsigned int vlan_filter_n; /* Number of configured VLAN filters. */
/* Device properties. */
uint16_t mtu; /* Configured MTU. */
unsigned int isolated:1; /* Whether isolated mode is enabled. */
unsigned int representor:1; /* Device is a port representor. */
unsigned int master:1; /* Device is a E-Switch master. */
unsigned int dr_shared:1; /* DV/DR data is shared. */
unsigned int counter_fallback:1; /* Use counter fallback management. */
unsigned int mtr_en:1; /* Whether support meter. */
unsigned int mtr_reg_share:1; /* Whether support meter REG_C share. */
uint16_t domain_id; /* Switch domain identifier. */
uint16_t vport_id; /* Associated VF vport index (if any). */
uint32_t vport_meta_tag; /* Used for vport index match ove VF LAG. */
uint32_t vport_meta_mask; /* Used for vport index field match mask. */
int32_t representor_id; /* Port representor identifier. */
int32_t pf_bond; /* >=0 means PF index in bonding configuration. */
unsigned int if_index; /* Associated kernel network device index. */
/* RX/TX queues. */
unsigned int rxqs_n; /* RX queues array size. */
unsigned int txqs_n; /* TX queues array size. */
struct mlx5_rxq_data *(*rxqs)[]; /* RX queues. */
struct mlx5_txq_data *(*txqs)[]; /* TX queues. */
struct rte_mempool *mprq_mp; /* Mempool for Multi-Packet RQ. */
struct rte_eth_rss_conf rss_conf; /* RSS configuration. */
unsigned int (*reta_idx)[]; /* RETA index table. */
unsigned int reta_idx_n; /* RETA index size. */
struct mlx5_drop drop_queue; /* Flow drop queues. */
uint32_t flows; /* RTE Flow rules. */
uint32_t ctrl_flows; /* Control flow rules. */
void *inter_flows; /* Intermediate resources for flow creation. */
void *rss_desc; /* Intermediate rss description resources. */
int flow_idx; /* Intermediate device flow index. */
int flow_nested_idx; /* Intermediate device flow index, nested. */
LIST_HEAD(rxq, mlx5_rxq_ctrl) rxqsctrl; /* DPDK Rx queues. */
LIST_HEAD(rxqobj, mlx5_rxq_obj) rxqsobj; /* Verbs/DevX Rx queues. */
uint32_t hrxqs; /* Verbs Hash Rx queues. */
LIST_HEAD(txq, mlx5_txq_ctrl) txqsctrl; /* DPDK Tx queues. */
LIST_HEAD(txqobj, mlx5_txq_obj) txqsobj; /* Verbs/DevX Tx queues. */
/* Indirection tables. */
LIST_HEAD(ind_tables, mlx5_ind_table_obj) ind_tbls;
/* Pointer to next element. */
rte_atomic32_t refcnt; /**< Reference counter. */
struct ibv_flow_action *verbs_action;
/**< Verbs modify header action object. */
uint8_t ft_type; /**< Flow table type, Rx or Tx. */
uint8_t max_lro_msg_size;
/* Tags resources cache. */
uint32_t link_speed_capa; /* Link speed capabilities. */
struct mlx5_xstats_ctrl xstats_ctrl; /* Extended stats control. */
struct mlx5_stats_ctrl stats_ctrl; /* Stats control. */
struct mlx5_dev_config config; /* Device configuration. */
struct mlx5_verbs_alloc_ctx verbs_alloc_ctx;
/* Context for Verbs allocator. */
int nl_socket_rdma; /* Netlink socket (NETLINK_RDMA). */
int nl_socket_route; /* Netlink socket (NETLINK_ROUTE). */
struct mlx5_dbr_page_list dbrpgs; /* Door-bell pages. */
struct mlx5_nl_vlan_vmwa_context *vmwa_context; /* VLAN WA context. */
net/mlx5: split Rx flows to provide metadata copy Values set by MARK and SET_META actions should be carried over to the VF representor in case of flow miss on Tx path. However, as not all metadata registers are preserved across the different domains (NIC Rx/Tx and E-Switch FDB), as a workaround, those values should be carried by reg_c's which are preserved across domains and copied to STE flow_tag (MARK) and reg_b (META) fields in the last stage of flow steering, in order to scatter those values to flow_tag and flow_table_metadata of CQE. While reg_c[meta] can be copied to reg_b simply by modify-header action (it is supported by hardware), it is not possible to copy reg_c[mark] to the STE flow_tag as flow_tag is not a metadata register and this is not supported by hardware. Instead, it should be manually set by a flow per MARK ID. For this purpose, there should be a dedicated flow table - RX_CP_TBL and all the Rx flow should pass by the table to properly copy values. As the last action of Rx flow steering must be a terminal action such as QUEUE, RSS or DROP, if a user flow has Q/RSS action, the flow must be split in order to pass by the RX_CP_TBL. And the remained Q/RSS action will be performed by another dedicated action table - RX_ACT_TBL. For example, for an ingress flow: pattern, actions_having_QRSS it must be split into two flows. The first one is, pattern, actions_except_QRSS / copy (reg_c[2] := flow_id) / jump to RX_CP_TBL and the second one in RX_ACT_TBL. (if reg_c[2] == flow_id), action_QRSS where flow_id is uniquely allocated and managed identifier. This patch implements the Rx flow splitting and build the RX_ACT_TBL. Also, per each egress flow on NIC Tx, a copy action (reg_c[]= reg_a) should be added in order to transfer metadata from WQE. Signed-off-by: Yongseok Koh <yskoh@mellanox.com> Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com> Acked-by: Matan Azrad <matan@mellanox.com>
2019-11-07 17:10:03 +00:00
struct mlx5_flow_id_pool *qrss_id_pool;
struct mlx5_hlist *mreg_cp_tbl;
/* Hash table of Rx metadata register copy table. */
uint8_t mtr_sfx_reg; /* Meter prefix-suffix flow match REG_C. */
uint8_t mtr_color_reg; /* Meter color match REG_C. */
struct mlx5_mtr_profiles flow_meter_profiles; /* MTR profile list. */
struct mlx5_flow_meters flow_meters; /* MTR list. */
uint8_t skip_default_rss_reta; /* Skip configuration of default reta. */
uint8_t fdb_def_rule; /* Whether fdb jump to table 1 is configured. */
struct mlx5_mp_id mp_id; /* ID of a multi-process process */
LIST_HEAD(fdir, mlx5_fdir_flow) fdir_flows; /* fdir flows. */
};
#define PORT_ID(priv) ((priv)->dev_data->port_id)
#define ETH_DEV(priv) (&rte_eth_devices[PORT_ID(priv)])
/* mlx5.c */
int mlx5_getenv_int(const char *);
int mlx5_proc_priv_init(struct rte_eth_dev *dev);
int mlx5_udp_tunnel_port_add(struct rte_eth_dev *dev,
struct rte_eth_udp_tunnel *udp_tunnel);
uint16_t mlx5_eth_find_next(uint16_t port_id, struct rte_pci_device *pci_dev);
void mlx5_dev_close(struct rte_eth_dev *dev);
/* Macro to iterate over all valid ports for mlx5 driver. */
#define MLX5_ETH_FOREACH_DEV(port_id, pci_dev) \
for (port_id = mlx5_eth_find_next(0, pci_dev); \
port_id < RTE_MAX_ETHPORTS; \
port_id = mlx5_eth_find_next(port_id + 1, pci_dev))
int mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs);
struct mlx5_dev_ctx_shared *
mlx5_alloc_shared_dev_ctx(const struct mlx5_dev_spawn_data *spawn,
const struct mlx5_dev_config *config);
void mlx5_free_shared_dev_ctx(struct mlx5_dev_ctx_shared *sh);
void mlx5_free_table_hash_list(struct mlx5_priv *priv);
int mlx5_alloc_table_hash_list(struct mlx5_priv *priv);
void mlx5_set_min_inline(struct mlx5_dev_spawn_data *spawn,
struct mlx5_dev_config *config);
void mlx5_set_metadata_mask(struct rte_eth_dev *dev);
int mlx5_dev_check_sibling_config(struct mlx5_priv *priv,
struct mlx5_dev_config *config);
int mlx5_init_once(void);
int mlx5_dev_configure(struct rte_eth_dev *dev);
int mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info);
int mlx5_fw_version_get(struct rte_eth_dev *dev, char *fw_ver, size_t fw_size);
int mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu);
int mlx5_hairpin_cap_get(struct rte_eth_dev *dev,
struct rte_eth_hairpin_cap *cap);
/* mlx5_ethdev.c */
int mlx5_dev_configure(struct rte_eth_dev *dev);
int mlx5_fw_version_get(struct rte_eth_dev *dev, char *fw_ver,
size_t fw_size);
int mlx5_dev_infos_get(struct rte_eth_dev *dev,
struct rte_eth_dev_info *info);
const uint32_t *mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev);
int mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu);
int mlx5_hairpin_cap_get(struct rte_eth_dev *dev,
struct rte_eth_hairpin_cap *cap);
/* mlx5_ethdev_os.c */
int mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE]);
unsigned int mlx5_ifindex(const struct rte_eth_dev *dev);
int mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr);
int mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu);
int mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep,
unsigned int flags);
int mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t mtu);
int mlx5_read_clock(struct rte_eth_dev *dev, uint64_t *clock);
int mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete);
int mlx5_force_link_status_change(struct rte_eth_dev *dev, int status);
int mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev,
struct rte_eth_fc_conf *fc_conf);
int mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev,
struct rte_eth_fc_conf *fc_conf);
void mlx5_dev_link_status_handler(void *arg);
void mlx5_dev_interrupt_handler(void *arg);
void mlx5_dev_interrupt_handler_devx(void *arg);
void mlx5_dev_interrupt_handler_uninstall(struct rte_eth_dev *dev);
void mlx5_dev_interrupt_handler_install(struct rte_eth_dev *dev);
int mlx5_set_link_down(struct rte_eth_dev *dev);
int mlx5_set_link_up(struct rte_eth_dev *dev);
int mlx5_is_removed(struct rte_eth_dev *dev);
eth_tx_burst_t mlx5_select_tx_function(struct rte_eth_dev *dev);
eth_rx_burst_t mlx5_select_rx_function(struct rte_eth_dev *dev);
struct mlx5_priv *mlx5_port_to_eswitch_info(uint16_t port, bool valid);
struct mlx5_priv *mlx5_dev_to_eswitch_info(struct rte_eth_dev *dev);
int mlx5_sysfs_switch_info(unsigned int ifindex,
struct mlx5_switch_info *info);
net/mlx5: support PF representor On BlueField platform we have the new entity - PF representor. This one represents the PCI PF attached to external host on the side of ARM. The traffic sent by the external host to the NIC via PF will be seem by ARM on this PF representor. This patch refactors port recognizing capability on the base of physical port name. We have two groups of name formats. Legacy name formats are supported by kernels before ver 5.0 (being more precise - before the patch [1]) or before Mellanox OFED 4.6, and new naming formats added by the patch [1]. Legacy naming formats are supported: - missing physical port name (no sysfs/netlink key) at all, master is assumed - decimal digits (for example "12"), representor is assumed, the value is the index of attached VF New naming formats are supported: - "p" followed by decimal digits, for example "p2", master is assumed - "pf" followed by PF index concatenated with "vf" followed by VF index, for example "pf0vf1", representor is assumed. If index of VF is "-1" it is a special case of host PF representor, this representor must be indexed in devargs as 65535, for example representor=[0-3,65535] will allow representors for VF0, VF1, VF2, VF3 and for host PF. Note: do not specify representor=[0-65535], it causes devargs processing error, because number of ports (rte_eth_dev) is limited. Applications should distinguish representors and master devices exclusively by device flag RTE_ETH_DEV_REPRESENTOR and do not rely on switch port_id (mlx5 PMD deduces ones from representor_id) values returned by dev_infos_get() API. [1] https://www.spinics.net/lists/netdev/msg547007.html Linux-tree: c12ecc23 (Or Gerlitz 2018-04-25 17:32 +0300) "net/mlx5e: Move to use common phys port names for vport representors" Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com> Acked-by: Shahaf Shuler <shahafs@mellanox.com>
2019-04-16 14:10:28 +00:00
void mlx5_sysfs_check_switch_info(bool device_dir,
struct mlx5_switch_info *switch_info);
void mlx5_translate_port_name(const char *port_name_in,
struct mlx5_switch_info *port_info_out);
void mlx5_intr_callback_unregister(const struct rte_intr_handle *handle,
rte_intr_callback_fn cb_fn, void *cb_arg);
int mlx5_get_module_info(struct rte_eth_dev *dev,
struct rte_eth_dev_module_info *modinfo);
int mlx5_get_module_eeprom(struct rte_eth_dev *dev,
struct rte_dev_eeprom_info *info);
int mlx5_dev_configure_rss_reta(struct rte_eth_dev *dev);
/* mlx5_mac.c */
net: add rte prefix to ether defines Add 'RTE_' prefix to defines: - rename ETHER_ADDR_LEN as RTE_ETHER_ADDR_LEN. - rename ETHER_TYPE_LEN as RTE_ETHER_TYPE_LEN. - rename ETHER_CRC_LEN as RTE_ETHER_CRC_LEN. - rename ETHER_HDR_LEN as RTE_ETHER_HDR_LEN. - rename ETHER_MIN_LEN as RTE_ETHER_MIN_LEN. - rename ETHER_MAX_LEN as RTE_ETHER_MAX_LEN. - rename ETHER_MTU as RTE_ETHER_MTU. - rename ETHER_MAX_VLAN_FRAME_LEN as RTE_ETHER_MAX_VLAN_FRAME_LEN. - rename ETHER_MAX_VLAN_ID as RTE_ETHER_MAX_VLAN_ID. - rename ETHER_MAX_JUMBO_FRAME_LEN as RTE_ETHER_MAX_JUMBO_FRAME_LEN. - rename ETHER_MIN_MTU as RTE_ETHER_MIN_MTU. - rename ETHER_LOCAL_ADMIN_ADDR as RTE_ETHER_LOCAL_ADMIN_ADDR. - rename ETHER_GROUP_ADDR as RTE_ETHER_GROUP_ADDR. - rename ETHER_TYPE_IPv4 as RTE_ETHER_TYPE_IPv4. - rename ETHER_TYPE_IPv6 as RTE_ETHER_TYPE_IPv6. - rename ETHER_TYPE_ARP as RTE_ETHER_TYPE_ARP. - rename ETHER_TYPE_VLAN as RTE_ETHER_TYPE_VLAN. - rename ETHER_TYPE_RARP as RTE_ETHER_TYPE_RARP. - rename ETHER_TYPE_QINQ as RTE_ETHER_TYPE_QINQ. - rename ETHER_TYPE_ETAG as RTE_ETHER_TYPE_ETAG. - rename ETHER_TYPE_1588 as RTE_ETHER_TYPE_1588. - rename ETHER_TYPE_SLOW as RTE_ETHER_TYPE_SLOW. - rename ETHER_TYPE_TEB as RTE_ETHER_TYPE_TEB. - rename ETHER_TYPE_LLDP as RTE_ETHER_TYPE_LLDP. - rename ETHER_TYPE_MPLS as RTE_ETHER_TYPE_MPLS. - rename ETHER_TYPE_MPLSM as RTE_ETHER_TYPE_MPLSM. - rename ETHER_VXLAN_HLEN as RTE_ETHER_VXLAN_HLEN. - rename ETHER_ADDR_FMT_SIZE as RTE_ETHER_ADDR_FMT_SIZE. - rename VXLAN_GPE_TYPE_IPV4 as RTE_VXLAN_GPE_TYPE_IPV4. - rename VXLAN_GPE_TYPE_IPV6 as RTE_VXLAN_GPE_TYPE_IPV6. - rename VXLAN_GPE_TYPE_ETH as RTE_VXLAN_GPE_TYPE_ETH. - rename VXLAN_GPE_TYPE_NSH as RTE_VXLAN_GPE_TYPE_NSH. - rename VXLAN_GPE_TYPE_MPLS as RTE_VXLAN_GPE_TYPE_MPLS. - rename VXLAN_GPE_TYPE_GBP as RTE_VXLAN_GPE_TYPE_GBP. - rename VXLAN_GPE_TYPE_VBNG as RTE_VXLAN_GPE_TYPE_VBNG. - rename ETHER_VXLAN_GPE_HLEN as RTE_ETHER_VXLAN_GPE_HLEN. Do not update the command line library to avoid adding a dependency to librte_net. Signed-off-by: Olivier Matz <olivier.matz@6wind.com> Reviewed-by: Stephen Hemminger <stephen@networkplumber.org> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> Reviewed-by: Ferruh Yigit <ferruh.yigit@intel.com>
2019-05-21 16:13:05 +00:00
int mlx5_get_mac(struct rte_eth_dev *dev, uint8_t (*mac)[RTE_ETHER_ADDR_LEN]);
void mlx5_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index);
int mlx5_mac_addr_add(struct rte_eth_dev *dev, struct rte_ether_addr *mac,
uint32_t index, uint32_t vmdq);
struct mlx5_nl_vlan_vmwa_context *mlx5_vlan_vmwa_init
(struct rte_eth_dev *dev, uint32_t ifindex);
int mlx5_mac_addr_set(struct rte_eth_dev *dev, struct rte_ether_addr *mac_addr);
int mlx5_set_mc_addr_list(struct rte_eth_dev *dev,
struct rte_ether_addr *mc_addr_set,
uint32_t nb_mc_addr);
/* mlx5_rss.c */
int mlx5_rss_hash_update(struct rte_eth_dev *dev,
struct rte_eth_rss_conf *rss_conf);
int mlx5_rss_hash_conf_get(struct rte_eth_dev *dev,
struct rte_eth_rss_conf *rss_conf);
int mlx5_rss_reta_index_resize(struct rte_eth_dev *dev, unsigned int reta_size);
int mlx5_dev_rss_reta_query(struct rte_eth_dev *dev,
struct rte_eth_rss_reta_entry64 *reta_conf,
uint16_t reta_size);
int mlx5_dev_rss_reta_update(struct rte_eth_dev *dev,
struct rte_eth_rss_reta_entry64 *reta_conf,
uint16_t reta_size);
/* mlx5_rxmode.c */
int mlx5_promiscuous_enable(struct rte_eth_dev *dev);
int mlx5_promiscuous_disable(struct rte_eth_dev *dev);
int mlx5_allmulticast_enable(struct rte_eth_dev *dev);
int mlx5_allmulticast_disable(struct rte_eth_dev *dev);
/* mlx5_stats.c */
int mlx5_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats);
int mlx5_stats_reset(struct rte_eth_dev *dev);
int mlx5_xstats_get(struct rte_eth_dev *dev, struct rte_eth_xstat *stats,
unsigned int n);
int mlx5_xstats_reset(struct rte_eth_dev *dev);
int mlx5_xstats_get_names(struct rte_eth_dev *dev __rte_unused,
struct rte_eth_xstat_name *xstats_names,
unsigned int n);
/* mlx5_vlan.c */
int mlx5_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on);
void mlx5_vlan_strip_queue_set(struct rte_eth_dev *dev, uint16_t queue, int on);
int mlx5_vlan_offload_set(struct rte_eth_dev *dev, int mask);
void mlx5_vlan_vmwa_exit(struct mlx5_nl_vlan_vmwa_context *ctx);
void mlx5_vlan_vmwa_release(struct rte_eth_dev *dev,
struct mlx5_vf_vlan *vf_vlan);
void mlx5_vlan_vmwa_acquire(struct rte_eth_dev *dev,
struct mlx5_vf_vlan *vf_vlan);
/* mlx5_trigger.c */
int mlx5_dev_start(struct rte_eth_dev *dev);
void mlx5_dev_stop(struct rte_eth_dev *dev);
int mlx5_traffic_enable(struct rte_eth_dev *dev);
void mlx5_traffic_disable(struct rte_eth_dev *dev);
int mlx5_traffic_restart(struct rte_eth_dev *dev);
/* mlx5_flow.c */
int mlx5_flow_discover_mreg_c(struct rte_eth_dev *eth_dev);
bool mlx5_flow_ext_mreg_supported(struct rte_eth_dev *dev);
int mlx5_flow_discover_priorities(struct rte_eth_dev *dev);
void mlx5_flow_print(struct rte_flow *flow);
int mlx5_flow_validate(struct rte_eth_dev *dev,
const struct rte_flow_attr *attr,
const struct rte_flow_item items[],
const struct rte_flow_action actions[],
struct rte_flow_error *error);
struct rte_flow *mlx5_flow_create(struct rte_eth_dev *dev,
const struct rte_flow_attr *attr,
const struct rte_flow_item items[],
const struct rte_flow_action actions[],
struct rte_flow_error *error);
int mlx5_flow_destroy(struct rte_eth_dev *dev, struct rte_flow *flow,
struct rte_flow_error *error);
void mlx5_flow_list_flush(struct rte_eth_dev *dev, uint32_t *list, bool active);
int mlx5_flow_flush(struct rte_eth_dev *dev, struct rte_flow_error *error);
int mlx5_flow_query(struct rte_eth_dev *dev, struct rte_flow *flow,
const struct rte_flow_action *action, void *data,
struct rte_flow_error *error);
int mlx5_flow_isolate(struct rte_eth_dev *dev, int enable,
struct rte_flow_error *error);
int mlx5_dev_filter_ctrl(struct rte_eth_dev *dev,
enum rte_filter_type filter_type,
enum rte_filter_op filter_op,
void *arg);
int mlx5_flow_start(struct rte_eth_dev *dev, uint32_t *list);
void mlx5_flow_stop(struct rte_eth_dev *dev, uint32_t *list);
int mlx5_flow_start_default(struct rte_eth_dev *dev);
void mlx5_flow_stop_default(struct rte_eth_dev *dev);
void mlx5_flow_alloc_intermediate(struct rte_eth_dev *dev);
void mlx5_flow_free_intermediate(struct rte_eth_dev *dev);
int mlx5_flow_verify(struct rte_eth_dev *dev);
int mlx5_ctrl_flow_source_queue(struct rte_eth_dev *dev, uint32_t queue);
int mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev,
struct rte_flow_item_eth *eth_spec,
struct rte_flow_item_eth *eth_mask,
struct rte_flow_item_vlan *vlan_spec,
struct rte_flow_item_vlan *vlan_mask);
int mlx5_ctrl_flow(struct rte_eth_dev *dev,
struct rte_flow_item_eth *eth_spec,
struct rte_flow_item_eth *eth_mask);
int mlx5_flow_lacp_miss(struct rte_eth_dev *dev);
struct rte_flow *mlx5_flow_create_esw_table_zero_flow(struct rte_eth_dev *dev);
int mlx5_flow_create_drop_queue(struct rte_eth_dev *dev);
void mlx5_flow_delete_drop_queue(struct rte_eth_dev *dev);
void mlx5_flow_async_pool_query_handle(struct mlx5_dev_ctx_shared *sh,
uint64_t async_id, int status);
void mlx5_set_query_alarm(struct mlx5_dev_ctx_shared *sh);
void mlx5_flow_query_alarm(void *arg);
uint32_t mlx5_counter_alloc(struct rte_eth_dev *dev);
void mlx5_counter_free(struct rte_eth_dev *dev, uint32_t cnt);
int mlx5_counter_query(struct rte_eth_dev *dev, uint32_t cnt,
bool clear, uint64_t *pkts, uint64_t *bytes);
int mlx5_flow_dev_dump(struct rte_eth_dev *dev, FILE *file,
struct rte_flow_error *error);
void mlx5_flow_rxq_dynf_metadata_set(struct rte_eth_dev *dev);
int mlx5_flow_get_aged_flows(struct rte_eth_dev *dev, void **contexts,
uint32_t nb_contexts, struct rte_flow_error *error);
/* mlx5_mp.c */
int mlx5_mp_primary_handle(const struct rte_mp_msg *mp_msg, const void *peer);
int mlx5_mp_secondary_handle(const struct rte_mp_msg *mp_msg, const void *peer);
void mlx5_mp_req_start_rxtx(struct rte_eth_dev *dev);
void mlx5_mp_req_stop_rxtx(struct rte_eth_dev *dev);
/* mlx5_socket.c */
int mlx5_pmd_socket_init(void);
/* mlx5_flow_meter.c */
int mlx5_flow_meter_ops_get(struct rte_eth_dev *dev, void *arg);
struct mlx5_flow_meter *mlx5_flow_meter_find(struct mlx5_priv *priv,
uint32_t meter_id);
struct mlx5_flow_meter *mlx5_flow_meter_attach
(struct mlx5_priv *priv,
uint32_t meter_id,
const struct rte_flow_attr *attr,
struct rte_flow_error *error);
void mlx5_flow_meter_detach(struct mlx5_flow_meter *fm);
/* mlx5_os.c */
struct rte_pci_driver;
int mlx5_os_get_dev_attr(void *ctx, struct mlx5_dev_attr *dev_attr);
void mlx5_os_free_shared_dr(struct mlx5_priv *priv);
int mlx5_os_open_device(const struct mlx5_dev_spawn_data *spawn,
const struct mlx5_dev_config *config,
struct mlx5_dev_ctx_shared *sh);
int mlx5_os_get_pdn(void *pd, uint32_t *pdn);
int mlx5_os_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
struct rte_pci_device *pci_dev);
void mlx5_os_dev_shared_handler_install(struct mlx5_dev_ctx_shared *sh);
void mlx5_os_dev_shared_handler_uninstall(struct mlx5_dev_ctx_shared *sh);
int mlx5_os_read_dev_stat(struct mlx5_priv *priv,
const char *ctr_name, uint64_t *stat);
int mlx5_os_read_dev_counters(struct rte_eth_dev *dev, uint64_t *stats);
int mlx5_os_get_stats_n(struct rte_eth_dev *dev);
void mlx5_os_stats_init(struct rte_eth_dev *dev);
void mlx5_os_set_reg_mr_cb(mlx5_reg_mr_t *reg_mr_cb,
mlx5_dereg_mr_t *dereg_mr_cb);
#endif /* RTE_PMD_MLX5_H_ */