ena: merge ena-com v2.5.0 upgrade

Merge commit '2530eb1fa01bf28fbcfcdda58bd41e055dcb2e4a'

Adjust the driver to the upgraded ena-com part twofold:

First update is related to the driver's NUMA awareness.

Allocate I/O queue memory in NUMA domain local to the CPU bound to the
given queue, improving data access time. Since this can result in
performance hit for unaware users, this is done only when RSS
option is enabled, for other cases the driver relies on kernel to
allocate memory by itself.

Information about first CPU bound is saved in adapter structure, so
the binding persists after bringing the interface down and up again.

If there are more buckets than interface queues, the driver will try to
bind different interfaces to different CPUs using round-robin algorithm
(but it will not bind queues to CPUs which do not have any RSS buckets
associated with them). This is done to better utilize hardware
resources by spreading the load.

Add (read-only) per-queue sysctls in order to provide the following
information:
- queueN.domain: NUMA domain associated with the queue
- queueN.cpu:    CPU affinity of the queue

The second change is for the CSUM_OFFLOAD constant, as ENA platform
file has removed its definition. To align to that change, it has been
added to the ena_datapath.h file.

Submitted by: Artur Rojek <ar@semihalf.com>
Submitted by: Dawid Gorecki <dgr@semihalf.com>
Obtained from: Semihalf
MFC after: 2 weeks
Sponsored by: Amazon, Inc.
This commit is contained in:
Marcin Wojtas 2022-01-23 20:21:17 +01:00
commit eb4c4f4a2e
6 changed files with 80 additions and 9 deletions

View File

@ -71,6 +71,11 @@ is advertised by the device via the Admin Queue), a dedicated MSI-X
interrupt vector per Tx/Rx queue pair, and CPU cacheline optimized
data placement.
.Pp
When RSS is enabled, each Tx/Rx queue pair is bound to a corresponding
CPU core and its NUMA domain. The order of those bindings is based on
the RSS bucket mapping. For builds with RSS support disabled, the
CPU and NUMA management is left to the kernel.
.Pp
The
.Nm
driver supports industry standard TCP/IP offload features such

View File

@ -42,6 +42,7 @@ __FBSDID("$FreeBSD$");
#include <sys/bus.h>
#include <sys/condvar.h>
#include <sys/domainset.h>
#include <sys/endian.h>
#include <sys/kernel.h>
#include <sys/kthread.h>
@ -170,6 +171,8 @@ static inline long PTR_ERR(const void *ptr)
#define ENA_COM_TIMER_EXPIRED ETIMEDOUT
#define ENA_COM_EIO EIO
#define ENA_NODE_ANY (-1)
#define ENA_MSLEEP(x) pause_sbt("ena", SBT_1MS * (x), SBT_1MS, 0)
#define ENA_USLEEP(x) pause_sbt("ena", SBT_1US * (x), SBT_1US, 0)
#define ENA_UDELAY(x) DELAY(x)
@ -277,7 +280,7 @@ typedef struct ifnet ena_netdev;
void ena_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nseg,
int error);
int ena_dma_alloc(device_t dmadev, bus_size_t size, ena_mem_handle_t *dma,
int mapflags, bus_size_t alignment);
int mapflags, bus_size_t alignment, int domain);
static inline uint32_t
ena_reg_read32(struct ena_bus *bus, bus_size_t offset)
@ -299,16 +302,27 @@ ena_reg_read32(struct ena_bus *bus, bus_size_t offset)
} while (0)
#define ENA_MEM_ALLOC(dmadev, size) malloc(size, M_DEVBUF, M_NOWAIT | M_ZERO)
#define ENA_MEM_ALLOC_NODE(dmadev, size, virt, node, dev_node) (virt = NULL)
#define ENA_MEM_ALLOC_NODE(dmadev, size, virt, node, dev_node) \
do { \
(virt) = malloc_domainset((size), M_DEVBUF, \
(node) < 0 ? DOMAINSET_RR() : DOMAINSET_PREF(node), \
M_NOWAIT | M_ZERO); \
(void)(dev_node); \
} while (0)
#define ENA_MEM_FREE(dmadev, ptr, size) \
do { \
(void)(size); \
free(ptr, M_DEVBUF); \
} while (0)
#define ENA_MEM_ALLOC_COHERENT_NODE_ALIGNED(dmadev, size, virt, phys, \
handle, node, dev_node, alignment) \
dma, node, dev_node, alignment) \
do { \
((virt) = NULL); \
ena_dma_alloc((dmadev), (size), &(dma), 0, (alignment), \
(node)); \
(virt) = (void *)(dma).vaddr; \
(phys) = (dma).paddr; \
(void)(dev_node); \
} while (0)
@ -320,7 +334,8 @@ ena_reg_read32(struct ena_bus *bus, bus_size_t offset)
#define ENA_MEM_ALLOC_COHERENT_ALIGNED(dmadev, size, virt, phys, dma, \
alignment) \
do { \
ena_dma_alloc((dmadev), (size), &(dma), 0, alignment); \
ena_dma_alloc((dmadev), (size), &(dma), 0, (alignment), \
ENA_NODE_ANY); \
(virt) = (void *)(dma).vaddr; \
(phys) = (dma).paddr; \
} while (0)
@ -366,7 +381,6 @@ ena_reg_read32(struct ena_bus *bus, bus_size_t offset)
#define time_after(a,b) ((long)((unsigned long)(b) - (unsigned long)(a)) < 0)
#define VLAN_HLEN sizeof(struct ether_vlan_header)
#define CSUM_OFFLOAD (CSUM_IP|CSUM_TCP|CSUM_UDP)
#define prefetch(x) (void)(x)
#define prefetchw(x) (void)(x)

View File

@ -198,7 +198,7 @@ ena_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nseg, int error)
int
ena_dma_alloc(device_t dmadev, bus_size_t size,
ena_mem_handle_t *dma, int mapflags, bus_size_t alignment)
ena_mem_handle_t *dma, int mapflags, bus_size_t alignment, int domain)
{
struct ena_adapter* adapter = device_get_softc(dmadev);
device_t pdev = adapter->pdev;
@ -229,6 +229,13 @@ ena_dma_alloc(device_t dmadev, bus_size_t size,
goto fail_tag;
}
error = bus_dma_tag_set_domain(dma->tag, domain);
if (unlikely(error != 0)) {
ena_log(pdev, ERR, "bus_dma_tag_set_domain failed: %d\n",
error);
goto fail_map_create;
}
error = bus_dmamem_alloc(dma->tag, (void**) &dma->vaddr,
BUS_DMA_COHERENT | BUS_DMA_ZERO, &dma->map);
if (unlikely(error != 0)) {
@ -1445,6 +1452,8 @@ ena_create_io_queues(struct ena_adapter *adapter)
ctx.queue_size = adapter->requested_tx_ring_size;
ctx.msix_vector = msix_vector;
ctx.qid = ena_qid;
ctx.numa_node = adapter->que[i].domain;
rc = ena_com_create_io_queue(ena_dev, &ctx);
if (rc != 0) {
ena_log(adapter->pdev, ERR,
@ -1462,6 +1471,11 @@ ena_create_io_queues(struct ena_adapter *adapter)
ena_com_destroy_io_queue(ena_dev, ena_qid);
goto err_tx;
}
if (ctx.numa_node >= 0) {
ena_com_update_numa_node(ring->ena_com_io_cq,
ctx.numa_node);
}
}
/* Create RX queues */
@ -1473,6 +1487,8 @@ ena_create_io_queues(struct ena_adapter *adapter)
ctx.queue_size = adapter->requested_rx_ring_size;
ctx.msix_vector = msix_vector;
ctx.qid = ena_qid;
ctx.numa_node = adapter->que[i].domain;
rc = ena_com_create_io_queue(ena_dev, &ctx);
if (unlikely(rc != 0)) {
ena_log(adapter->pdev, ERR,
@ -1491,6 +1507,11 @@ ena_create_io_queues(struct ena_adapter *adapter)
ena_com_destroy_io_queue(ena_dev, ena_qid);
goto err_rx;
}
if (ctx.numa_node >= 0) {
ena_com_update_numa_node(ring->ena_com_io_cq,
ctx.numa_node);
}
}
for (i = 0; i < adapter->num_io_queues; i++) {
@ -1646,12 +1667,22 @@ ena_setup_io_intr(struct ena_adapter *adapter)
#ifdef RSS
int num_buckets = rss_getnumbuckets();
static int last_bind = 0;
int cur_bind;
int idx;
#endif
int irq_idx;
if (adapter->msix_entries == NULL)
return (EINVAL);
#ifdef RSS
if (adapter->first_bind < 0) {
adapter->first_bind = last_bind;
last_bind = (last_bind + adapter->num_io_queues) % num_buckets;
}
cur_bind = adapter->first_bind;
#endif
for (int i = 0; i < adapter->num_io_queues; i++) {
irq_idx = ENA_IO_IRQ_IDX(i);
@ -1666,9 +1697,17 @@ ena_setup_io_intr(struct ena_adapter *adapter)
#ifdef RSS
adapter->que[i].cpu = adapter->irq_tbl[irq_idx].cpu =
rss_getcpu(last_bind);
last_bind = (last_bind + 1) % num_buckets;
rss_getcpu(cur_bind);
cur_bind = (cur_bind + 1) % num_buckets;
CPU_SETOF(adapter->que[i].cpu, &adapter->que[i].cpu_mask);
for (idx = 0; idx < MAXMEMDOM; ++idx) {
if (CPU_ISSET(adapter->que[i].cpu, &cpuset_domain[idx]))
break;
}
adapter->que[i].domain = idx;
#else
adapter->que[i].domain = -1;
#endif
}
@ -3459,6 +3498,7 @@ ena_attach(device_t pdev)
adapter = device_get_softc(pdev);
adapter->pdev = pdev;
adapter->first_bind = -1;
/*
* Set up the timer service - driver is responsible for avoiding

View File

@ -222,6 +222,7 @@ struct ena_que {
int cpu;
cpuset_t cpu_mask;
#endif
int domain;
struct sysctl_oid *oid;
};
@ -439,6 +440,7 @@ struct ena_adapter {
uint32_t buf_ring_size;
/* RSS*/
int first_bind;
struct ena_indir *rss_indir;
uint8_t mac_addr[ETHER_ADDR_LEN];

View File

@ -39,4 +39,6 @@ void ena_qflush(if_t ifp);
int ena_mq_start(if_t ifp, struct mbuf *m);
void ena_deferred_mq_start(void *arg, int pending);
#define CSUM_OFFLOAD (CSUM_IP|CSUM_TCP|CSUM_UDP)
#endif /* ENA_TXRX_H */

View File

@ -208,6 +208,14 @@ ena_sysctl_add_stats(struct ena_adapter *adapter)
adapter->que[i].oid = queue_node;
#ifdef RSS
/* Common stats */
SYSCTL_ADD_INT(ctx, queue_list, OID_AUTO, "cpu",
CTLFLAG_RD, &adapter->que[i].cpu, 0, "CPU affinity");
SYSCTL_ADD_INT(ctx, queue_list, OID_AUTO, "domain",
CTLFLAG_RD, &adapter->que[i].domain, 0, "NUMA domain");
#endif
/* TX specific stats */
tx_node = SYSCTL_ADD_NODE(ctx, queue_list, OID_AUTO,
"tx_ring", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "TX ring");