numam-dpdk/drivers/net/af_xdp/rte_eth_af_xdp.c
Ciara Loftus ae70cc6e89 net/af_xdp: use BPF link for XDP programs
Since v0.4.0, if the underlying kernel supports it, libbpf uses 'bpf
link' to manage the programs on the interfaces of the xsks. This has two
repercussions for the PMD.

1. In the case where the PMD asks libbpf to load the default XDP
   program, the PMD no longer needs to remove it on teardown. This is
   because bpf link handles the unloading under the hood.
2. In the case where the PMD loads a custom program, libbpf expects this
   program to be linked via bpf link prior to creating the socket.

This patch introduces probes for the libbpf version and kernel support
for bpf link and orchestrates the loading and unloading of
programs according to the capabilities of the kernel and libbpf. The
libbpf version is checked with meson and pkg-config. The probe for
kernel support mirrors how it is implemented in libbpf. A bpf_link is
created and looked up on loopback device. If successful, bpf_link will
be used for the AF_XDP netdev.

Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
2021-11-02 17:36:46 +01:00

1870 lines
45 KiB
C

/* SPDX-License-Identifier: BSD-3-Clause
* Copyright(c) 2019-2020 Intel Corporation.
*/
#include <unistd.h>
#include <errno.h>
#include <stdlib.h>
#include <string.h>
#include <netinet/in.h>
#include <net/if.h>
#include <sys/socket.h>
#include <sys/ioctl.h>
#include <linux/if_ether.h>
#include <linux/if_xdp.h>
#include <linux/if_link.h>
#include <linux/ethtool.h>
#include <linux/sockios.h>
#include "af_xdp_deps.h"
#include <bpf/xsk.h>
#include <rte_ethdev.h>
#include <ethdev_driver.h>
#include <ethdev_vdev.h>
#include <rte_kvargs.h>
#include <rte_bus_vdev.h>
#include <rte_string_fns.h>
#include <rte_branch_prediction.h>
#include <rte_common.h>
#include <rte_dev.h>
#include <rte_eal.h>
#include <rte_ether.h>
#include <rte_lcore.h>
#include <rte_log.h>
#include <rte_memory.h>
#include <rte_memzone.h>
#include <rte_mempool.h>
#include <rte_mbuf.h>
#include <rte_malloc.h>
#include <rte_ring.h>
#include <rte_spinlock.h>
#include <rte_power_intrinsics.h>
#include "compat.h"
#ifndef SO_PREFER_BUSY_POLL
#define SO_PREFER_BUSY_POLL 69
#endif
#ifndef SO_BUSY_POLL_BUDGET
#define SO_BUSY_POLL_BUDGET 70
#endif
#ifndef SOL_XDP
#define SOL_XDP 283
#endif
#ifndef AF_XDP
#define AF_XDP 44
#endif
#ifndef PF_XDP
#define PF_XDP AF_XDP
#endif
RTE_LOG_REGISTER_DEFAULT(af_xdp_logtype, NOTICE);
#define AF_XDP_LOG(level, fmt, args...) \
rte_log(RTE_LOG_ ## level, af_xdp_logtype, \
"%s(): " fmt, __func__, ##args)
#define ETH_AF_XDP_FRAME_SIZE 2048
#define ETH_AF_XDP_NUM_BUFFERS 4096
#define ETH_AF_XDP_DFLT_NUM_DESCS XSK_RING_CONS__DEFAULT_NUM_DESCS
#define ETH_AF_XDP_DFLT_START_QUEUE_IDX 0
#define ETH_AF_XDP_DFLT_QUEUE_COUNT 1
#define ETH_AF_XDP_DFLT_BUSY_BUDGET 64
#define ETH_AF_XDP_DFLT_BUSY_TIMEOUT 20
#define ETH_AF_XDP_RX_BATCH_SIZE XSK_RING_CONS__DEFAULT_NUM_DESCS
#define ETH_AF_XDP_TX_BATCH_SIZE XSK_RING_CONS__DEFAULT_NUM_DESCS
#define ETH_AF_XDP_ETH_OVERHEAD (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN)
struct xsk_umem_info {
struct xsk_umem *umem;
struct rte_ring *buf_ring;
const struct rte_memzone *mz;
struct rte_mempool *mb_pool;
void *buffer;
uint8_t refcnt;
uint32_t max_xsks;
};
struct rx_stats {
uint64_t rx_pkts;
uint64_t rx_bytes;
uint64_t rx_dropped;
};
struct pkt_rx_queue {
struct xsk_ring_cons rx;
struct xsk_umem_info *umem;
struct xsk_socket *xsk;
struct rte_mempool *mb_pool;
struct rx_stats stats;
struct xsk_ring_prod fq;
struct xsk_ring_cons cq;
struct pkt_tx_queue *pair;
struct pollfd fds[1];
int xsk_queue_idx;
int busy_budget;
};
struct tx_stats {
uint64_t tx_pkts;
uint64_t tx_bytes;
uint64_t tx_dropped;
};
struct pkt_tx_queue {
struct xsk_ring_prod tx;
struct xsk_umem_info *umem;
struct tx_stats stats;
struct pkt_rx_queue *pair;
int xsk_queue_idx;
};
struct pmd_internals {
int if_index;
char if_name[IFNAMSIZ];
int start_queue_idx;
int queue_cnt;
int max_queue_cnt;
int combined_queue_cnt;
bool shared_umem;
char prog_path[PATH_MAX];
bool custom_prog_configured;
bool use_bpf_link;
struct rte_ether_addr eth_addr;
struct pkt_rx_queue *rx_queues;
struct pkt_tx_queue *tx_queues;
};
#define ETH_AF_XDP_IFACE_ARG "iface"
#define ETH_AF_XDP_START_QUEUE_ARG "start_queue"
#define ETH_AF_XDP_QUEUE_COUNT_ARG "queue_count"
#define ETH_AF_XDP_SHARED_UMEM_ARG "shared_umem"
#define ETH_AF_XDP_PROG_ARG "xdp_prog"
#define ETH_AF_XDP_BUDGET_ARG "busy_budget"
static const char * const valid_arguments[] = {
ETH_AF_XDP_IFACE_ARG,
ETH_AF_XDP_START_QUEUE_ARG,
ETH_AF_XDP_QUEUE_COUNT_ARG,
ETH_AF_XDP_SHARED_UMEM_ARG,
ETH_AF_XDP_PROG_ARG,
ETH_AF_XDP_BUDGET_ARG,
NULL
};
static const struct rte_eth_link pmd_link = {
.link_speed = RTE_ETH_SPEED_NUM_10G,
.link_duplex = RTE_ETH_LINK_FULL_DUPLEX,
.link_status = RTE_ETH_LINK_DOWN,
.link_autoneg = RTE_ETH_LINK_AUTONEG
};
/* List which tracks PMDs to facilitate sharing UMEMs across them. */
struct internal_list {
TAILQ_ENTRY(internal_list) next;
struct rte_eth_dev *eth_dev;
};
TAILQ_HEAD(internal_list_head, internal_list);
static struct internal_list_head internal_list =
TAILQ_HEAD_INITIALIZER(internal_list);
static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
#if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
static inline int
reserve_fill_queue_zc(struct xsk_umem_info *umem, uint16_t reserve_size,
struct rte_mbuf **bufs, struct xsk_ring_prod *fq)
{
uint32_t idx;
uint16_t i;
if (unlikely(!xsk_ring_prod__reserve(fq, reserve_size, &idx))) {
for (i = 0; i < reserve_size; i++)
rte_pktmbuf_free(bufs[i]);
AF_XDP_LOG(DEBUG, "Failed to reserve enough fq descs.\n");
return -1;
}
for (i = 0; i < reserve_size; i++) {
__u64 *fq_addr;
uint64_t addr;
fq_addr = xsk_ring_prod__fill_addr(fq, idx++);
addr = (uint64_t)bufs[i] - (uint64_t)umem->buffer -
umem->mb_pool->header_size;
*fq_addr = addr;
}
xsk_ring_prod__submit(fq, reserve_size);
return 0;
}
#else
static inline int
reserve_fill_queue_cp(struct xsk_umem_info *umem, uint16_t reserve_size,
struct rte_mbuf **bufs __rte_unused,
struct xsk_ring_prod *fq)
{
void *addrs[reserve_size];
uint32_t idx;
uint16_t i;
if (rte_ring_dequeue_bulk(umem->buf_ring, addrs, reserve_size, NULL)
!= reserve_size) {
AF_XDP_LOG(DEBUG, "Failed to get enough buffers for fq.\n");
return -1;
}
if (unlikely(!xsk_ring_prod__reserve(fq, reserve_size, &idx))) {
AF_XDP_LOG(DEBUG, "Failed to reserve enough fq descs.\n");
rte_ring_enqueue_bulk(umem->buf_ring, addrs,
reserve_size, NULL);
return -1;
}
for (i = 0; i < reserve_size; i++) {
__u64 *fq_addr;
fq_addr = xsk_ring_prod__fill_addr(fq, idx++);
*fq_addr = (uint64_t)addrs[i];
}
xsk_ring_prod__submit(fq, reserve_size);
return 0;
}
#endif
static inline int
reserve_fill_queue(struct xsk_umem_info *umem, uint16_t reserve_size,
struct rte_mbuf **bufs, struct xsk_ring_prod *fq)
{
#if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
return reserve_fill_queue_zc(umem, reserve_size, bufs, fq);
#else
return reserve_fill_queue_cp(umem, reserve_size, bufs, fq);
#endif
}
#if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
static uint16_t
af_xdp_rx_zc(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
{
struct pkt_rx_queue *rxq = queue;
struct xsk_ring_cons *rx = &rxq->rx;
struct xsk_ring_prod *fq = &rxq->fq;
struct xsk_umem_info *umem = rxq->umem;
uint32_t idx_rx = 0;
unsigned long rx_bytes = 0;
int i;
struct rte_mbuf *fq_bufs[ETH_AF_XDP_RX_BATCH_SIZE];
nb_pkts = xsk_ring_cons__peek(rx, nb_pkts, &idx_rx);
if (nb_pkts == 0) {
/* we can assume a kernel >= 5.11 is in use if busy polling is
* enabled and thus we can safely use the recvfrom() syscall
* which is only supported for AF_XDP sockets in kernels >=
* 5.11.
*/
if (rxq->busy_budget) {
(void)recvfrom(xsk_socket__fd(rxq->xsk), NULL, 0,
MSG_DONTWAIT, NULL, NULL);
} else if (xsk_ring_prod__needs_wakeup(fq)) {
(void)poll(&rxq->fds[0], 1, 1000);
}
return 0;
}
/* allocate bufs for fill queue replenishment after rx */
if (rte_pktmbuf_alloc_bulk(umem->mb_pool, fq_bufs, nb_pkts)) {
AF_XDP_LOG(DEBUG,
"Failed to get enough buffers for fq.\n");
/* rollback cached_cons which is added by
* xsk_ring_cons__peek
*/
rx->cached_cons -= nb_pkts;
return 0;
}
for (i = 0; i < nb_pkts; i++) {
const struct xdp_desc *desc;
uint64_t addr;
uint32_t len;
uint64_t offset;
desc = xsk_ring_cons__rx_desc(rx, idx_rx++);
addr = desc->addr;
len = desc->len;
offset = xsk_umem__extract_offset(addr);
addr = xsk_umem__extract_addr(addr);
bufs[i] = (struct rte_mbuf *)
xsk_umem__get_data(umem->buffer, addr +
umem->mb_pool->header_size);
bufs[i]->data_off = offset - sizeof(struct rte_mbuf) -
rte_pktmbuf_priv_size(umem->mb_pool) -
umem->mb_pool->header_size;
rte_pktmbuf_pkt_len(bufs[i]) = len;
rte_pktmbuf_data_len(bufs[i]) = len;
rx_bytes += len;
}
xsk_ring_cons__release(rx, nb_pkts);
(void)reserve_fill_queue(umem, nb_pkts, fq_bufs, fq);
/* statistics */
rxq->stats.rx_pkts += nb_pkts;
rxq->stats.rx_bytes += rx_bytes;
return nb_pkts;
}
#else
static uint16_t
af_xdp_rx_cp(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
{
struct pkt_rx_queue *rxq = queue;
struct xsk_ring_cons *rx = &rxq->rx;
struct xsk_umem_info *umem = rxq->umem;
struct xsk_ring_prod *fq = &rxq->fq;
uint32_t idx_rx = 0;
unsigned long rx_bytes = 0;
int i;
uint32_t free_thresh = fq->size >> 1;
struct rte_mbuf *mbufs[ETH_AF_XDP_RX_BATCH_SIZE];
if (xsk_prod_nb_free(fq, free_thresh) >= free_thresh)
(void)reserve_fill_queue(umem, nb_pkts, NULL, fq);
nb_pkts = xsk_ring_cons__peek(rx, nb_pkts, &idx_rx);
if (nb_pkts == 0) {
#if defined(XDP_USE_NEED_WAKEUP)
if (xsk_ring_prod__needs_wakeup(fq))
(void)poll(rxq->fds, 1, 1000);
#endif
return 0;
}
if (unlikely(rte_pktmbuf_alloc_bulk(rxq->mb_pool, mbufs, nb_pkts))) {
/* rollback cached_cons which is added by
* xsk_ring_cons__peek
*/
rx->cached_cons -= nb_pkts;
return 0;
}
for (i = 0; i < nb_pkts; i++) {
const struct xdp_desc *desc;
uint64_t addr;
uint32_t len;
void *pkt;
desc = xsk_ring_cons__rx_desc(rx, idx_rx++);
addr = desc->addr;
len = desc->len;
pkt = xsk_umem__get_data(rxq->umem->mz->addr, addr);
rte_memcpy(rte_pktmbuf_mtod(mbufs[i], void *), pkt, len);
rte_ring_enqueue(umem->buf_ring, (void *)addr);
rte_pktmbuf_pkt_len(mbufs[i]) = len;
rte_pktmbuf_data_len(mbufs[i]) = len;
rx_bytes += len;
bufs[i] = mbufs[i];
}
xsk_ring_cons__release(rx, nb_pkts);
/* statistics */
rxq->stats.rx_pkts += nb_pkts;
rxq->stats.rx_bytes += rx_bytes;
return nb_pkts;
}
#endif
static uint16_t
af_xdp_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
{
#if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
return af_xdp_rx_zc(queue, bufs, nb_pkts);
#else
return af_xdp_rx_cp(queue, bufs, nb_pkts);
#endif
}
static uint16_t
eth_af_xdp_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
{
uint16_t nb_rx;
if (likely(nb_pkts <= ETH_AF_XDP_RX_BATCH_SIZE))
return af_xdp_rx(queue, bufs, nb_pkts);
/* Split larger batch into smaller batches of size
* ETH_AF_XDP_RX_BATCH_SIZE or less.
*/
nb_rx = 0;
while (nb_pkts) {
uint16_t ret, n;
n = (uint16_t)RTE_MIN(nb_pkts, ETH_AF_XDP_RX_BATCH_SIZE);
ret = af_xdp_rx(queue, &bufs[nb_rx], n);
nb_rx = (uint16_t)(nb_rx + ret);
nb_pkts = (uint16_t)(nb_pkts - ret);
if (ret < n)
break;
}
return nb_rx;
}
static void
pull_umem_cq(struct xsk_umem_info *umem, int size, struct xsk_ring_cons *cq)
{
size_t i, n;
uint32_t idx_cq = 0;
n = xsk_ring_cons__peek(cq, size, &idx_cq);
for (i = 0; i < n; i++) {
uint64_t addr;
addr = *xsk_ring_cons__comp_addr(cq, idx_cq++);
#if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
addr = xsk_umem__extract_addr(addr);
rte_pktmbuf_free((struct rte_mbuf *)
xsk_umem__get_data(umem->buffer,
addr + umem->mb_pool->header_size));
#else
rte_ring_enqueue(umem->buf_ring, (void *)addr);
#endif
}
xsk_ring_cons__release(cq, n);
}
static void
kick_tx(struct pkt_tx_queue *txq, struct xsk_ring_cons *cq)
{
struct xsk_umem_info *umem = txq->umem;
pull_umem_cq(umem, XSK_RING_CONS__DEFAULT_NUM_DESCS, cq);
if (tx_syscall_needed(&txq->tx))
while (send(xsk_socket__fd(txq->pair->xsk), NULL,
0, MSG_DONTWAIT) < 0) {
/* some thing unexpected */
if (errno != EBUSY && errno != EAGAIN && errno != EINTR)
break;
/* pull from completion queue to leave more space */
if (errno == EAGAIN)
pull_umem_cq(umem,
XSK_RING_CONS__DEFAULT_NUM_DESCS,
cq);
}
}
#if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
static uint16_t
af_xdp_tx_zc(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
{
struct pkt_tx_queue *txq = queue;
struct xsk_umem_info *umem = txq->umem;
struct rte_mbuf *mbuf;
unsigned long tx_bytes = 0;
int i;
uint32_t idx_tx;
uint16_t count = 0;
struct xdp_desc *desc;
uint64_t addr, offset;
struct xsk_ring_cons *cq = &txq->pair->cq;
uint32_t free_thresh = cq->size >> 1;
if (xsk_cons_nb_avail(cq, free_thresh) >= free_thresh)
pull_umem_cq(umem, XSK_RING_CONS__DEFAULT_NUM_DESCS, cq);
for (i = 0; i < nb_pkts; i++) {
mbuf = bufs[i];
if (mbuf->pool == umem->mb_pool) {
if (!xsk_ring_prod__reserve(&txq->tx, 1, &idx_tx)) {
kick_tx(txq, cq);
if (!xsk_ring_prod__reserve(&txq->tx, 1,
&idx_tx))
goto out;
}
desc = xsk_ring_prod__tx_desc(&txq->tx, idx_tx);
desc->len = mbuf->pkt_len;
addr = (uint64_t)mbuf - (uint64_t)umem->buffer -
umem->mb_pool->header_size;
offset = rte_pktmbuf_mtod(mbuf, uint64_t) -
(uint64_t)mbuf +
umem->mb_pool->header_size;
offset = offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT;
desc->addr = addr | offset;
count++;
} else {
struct rte_mbuf *local_mbuf =
rte_pktmbuf_alloc(umem->mb_pool);
void *pkt;
if (local_mbuf == NULL)
goto out;
if (!xsk_ring_prod__reserve(&txq->tx, 1, &idx_tx)) {
rte_pktmbuf_free(local_mbuf);
goto out;
}
desc = xsk_ring_prod__tx_desc(&txq->tx, idx_tx);
desc->len = mbuf->pkt_len;
addr = (uint64_t)local_mbuf - (uint64_t)umem->buffer -
umem->mb_pool->header_size;
offset = rte_pktmbuf_mtod(local_mbuf, uint64_t) -
(uint64_t)local_mbuf +
umem->mb_pool->header_size;
pkt = xsk_umem__get_data(umem->buffer, addr + offset);
offset = offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT;
desc->addr = addr | offset;
rte_memcpy(pkt, rte_pktmbuf_mtod(mbuf, void *),
desc->len);
rte_pktmbuf_free(mbuf);
count++;
}
tx_bytes += mbuf->pkt_len;
}
out:
xsk_ring_prod__submit(&txq->tx, count);
kick_tx(txq, cq);
txq->stats.tx_pkts += count;
txq->stats.tx_bytes += tx_bytes;
txq->stats.tx_dropped += nb_pkts - count;
return count;
}
#else
static uint16_t
af_xdp_tx_cp(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
{
struct pkt_tx_queue *txq = queue;
struct xsk_umem_info *umem = txq->umem;
struct rte_mbuf *mbuf;
void *addrs[ETH_AF_XDP_TX_BATCH_SIZE];
unsigned long tx_bytes = 0;
int i;
uint32_t idx_tx;
struct xsk_ring_cons *cq = &txq->pair->cq;
pull_umem_cq(umem, nb_pkts, cq);
nb_pkts = rte_ring_dequeue_bulk(umem->buf_ring, addrs,
nb_pkts, NULL);
if (nb_pkts == 0)
return 0;
if (xsk_ring_prod__reserve(&txq->tx, nb_pkts, &idx_tx) != nb_pkts) {
kick_tx(txq, cq);
rte_ring_enqueue_bulk(umem->buf_ring, addrs, nb_pkts, NULL);
return 0;
}
for (i = 0; i < nb_pkts; i++) {
struct xdp_desc *desc;
void *pkt;
desc = xsk_ring_prod__tx_desc(&txq->tx, idx_tx + i);
mbuf = bufs[i];
desc->len = mbuf->pkt_len;
desc->addr = (uint64_t)addrs[i];
pkt = xsk_umem__get_data(umem->mz->addr,
desc->addr);
rte_memcpy(pkt, rte_pktmbuf_mtod(mbuf, void *), desc->len);
tx_bytes += mbuf->pkt_len;
rte_pktmbuf_free(mbuf);
}
xsk_ring_prod__submit(&txq->tx, nb_pkts);
kick_tx(txq, cq);
txq->stats.tx_pkts += nb_pkts;
txq->stats.tx_bytes += tx_bytes;
return nb_pkts;
}
static uint16_t
af_xdp_tx_cp_batch(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
{
uint16_t nb_tx;
if (likely(nb_pkts <= ETH_AF_XDP_TX_BATCH_SIZE))
return af_xdp_tx_cp(queue, bufs, nb_pkts);
nb_tx = 0;
while (nb_pkts) {
uint16_t ret, n;
/* Split larger batch into smaller batches of size
* ETH_AF_XDP_TX_BATCH_SIZE or less.
*/
n = (uint16_t)RTE_MIN(nb_pkts, ETH_AF_XDP_TX_BATCH_SIZE);
ret = af_xdp_tx_cp(queue, &bufs[nb_tx], n);
nb_tx = (uint16_t)(nb_tx + ret);
nb_pkts = (uint16_t)(nb_pkts - ret);
if (ret < n)
break;
}
return nb_tx;
}
#endif
static uint16_t
eth_af_xdp_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
{
#if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
return af_xdp_tx_zc(queue, bufs, nb_pkts);
#else
return af_xdp_tx_cp_batch(queue, bufs, nb_pkts);
#endif
}
static int
eth_dev_start(struct rte_eth_dev *dev)
{
dev->data->dev_link.link_status = RTE_ETH_LINK_UP;
return 0;
}
/* This function gets called when the current port gets stopped. */
static int
eth_dev_stop(struct rte_eth_dev *dev)
{
dev->data->dev_link.link_status = RTE_ETH_LINK_DOWN;
return 0;
}
/* Find ethdev in list */
static inline struct internal_list *
find_internal_resource(struct pmd_internals *port_int)
{
int found = 0;
struct internal_list *list = NULL;
if (port_int == NULL)
return NULL;
pthread_mutex_lock(&internal_list_lock);
TAILQ_FOREACH(list, &internal_list, next) {
struct pmd_internals *list_int =
list->eth_dev->data->dev_private;
if (list_int == port_int) {
found = 1;
break;
}
}
pthread_mutex_unlock(&internal_list_lock);
if (!found)
return NULL;
return list;
}
/* Check if the netdev,qid context already exists */
static inline bool
ctx_exists(struct pkt_rx_queue *rxq, const char *ifname,
struct pkt_rx_queue *list_rxq, const char *list_ifname)
{
bool exists = false;
if (rxq->xsk_queue_idx == list_rxq->xsk_queue_idx &&
!strncmp(ifname, list_ifname, IFNAMSIZ)) {
AF_XDP_LOG(ERR, "ctx %s,%i already exists, cannot share umem\n",
ifname, rxq->xsk_queue_idx);
exists = true;
}
return exists;
}
/* Get a pointer to an existing UMEM which overlays the rxq's mb_pool */
static inline int
get_shared_umem(struct pkt_rx_queue *rxq, const char *ifname,
struct xsk_umem_info **umem)
{
struct internal_list *list;
struct pmd_internals *internals;
int i = 0, ret = 0;
struct rte_mempool *mb_pool = rxq->mb_pool;
if (mb_pool == NULL)
return ret;
pthread_mutex_lock(&internal_list_lock);
TAILQ_FOREACH(list, &internal_list, next) {
internals = list->eth_dev->data->dev_private;
for (i = 0; i < internals->queue_cnt; i++) {
struct pkt_rx_queue *list_rxq =
&internals->rx_queues[i];
if (rxq == list_rxq)
continue;
if (mb_pool == internals->rx_queues[i].mb_pool) {
if (ctx_exists(rxq, ifname, list_rxq,
internals->if_name)) {
ret = -1;
goto out;
}
if (__atomic_load_n(
&internals->rx_queues[i].umem->refcnt,
__ATOMIC_ACQUIRE)) {
*umem = internals->rx_queues[i].umem;
goto out;
}
}
}
}
out:
pthread_mutex_unlock(&internal_list_lock);
return ret;
}
static int
eth_dev_configure(struct rte_eth_dev *dev)
{
struct pmd_internals *internal = dev->data->dev_private;
/* rx/tx must be paired */
if (dev->data->nb_rx_queues != dev->data->nb_tx_queues)
return -EINVAL;
if (internal->shared_umem) {
struct internal_list *list = NULL;
const char *name = dev->device->name;
/* Ensure PMD is not already inserted into the list */
list = find_internal_resource(internal);
if (list)
return 0;
list = rte_zmalloc_socket(name, sizeof(*list), 0,
dev->device->numa_node);
if (list == NULL)
return -1;
list->eth_dev = dev;
pthread_mutex_lock(&internal_list_lock);
TAILQ_INSERT_TAIL(&internal_list, list, next);
pthread_mutex_unlock(&internal_list_lock);
}
return 0;
}
#define CLB_VAL_IDX 0
static int
eth_monitor_callback(const uint64_t value,
const uint64_t opaque[RTE_POWER_MONITOR_OPAQUE_SZ])
{
const uint64_t v = opaque[CLB_VAL_IDX];
const uint64_t m = (uint32_t)~0;
/* if the value has changed, abort entering power optimized state */
return (value & m) == v ? 0 : -1;
}
static int
eth_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc)
{
struct pkt_rx_queue *rxq = rx_queue;
unsigned int *prod = rxq->rx.producer;
const uint32_t cur_val = rxq->rx.cached_prod; /* use cached value */
/* watch for changes in producer ring */
pmc->addr = (void *)prod;
/* store current value */
pmc->opaque[CLB_VAL_IDX] = cur_val;
pmc->fn = eth_monitor_callback;
/* AF_XDP producer ring index is 32-bit */
pmc->size = sizeof(uint32_t);
return 0;
}
static int
eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
{
struct pmd_internals *internals = dev->data->dev_private;
dev_info->if_index = internals->if_index;
dev_info->max_mac_addrs = 1;
dev_info->max_rx_queues = internals->queue_cnt;
dev_info->max_tx_queues = internals->queue_cnt;
dev_info->min_mtu = RTE_ETHER_MIN_MTU;
#if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
dev_info->max_rx_pktlen = getpagesize() -
sizeof(struct rte_mempool_objhdr) -
sizeof(struct rte_mbuf) -
RTE_PKTMBUF_HEADROOM - XDP_PACKET_HEADROOM;
#else
dev_info->max_rx_pktlen = ETH_AF_XDP_FRAME_SIZE - XDP_PACKET_HEADROOM;
#endif
dev_info->max_mtu = dev_info->max_rx_pktlen - ETH_AF_XDP_ETH_OVERHEAD;
dev_info->default_rxportconf.burst_size = ETH_AF_XDP_DFLT_BUSY_BUDGET;
dev_info->default_txportconf.burst_size = ETH_AF_XDP_DFLT_BUSY_BUDGET;
dev_info->default_rxportconf.nb_queues = 1;
dev_info->default_txportconf.nb_queues = 1;
dev_info->default_rxportconf.ring_size = ETH_AF_XDP_DFLT_NUM_DESCS;
dev_info->default_txportconf.ring_size = ETH_AF_XDP_DFLT_NUM_DESCS;
return 0;
}
static int
eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
{
struct pmd_internals *internals = dev->data->dev_private;
struct xdp_statistics xdp_stats;
struct pkt_rx_queue *rxq;
struct pkt_tx_queue *txq;
socklen_t optlen;
int i, ret;
for (i = 0; i < dev->data->nb_rx_queues; i++) {
optlen = sizeof(struct xdp_statistics);
rxq = &internals->rx_queues[i];
txq = rxq->pair;
stats->q_ipackets[i] = rxq->stats.rx_pkts;
stats->q_ibytes[i] = rxq->stats.rx_bytes;
stats->q_opackets[i] = txq->stats.tx_pkts;
stats->q_obytes[i] = txq->stats.tx_bytes;
stats->ipackets += stats->q_ipackets[i];
stats->ibytes += stats->q_ibytes[i];
stats->imissed += rxq->stats.rx_dropped;
stats->oerrors += txq->stats.tx_dropped;
ret = getsockopt(xsk_socket__fd(rxq->xsk), SOL_XDP,
XDP_STATISTICS, &xdp_stats, &optlen);
if (ret != 0) {
AF_XDP_LOG(ERR, "getsockopt() failed for XDP_STATISTICS.\n");
return -1;
}
stats->imissed += xdp_stats.rx_dropped;
stats->opackets += stats->q_opackets[i];
stats->obytes += stats->q_obytes[i];
}
return 0;
}
static int
eth_stats_reset(struct rte_eth_dev *dev)
{
struct pmd_internals *internals = dev->data->dev_private;
int i;
for (i = 0; i < internals->queue_cnt; i++) {
memset(&internals->rx_queues[i].stats, 0,
sizeof(struct rx_stats));
memset(&internals->tx_queues[i].stats, 0,
sizeof(struct tx_stats));
}
return 0;
}
static void
remove_xdp_program(struct pmd_internals *internals)
{
uint32_t curr_prog_id = 0;
if (bpf_get_link_xdp_id(internals->if_index, &curr_prog_id,
XDP_FLAGS_UPDATE_IF_NOEXIST)) {
AF_XDP_LOG(ERR, "bpf_get_link_xdp_id failed\n");
return;
}
bpf_set_link_xdp_fd(internals->if_index, -1,
XDP_FLAGS_UPDATE_IF_NOEXIST);
}
static void
xdp_umem_destroy(struct xsk_umem_info *umem)
{
#if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
umem->mb_pool = NULL;
#else
rte_memzone_free(umem->mz);
umem->mz = NULL;
rte_ring_free(umem->buf_ring);
umem->buf_ring = NULL;
#endif
rte_free(umem);
}
static int
eth_dev_close(struct rte_eth_dev *dev)
{
struct pmd_internals *internals = dev->data->dev_private;
struct pkt_rx_queue *rxq;
int i;
if (rte_eal_process_type() != RTE_PROC_PRIMARY)
return 0;
AF_XDP_LOG(INFO, "Closing AF_XDP ethdev on numa socket %u\n",
rte_socket_id());
for (i = 0; i < internals->queue_cnt; i++) {
rxq = &internals->rx_queues[i];
if (rxq->umem == NULL)
break;
xsk_socket__delete(rxq->xsk);
if (__atomic_sub_fetch(&rxq->umem->refcnt, 1, __ATOMIC_ACQUIRE)
== 0) {
(void)xsk_umem__delete(rxq->umem->umem);
xdp_umem_destroy(rxq->umem);
}
/* free pkt_tx_queue */
rte_free(rxq->pair);
rte_free(rxq);
}
/*
* MAC is not allocated dynamically, setting it to NULL would prevent
* from releasing it in rte_eth_dev_release_port.
*/
dev->data->mac_addrs = NULL;
if (!internals->use_bpf_link)
remove_xdp_program(internals);
if (internals->shared_umem) {
struct internal_list *list;
/* Remove ethdev from list used to track and share UMEMs */
list = find_internal_resource(internals);
if (list) {
pthread_mutex_lock(&internal_list_lock);
TAILQ_REMOVE(&internal_list, list, next);
pthread_mutex_unlock(&internal_list_lock);
rte_free(list);
}
}
return 0;
}
static int
eth_link_update(struct rte_eth_dev *dev __rte_unused,
int wait_to_complete __rte_unused)
{
return 0;
}
#if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
static inline uintptr_t get_base_addr(struct rte_mempool *mp, uint64_t *align)
{
struct rte_mempool_memhdr *memhdr;
uintptr_t memhdr_addr, aligned_addr;
memhdr = STAILQ_FIRST(&mp->mem_list);
memhdr_addr = (uintptr_t)memhdr->addr;
aligned_addr = memhdr_addr & ~(getpagesize() - 1);
*align = memhdr_addr - aligned_addr;
return aligned_addr;
}
static struct
xsk_umem_info *xdp_umem_configure(struct pmd_internals *internals,
struct pkt_rx_queue *rxq)
{
struct xsk_umem_info *umem = NULL;
int ret;
struct xsk_umem_config usr_config = {
.fill_size = ETH_AF_XDP_DFLT_NUM_DESCS * 2,
.comp_size = ETH_AF_XDP_DFLT_NUM_DESCS,
.flags = XDP_UMEM_UNALIGNED_CHUNK_FLAG};
void *base_addr = NULL;
struct rte_mempool *mb_pool = rxq->mb_pool;
uint64_t umem_size, align = 0;
if (internals->shared_umem) {
if (get_shared_umem(rxq, internals->if_name, &umem) < 0)
return NULL;
if (umem != NULL &&
__atomic_load_n(&umem->refcnt, __ATOMIC_ACQUIRE) <
umem->max_xsks) {
AF_XDP_LOG(INFO, "%s,qid%i sharing UMEM\n",
internals->if_name, rxq->xsk_queue_idx);
__atomic_fetch_add(&umem->refcnt, 1, __ATOMIC_ACQUIRE);
}
}
if (umem == NULL) {
usr_config.frame_size =
rte_mempool_calc_obj_size(mb_pool->elt_size,
mb_pool->flags, NULL);
usr_config.frame_headroom = mb_pool->header_size +
sizeof(struct rte_mbuf) +
rte_pktmbuf_priv_size(mb_pool) +
RTE_PKTMBUF_HEADROOM;
umem = rte_zmalloc_socket("umem", sizeof(*umem), 0,
rte_socket_id());
if (umem == NULL) {
AF_XDP_LOG(ERR, "Failed to allocate umem info");
return NULL;
}
umem->mb_pool = mb_pool;
base_addr = (void *)get_base_addr(mb_pool, &align);
umem_size = (uint64_t)mb_pool->populated_size *
(uint64_t)usr_config.frame_size +
align;
ret = xsk_umem__create(&umem->umem, base_addr, umem_size,
&rxq->fq, &rxq->cq, &usr_config);
if (ret) {
AF_XDP_LOG(ERR, "Failed to create umem");
goto err;
}
umem->buffer = base_addr;
if (internals->shared_umem) {
umem->max_xsks = mb_pool->populated_size /
ETH_AF_XDP_NUM_BUFFERS;
AF_XDP_LOG(INFO, "Max xsks for UMEM %s: %u\n",
mb_pool->name, umem->max_xsks);
}
__atomic_store_n(&umem->refcnt, 1, __ATOMIC_RELEASE);
}
#else
static struct
xsk_umem_info *xdp_umem_configure(struct pmd_internals *internals,
struct pkt_rx_queue *rxq)
{
struct xsk_umem_info *umem;
const struct rte_memzone *mz;
struct xsk_umem_config usr_config = {
.fill_size = ETH_AF_XDP_DFLT_NUM_DESCS,
.comp_size = ETH_AF_XDP_DFLT_NUM_DESCS,
.frame_size = ETH_AF_XDP_FRAME_SIZE,
.frame_headroom = 0 };
char ring_name[RTE_RING_NAMESIZE];
char mz_name[RTE_MEMZONE_NAMESIZE];
int ret;
uint64_t i;
umem = rte_zmalloc_socket("umem", sizeof(*umem), 0, rte_socket_id());
if (umem == NULL) {
AF_XDP_LOG(ERR, "Failed to allocate umem info");
return NULL;
}
snprintf(ring_name, sizeof(ring_name), "af_xdp_ring_%s_%u",
internals->if_name, rxq->xsk_queue_idx);
umem->buf_ring = rte_ring_create(ring_name,
ETH_AF_XDP_NUM_BUFFERS,
rte_socket_id(),
0x0);
if (umem->buf_ring == NULL) {
AF_XDP_LOG(ERR, "Failed to create rte_ring\n");
goto err;
}
for (i = 0; i < ETH_AF_XDP_NUM_BUFFERS; i++)
rte_ring_enqueue(umem->buf_ring,
(void *)(i * ETH_AF_XDP_FRAME_SIZE));
snprintf(mz_name, sizeof(mz_name), "af_xdp_umem_%s_%u",
internals->if_name, rxq->xsk_queue_idx);
mz = rte_memzone_reserve_aligned(mz_name,
ETH_AF_XDP_NUM_BUFFERS * ETH_AF_XDP_FRAME_SIZE,
rte_socket_id(), RTE_MEMZONE_IOVA_CONTIG,
getpagesize());
if (mz == NULL) {
AF_XDP_LOG(ERR, "Failed to reserve memzone for af_xdp umem.\n");
goto err;
}
ret = xsk_umem__create(&umem->umem, mz->addr,
ETH_AF_XDP_NUM_BUFFERS * ETH_AF_XDP_FRAME_SIZE,
&rxq->fq, &rxq->cq,
&usr_config);
if (ret) {
AF_XDP_LOG(ERR, "Failed to create umem");
goto err;
}
umem->mz = mz;
#endif
return umem;
err:
xdp_umem_destroy(umem);
return NULL;
}
static int
load_custom_xdp_prog(const char *prog_path, int if_index, bool use_bpf_link)
{
int ret, prog_fd = -1;
struct bpf_object *obj;
struct bpf_map *map;
ret = bpf_prog_load(prog_path, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
if (ret) {
AF_XDP_LOG(ERR, "Failed to load program %s\n", prog_path);
return ret;
}
/*
* The loaded program must provision for a map of xsks, such that some
* traffic can be redirected to userspace. When the xsk is created,
* libbpf inserts it into the map.
*/
map = bpf_object__find_map_by_name(obj, "xsks_map");
if (!map) {
AF_XDP_LOG(ERR, "Failed to find xsks_map in %s\n", prog_path);
return -1;
}
/* Link the program with the given network device */
ret = link_xdp_program(if_index, prog_fd, use_bpf_link);
if (ret) {
AF_XDP_LOG(ERR, "Failed to set prog fd %d on interface\n",
prog_fd);
return -1;
}
AF_XDP_LOG(INFO, "Successfully loaded XDP program %s with fd %d\n",
prog_path, prog_fd);
return 0;
}
/* Detect support for busy polling through setsockopt(). */
static int
configure_preferred_busy_poll(struct pkt_rx_queue *rxq)
{
int sock_opt = 1;
int fd = xsk_socket__fd(rxq->xsk);
int ret = 0;
ret = setsockopt(fd, SOL_SOCKET, SO_PREFER_BUSY_POLL,
(void *)&sock_opt, sizeof(sock_opt));
if (ret < 0) {
AF_XDP_LOG(DEBUG, "Failed to set SO_PREFER_BUSY_POLL\n");
goto err_prefer;
}
sock_opt = ETH_AF_XDP_DFLT_BUSY_TIMEOUT;
ret = setsockopt(fd, SOL_SOCKET, SO_BUSY_POLL, (void *)&sock_opt,
sizeof(sock_opt));
if (ret < 0) {
AF_XDP_LOG(DEBUG, "Failed to set SO_BUSY_POLL\n");
goto err_timeout;
}
sock_opt = rxq->busy_budget;
ret = setsockopt(fd, SOL_SOCKET, SO_BUSY_POLL_BUDGET,
(void *)&sock_opt, sizeof(sock_opt));
if (ret < 0) {
AF_XDP_LOG(DEBUG, "Failed to set SO_BUSY_POLL_BUDGET\n");
} else {
AF_XDP_LOG(INFO, "Busy polling budget set to: %u\n",
rxq->busy_budget);
return 0;
}
/* setsockopt failure - attempt to restore xsk to default state and
* proceed without busy polling support.
*/
sock_opt = 0;
ret = setsockopt(fd, SOL_SOCKET, SO_BUSY_POLL, (void *)&sock_opt,
sizeof(sock_opt));
if (ret < 0) {
AF_XDP_LOG(ERR, "Failed to unset SO_BUSY_POLL\n");
return -1;
}
err_timeout:
sock_opt = 0;
ret = setsockopt(fd, SOL_SOCKET, SO_PREFER_BUSY_POLL,
(void *)&sock_opt, sizeof(sock_opt));
if (ret < 0) {
AF_XDP_LOG(ERR, "Failed to unset SO_PREFER_BUSY_POLL\n");
return -1;
}
err_prefer:
rxq->busy_budget = 0;
return 0;
}
static int
xsk_configure(struct pmd_internals *internals, struct pkt_rx_queue *rxq,
int ring_size)
{
struct xsk_socket_config cfg;
struct pkt_tx_queue *txq = rxq->pair;
int ret = 0;
int reserve_size = ETH_AF_XDP_DFLT_NUM_DESCS;
struct rte_mbuf *fq_bufs[reserve_size];
rxq->umem = xdp_umem_configure(internals, rxq);
if (rxq->umem == NULL)
return -ENOMEM;
txq->umem = rxq->umem;
cfg.rx_size = ring_size;
cfg.tx_size = ring_size;
cfg.libbpf_flags = 0;
cfg.xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
cfg.bind_flags = 0;
#if defined(XDP_USE_NEED_WAKEUP)
cfg.bind_flags |= XDP_USE_NEED_WAKEUP;
#endif
if (strnlen(internals->prog_path, PATH_MAX) &&
!internals->custom_prog_configured) {
ret = load_custom_xdp_prog(internals->prog_path,
internals->if_index,
internals->use_bpf_link);
if (ret) {
AF_XDP_LOG(ERR, "Failed to load custom XDP program %s\n",
internals->prog_path);
goto err;
}
internals->custom_prog_configured = 1;
}
if (internals->shared_umem)
ret = create_shared_socket(&rxq->xsk, internals->if_name,
rxq->xsk_queue_idx, rxq->umem->umem, &rxq->rx,
&txq->tx, &rxq->fq, &rxq->cq, &cfg);
else
ret = xsk_socket__create(&rxq->xsk, internals->if_name,
rxq->xsk_queue_idx, rxq->umem->umem, &rxq->rx,
&txq->tx, &cfg);
if (ret) {
AF_XDP_LOG(ERR, "Failed to create xsk socket.\n");
goto err;
}
#if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
ret = rte_pktmbuf_alloc_bulk(rxq->umem->mb_pool, fq_bufs, reserve_size);
if (ret) {
AF_XDP_LOG(DEBUG, "Failed to get enough buffers for fq.\n");
goto err;
}
#endif
if (rxq->busy_budget) {
ret = configure_preferred_busy_poll(rxq);
if (ret) {
AF_XDP_LOG(ERR, "Failed configure busy polling.\n");
goto err;
}
}
ret = reserve_fill_queue(rxq->umem, reserve_size, fq_bufs, &rxq->fq);
if (ret) {
xsk_socket__delete(rxq->xsk);
AF_XDP_LOG(ERR, "Failed to reserve fill queue.\n");
goto err;
}
return 0;
err:
if (__atomic_sub_fetch(&rxq->umem->refcnt, 1, __ATOMIC_ACQUIRE) == 0)
xdp_umem_destroy(rxq->umem);
return ret;
}
static int
eth_rx_queue_setup(struct rte_eth_dev *dev,
uint16_t rx_queue_id,
uint16_t nb_rx_desc,
unsigned int socket_id __rte_unused,
const struct rte_eth_rxconf *rx_conf __rte_unused,
struct rte_mempool *mb_pool)
{
struct pmd_internals *internals = dev->data->dev_private;
struct pkt_rx_queue *rxq;
int ret;
rxq = &internals->rx_queues[rx_queue_id];
AF_XDP_LOG(INFO, "Set up rx queue, rx queue id: %d, xsk queue id: %d\n",
rx_queue_id, rxq->xsk_queue_idx);
#ifndef XDP_UMEM_UNALIGNED_CHUNK_FLAG
uint32_t buf_size, data_size;
/* Now get the space available for data in the mbuf */
buf_size = rte_pktmbuf_data_room_size(mb_pool) -
RTE_PKTMBUF_HEADROOM;
data_size = ETH_AF_XDP_FRAME_SIZE;
if (data_size > buf_size) {
AF_XDP_LOG(ERR, "%s: %d bytes will not fit in mbuf (%d bytes)\n",
dev->device->name, data_size, buf_size);
ret = -ENOMEM;
goto err;
}
#endif
rxq->mb_pool = mb_pool;
if (xsk_configure(internals, rxq, nb_rx_desc)) {
AF_XDP_LOG(ERR, "Failed to configure xdp socket\n");
ret = -EINVAL;
goto err;
}
if (!rxq->busy_budget)
AF_XDP_LOG(DEBUG, "Preferred busy polling not enabled\n");
rxq->fds[0].fd = xsk_socket__fd(rxq->xsk);
rxq->fds[0].events = POLLIN;
dev->data->rx_queues[rx_queue_id] = rxq;
return 0;
err:
return ret;
}
static int
eth_tx_queue_setup(struct rte_eth_dev *dev,
uint16_t tx_queue_id,
uint16_t nb_tx_desc __rte_unused,
unsigned int socket_id __rte_unused,
const struct rte_eth_txconf *tx_conf __rte_unused)
{
struct pmd_internals *internals = dev->data->dev_private;
struct pkt_tx_queue *txq;
txq = &internals->tx_queues[tx_queue_id];
dev->data->tx_queues[tx_queue_id] = txq;
return 0;
}
static int
eth_dev_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
{
struct pmd_internals *internals = dev->data->dev_private;
struct ifreq ifr = { .ifr_mtu = mtu };
int ret;
int s;
s = socket(PF_INET, SOCK_DGRAM, 0);
if (s < 0)
return -EINVAL;
strlcpy(ifr.ifr_name, internals->if_name, IFNAMSIZ);
ret = ioctl(s, SIOCSIFMTU, &ifr);
close(s);
return (ret < 0) ? -errno : 0;
}
static int
eth_dev_change_flags(char *if_name, uint32_t flags, uint32_t mask)
{
struct ifreq ifr;
int ret = 0;
int s;
s = socket(PF_INET, SOCK_DGRAM, 0);
if (s < 0)
return -errno;
strlcpy(ifr.ifr_name, if_name, IFNAMSIZ);
if (ioctl(s, SIOCGIFFLAGS, &ifr) < 0) {
ret = -errno;
goto out;
}
ifr.ifr_flags &= mask;
ifr.ifr_flags |= flags;
if (ioctl(s, SIOCSIFFLAGS, &ifr) < 0) {
ret = -errno;
goto out;
}
out:
close(s);
return ret;
}
static int
eth_dev_promiscuous_enable(struct rte_eth_dev *dev)
{
struct pmd_internals *internals = dev->data->dev_private;
return eth_dev_change_flags(internals->if_name, IFF_PROMISC, ~0);
}
static int
eth_dev_promiscuous_disable(struct rte_eth_dev *dev)
{
struct pmd_internals *internals = dev->data->dev_private;
return eth_dev_change_flags(internals->if_name, 0, ~IFF_PROMISC);
}
static const struct eth_dev_ops ops = {
.dev_start = eth_dev_start,
.dev_stop = eth_dev_stop,
.dev_close = eth_dev_close,
.dev_configure = eth_dev_configure,
.dev_infos_get = eth_dev_info,
.mtu_set = eth_dev_mtu_set,
.promiscuous_enable = eth_dev_promiscuous_enable,
.promiscuous_disable = eth_dev_promiscuous_disable,
.rx_queue_setup = eth_rx_queue_setup,
.tx_queue_setup = eth_tx_queue_setup,
.link_update = eth_link_update,
.stats_get = eth_stats_get,
.stats_reset = eth_stats_reset,
.get_monitor_addr = eth_get_monitor_addr,
};
/** parse busy_budget argument */
static int
parse_budget_arg(const char *key __rte_unused,
const char *value, void *extra_args)
{
int *i = (int *)extra_args;
char *end;
*i = strtol(value, &end, 10);
if (*i < 0 || *i > UINT16_MAX) {
AF_XDP_LOG(ERR, "Invalid busy_budget %i, must be >= 0 and <= %u\n",
*i, UINT16_MAX);
return -EINVAL;
}
return 0;
}
/** parse integer from integer argument */
static int
parse_integer_arg(const char *key __rte_unused,
const char *value, void *extra_args)
{
int *i = (int *)extra_args;
char *end;
*i = strtol(value, &end, 10);
if (*i < 0) {
AF_XDP_LOG(ERR, "Argument has to be positive.\n");
return -EINVAL;
}
return 0;
}
/** parse name argument */
static int
parse_name_arg(const char *key __rte_unused,
const char *value, void *extra_args)
{
char *name = extra_args;
if (strnlen(value, IFNAMSIZ) > IFNAMSIZ - 1) {
AF_XDP_LOG(ERR, "Invalid name %s, should be less than %u bytes.\n",
value, IFNAMSIZ);
return -EINVAL;
}
strlcpy(name, value, IFNAMSIZ);
return 0;
}
/** parse xdp prog argument */
static int
parse_prog_arg(const char *key __rte_unused,
const char *value, void *extra_args)
{
char *path = extra_args;
if (strnlen(value, PATH_MAX) == PATH_MAX) {
AF_XDP_LOG(ERR, "Invalid path %s, should be less than %u bytes.\n",
value, PATH_MAX);
return -EINVAL;
}
if (access(value, F_OK) != 0) {
AF_XDP_LOG(ERR, "Error accessing %s: %s\n",
value, strerror(errno));
return -EINVAL;
}
strlcpy(path, value, PATH_MAX);
return 0;
}
static int
xdp_get_channels_info(const char *if_name, int *max_queues,
int *combined_queues)
{
struct ethtool_channels channels;
struct ifreq ifr;
int fd, ret;
fd = socket(AF_INET, SOCK_DGRAM, 0);
if (fd < 0)
return -1;
channels.cmd = ETHTOOL_GCHANNELS;
ifr.ifr_data = (void *)&channels;
strlcpy(ifr.ifr_name, if_name, IFNAMSIZ);
ret = ioctl(fd, SIOCETHTOOL, &ifr);
if (ret) {
if (errno == EOPNOTSUPP) {
ret = 0;
} else {
ret = -errno;
goto out;
}
}
if (channels.max_combined == 0 || errno == EOPNOTSUPP) {
/* If the device says it has no channels, then all traffic
* is sent to a single stream, so max queues = 1.
*/
*max_queues = 1;
*combined_queues = 1;
} else {
*max_queues = channels.max_combined;
*combined_queues = channels.combined_count;
}
out:
close(fd);
return ret;
}
static int
parse_parameters(struct rte_kvargs *kvlist, char *if_name, int *start_queue,
int *queue_cnt, int *shared_umem, char *prog_path,
int *busy_budget)
{
int ret;
ret = rte_kvargs_process(kvlist, ETH_AF_XDP_IFACE_ARG,
&parse_name_arg, if_name);
if (ret < 0)
goto free_kvlist;
ret = rte_kvargs_process(kvlist, ETH_AF_XDP_START_QUEUE_ARG,
&parse_integer_arg, start_queue);
if (ret < 0)
goto free_kvlist;
ret = rte_kvargs_process(kvlist, ETH_AF_XDP_QUEUE_COUNT_ARG,
&parse_integer_arg, queue_cnt);
if (ret < 0 || *queue_cnt <= 0) {
ret = -EINVAL;
goto free_kvlist;
}
ret = rte_kvargs_process(kvlist, ETH_AF_XDP_SHARED_UMEM_ARG,
&parse_integer_arg, shared_umem);
if (ret < 0)
goto free_kvlist;
ret = rte_kvargs_process(kvlist, ETH_AF_XDP_PROG_ARG,
&parse_prog_arg, prog_path);
if (ret < 0)
goto free_kvlist;
ret = rte_kvargs_process(kvlist, ETH_AF_XDP_BUDGET_ARG,
&parse_budget_arg, busy_budget);
if (ret < 0)
goto free_kvlist;
free_kvlist:
rte_kvargs_free(kvlist);
return ret;
}
static int
get_iface_info(const char *if_name,
struct rte_ether_addr *eth_addr,
int *if_index)
{
struct ifreq ifr;
int sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_IP);
if (sock < 0)
return -1;
strlcpy(ifr.ifr_name, if_name, IFNAMSIZ);
if (ioctl(sock, SIOCGIFINDEX, &ifr))
goto error;
*if_index = ifr.ifr_ifindex;
if (ioctl(sock, SIOCGIFHWADDR, &ifr))
goto error;
rte_memcpy(eth_addr, ifr.ifr_hwaddr.sa_data, RTE_ETHER_ADDR_LEN);
close(sock);
return 0;
error:
close(sock);
return -1;
}
static struct rte_eth_dev *
init_internals(struct rte_vdev_device *dev, const char *if_name,
int start_queue_idx, int queue_cnt, int shared_umem,
const char *prog_path, int busy_budget)
{
const char *name = rte_vdev_device_name(dev);
const unsigned int numa_node = dev->device.numa_node;
struct pmd_internals *internals;
struct rte_eth_dev *eth_dev;
int ret;
int i;
internals = rte_zmalloc_socket(name, sizeof(*internals), 0, numa_node);
if (internals == NULL)
return NULL;
internals->start_queue_idx = start_queue_idx;
internals->queue_cnt = queue_cnt;
strlcpy(internals->if_name, if_name, IFNAMSIZ);
strlcpy(internals->prog_path, prog_path, PATH_MAX);
internals->custom_prog_configured = 0;
internals->use_bpf_link = probe_bpf_link();
#ifndef ETH_AF_XDP_SHARED_UMEM
if (shared_umem) {
AF_XDP_LOG(ERR, "Shared UMEM feature not available. "
"Check kernel and libbpf version\n");
goto err_free_internals;
}
#endif
internals->shared_umem = shared_umem;
if (xdp_get_channels_info(if_name, &internals->max_queue_cnt,
&internals->combined_queue_cnt)) {
AF_XDP_LOG(ERR, "Failed to get channel info of interface: %s\n",
if_name);
goto err_free_internals;
}
if (queue_cnt > internals->combined_queue_cnt) {
AF_XDP_LOG(ERR, "Specified queue count %d is larger than combined queue count %d.\n",
queue_cnt, internals->combined_queue_cnt);
goto err_free_internals;
}
internals->rx_queues = rte_zmalloc_socket(NULL,
sizeof(struct pkt_rx_queue) * queue_cnt,
0, numa_node);
if (internals->rx_queues == NULL) {
AF_XDP_LOG(ERR, "Failed to allocate memory for rx queues.\n");
goto err_free_internals;
}
internals->tx_queues = rte_zmalloc_socket(NULL,
sizeof(struct pkt_tx_queue) * queue_cnt,
0, numa_node);
if (internals->tx_queues == NULL) {
AF_XDP_LOG(ERR, "Failed to allocate memory for tx queues.\n");
goto err_free_rx;
}
for (i = 0; i < queue_cnt; i++) {
internals->tx_queues[i].pair = &internals->rx_queues[i];
internals->rx_queues[i].pair = &internals->tx_queues[i];
internals->rx_queues[i].xsk_queue_idx = start_queue_idx + i;
internals->tx_queues[i].xsk_queue_idx = start_queue_idx + i;
internals->rx_queues[i].busy_budget = busy_budget;
}
ret = get_iface_info(if_name, &internals->eth_addr,
&internals->if_index);
if (ret)
goto err_free_tx;
eth_dev = rte_eth_vdev_allocate(dev, 0);
if (eth_dev == NULL)
goto err_free_tx;
eth_dev->data->dev_private = internals;
eth_dev->data->dev_link = pmd_link;
eth_dev->data->mac_addrs = &internals->eth_addr;
eth_dev->data->dev_flags |= RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS;
eth_dev->dev_ops = &ops;
eth_dev->rx_pkt_burst = eth_af_xdp_rx;
eth_dev->tx_pkt_burst = eth_af_xdp_tx;
#if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
AF_XDP_LOG(INFO, "Zero copy between umem and mbuf enabled.\n");
#endif
return eth_dev;
err_free_tx:
rte_free(internals->tx_queues);
err_free_rx:
rte_free(internals->rx_queues);
err_free_internals:
rte_free(internals);
return NULL;
}
static int
rte_pmd_af_xdp_probe(struct rte_vdev_device *dev)
{
struct rte_kvargs *kvlist;
char if_name[IFNAMSIZ] = {'\0'};
int xsk_start_queue_idx = ETH_AF_XDP_DFLT_START_QUEUE_IDX;
int xsk_queue_cnt = ETH_AF_XDP_DFLT_QUEUE_COUNT;
int shared_umem = 0;
char prog_path[PATH_MAX] = {'\0'};
int busy_budget = -1;
struct rte_eth_dev *eth_dev = NULL;
const char *name;
AF_XDP_LOG(INFO, "Initializing pmd_af_xdp for %s\n",
rte_vdev_device_name(dev));
name = rte_vdev_device_name(dev);
if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
AF_XDP_LOG(ERR, "Failed to probe %s. "
"AF_XDP PMD does not support secondary processes.\n",
name);
return -ENOTSUP;
}
kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments);
if (kvlist == NULL) {
AF_XDP_LOG(ERR, "Invalid kvargs key\n");
return -EINVAL;
}
if (dev->device.numa_node == SOCKET_ID_ANY)
dev->device.numa_node = rte_socket_id();
if (parse_parameters(kvlist, if_name, &xsk_start_queue_idx,
&xsk_queue_cnt, &shared_umem, prog_path,
&busy_budget) < 0) {
AF_XDP_LOG(ERR, "Invalid kvargs value\n");
return -EINVAL;
}
if (strlen(if_name) == 0) {
AF_XDP_LOG(ERR, "Network interface must be specified\n");
return -EINVAL;
}
busy_budget = busy_budget == -1 ? ETH_AF_XDP_DFLT_BUSY_BUDGET :
busy_budget;
eth_dev = init_internals(dev, if_name, xsk_start_queue_idx,
xsk_queue_cnt, shared_umem, prog_path,
busy_budget);
if (eth_dev == NULL) {
AF_XDP_LOG(ERR, "Failed to init internals\n");
return -1;
}
rte_eth_dev_probing_finish(eth_dev);
return 0;
}
static int
rte_pmd_af_xdp_remove(struct rte_vdev_device *dev)
{
struct rte_eth_dev *eth_dev = NULL;
AF_XDP_LOG(INFO, "Removing AF_XDP ethdev on numa socket %u\n",
rte_socket_id());
if (dev == NULL)
return -1;
/* find the ethdev entry */
eth_dev = rte_eth_dev_allocated(rte_vdev_device_name(dev));
if (eth_dev == NULL)
return 0;
eth_dev_close(eth_dev);
rte_eth_dev_release_port(eth_dev);
return 0;
}
static struct rte_vdev_driver pmd_af_xdp_drv = {
.probe = rte_pmd_af_xdp_probe,
.remove = rte_pmd_af_xdp_remove,
};
RTE_PMD_REGISTER_VDEV(net_af_xdp, pmd_af_xdp_drv);
RTE_PMD_REGISTER_PARAM_STRING(net_af_xdp,
"iface=<string> "
"start_queue=<int> "
"queue_count=<int> "
"shared_umem=<int> "
"xdp_prog=<string> "
"busy_budget=<int>");