numam-dpdk/drivers/net/tap/rte_eth_tap.c
Stephen Hemminger 9b4b4d95bc net/tap: add persist option
The TAP device only lasts as long as the DPDK application that opened
it is running. This behavior is bad if the DPDK application needs
to be updated transparently without disturbing other services
using the tap device.

Add a persist feature to the TAP device. If this flag is set, the
kernel network device remains even if after the application has exited.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Reviewed-by: Andrew Rybchenko <andrew.rybchenko@oktetlabs.ru>
2022-10-04 16:40:24 +02:00

2673 lines
69 KiB
C

/* SPDX-License-Identifier: BSD-3-Clause
* Copyright(c) 2016-2017 Intel Corporation
*/
#include <rte_atomic.h>
#include <rte_branch_prediction.h>
#include <rte_byteorder.h>
#include <rte_common.h>
#include <rte_mbuf.h>
#include <ethdev_driver.h>
#include <ethdev_vdev.h>
#include <rte_malloc.h>
#include <bus_vdev_driver.h>
#include <rte_kvargs.h>
#include <rte_net.h>
#include <rte_debug.h>
#include <rte_ip.h>
#include <rte_string_fns.h>
#include <rte_ethdev.h>
#include <rte_errno.h>
#include <rte_cycles.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/socket.h>
#include <sys/ioctl.h>
#include <sys/utsname.h>
#include <sys/mman.h>
#include <errno.h>
#include <signal.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>
#include <sys/uio.h>
#include <unistd.h>
#include <arpa/inet.h>
#include <net/if.h>
#include <linux/if_tun.h>
#include <linux/if_ether.h>
#include <fcntl.h>
#include <ctype.h>
#include <tap_rss.h>
#include <rte_eth_tap.h>
#include <tap_flow.h>
#include <tap_netlink.h>
#include <tap_tcmsgs.h>
/* Linux based path to the TUN device */
#define TUN_TAP_DEV_PATH "/dev/net/tun"
#define DEFAULT_TAP_NAME "dtap"
#define DEFAULT_TUN_NAME "dtun"
#define ETH_TAP_IFACE_ARG "iface"
#define ETH_TAP_REMOTE_ARG "remote"
#define ETH_TAP_MAC_ARG "mac"
#define ETH_TAP_MAC_FIXED "fixed"
#define ETH_TAP_PERSIST_ARG "persist"
#define ETH_TAP_USR_MAC_FMT "xx:xx:xx:xx:xx:xx"
#define ETH_TAP_CMP_MAC_FMT "0123456789ABCDEFabcdef"
#define ETH_TAP_MAC_ARG_FMT ETH_TAP_MAC_FIXED "|" ETH_TAP_USR_MAC_FMT
#define TAP_GSO_MBUFS_PER_CORE 128
#define TAP_GSO_MBUF_SEG_SIZE 128
#define TAP_GSO_MBUF_CACHE_SIZE 4
#define TAP_GSO_MBUFS_NUM \
(TAP_GSO_MBUFS_PER_CORE * TAP_GSO_MBUF_CACHE_SIZE)
/* IPC key for queue fds sync */
#define TAP_MP_KEY "tap_mp_sync_queues"
#define TAP_MP_REQ_START_RXTX "tap_mp_req_start_rxtx"
#define TAP_IOV_DEFAULT_MAX 1024
#define TAP_RX_OFFLOAD (RTE_ETH_RX_OFFLOAD_SCATTER | \
RTE_ETH_RX_OFFLOAD_IPV4_CKSUM | \
RTE_ETH_RX_OFFLOAD_UDP_CKSUM | \
RTE_ETH_RX_OFFLOAD_TCP_CKSUM)
#define TAP_TX_OFFLOAD (RTE_ETH_TX_OFFLOAD_MULTI_SEGS | \
RTE_ETH_TX_OFFLOAD_IPV4_CKSUM | \
RTE_ETH_TX_OFFLOAD_UDP_CKSUM | \
RTE_ETH_TX_OFFLOAD_TCP_CKSUM | \
RTE_ETH_TX_OFFLOAD_TCP_TSO)
static int tap_devices_count;
static const char *tuntap_types[ETH_TUNTAP_TYPE_MAX] = {
"UNKNOWN", "TUN", "TAP"
};
static const char *valid_arguments[] = {
ETH_TAP_IFACE_ARG,
ETH_TAP_REMOTE_ARG,
ETH_TAP_MAC_ARG,
ETH_TAP_PERSIST_ARG,
NULL
};
static volatile uint32_t tap_trigger; /* Rx trigger */
static struct rte_eth_link pmd_link = {
.link_speed = RTE_ETH_SPEED_NUM_10G,
.link_duplex = RTE_ETH_LINK_FULL_DUPLEX,
.link_status = RTE_ETH_LINK_DOWN,
.link_autoneg = RTE_ETH_LINK_FIXED,
};
static void
tap_trigger_cb(int sig __rte_unused)
{
/* Valid trigger values are nonzero */
tap_trigger = (tap_trigger + 1) | 0x80000000;
}
/* Specifies on what netdevices the ioctl should be applied */
enum ioctl_mode {
LOCAL_AND_REMOTE,
LOCAL_ONLY,
REMOTE_ONLY,
};
/* Message header to synchronize queues via IPC */
struct ipc_queues {
char port_name[RTE_DEV_NAME_MAX_LEN];
int rxq_count;
int txq_count;
/*
* The file descriptors are in the dedicated part
* of the Unix message to be translated by the kernel.
*/
};
static int tap_intr_handle_set(struct rte_eth_dev *dev, int set);
/**
* Tun/Tap allocation routine
*
* @param[in] pmd
* Pointer to private structure.
*
* @param[in] is_keepalive
* Keepalive flag
*
* @param[in] persistent
* Mark device as persistent
*
* @return
* -1 on failure, fd on success
*/
static int
tun_alloc(struct pmd_internals *pmd, int is_keepalive, int persistent)
{
struct ifreq ifr;
#ifdef IFF_MULTI_QUEUE
unsigned int features;
#endif
int fd, signo, flags;
memset(&ifr, 0, sizeof(struct ifreq));
/*
* Do not set IFF_NO_PI as packet information header will be needed
* to check if a received packet has been truncated.
*/
ifr.ifr_flags = (pmd->type == ETH_TUNTAP_TYPE_TAP) ?
IFF_TAP : IFF_TUN | IFF_POINTOPOINT;
strlcpy(ifr.ifr_name, pmd->name, IFNAMSIZ);
fd = open(TUN_TAP_DEV_PATH, O_RDWR);
if (fd < 0) {
TAP_LOG(ERR, "Unable to open %s interface", TUN_TAP_DEV_PATH);
goto error;
}
#ifdef IFF_MULTI_QUEUE
/* Grab the TUN features to verify we can work multi-queue */
if (ioctl(fd, TUNGETFEATURES, &features) < 0) {
TAP_LOG(ERR, "unable to get TUN/TAP features");
goto error;
}
TAP_LOG(DEBUG, "%s Features %08x", TUN_TAP_DEV_PATH, features);
if (features & IFF_MULTI_QUEUE) {
TAP_LOG(DEBUG, " Multi-queue support for %d queues",
RTE_PMD_TAP_MAX_QUEUES);
ifr.ifr_flags |= IFF_MULTI_QUEUE;
} else
#endif
{
ifr.ifr_flags |= IFF_ONE_QUEUE;
TAP_LOG(DEBUG, " Single queue only support");
}
/* Set the TUN/TAP configuration and set the name if needed */
if (ioctl(fd, TUNSETIFF, (void *)&ifr) < 0) {
TAP_LOG(WARNING, "Unable to set TUNSETIFF for %s: %s",
ifr.ifr_name, strerror(errno));
goto error;
}
/* Keep the device after application exit */
if (persistent && ioctl(fd, TUNSETPERSIST, 1) < 0) {
TAP_LOG(WARNING,
"Unable to set persist %s: %s",
ifr.ifr_name, strerror(errno));
goto error;
}
/*
* Name passed to kernel might be wildcard like dtun%d
* and need to find the resulting device.
*/
TAP_LOG(DEBUG, "Device name is '%s'", ifr.ifr_name);
strlcpy(pmd->name, ifr.ifr_name, RTE_ETH_NAME_MAX_LEN);
if (is_keepalive) {
/*
* Detach the TUN/TAP keep-alive queue
* to avoid traffic through it
*/
ifr.ifr_flags = IFF_DETACH_QUEUE;
if (ioctl(fd, TUNSETQUEUE, (void *)&ifr) < 0) {
TAP_LOG(WARNING,
"Unable to detach keep-alive queue for %s: %s",
ifr.ifr_name, strerror(errno));
goto error;
}
}
flags = fcntl(fd, F_GETFL);
if (flags == -1) {
TAP_LOG(WARNING,
"Unable to get %s current flags\n",
ifr.ifr_name);
goto error;
}
/* Always set the file descriptor to non-blocking */
flags |= O_NONBLOCK;
if (fcntl(fd, F_SETFL, flags) < 0) {
TAP_LOG(WARNING,
"Unable to set %s to nonblocking: %s",
ifr.ifr_name, strerror(errno));
goto error;
}
/* Find a free realtime signal */
for (signo = SIGRTMIN + 1; signo < SIGRTMAX; signo++) {
struct sigaction sa;
if (sigaction(signo, NULL, &sa) == -1) {
TAP_LOG(WARNING,
"Unable to get current rt-signal %d handler",
signo);
goto error;
}
/* Already have the handler we want on this signal */
if (sa.sa_handler == tap_trigger_cb)
break;
/* Is handler in use by application */
if (sa.sa_handler != SIG_DFL) {
TAP_LOG(DEBUG,
"Skipping used rt-signal %d", signo);
continue;
}
sa = (struct sigaction) {
.sa_flags = SA_RESTART,
.sa_handler = tap_trigger_cb,
};
if (sigaction(signo, &sa, NULL) == -1) {
TAP_LOG(WARNING,
"Unable to set rt-signal %d handler\n", signo);
goto error;
}
/* Found a good signal to use */
TAP_LOG(DEBUG,
"Using rt-signal %d", signo);
break;
}
if (signo == SIGRTMAX) {
TAP_LOG(WARNING, "All rt-signals are in use\n");
/* Disable trigger globally in case of error */
tap_trigger = 0;
TAP_LOG(NOTICE, "No Rx trigger signal available\n");
} else {
/* Enable signal on file descriptor */
if (fcntl(fd, F_SETSIG, signo) < 0) {
TAP_LOG(WARNING, "Unable to set signo %d for fd %d: %s",
signo, fd, strerror(errno));
goto error;
}
if (fcntl(fd, F_SETFL, flags | O_ASYNC) < 0) {
TAP_LOG(WARNING, "Unable to set fcntl flags: %s",
strerror(errno));
goto error;
}
if (fcntl(fd, F_SETOWN, getpid()) < 0) {
TAP_LOG(WARNING, "Unable to set fcntl owner: %s",
strerror(errno));
goto error;
}
}
return fd;
error:
if (fd >= 0)
close(fd);
return -1;
}
static void
tap_verify_csum(struct rte_mbuf *mbuf)
{
uint32_t l2 = mbuf->packet_type & RTE_PTYPE_L2_MASK;
uint32_t l3 = mbuf->packet_type & RTE_PTYPE_L3_MASK;
uint32_t l4 = mbuf->packet_type & RTE_PTYPE_L4_MASK;
unsigned int l2_len = sizeof(struct rte_ether_hdr);
unsigned int l3_len;
uint16_t cksum = 0;
void *l3_hdr;
void *l4_hdr;
struct rte_udp_hdr *udp_hdr;
if (l2 == RTE_PTYPE_L2_ETHER_VLAN)
l2_len += 4;
else if (l2 == RTE_PTYPE_L2_ETHER_QINQ)
l2_len += 8;
/* Don't verify checksum for packets with discontinuous L2 header */
if (unlikely(l2_len + sizeof(struct rte_ipv4_hdr) >
rte_pktmbuf_data_len(mbuf)))
return;
l3_hdr = rte_pktmbuf_mtod_offset(mbuf, void *, l2_len);
if (l3 == RTE_PTYPE_L3_IPV4 || l3 == RTE_PTYPE_L3_IPV4_EXT) {
struct rte_ipv4_hdr *iph = l3_hdr;
l3_len = rte_ipv4_hdr_len(iph);
if (unlikely(l2_len + l3_len > rte_pktmbuf_data_len(mbuf)))
return;
/* check that the total length reported by header is not
* greater than the total received size
*/
if (l2_len + rte_be_to_cpu_16(iph->total_length) >
rte_pktmbuf_data_len(mbuf))
return;
cksum = ~rte_raw_cksum(iph, l3_len);
mbuf->ol_flags |= cksum ?
RTE_MBUF_F_RX_IP_CKSUM_BAD :
RTE_MBUF_F_RX_IP_CKSUM_GOOD;
} else if (l3 == RTE_PTYPE_L3_IPV6) {
struct rte_ipv6_hdr *iph = l3_hdr;
l3_len = sizeof(struct rte_ipv6_hdr);
/* check that the total length reported by header is not
* greater than the total received size
*/
if (l2_len + l3_len + rte_be_to_cpu_16(iph->payload_len) >
rte_pktmbuf_data_len(mbuf))
return;
} else {
/* - RTE_PTYPE_L3_IPV4_EXT_UNKNOWN cannot happen because
* mbuf->packet_type is filled by rte_net_get_ptype() which
* never returns this value.
* - IPv6 extensions are not supported.
*/
return;
}
if (l4 == RTE_PTYPE_L4_UDP || l4 == RTE_PTYPE_L4_TCP) {
int cksum_ok;
l4_hdr = rte_pktmbuf_mtod_offset(mbuf, void *, l2_len + l3_len);
/* Don't verify checksum for multi-segment packets. */
if (mbuf->nb_segs > 1)
return;
if (l3 == RTE_PTYPE_L3_IPV4 || l3 == RTE_PTYPE_L3_IPV4_EXT) {
if (l4 == RTE_PTYPE_L4_UDP) {
udp_hdr = (struct rte_udp_hdr *)l4_hdr;
if (udp_hdr->dgram_cksum == 0) {
/*
* For IPv4, a zero UDP checksum
* indicates that the sender did not
* generate one [RFC 768].
*/
mbuf->ol_flags |= RTE_MBUF_F_RX_L4_CKSUM_NONE;
return;
}
}
cksum_ok = !rte_ipv4_udptcp_cksum_verify(l3_hdr,
l4_hdr);
} else { /* l3 == RTE_PTYPE_L3_IPV6, checked above */
cksum_ok = !rte_ipv6_udptcp_cksum_verify(l3_hdr,
l4_hdr);
}
mbuf->ol_flags |= cksum_ok ?
RTE_MBUF_F_RX_L4_CKSUM_GOOD : RTE_MBUF_F_RX_L4_CKSUM_BAD;
}
}
static void
tap_rxq_pool_free(struct rte_mbuf *pool)
{
struct rte_mbuf *mbuf = pool;
uint16_t nb_segs = 1;
if (mbuf == NULL)
return;
while (mbuf->next) {
mbuf = mbuf->next;
nb_segs++;
}
pool->nb_segs = nb_segs;
rte_pktmbuf_free(pool);
}
/* Callback to handle the rx burst of packets to the correct interface and
* file descriptor(s) in a multi-queue setup.
*/
static uint16_t
pmd_rx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
{
struct rx_queue *rxq = queue;
struct pmd_process_private *process_private;
uint16_t num_rx;
unsigned long num_rx_bytes = 0;
uint32_t trigger = tap_trigger;
if (trigger == rxq->trigger_seen)
return 0;
process_private = rte_eth_devices[rxq->in_port].process_private;
for (num_rx = 0; num_rx < nb_pkts; ) {
struct rte_mbuf *mbuf = rxq->pool;
struct rte_mbuf *seg = NULL;
struct rte_mbuf *new_tail = NULL;
uint16_t data_off = rte_pktmbuf_headroom(mbuf);
int len;
len = readv(process_private->rxq_fds[rxq->queue_id],
*rxq->iovecs,
1 + (rxq->rxmode->offloads & RTE_ETH_RX_OFFLOAD_SCATTER ?
rxq->nb_rx_desc : 1));
if (len < (int)sizeof(struct tun_pi))
break;
/* Packet couldn't fit in the provided mbuf */
if (unlikely(rxq->pi.flags & TUN_PKT_STRIP)) {
rxq->stats.ierrors++;
continue;
}
len -= sizeof(struct tun_pi);
mbuf->pkt_len = len;
mbuf->port = rxq->in_port;
while (1) {
struct rte_mbuf *buf = rte_pktmbuf_alloc(rxq->mp);
if (unlikely(!buf)) {
rxq->stats.rx_nombuf++;
/* No new buf has been allocated: do nothing */
if (!new_tail || !seg)
goto end;
seg->next = NULL;
tap_rxq_pool_free(mbuf);
goto end;
}
seg = seg ? seg->next : mbuf;
if (rxq->pool == mbuf)
rxq->pool = buf;
if (new_tail)
new_tail->next = buf;
new_tail = buf;
new_tail->next = seg->next;
/* iovecs[0] is reserved for packet info (pi) */
(*rxq->iovecs)[mbuf->nb_segs].iov_len =
buf->buf_len - data_off;
(*rxq->iovecs)[mbuf->nb_segs].iov_base =
(char *)buf->buf_addr + data_off;
seg->data_len = RTE_MIN(seg->buf_len - data_off, len);
seg->data_off = data_off;
len -= seg->data_len;
if (len <= 0)
break;
mbuf->nb_segs++;
/* First segment has headroom, not the others */
data_off = 0;
}
seg->next = NULL;
mbuf->packet_type = rte_net_get_ptype(mbuf, NULL,
RTE_PTYPE_ALL_MASK);
if (rxq->rxmode->offloads & RTE_ETH_RX_OFFLOAD_CHECKSUM)
tap_verify_csum(mbuf);
/* account for the receive frame */
bufs[num_rx++] = mbuf;
num_rx_bytes += mbuf->pkt_len;
}
end:
rxq->stats.ipackets += num_rx;
rxq->stats.ibytes += num_rx_bytes;
if (trigger && num_rx < nb_pkts)
rxq->trigger_seen = trigger;
return num_rx;
}
/* Finalize l4 checksum calculation */
static void
tap_tx_l4_cksum(uint16_t *l4_cksum, uint16_t l4_phdr_cksum,
uint32_t l4_raw_cksum)
{
if (l4_cksum) {
uint32_t cksum;
cksum = __rte_raw_cksum_reduce(l4_raw_cksum);
cksum += l4_phdr_cksum;
cksum = ((cksum & 0xffff0000) >> 16) + (cksum & 0xffff);
cksum = (~cksum) & 0xffff;
if (cksum == 0)
cksum = 0xffff;
*l4_cksum = cksum;
}
}
/* Accumulate L4 raw checksums */
static void
tap_tx_l4_add_rcksum(char *l4_data, unsigned int l4_len, uint16_t *l4_cksum,
uint32_t *l4_raw_cksum)
{
if (l4_cksum == NULL)
return;
*l4_raw_cksum = __rte_raw_cksum(l4_data, l4_len, *l4_raw_cksum);
}
/* L3 and L4 pseudo headers checksum offloads */
static void
tap_tx_l3_cksum(char *packet, uint64_t ol_flags, unsigned int l2_len,
unsigned int l3_len, unsigned int l4_len, uint16_t **l4_cksum,
uint16_t *l4_phdr_cksum, uint32_t *l4_raw_cksum)
{
void *l3_hdr = packet + l2_len;
if (ol_flags & (RTE_MBUF_F_TX_IP_CKSUM | RTE_MBUF_F_TX_IPV4)) {
struct rte_ipv4_hdr *iph = l3_hdr;
uint16_t cksum;
iph->hdr_checksum = 0;
cksum = rte_raw_cksum(iph, l3_len);
iph->hdr_checksum = (cksum == 0xffff) ? cksum : ~cksum;
}
if (ol_flags & RTE_MBUF_F_TX_L4_MASK) {
void *l4_hdr;
l4_hdr = packet + l2_len + l3_len;
if ((ol_flags & RTE_MBUF_F_TX_L4_MASK) == RTE_MBUF_F_TX_UDP_CKSUM)
*l4_cksum = &((struct rte_udp_hdr *)l4_hdr)->dgram_cksum;
else if ((ol_flags & RTE_MBUF_F_TX_L4_MASK) == RTE_MBUF_F_TX_TCP_CKSUM)
*l4_cksum = &((struct rte_tcp_hdr *)l4_hdr)->cksum;
else
return;
**l4_cksum = 0;
if (ol_flags & RTE_MBUF_F_TX_IPV4)
*l4_phdr_cksum = rte_ipv4_phdr_cksum(l3_hdr, 0);
else
*l4_phdr_cksum = rte_ipv6_phdr_cksum(l3_hdr, 0);
*l4_raw_cksum = __rte_raw_cksum(l4_hdr, l4_len, 0);
}
}
static inline int
tap_write_mbufs(struct tx_queue *txq, uint16_t num_mbufs,
struct rte_mbuf **pmbufs,
uint16_t *num_packets, unsigned long *num_tx_bytes)
{
int i;
uint16_t l234_hlen;
struct pmd_process_private *process_private;
process_private = rte_eth_devices[txq->out_port].process_private;
for (i = 0; i < num_mbufs; i++) {
struct rte_mbuf *mbuf = pmbufs[i];
struct iovec iovecs[mbuf->nb_segs + 2];
struct tun_pi pi = { .flags = 0, .proto = 0x00 };
struct rte_mbuf *seg = mbuf;
char m_copy[mbuf->data_len];
int proto;
int n;
int j;
int k; /* current index in iovecs for copying segments */
uint16_t seg_len; /* length of first segment */
uint16_t nb_segs;
uint16_t *l4_cksum; /* l4 checksum (pseudo header + payload) */
uint32_t l4_raw_cksum = 0; /* TCP/UDP payload raw checksum */
uint16_t l4_phdr_cksum = 0; /* TCP/UDP pseudo header checksum */
uint16_t is_cksum = 0; /* in case cksum should be offloaded */
l4_cksum = NULL;
if (txq->type == ETH_TUNTAP_TYPE_TUN) {
/*
* TUN and TAP are created with IFF_NO_PI disabled.
* For TUN PMD this mandatory as fields are used by
* Kernel tun.c to determine whether its IP or non IP
* packets.
*
* The logic fetches the first byte of data from mbuf
* then compares whether its v4 or v6. If first byte
* is 4 or 6, then protocol field is updated.
*/
char *buff_data = rte_pktmbuf_mtod(seg, void *);
proto = (*buff_data & 0xf0);
pi.proto = (proto == 0x40) ?
rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4) :
((proto == 0x60) ?
rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV6) :
0x00);
}
k = 0;
iovecs[k].iov_base = &pi;
iovecs[k].iov_len = sizeof(pi);
k++;
nb_segs = mbuf->nb_segs;
if (txq->csum &&
((mbuf->ol_flags & (RTE_MBUF_F_TX_IP_CKSUM | RTE_MBUF_F_TX_IPV4) ||
(mbuf->ol_flags & RTE_MBUF_F_TX_L4_MASK) == RTE_MBUF_F_TX_UDP_CKSUM ||
(mbuf->ol_flags & RTE_MBUF_F_TX_L4_MASK) == RTE_MBUF_F_TX_TCP_CKSUM))) {
is_cksum = 1;
/* Support only packets with at least layer 4
* header included in the first segment
*/
seg_len = rte_pktmbuf_data_len(mbuf);
l234_hlen = mbuf->l2_len + mbuf->l3_len + mbuf->l4_len;
if (seg_len < l234_hlen)
return -1;
/* To change checksums, work on a * copy of l2, l3
* headers + l4 pseudo header
*/
rte_memcpy(m_copy, rte_pktmbuf_mtod(mbuf, void *),
l234_hlen);
tap_tx_l3_cksum(m_copy, mbuf->ol_flags,
mbuf->l2_len, mbuf->l3_len, mbuf->l4_len,
&l4_cksum, &l4_phdr_cksum,
&l4_raw_cksum);
iovecs[k].iov_base = m_copy;
iovecs[k].iov_len = l234_hlen;
k++;
/* Update next iovecs[] beyond l2, l3, l4 headers */
if (seg_len > l234_hlen) {
iovecs[k].iov_len = seg_len - l234_hlen;
iovecs[k].iov_base =
rte_pktmbuf_mtod(seg, char *) +
l234_hlen;
tap_tx_l4_add_rcksum(iovecs[k].iov_base,
iovecs[k].iov_len, l4_cksum,
&l4_raw_cksum);
k++;
nb_segs++;
}
seg = seg->next;
}
for (j = k; j <= nb_segs; j++) {
iovecs[j].iov_len = rte_pktmbuf_data_len(seg);
iovecs[j].iov_base = rte_pktmbuf_mtod(seg, void *);
if (is_cksum)
tap_tx_l4_add_rcksum(iovecs[j].iov_base,
iovecs[j].iov_len, l4_cksum,
&l4_raw_cksum);
seg = seg->next;
}
if (is_cksum)
tap_tx_l4_cksum(l4_cksum, l4_phdr_cksum, l4_raw_cksum);
/* copy the tx frame data */
n = writev(process_private->txq_fds[txq->queue_id], iovecs, j);
if (n <= 0)
return -1;
(*num_packets)++;
(*num_tx_bytes) += rte_pktmbuf_pkt_len(mbuf);
}
return 0;
}
/* Callback to handle sending packets from the tap interface
*/
static uint16_t
pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
{
struct tx_queue *txq = queue;
uint16_t num_tx = 0;
uint16_t num_packets = 0;
unsigned long num_tx_bytes = 0;
uint32_t max_size;
int i;
if (unlikely(nb_pkts == 0))
return 0;
struct rte_mbuf *gso_mbufs[MAX_GSO_MBUFS];
max_size = *txq->mtu + (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN + 4);
for (i = 0; i < nb_pkts; i++) {
struct rte_mbuf *mbuf_in = bufs[num_tx];
struct rte_mbuf **mbuf;
uint16_t num_mbufs = 0;
uint16_t tso_segsz = 0;
int ret;
int num_tso_mbufs;
uint16_t hdrs_len;
uint64_t tso;
tso = mbuf_in->ol_flags & RTE_MBUF_F_TX_TCP_SEG;
if (tso) {
struct rte_gso_ctx *gso_ctx = &txq->gso_ctx;
/* TCP segmentation implies TCP checksum offload */
mbuf_in->ol_flags |= RTE_MBUF_F_TX_TCP_CKSUM;
/* gso size is calculated without RTE_ETHER_CRC_LEN */
hdrs_len = mbuf_in->l2_len + mbuf_in->l3_len +
mbuf_in->l4_len;
tso_segsz = mbuf_in->tso_segsz + hdrs_len;
if (unlikely(tso_segsz == hdrs_len) ||
tso_segsz > *txq->mtu) {
txq->stats.errs++;
break;
}
gso_ctx->gso_size = tso_segsz;
/* 'mbuf_in' packet to segment */
num_tso_mbufs = rte_gso_segment(mbuf_in,
gso_ctx, /* gso control block */
(struct rte_mbuf **)&gso_mbufs, /* out mbufs */
RTE_DIM(gso_mbufs)); /* max tso mbufs */
/* ret contains the number of new created mbufs */
if (num_tso_mbufs < 0)
break;
if (num_tso_mbufs >= 1) {
mbuf = gso_mbufs;
num_mbufs = num_tso_mbufs;
} else {
/* 0 means it can be transmitted directly
* without gso.
*/
mbuf = &mbuf_in;
num_mbufs = 1;
}
} else {
/* stats.errs will be incremented */
if (rte_pktmbuf_pkt_len(mbuf_in) > max_size)
break;
/* ret 0 indicates no new mbufs were created */
num_tso_mbufs = 0;
mbuf = &mbuf_in;
num_mbufs = 1;
}
ret = tap_write_mbufs(txq, num_mbufs, mbuf,
&num_packets, &num_tx_bytes);
if (ret == -1) {
txq->stats.errs++;
/* free tso mbufs */
if (num_tso_mbufs > 0)
rte_pktmbuf_free_bulk(mbuf, num_tso_mbufs);
break;
}
num_tx++;
/* free original mbuf */
rte_pktmbuf_free(mbuf_in);
/* free tso mbufs */
if (num_tso_mbufs > 0)
rte_pktmbuf_free_bulk(mbuf, num_tso_mbufs);
}
txq->stats.opackets += num_packets;
txq->stats.errs += nb_pkts - num_tx;
txq->stats.obytes += num_tx_bytes;
return num_tx;
}
static const char *
tap_ioctl_req2str(unsigned long request)
{
switch (request) {
case SIOCSIFFLAGS:
return "SIOCSIFFLAGS";
case SIOCGIFFLAGS:
return "SIOCGIFFLAGS";
case SIOCGIFHWADDR:
return "SIOCGIFHWADDR";
case SIOCSIFHWADDR:
return "SIOCSIFHWADDR";
case SIOCSIFMTU:
return "SIOCSIFMTU";
}
return "UNKNOWN";
}
static int
tap_ioctl(struct pmd_internals *pmd, unsigned long request,
struct ifreq *ifr, int set, enum ioctl_mode mode)
{
short req_flags = ifr->ifr_flags;
int remote = pmd->remote_if_index &&
(mode == REMOTE_ONLY || mode == LOCAL_AND_REMOTE);
if (!pmd->remote_if_index && mode == REMOTE_ONLY)
return 0;
/*
* If there is a remote netdevice, apply ioctl on it, then apply it on
* the tap netdevice.
*/
apply:
if (remote)
strlcpy(ifr->ifr_name, pmd->remote_iface, IFNAMSIZ);
else if (mode == LOCAL_ONLY || mode == LOCAL_AND_REMOTE)
strlcpy(ifr->ifr_name, pmd->name, IFNAMSIZ);
switch (request) {
case SIOCSIFFLAGS:
/* fetch current flags to leave other flags untouched */
if (ioctl(pmd->ioctl_sock, SIOCGIFFLAGS, ifr) < 0)
goto error;
if (set)
ifr->ifr_flags |= req_flags;
else
ifr->ifr_flags &= ~req_flags;
break;
case SIOCGIFFLAGS:
case SIOCGIFHWADDR:
case SIOCSIFHWADDR:
case SIOCSIFMTU:
break;
default:
TAP_LOG(WARNING, "%s: ioctl() called with wrong arg",
pmd->name);
return -EINVAL;
}
if (ioctl(pmd->ioctl_sock, request, ifr) < 0)
goto error;
if (remote-- && mode == LOCAL_AND_REMOTE)
goto apply;
return 0;
error:
TAP_LOG(DEBUG, "%s(%s) failed: %s(%d)", ifr->ifr_name,
tap_ioctl_req2str(request), strerror(errno), errno);
return -errno;
}
static int
tap_link_set_down(struct rte_eth_dev *dev)
{
struct pmd_internals *pmd = dev->data->dev_private;
struct ifreq ifr = { .ifr_flags = IFF_UP };
dev->data->dev_link.link_status = RTE_ETH_LINK_DOWN;
return tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_ONLY);
}
static int
tap_link_set_up(struct rte_eth_dev *dev)
{
struct pmd_internals *pmd = dev->data->dev_private;
struct ifreq ifr = { .ifr_flags = IFF_UP };
dev->data->dev_link.link_status = RTE_ETH_LINK_UP;
return tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE);
}
static int
tap_mp_req_on_rxtx(struct rte_eth_dev *dev)
{
struct rte_mp_msg msg;
struct ipc_queues *request_param = (struct ipc_queues *)msg.param;
int err;
int fd_iterator = 0;
struct pmd_process_private *process_private = dev->process_private;
int i;
memset(&msg, 0, sizeof(msg));
strlcpy(msg.name, TAP_MP_REQ_START_RXTX, sizeof(msg.name));
strlcpy(request_param->port_name, dev->data->name, sizeof(request_param->port_name));
msg.len_param = sizeof(*request_param);
for (i = 0; i < dev->data->nb_tx_queues; i++) {
msg.fds[fd_iterator++] = process_private->txq_fds[i];
msg.num_fds++;
request_param->txq_count++;
}
for (i = 0; i < dev->data->nb_rx_queues; i++) {
msg.fds[fd_iterator++] = process_private->rxq_fds[i];
msg.num_fds++;
request_param->rxq_count++;
}
err = rte_mp_sendmsg(&msg);
if (err < 0) {
TAP_LOG(ERR, "Failed to send start req to secondary %d",
rte_errno);
return -1;
}
return 0;
}
static int
tap_dev_start(struct rte_eth_dev *dev)
{
int err, i;
if (rte_eal_process_type() == RTE_PROC_PRIMARY)
tap_mp_req_on_rxtx(dev);
err = tap_intr_handle_set(dev, 1);
if (err)
return err;
err = tap_link_set_up(dev);
if (err)
return err;
for (i = 0; i < dev->data->nb_tx_queues; i++)
dev->data->tx_queue_state[i] = RTE_ETH_QUEUE_STATE_STARTED;
for (i = 0; i < dev->data->nb_rx_queues; i++)
dev->data->rx_queue_state[i] = RTE_ETH_QUEUE_STATE_STARTED;
return err;
}
static int
tap_mp_req_start_rxtx(const struct rte_mp_msg *request, __rte_unused const void *peer)
{
struct rte_eth_dev *dev;
const struct ipc_queues *request_param =
(const struct ipc_queues *)request->param;
int fd_iterator;
int queue;
struct pmd_process_private *process_private;
dev = rte_eth_dev_get_by_name(request_param->port_name);
if (!dev) {
TAP_LOG(ERR, "Failed to get dev for %s",
request_param->port_name);
return -1;
}
process_private = dev->process_private;
fd_iterator = 0;
TAP_LOG(DEBUG, "tap_attach rx_q:%d tx_q:%d\n", request_param->rxq_count,
request_param->txq_count);
for (queue = 0; queue < request_param->txq_count; queue++)
process_private->txq_fds[queue] = request->fds[fd_iterator++];
for (queue = 0; queue < request_param->rxq_count; queue++)
process_private->rxq_fds[queue] = request->fds[fd_iterator++];
return 0;
}
/* This function gets called when the current port gets stopped.
*/
static int
tap_dev_stop(struct rte_eth_dev *dev)
{
struct pmd_internals *pmd = dev->data->dev_private;
int i;
for (i = 0; i < dev->data->nb_tx_queues; i++)
dev->data->tx_queue_state[i] = RTE_ETH_QUEUE_STATE_STOPPED;
for (i = 0; i < dev->data->nb_rx_queues; i++)
dev->data->rx_queue_state[i] = RTE_ETH_QUEUE_STATE_STOPPED;
tap_intr_handle_set(dev, 0);
if (!pmd->persist)
tap_link_set_down(dev);
return 0;
}
static int
tap_dev_configure(struct rte_eth_dev *dev)
{
struct pmd_internals *pmd = dev->data->dev_private;
if (dev->data->nb_rx_queues > RTE_PMD_TAP_MAX_QUEUES) {
TAP_LOG(ERR,
"%s: number of rx queues %d exceeds max num of queues %d",
dev->device->name,
dev->data->nb_rx_queues,
RTE_PMD_TAP_MAX_QUEUES);
return -1;
}
if (dev->data->nb_tx_queues > RTE_PMD_TAP_MAX_QUEUES) {
TAP_LOG(ERR,
"%s: number of tx queues %d exceeds max num of queues %d",
dev->device->name,
dev->data->nb_tx_queues,
RTE_PMD_TAP_MAX_QUEUES);
return -1;
}
if (dev->data->nb_rx_queues != dev->data->nb_tx_queues) {
TAP_LOG(ERR,
"%s: number of rx queues %d must be equal to number of tx queues %d",
dev->device->name,
dev->data->nb_rx_queues,
dev->data->nb_tx_queues);
return -1;
}
TAP_LOG(INFO, "%s: %s: TX configured queues number: %u",
dev->device->name, pmd->name, dev->data->nb_tx_queues);
TAP_LOG(INFO, "%s: %s: RX configured queues number: %u",
dev->device->name, pmd->name, dev->data->nb_rx_queues);
return 0;
}
static uint32_t
tap_dev_speed_capa(void)
{
uint32_t speed = pmd_link.link_speed;
uint32_t capa = 0;
if (speed >= RTE_ETH_SPEED_NUM_10M)
capa |= RTE_ETH_LINK_SPEED_10M;
if (speed >= RTE_ETH_SPEED_NUM_100M)
capa |= RTE_ETH_LINK_SPEED_100M;
if (speed >= RTE_ETH_SPEED_NUM_1G)
capa |= RTE_ETH_LINK_SPEED_1G;
if (speed >= RTE_ETH_SPEED_NUM_5G)
capa |= RTE_ETH_LINK_SPEED_2_5G;
if (speed >= RTE_ETH_SPEED_NUM_5G)
capa |= RTE_ETH_LINK_SPEED_5G;
if (speed >= RTE_ETH_SPEED_NUM_10G)
capa |= RTE_ETH_LINK_SPEED_10G;
if (speed >= RTE_ETH_SPEED_NUM_20G)
capa |= RTE_ETH_LINK_SPEED_20G;
if (speed >= RTE_ETH_SPEED_NUM_25G)
capa |= RTE_ETH_LINK_SPEED_25G;
if (speed >= RTE_ETH_SPEED_NUM_40G)
capa |= RTE_ETH_LINK_SPEED_40G;
if (speed >= RTE_ETH_SPEED_NUM_50G)
capa |= RTE_ETH_LINK_SPEED_50G;
if (speed >= RTE_ETH_SPEED_NUM_56G)
capa |= RTE_ETH_LINK_SPEED_56G;
if (speed >= RTE_ETH_SPEED_NUM_100G)
capa |= RTE_ETH_LINK_SPEED_100G;
return capa;
}
static int
tap_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
{
struct pmd_internals *internals = dev->data->dev_private;
dev_info->if_index = internals->if_index;
dev_info->max_mac_addrs = 1;
dev_info->max_rx_pktlen = (uint32_t)RTE_ETHER_MAX_VLAN_FRAME_LEN;
dev_info->max_rx_queues = RTE_PMD_TAP_MAX_QUEUES;
dev_info->max_tx_queues = RTE_PMD_TAP_MAX_QUEUES;
dev_info->min_rx_bufsize = 0;
dev_info->speed_capa = tap_dev_speed_capa();
dev_info->rx_queue_offload_capa = TAP_RX_OFFLOAD;
dev_info->rx_offload_capa = dev_info->rx_queue_offload_capa;
dev_info->tx_queue_offload_capa = TAP_TX_OFFLOAD;
dev_info->tx_offload_capa = dev_info->tx_queue_offload_capa;
dev_info->hash_key_size = TAP_RSS_HASH_KEY_SIZE;
/*
* limitation: TAP supports all of IP, UDP and TCP hash
* functions together and not in partial combinations
*/
dev_info->flow_type_rss_offloads = ~TAP_RSS_HF_MASK;
dev_info->dev_capa &= ~RTE_ETH_DEV_CAPA_FLOW_RULE_KEEP;
return 0;
}
static int
tap_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *tap_stats)
{
unsigned int i, imax;
unsigned long rx_total = 0, tx_total = 0, tx_err_total = 0;
unsigned long rx_bytes_total = 0, tx_bytes_total = 0;
unsigned long rx_nombuf = 0, ierrors = 0;
const struct pmd_internals *pmd = dev->data->dev_private;
/* rx queue statistics */
imax = (dev->data->nb_rx_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS) ?
dev->data->nb_rx_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS;
for (i = 0; i < imax; i++) {
tap_stats->q_ipackets[i] = pmd->rxq[i].stats.ipackets;
tap_stats->q_ibytes[i] = pmd->rxq[i].stats.ibytes;
rx_total += tap_stats->q_ipackets[i];
rx_bytes_total += tap_stats->q_ibytes[i];
rx_nombuf += pmd->rxq[i].stats.rx_nombuf;
ierrors += pmd->rxq[i].stats.ierrors;
}
/* tx queue statistics */
imax = (dev->data->nb_tx_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS) ?
dev->data->nb_tx_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS;
for (i = 0; i < imax; i++) {
tap_stats->q_opackets[i] = pmd->txq[i].stats.opackets;
tap_stats->q_obytes[i] = pmd->txq[i].stats.obytes;
tx_total += tap_stats->q_opackets[i];
tx_err_total += pmd->txq[i].stats.errs;
tx_bytes_total += tap_stats->q_obytes[i];
}
tap_stats->ipackets = rx_total;
tap_stats->ibytes = rx_bytes_total;
tap_stats->ierrors = ierrors;
tap_stats->rx_nombuf = rx_nombuf;
tap_stats->opackets = tx_total;
tap_stats->oerrors = tx_err_total;
tap_stats->obytes = tx_bytes_total;
return 0;
}
static int
tap_stats_reset(struct rte_eth_dev *dev)
{
int i;
struct pmd_internals *pmd = dev->data->dev_private;
for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
pmd->rxq[i].stats.ipackets = 0;
pmd->rxq[i].stats.ibytes = 0;
pmd->rxq[i].stats.ierrors = 0;
pmd->rxq[i].stats.rx_nombuf = 0;
pmd->txq[i].stats.opackets = 0;
pmd->txq[i].stats.errs = 0;
pmd->txq[i].stats.obytes = 0;
}
return 0;
}
static int
tap_dev_close(struct rte_eth_dev *dev)
{
int i;
struct pmd_internals *internals = dev->data->dev_private;
struct pmd_process_private *process_private = dev->process_private;
struct rx_queue *rxq;
if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
rte_free(dev->process_private);
if (tap_devices_count == 1)
rte_mp_action_unregister(TAP_MP_REQ_START_RXTX);
tap_devices_count--;
return 0;
}
if (!internals->persist)
tap_link_set_down(dev);
if (internals->nlsk_fd != -1) {
tap_flow_flush(dev, NULL);
tap_flow_implicit_flush(internals, NULL);
tap_nl_final(internals->nlsk_fd);
internals->nlsk_fd = -1;
}
for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
if (process_private->rxq_fds[i] != -1) {
rxq = &internals->rxq[i];
close(process_private->rxq_fds[i]);
process_private->rxq_fds[i] = -1;
tap_rxq_pool_free(rxq->pool);
rte_free(rxq->iovecs);
rxq->pool = NULL;
rxq->iovecs = NULL;
}
if (process_private->txq_fds[i] != -1) {
close(process_private->txq_fds[i]);
process_private->txq_fds[i] = -1;
}
}
if (internals->remote_if_index) {
/* Restore initial remote state */
int ret = ioctl(internals->ioctl_sock, SIOCSIFFLAGS,
&internals->remote_initial_flags);
if (ret)
TAP_LOG(ERR, "restore remote state failed: %d", ret);
}
rte_mempool_free(internals->gso_ctx_mp);
internals->gso_ctx_mp = NULL;
if (internals->ka_fd != -1) {
close(internals->ka_fd);
internals->ka_fd = -1;
}
/* mac_addrs must not be freed alone because part of dev_private */
dev->data->mac_addrs = NULL;
internals = dev->data->dev_private;
TAP_LOG(DEBUG, "Closing %s Ethernet device on numa %u",
tuntap_types[internals->type], rte_socket_id());
rte_intr_instance_free(internals->intr_handle);
if (internals->ioctl_sock != -1) {
close(internals->ioctl_sock);
internals->ioctl_sock = -1;
}
rte_free(dev->process_private);
if (tap_devices_count == 1)
rte_mp_action_unregister(TAP_MP_KEY);
tap_devices_count--;
/*
* Since TUN device has no more opened file descriptors
* it will be removed from kernel
*/
return 0;
}
static void
tap_rx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
{
struct rx_queue *rxq = dev->data->rx_queues[qid];
struct pmd_process_private *process_private;
if (!rxq)
return;
process_private = rte_eth_devices[rxq->in_port].process_private;
if (process_private->rxq_fds[rxq->queue_id] != -1) {
close(process_private->rxq_fds[rxq->queue_id]);
process_private->rxq_fds[rxq->queue_id] = -1;
tap_rxq_pool_free(rxq->pool);
rte_free(rxq->iovecs);
rxq->pool = NULL;
rxq->iovecs = NULL;
}
}
static void
tap_tx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
{
struct tx_queue *txq = dev->data->tx_queues[qid];
struct pmd_process_private *process_private;
if (!txq)
return;
process_private = rte_eth_devices[txq->out_port].process_private;
if (process_private->txq_fds[txq->queue_id] != -1) {
close(process_private->txq_fds[txq->queue_id]);
process_private->txq_fds[txq->queue_id] = -1;
}
}
static int
tap_link_update(struct rte_eth_dev *dev, int wait_to_complete __rte_unused)
{
struct rte_eth_link *dev_link = &dev->data->dev_link;
struct pmd_internals *pmd = dev->data->dev_private;
struct ifreq ifr = { .ifr_flags = 0 };
if (pmd->remote_if_index) {
tap_ioctl(pmd, SIOCGIFFLAGS, &ifr, 0, REMOTE_ONLY);
if (!(ifr.ifr_flags & IFF_UP) ||
!(ifr.ifr_flags & IFF_RUNNING)) {
dev_link->link_status = RTE_ETH_LINK_DOWN;
return 0;
}
}
tap_ioctl(pmd, SIOCGIFFLAGS, &ifr, 0, LOCAL_ONLY);
dev_link->link_status =
((ifr.ifr_flags & IFF_UP) && (ifr.ifr_flags & IFF_RUNNING) ?
RTE_ETH_LINK_UP :
RTE_ETH_LINK_DOWN);
return 0;
}
static int
tap_promisc_enable(struct rte_eth_dev *dev)
{
struct pmd_internals *pmd = dev->data->dev_private;
struct ifreq ifr = { .ifr_flags = IFF_PROMISC };
int ret;
ret = tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE);
if (ret != 0)
return ret;
if (pmd->remote_if_index && !pmd->flow_isolate) {
dev->data->promiscuous = 1;
ret = tap_flow_implicit_create(pmd, TAP_REMOTE_PROMISC);
if (ret != 0) {
/* Rollback promisc flag */
tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_AND_REMOTE);
/*
* rte_eth_dev_promiscuous_enable() rollback
* dev->data->promiscuous in the case of failure.
*/
return ret;
}
}
return 0;
}
static int
tap_promisc_disable(struct rte_eth_dev *dev)
{
struct pmd_internals *pmd = dev->data->dev_private;
struct ifreq ifr = { .ifr_flags = IFF_PROMISC };
int ret;
ret = tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_AND_REMOTE);
if (ret != 0)
return ret;
if (pmd->remote_if_index && !pmd->flow_isolate) {
dev->data->promiscuous = 0;
ret = tap_flow_implicit_destroy(pmd, TAP_REMOTE_PROMISC);
if (ret != 0) {
/* Rollback promisc flag */
tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE);
/*
* rte_eth_dev_promiscuous_disable() rollback
* dev->data->promiscuous in the case of failure.
*/
return ret;
}
}
return 0;
}
static int
tap_allmulti_enable(struct rte_eth_dev *dev)
{
struct pmd_internals *pmd = dev->data->dev_private;
struct ifreq ifr = { .ifr_flags = IFF_ALLMULTI };
int ret;
ret = tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE);
if (ret != 0)
return ret;
if (pmd->remote_if_index && !pmd->flow_isolate) {
dev->data->all_multicast = 1;
ret = tap_flow_implicit_create(pmd, TAP_REMOTE_ALLMULTI);
if (ret != 0) {
/* Rollback allmulti flag */
tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_AND_REMOTE);
/*
* rte_eth_dev_allmulticast_enable() rollback
* dev->data->all_multicast in the case of failure.
*/
return ret;
}
}
return 0;
}
static int
tap_allmulti_disable(struct rte_eth_dev *dev)
{
struct pmd_internals *pmd = dev->data->dev_private;
struct ifreq ifr = { .ifr_flags = IFF_ALLMULTI };
int ret;
ret = tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_AND_REMOTE);
if (ret != 0)
return ret;
if (pmd->remote_if_index && !pmd->flow_isolate) {
dev->data->all_multicast = 0;
ret = tap_flow_implicit_destroy(pmd, TAP_REMOTE_ALLMULTI);
if (ret != 0) {
/* Rollback allmulti flag */
tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE);
/*
* rte_eth_dev_allmulticast_disable() rollback
* dev->data->all_multicast in the case of failure.
*/
return ret;
}
}
return 0;
}
static int
tap_mac_set(struct rte_eth_dev *dev, struct rte_ether_addr *mac_addr)
{
struct pmd_internals *pmd = dev->data->dev_private;
enum ioctl_mode mode = LOCAL_ONLY;
struct ifreq ifr;
int ret;
if (pmd->type == ETH_TUNTAP_TYPE_TUN) {
TAP_LOG(ERR, "%s: can't MAC address for TUN",
dev->device->name);
return -ENOTSUP;
}
if (rte_is_zero_ether_addr(mac_addr)) {
TAP_LOG(ERR, "%s: can't set an empty MAC address",
dev->device->name);
return -EINVAL;
}
/* Check the actual current MAC address on the tap netdevice */
ret = tap_ioctl(pmd, SIOCGIFHWADDR, &ifr, 0, LOCAL_ONLY);
if (ret < 0)
return ret;
if (rte_is_same_ether_addr(
(struct rte_ether_addr *)&ifr.ifr_hwaddr.sa_data,
mac_addr))
return 0;
/* Check the current MAC address on the remote */
ret = tap_ioctl(pmd, SIOCGIFHWADDR, &ifr, 0, REMOTE_ONLY);
if (ret < 0)
return ret;
if (!rte_is_same_ether_addr(
(struct rte_ether_addr *)&ifr.ifr_hwaddr.sa_data,
mac_addr))
mode = LOCAL_AND_REMOTE;
ifr.ifr_hwaddr.sa_family = AF_LOCAL;
rte_memcpy(ifr.ifr_hwaddr.sa_data, mac_addr, RTE_ETHER_ADDR_LEN);
ret = tap_ioctl(pmd, SIOCSIFHWADDR, &ifr, 1, mode);
if (ret < 0)
return ret;
rte_memcpy(&pmd->eth_addr, mac_addr, RTE_ETHER_ADDR_LEN);
if (pmd->remote_if_index && !pmd->flow_isolate) {
/* Replace MAC redirection rule after a MAC change */
ret = tap_flow_implicit_destroy(pmd, TAP_REMOTE_LOCAL_MAC);
if (ret < 0) {
TAP_LOG(ERR,
"%s: Couldn't delete MAC redirection rule",
dev->device->name);
return ret;
}
ret = tap_flow_implicit_create(pmd, TAP_REMOTE_LOCAL_MAC);
if (ret < 0) {
TAP_LOG(ERR,
"%s: Couldn't add MAC redirection rule",
dev->device->name);
return ret;
}
}
return 0;
}
static int
tap_gso_ctx_setup(struct rte_gso_ctx *gso_ctx, struct rte_eth_dev *dev)
{
uint32_t gso_types;
char pool_name[64];
struct pmd_internals *pmd = dev->data->dev_private;
int ret;
/* initialize GSO context */
gso_types = RTE_ETH_TX_OFFLOAD_TCP_TSO;
if (!pmd->gso_ctx_mp) {
/*
* Create private mbuf pool with TAP_GSO_MBUF_SEG_SIZE
* bytes size per mbuf use this pool for both direct and
* indirect mbufs
*/
ret = snprintf(pool_name, sizeof(pool_name), "mp_%s",
dev->device->name);
if (ret < 0 || ret >= (int)sizeof(pool_name)) {
TAP_LOG(ERR,
"%s: failed to create mbuf pool name for device %s,"
"device name too long or output error, ret: %d\n",
pmd->name, dev->device->name, ret);
return -ENAMETOOLONG;
}
pmd->gso_ctx_mp = rte_pktmbuf_pool_create(pool_name,
TAP_GSO_MBUFS_NUM, TAP_GSO_MBUF_CACHE_SIZE, 0,
RTE_PKTMBUF_HEADROOM + TAP_GSO_MBUF_SEG_SIZE,
SOCKET_ID_ANY);
if (!pmd->gso_ctx_mp) {
TAP_LOG(ERR,
"%s: failed to create mbuf pool for device %s\n",
pmd->name, dev->device->name);
return -1;
}
}
gso_ctx->direct_pool = pmd->gso_ctx_mp;
gso_ctx->indirect_pool = pmd->gso_ctx_mp;
gso_ctx->gso_types = gso_types;
gso_ctx->gso_size = 0; /* gso_size is set in tx_burst() per packet */
gso_ctx->flag = 0;
return 0;
}
static int
tap_setup_queue(struct rte_eth_dev *dev,
struct pmd_internals *internals,
uint16_t qid,
int is_rx)
{
int ret;
int *fd;
int *other_fd;
const char *dir;
struct pmd_internals *pmd = dev->data->dev_private;
struct pmd_process_private *process_private = dev->process_private;
struct rx_queue *rx = &internals->rxq[qid];
struct tx_queue *tx = &internals->txq[qid];
struct rte_gso_ctx *gso_ctx;
if (is_rx) {
fd = &process_private->rxq_fds[qid];
other_fd = &process_private->txq_fds[qid];
dir = "rx";
gso_ctx = NULL;
} else {
fd = &process_private->txq_fds[qid];
other_fd = &process_private->rxq_fds[qid];
dir = "tx";
gso_ctx = &tx->gso_ctx;
}
if (*fd != -1) {
/* fd for this queue already exists */
TAP_LOG(DEBUG, "%s: fd %d for %s queue qid %d exists",
pmd->name, *fd, dir, qid);
gso_ctx = NULL;
} else if (*other_fd != -1) {
/* Only other_fd exists. dup it */
*fd = dup(*other_fd);
if (*fd < 0) {
*fd = -1;
TAP_LOG(ERR, "%s: dup() failed.", pmd->name);
return -1;
}
TAP_LOG(DEBUG, "%s: dup fd %d for %s queue qid %d (%d)",
pmd->name, *other_fd, dir, qid, *fd);
} else {
/* Both RX and TX fds do not exist (equal -1). Create fd */
*fd = tun_alloc(pmd, 0, 0);
if (*fd < 0) {
*fd = -1; /* restore original value */
TAP_LOG(ERR, "%s: tun_alloc() failed.", pmd->name);
return -1;
}
TAP_LOG(DEBUG, "%s: add %s queue for qid %d fd %d",
pmd->name, dir, qid, *fd);
}
tx->mtu = &dev->data->mtu;
rx->rxmode = &dev->data->dev_conf.rxmode;
if (gso_ctx) {
ret = tap_gso_ctx_setup(gso_ctx, dev);
if (ret)
return -1;
}
tx->type = pmd->type;
return *fd;
}
static int
tap_rx_queue_setup(struct rte_eth_dev *dev,
uint16_t rx_queue_id,
uint16_t nb_rx_desc,
unsigned int socket_id,
const struct rte_eth_rxconf *rx_conf __rte_unused,
struct rte_mempool *mp)
{
struct pmd_internals *internals = dev->data->dev_private;
struct pmd_process_private *process_private = dev->process_private;
struct rx_queue *rxq = &internals->rxq[rx_queue_id];
struct rte_mbuf **tmp = &rxq->pool;
long iov_max = sysconf(_SC_IOV_MAX);
if (iov_max <= 0) {
TAP_LOG(WARNING,
"_SC_IOV_MAX is not defined. Using %d as default",
TAP_IOV_DEFAULT_MAX);
iov_max = TAP_IOV_DEFAULT_MAX;
}
uint16_t nb_desc = RTE_MIN(nb_rx_desc, iov_max - 1);
struct iovec (*iovecs)[nb_desc + 1];
int data_off = RTE_PKTMBUF_HEADROOM;
int ret = 0;
int fd;
int i;
if (rx_queue_id >= dev->data->nb_rx_queues || !mp) {
TAP_LOG(WARNING,
"nb_rx_queues %d too small or mempool NULL",
dev->data->nb_rx_queues);
return -1;
}
rxq->mp = mp;
rxq->trigger_seen = 1; /* force initial burst */
rxq->in_port = dev->data->port_id;
rxq->queue_id = rx_queue_id;
rxq->nb_rx_desc = nb_desc;
iovecs = rte_zmalloc_socket(dev->device->name, sizeof(*iovecs), 0,
socket_id);
if (!iovecs) {
TAP_LOG(WARNING,
"%s: Couldn't allocate %d RX descriptors",
dev->device->name, nb_desc);
return -ENOMEM;
}
rxq->iovecs = iovecs;
dev->data->rx_queues[rx_queue_id] = rxq;
fd = tap_setup_queue(dev, internals, rx_queue_id, 1);
if (fd == -1) {
ret = fd;
goto error;
}
(*rxq->iovecs)[0].iov_len = sizeof(struct tun_pi);
(*rxq->iovecs)[0].iov_base = &rxq->pi;
for (i = 1; i <= nb_desc; i++) {
*tmp = rte_pktmbuf_alloc(rxq->mp);
if (!*tmp) {
TAP_LOG(WARNING,
"%s: couldn't allocate memory for queue %d",
dev->device->name, rx_queue_id);
ret = -ENOMEM;
goto error;
}
(*rxq->iovecs)[i].iov_len = (*tmp)->buf_len - data_off;
(*rxq->iovecs)[i].iov_base =
(char *)(*tmp)->buf_addr + data_off;
data_off = 0;
tmp = &(*tmp)->next;
}
TAP_LOG(DEBUG, " RX TUNTAP device name %s, qid %d on fd %d",
internals->name, rx_queue_id,
process_private->rxq_fds[rx_queue_id]);
return 0;
error:
tap_rxq_pool_free(rxq->pool);
rxq->pool = NULL;
rte_free(rxq->iovecs);
rxq->iovecs = NULL;
return ret;
}
static int
tap_tx_queue_setup(struct rte_eth_dev *dev,
uint16_t tx_queue_id,
uint16_t nb_tx_desc __rte_unused,
unsigned int socket_id __rte_unused,
const struct rte_eth_txconf *tx_conf)
{
struct pmd_internals *internals = dev->data->dev_private;
struct pmd_process_private *process_private = dev->process_private;
struct tx_queue *txq;
int ret;
uint64_t offloads;
if (tx_queue_id >= dev->data->nb_tx_queues)
return -1;
dev->data->tx_queues[tx_queue_id] = &internals->txq[tx_queue_id];
txq = dev->data->tx_queues[tx_queue_id];
txq->out_port = dev->data->port_id;
txq->queue_id = tx_queue_id;
offloads = tx_conf->offloads | dev->data->dev_conf.txmode.offloads;
txq->csum = !!(offloads &
(RTE_ETH_TX_OFFLOAD_IPV4_CKSUM |
RTE_ETH_TX_OFFLOAD_UDP_CKSUM |
RTE_ETH_TX_OFFLOAD_TCP_CKSUM));
ret = tap_setup_queue(dev, internals, tx_queue_id, 0);
if (ret == -1)
return -1;
TAP_LOG(DEBUG,
" TX TUNTAP device name %s, qid %d on fd %d csum %s",
internals->name, tx_queue_id,
process_private->txq_fds[tx_queue_id],
txq->csum ? "on" : "off");
return 0;
}
static int
tap_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
{
struct pmd_internals *pmd = dev->data->dev_private;
struct ifreq ifr = { .ifr_mtu = mtu };
return tap_ioctl(pmd, SIOCSIFMTU, &ifr, 1, LOCAL_AND_REMOTE);
}
static int
tap_set_mc_addr_list(struct rte_eth_dev *dev __rte_unused,
struct rte_ether_addr *mc_addr_set __rte_unused,
uint32_t nb_mc_addr __rte_unused)
{
/*
* Nothing to do actually: the tap has no filtering whatsoever, every
* packet is received.
*/
return 0;
}
static int
tap_nl_msg_handler(struct nlmsghdr *nh, void *arg)
{
struct rte_eth_dev *dev = arg;
struct pmd_internals *pmd = dev->data->dev_private;
struct ifinfomsg *info = NLMSG_DATA(nh);
if (nh->nlmsg_type != RTM_NEWLINK ||
(info->ifi_index != pmd->if_index &&
info->ifi_index != pmd->remote_if_index))
return 0;
return tap_link_update(dev, 0);
}
static void
tap_dev_intr_handler(void *cb_arg)
{
struct rte_eth_dev *dev = cb_arg;
struct pmd_internals *pmd = dev->data->dev_private;
if (rte_intr_fd_get(pmd->intr_handle) >= 0)
tap_nl_recv(rte_intr_fd_get(pmd->intr_handle),
tap_nl_msg_handler, dev);
}
static int
tap_lsc_intr_handle_set(struct rte_eth_dev *dev, int set)
{
struct pmd_internals *pmd = dev->data->dev_private;
int ret;
/* In any case, disable interrupt if the conf is no longer there. */
if (!dev->data->dev_conf.intr_conf.lsc) {
if (rte_intr_fd_get(pmd->intr_handle) != -1)
goto clean;
return 0;
}
if (set) {
rte_intr_fd_set(pmd->intr_handle, tap_nl_init(RTMGRP_LINK));
if (unlikely(rte_intr_fd_get(pmd->intr_handle) == -1))
return -EBADF;
return rte_intr_callback_register(
pmd->intr_handle, tap_dev_intr_handler, dev);
}
clean:
do {
ret = rte_intr_callback_unregister(pmd->intr_handle,
tap_dev_intr_handler, dev);
if (ret >= 0) {
break;
} else if (ret == -EAGAIN) {
rte_delay_ms(100);
} else {
TAP_LOG(ERR, "intr callback unregister failed: %d",
ret);
break;
}
} while (true);
if (rte_intr_fd_get(pmd->intr_handle) >= 0) {
tap_nl_final(rte_intr_fd_get(pmd->intr_handle));
rte_intr_fd_set(pmd->intr_handle, -1);
}
return 0;
}
static int
tap_intr_handle_set(struct rte_eth_dev *dev, int set)
{
int err;
err = tap_lsc_intr_handle_set(dev, set);
if (err < 0) {
if (!set)
tap_rx_intr_vec_set(dev, 0);
return err;
}
err = tap_rx_intr_vec_set(dev, set);
if (err && set)
tap_lsc_intr_handle_set(dev, 0);
return err;
}
static const uint32_t*
tap_dev_supported_ptypes_get(struct rte_eth_dev *dev __rte_unused)
{
static const uint32_t ptypes[] = {
RTE_PTYPE_INNER_L2_ETHER,
RTE_PTYPE_INNER_L2_ETHER_VLAN,
RTE_PTYPE_INNER_L2_ETHER_QINQ,
RTE_PTYPE_INNER_L3_IPV4,
RTE_PTYPE_INNER_L3_IPV4_EXT,
RTE_PTYPE_INNER_L3_IPV6,
RTE_PTYPE_INNER_L3_IPV6_EXT,
RTE_PTYPE_INNER_L4_FRAG,
RTE_PTYPE_INNER_L4_UDP,
RTE_PTYPE_INNER_L4_TCP,
RTE_PTYPE_INNER_L4_SCTP,
RTE_PTYPE_L2_ETHER,
RTE_PTYPE_L2_ETHER_VLAN,
RTE_PTYPE_L2_ETHER_QINQ,
RTE_PTYPE_L3_IPV4,
RTE_PTYPE_L3_IPV4_EXT,
RTE_PTYPE_L3_IPV6_EXT,
RTE_PTYPE_L3_IPV6,
RTE_PTYPE_L4_FRAG,
RTE_PTYPE_L4_UDP,
RTE_PTYPE_L4_TCP,
RTE_PTYPE_L4_SCTP,
};
return ptypes;
}
static int
tap_flow_ctrl_get(struct rte_eth_dev *dev __rte_unused,
struct rte_eth_fc_conf *fc_conf)
{
fc_conf->mode = RTE_ETH_FC_NONE;
return 0;
}
static int
tap_flow_ctrl_set(struct rte_eth_dev *dev __rte_unused,
struct rte_eth_fc_conf *fc_conf)
{
if (fc_conf->mode != RTE_ETH_FC_NONE)
return -ENOTSUP;
return 0;
}
/**
* DPDK callback to update the RSS hash configuration.
*
* @param dev
* Pointer to Ethernet device structure.
* @param[in] rss_conf
* RSS configuration data.
*
* @return
* 0 on success, a negative errno value otherwise and rte_errno is set.
*/
static int
tap_rss_hash_update(struct rte_eth_dev *dev,
struct rte_eth_rss_conf *rss_conf)
{
if (rss_conf->rss_hf & TAP_RSS_HF_MASK) {
rte_errno = EINVAL;
return -rte_errno;
}
if (rss_conf->rss_key && rss_conf->rss_key_len) {
/*
* Currently TAP RSS key is hard coded
* and cannot be updated
*/
TAP_LOG(ERR,
"port %u RSS key cannot be updated",
dev->data->port_id);
rte_errno = EINVAL;
return -rte_errno;
}
return 0;
}
static int
tap_rx_queue_start(struct rte_eth_dev *dev, uint16_t rx_queue_id)
{
dev->data->rx_queue_state[rx_queue_id] = RTE_ETH_QUEUE_STATE_STARTED;
return 0;
}
static int
tap_tx_queue_start(struct rte_eth_dev *dev, uint16_t tx_queue_id)
{
dev->data->tx_queue_state[tx_queue_id] = RTE_ETH_QUEUE_STATE_STARTED;
return 0;
}
static int
tap_rx_queue_stop(struct rte_eth_dev *dev, uint16_t rx_queue_id)
{
dev->data->rx_queue_state[rx_queue_id] = RTE_ETH_QUEUE_STATE_STOPPED;
return 0;
}
static int
tap_tx_queue_stop(struct rte_eth_dev *dev, uint16_t tx_queue_id)
{
dev->data->tx_queue_state[tx_queue_id] = RTE_ETH_QUEUE_STATE_STOPPED;
return 0;
}
static const struct eth_dev_ops ops = {
.dev_start = tap_dev_start,
.dev_stop = tap_dev_stop,
.dev_close = tap_dev_close,
.dev_configure = tap_dev_configure,
.dev_infos_get = tap_dev_info,
.rx_queue_setup = tap_rx_queue_setup,
.tx_queue_setup = tap_tx_queue_setup,
.rx_queue_start = tap_rx_queue_start,
.tx_queue_start = tap_tx_queue_start,
.rx_queue_stop = tap_rx_queue_stop,
.tx_queue_stop = tap_tx_queue_stop,
.rx_queue_release = tap_rx_queue_release,
.tx_queue_release = tap_tx_queue_release,
.flow_ctrl_get = tap_flow_ctrl_get,
.flow_ctrl_set = tap_flow_ctrl_set,
.link_update = tap_link_update,
.dev_set_link_up = tap_link_set_up,
.dev_set_link_down = tap_link_set_down,
.promiscuous_enable = tap_promisc_enable,
.promiscuous_disable = tap_promisc_disable,
.allmulticast_enable = tap_allmulti_enable,
.allmulticast_disable = tap_allmulti_disable,
.mac_addr_set = tap_mac_set,
.mtu_set = tap_mtu_set,
.set_mc_addr_list = tap_set_mc_addr_list,
.stats_get = tap_stats_get,
.stats_reset = tap_stats_reset,
.dev_supported_ptypes_get = tap_dev_supported_ptypes_get,
.rss_hash_update = tap_rss_hash_update,
.flow_ops_get = tap_dev_flow_ops_get,
};
static int
eth_dev_tap_create(struct rte_vdev_device *vdev, const char *tap_name,
char *remote_iface, struct rte_ether_addr *mac_addr,
enum rte_tuntap_type type, int persist)
{
int numa_node = rte_socket_id();
struct rte_eth_dev *dev;
struct pmd_internals *pmd;
struct pmd_process_private *process_private;
const char *tuntap_name = tuntap_types[type];
struct rte_eth_dev_data *data;
struct ifreq ifr;
int i;
TAP_LOG(DEBUG, "%s device on numa %u", tuntap_name, rte_socket_id());
dev = rte_eth_vdev_allocate(vdev, sizeof(*pmd));
if (!dev) {
TAP_LOG(ERR, "%s Unable to allocate device struct",
tuntap_name);
goto error_exit_nodev;
}
process_private = (struct pmd_process_private *)
rte_zmalloc_socket(tap_name, sizeof(struct pmd_process_private),
RTE_CACHE_LINE_SIZE, dev->device->numa_node);
if (process_private == NULL) {
TAP_LOG(ERR, "Failed to alloc memory for process private");
return -1;
}
pmd = dev->data->dev_private;
dev->process_private = process_private;
pmd->dev = dev;
strlcpy(pmd->name, tap_name, sizeof(pmd->name));
pmd->type = type;
pmd->ka_fd = -1;
pmd->nlsk_fd = -1;
pmd->gso_ctx_mp = NULL;
pmd->ioctl_sock = socket(AF_INET, SOCK_DGRAM, 0);
if (pmd->ioctl_sock == -1) {
TAP_LOG(ERR,
"%s Unable to get a socket for management: %s",
tuntap_name, strerror(errno));
goto error_exit;
}
/* Allocate interrupt instance */
pmd->intr_handle = rte_intr_instance_alloc(RTE_INTR_INSTANCE_F_SHARED);
if (pmd->intr_handle == NULL) {
TAP_LOG(ERR, "Failed to allocate intr handle");
goto error_exit;
}
/* Setup some default values */
data = dev->data;
data->dev_private = pmd;
data->dev_flags = RTE_ETH_DEV_INTR_LSC |
RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS;
data->numa_node = numa_node;
data->dev_link = pmd_link;
data->mac_addrs = &pmd->eth_addr;
/* Set the number of RX and TX queues */
data->nb_rx_queues = 0;
data->nb_tx_queues = 0;
dev->dev_ops = &ops;
dev->rx_pkt_burst = pmd_rx_burst;
dev->tx_pkt_burst = pmd_tx_burst;
rte_intr_type_set(pmd->intr_handle, RTE_INTR_HANDLE_EXT);
rte_intr_fd_set(pmd->intr_handle, -1);
dev->intr_handle = pmd->intr_handle;
/* Presetup the fds to -1 as being not valid */
for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
process_private->rxq_fds[i] = -1;
process_private->txq_fds[i] = -1;
}
if (pmd->type == ETH_TUNTAP_TYPE_TAP) {
if (rte_is_zero_ether_addr(mac_addr))
rte_eth_random_addr((uint8_t *)&pmd->eth_addr);
else
rte_memcpy(&pmd->eth_addr, mac_addr, sizeof(*mac_addr));
}
/*
* Allocate a TUN device keep-alive file descriptor that will only be
* closed when the TUN device itself is closed or removed.
* This keep-alive file descriptor will guarantee that the TUN device
* exists even when all of its queues are closed
*/
pmd->ka_fd = tun_alloc(pmd, 1, persist);
if (pmd->ka_fd == -1) {
TAP_LOG(ERR, "Unable to create %s interface", tuntap_name);
goto error_exit;
}
TAP_LOG(DEBUG, "allocated %s", pmd->name);
ifr.ifr_mtu = dev->data->mtu;
if (tap_ioctl(pmd, SIOCSIFMTU, &ifr, 1, LOCAL_AND_REMOTE) < 0)
goto error_exit;
if (pmd->type == ETH_TUNTAP_TYPE_TAP) {
memset(&ifr, 0, sizeof(struct ifreq));
ifr.ifr_hwaddr.sa_family = AF_LOCAL;
rte_memcpy(ifr.ifr_hwaddr.sa_data, &pmd->eth_addr,
RTE_ETHER_ADDR_LEN);
if (tap_ioctl(pmd, SIOCSIFHWADDR, &ifr, 0, LOCAL_ONLY) < 0)
goto error_exit;
}
/* Make network device persist after application exit */
pmd->persist = persist;
/*
* Set up everything related to rte_flow:
* - netlink socket
* - tap / remote if_index
* - mandatory QDISCs
* - rte_flow actual/implicit lists
* - implicit rules
*/
pmd->nlsk_fd = tap_nl_init(0);
if (pmd->nlsk_fd == -1) {
TAP_LOG(WARNING, "%s: failed to create netlink socket.",
pmd->name);
goto disable_rte_flow;
}
pmd->if_index = if_nametoindex(pmd->name);
if (!pmd->if_index) {
TAP_LOG(ERR, "%s: failed to get if_index.", pmd->name);
goto disable_rte_flow;
}
if (qdisc_create_multiq(pmd->nlsk_fd, pmd->if_index) < 0) {
TAP_LOG(ERR, "%s: failed to create multiq qdisc.",
pmd->name);
goto disable_rte_flow;
}
if (qdisc_create_ingress(pmd->nlsk_fd, pmd->if_index) < 0) {
TAP_LOG(ERR, "%s: failed to create ingress qdisc.",
pmd->name);
goto disable_rte_flow;
}
LIST_INIT(&pmd->flows);
if (strlen(remote_iface)) {
pmd->remote_if_index = if_nametoindex(remote_iface);
if (!pmd->remote_if_index) {
TAP_LOG(ERR, "%s: failed to get %s if_index.",
pmd->name, remote_iface);
goto error_remote;
}
strlcpy(pmd->remote_iface, remote_iface, RTE_ETH_NAME_MAX_LEN);
/* Save state of remote device */
tap_ioctl(pmd, SIOCGIFFLAGS, &pmd->remote_initial_flags, 0, REMOTE_ONLY);
/* Replicate remote MAC address */
if (tap_ioctl(pmd, SIOCGIFHWADDR, &ifr, 0, REMOTE_ONLY) < 0) {
TAP_LOG(ERR, "%s: failed to get %s MAC address.",
pmd->name, pmd->remote_iface);
goto error_remote;
}
rte_memcpy(&pmd->eth_addr, ifr.ifr_hwaddr.sa_data,
RTE_ETHER_ADDR_LEN);
/* The desired MAC is already in ifreq after SIOCGIFHWADDR. */
if (tap_ioctl(pmd, SIOCSIFHWADDR, &ifr, 0, LOCAL_ONLY) < 0) {
TAP_LOG(ERR, "%s: failed to get %s MAC address.",
pmd->name, remote_iface);
goto error_remote;
}
/*
* Flush usually returns negative value because it tries to
* delete every QDISC (and on a running device, one QDISC at
* least is needed). Ignore negative return value.
*/
qdisc_flush(pmd->nlsk_fd, pmd->remote_if_index);
if (qdisc_create_ingress(pmd->nlsk_fd,
pmd->remote_if_index) < 0) {
TAP_LOG(ERR, "%s: failed to create ingress qdisc.",
pmd->remote_iface);
goto error_remote;
}
LIST_INIT(&pmd->implicit_flows);
if (tap_flow_implicit_create(pmd, TAP_REMOTE_TX) < 0 ||
tap_flow_implicit_create(pmd, TAP_REMOTE_LOCAL_MAC) < 0 ||
tap_flow_implicit_create(pmd, TAP_REMOTE_BROADCAST) < 0 ||
tap_flow_implicit_create(pmd, TAP_REMOTE_BROADCASTV6) < 0) {
TAP_LOG(ERR,
"%s: failed to create implicit rules.",
pmd->name);
goto error_remote;
}
}
rte_eth_dev_probing_finish(dev);
return 0;
disable_rte_flow:
TAP_LOG(ERR, " Disabling rte flow support: %s(%d)",
strerror(errno), errno);
if (strlen(remote_iface)) {
TAP_LOG(ERR, "Remote feature requires flow support.");
goto error_exit;
}
rte_eth_dev_probing_finish(dev);
return 0;
error_remote:
TAP_LOG(ERR, " Can't set up remote feature: %s(%d)",
strerror(errno), errno);
tap_flow_implicit_flush(pmd, NULL);
error_exit:
if (pmd->nlsk_fd != -1)
close(pmd->nlsk_fd);
if (pmd->ka_fd != -1)
close(pmd->ka_fd);
if (pmd->ioctl_sock != -1)
close(pmd->ioctl_sock);
/* mac_addrs must not be freed alone because part of dev_private */
dev->data->mac_addrs = NULL;
rte_intr_instance_free(pmd->intr_handle);
rte_eth_dev_release_port(dev);
error_exit_nodev:
TAP_LOG(ERR, "%s Unable to initialize %s",
tuntap_name, rte_vdev_device_name(vdev));
return -EINVAL;
}
/* make sure name is a possible Linux network device name */
static bool
is_valid_iface(const char *name)
{
if (*name == '\0')
return false;
if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
return false;
while (*name) {
if (*name == '/' || *name == ':' || isspace(*name))
return false;
name++;
}
return true;
}
static int
set_interface_name(const char *key __rte_unused,
const char *value,
void *extra_args)
{
char *name = (char *)extra_args;
if (value) {
if (!is_valid_iface(value)) {
TAP_LOG(ERR, "TAP invalid remote interface name (%s)",
value);
return -1;
}
strlcpy(name, value, RTE_ETH_NAME_MAX_LEN);
} else {
/* use tap%d which causes kernel to choose next available */
strlcpy(name, DEFAULT_TAP_NAME "%d", RTE_ETH_NAME_MAX_LEN);
}
return 0;
}
static int
set_remote_iface(const char *key __rte_unused,
const char *value,
void *extra_args)
{
char *name = (char *)extra_args;
if (value) {
if (!is_valid_iface(value)) {
TAP_LOG(ERR, "TAP invalid remote interface name (%s)",
value);
return -1;
}
strlcpy(name, value, RTE_ETH_NAME_MAX_LEN);
}
return 0;
}
static int parse_user_mac(struct rte_ether_addr *user_mac,
const char *value)
{
unsigned int index = 0;
char mac_temp[strlen(ETH_TAP_USR_MAC_FMT) + 1], *mac_byte = NULL;
if (user_mac == NULL || value == NULL)
return 0;
strlcpy(mac_temp, value, sizeof(mac_temp));
mac_byte = strtok(mac_temp, ":");
while ((mac_byte != NULL) &&
(strlen(mac_byte) <= 2) &&
(strlen(mac_byte) == strspn(mac_byte,
ETH_TAP_CMP_MAC_FMT))) {
user_mac->addr_bytes[index++] = strtoul(mac_byte, NULL, 16);
mac_byte = strtok(NULL, ":");
}
return index;
}
static int
set_mac_type(const char *key __rte_unused,
const char *value,
void *extra_args)
{
struct rte_ether_addr *user_mac = extra_args;
if (!value)
return 0;
if (!strncasecmp(ETH_TAP_MAC_FIXED, value, strlen(ETH_TAP_MAC_FIXED))) {
static int iface_idx;
/* fixed mac = 00:64:74:61:70:<iface_idx> */
memcpy((char *)user_mac->addr_bytes, "\0dtap",
RTE_ETHER_ADDR_LEN);
user_mac->addr_bytes[RTE_ETHER_ADDR_LEN - 1] =
iface_idx++ + '0';
goto success;
}
if (parse_user_mac(user_mac, value) != 6)
goto error;
success:
TAP_LOG(DEBUG, "TAP user MAC param (%s)", value);
return 0;
error:
TAP_LOG(ERR, "TAP user MAC (%s) is not in format (%s|%s)",
value, ETH_TAP_MAC_FIXED, ETH_TAP_USR_MAC_FMT);
return -1;
}
/*
* Open a TUN interface device. TUN PMD
* 1) sets tap_type as false
* 2) intakes iface as argument.
* 3) as interface is virtual set speed to 10G
*/
static int
rte_pmd_tun_probe(struct rte_vdev_device *dev)
{
const char *name, *params;
int ret;
struct rte_kvargs *kvlist = NULL;
char tun_name[RTE_ETH_NAME_MAX_LEN];
char remote_iface[RTE_ETH_NAME_MAX_LEN];
struct rte_eth_dev *eth_dev;
name = rte_vdev_device_name(dev);
params = rte_vdev_device_args(dev);
memset(remote_iface, 0, RTE_ETH_NAME_MAX_LEN);
if (rte_eal_process_type() == RTE_PROC_SECONDARY &&
strlen(params) == 0) {
eth_dev = rte_eth_dev_attach_secondary(name);
if (!eth_dev) {
TAP_LOG(ERR, "Failed to probe %s", name);
return -1;
}
eth_dev->dev_ops = &ops;
eth_dev->device = &dev->device;
rte_eth_dev_probing_finish(eth_dev);
return 0;
}
/* use tun%d which causes kernel to choose next available */
strlcpy(tun_name, DEFAULT_TUN_NAME "%d", RTE_ETH_NAME_MAX_LEN);
if (params && (params[0] != '\0')) {
TAP_LOG(DEBUG, "parameters (%s)", params);
kvlist = rte_kvargs_parse(params, valid_arguments);
if (kvlist) {
if (rte_kvargs_count(kvlist, ETH_TAP_IFACE_ARG) == 1) {
ret = rte_kvargs_process(kvlist,
ETH_TAP_IFACE_ARG,
&set_interface_name,
tun_name);
if (ret == -1)
goto leave;
}
}
}
pmd_link.link_speed = RTE_ETH_SPEED_NUM_10G;
TAP_LOG(DEBUG, "Initializing pmd_tun for %s", name);
ret = eth_dev_tap_create(dev, tun_name, remote_iface, 0,
ETH_TUNTAP_TYPE_TUN, 0);
leave:
if (ret == -1) {
TAP_LOG(ERR, "Failed to create pmd for %s as %s",
name, tun_name);
}
rte_kvargs_free(kvlist);
return ret;
}
/* Request queue file descriptors from secondary to primary. */
static int
tap_mp_attach_queues(const char *port_name, struct rte_eth_dev *dev)
{
int ret;
struct timespec timeout = {.tv_sec = 1, .tv_nsec = 0};
struct rte_mp_msg request, *reply;
struct rte_mp_reply replies;
struct ipc_queues *request_param = (struct ipc_queues *)request.param;
struct ipc_queues *reply_param;
struct pmd_process_private *process_private = dev->process_private;
int queue, fd_iterator;
/* Prepare the request */
memset(&request, 0, sizeof(request));
strlcpy(request.name, TAP_MP_KEY, sizeof(request.name));
strlcpy(request_param->port_name, port_name,
sizeof(request_param->port_name));
request.len_param = sizeof(*request_param);
/* Send request and receive reply */
ret = rte_mp_request_sync(&request, &replies, &timeout);
if (ret < 0 || replies.nb_received != 1) {
TAP_LOG(ERR, "Failed to request queues from primary: %d",
rte_errno);
return -1;
}
reply = &replies.msgs[0];
reply_param = (struct ipc_queues *)reply->param;
TAP_LOG(DEBUG, "Received IPC reply for %s", reply_param->port_name);
/* Attach the queues from received file descriptors */
if (reply_param->rxq_count + reply_param->txq_count != reply->num_fds) {
TAP_LOG(ERR, "Unexpected number of fds received");
return -1;
}
dev->data->nb_rx_queues = reply_param->rxq_count;
dev->data->nb_tx_queues = reply_param->txq_count;
fd_iterator = 0;
for (queue = 0; queue < reply_param->rxq_count; queue++)
process_private->rxq_fds[queue] = reply->fds[fd_iterator++];
for (queue = 0; queue < reply_param->txq_count; queue++)
process_private->txq_fds[queue] = reply->fds[fd_iterator++];
free(reply);
return 0;
}
/* Send the queue file descriptors from the primary process to secondary. */
static int
tap_mp_sync_queues(const struct rte_mp_msg *request, const void *peer)
{
struct rte_eth_dev *dev;
struct pmd_process_private *process_private;
struct rte_mp_msg reply;
const struct ipc_queues *request_param =
(const struct ipc_queues *)request->param;
struct ipc_queues *reply_param =
(struct ipc_queues *)reply.param;
int queue;
/* Get requested port */
TAP_LOG(DEBUG, "Received IPC request for %s", request_param->port_name);
dev = rte_eth_dev_get_by_name(request_param->port_name);
if (!dev) {
TAP_LOG(ERR, "Failed to get port id for %s",
request_param->port_name);
return -1;
}
process_private = dev->process_private;
/* Fill file descriptors for all queues */
reply.num_fds = 0;
reply_param->rxq_count = 0;
if (dev->data->nb_rx_queues + dev->data->nb_tx_queues >
RTE_MP_MAX_FD_NUM){
TAP_LOG(ERR, "Number of rx/tx queues exceeds max number of fds");
return -1;
}
for (queue = 0; queue < dev->data->nb_rx_queues; queue++) {
reply.fds[reply.num_fds++] = process_private->rxq_fds[queue];
reply_param->rxq_count++;
}
RTE_ASSERT(reply_param->rxq_count == dev->data->nb_rx_queues);
reply_param->txq_count = 0;
for (queue = 0; queue < dev->data->nb_tx_queues; queue++) {
reply.fds[reply.num_fds++] = process_private->txq_fds[queue];
reply_param->txq_count++;
}
RTE_ASSERT(reply_param->txq_count == dev->data->nb_tx_queues);
/* Send reply */
strlcpy(reply.name, request->name, sizeof(reply.name));
strlcpy(reply_param->port_name, request_param->port_name,
sizeof(reply_param->port_name));
reply.len_param = sizeof(*reply_param);
if (rte_mp_reply(&reply, peer) < 0) {
TAP_LOG(ERR, "Failed to reply an IPC request to sync queues");
return -1;
}
return 0;
}
/* Open a TAP interface device.
*/
static int
rte_pmd_tap_probe(struct rte_vdev_device *dev)
{
const char *name, *params;
int ret;
struct rte_kvargs *kvlist = NULL;
int speed;
char tap_name[RTE_ETH_NAME_MAX_LEN];
char remote_iface[RTE_ETH_NAME_MAX_LEN];
struct rte_ether_addr user_mac = { .addr_bytes = {0} };
struct rte_eth_dev *eth_dev;
int tap_devices_count_increased = 0;
int persist = 0;
name = rte_vdev_device_name(dev);
params = rte_vdev_device_args(dev);
if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
eth_dev = rte_eth_dev_attach_secondary(name);
if (!eth_dev) {
TAP_LOG(ERR, "Failed to probe %s", name);
return -1;
}
eth_dev->dev_ops = &ops;
eth_dev->device = &dev->device;
eth_dev->rx_pkt_burst = pmd_rx_burst;
eth_dev->tx_pkt_burst = pmd_tx_burst;
if (!rte_eal_primary_proc_alive(NULL)) {
TAP_LOG(ERR, "Primary process is missing");
return -1;
}
eth_dev->process_private = (struct pmd_process_private *)
rte_zmalloc_socket(name,
sizeof(struct pmd_process_private),
RTE_CACHE_LINE_SIZE,
eth_dev->device->numa_node);
if (eth_dev->process_private == NULL) {
TAP_LOG(ERR,
"Failed to alloc memory for process private");
return -1;
}
ret = tap_mp_attach_queues(name, eth_dev);
if (ret != 0)
return -1;
if (!tap_devices_count) {
ret = rte_mp_action_register(TAP_MP_REQ_START_RXTX, tap_mp_req_start_rxtx);
if (ret < 0 && rte_errno != ENOTSUP) {
TAP_LOG(ERR, "tap: Failed to register IPC callback: %s",
strerror(rte_errno));
return -1;
}
}
tap_devices_count++;
rte_eth_dev_probing_finish(eth_dev);
return 0;
}
speed = RTE_ETH_SPEED_NUM_10G;
/* use tap%d which causes kernel to choose next available */
strlcpy(tap_name, DEFAULT_TAP_NAME "%d", RTE_ETH_NAME_MAX_LEN);
memset(remote_iface, 0, RTE_ETH_NAME_MAX_LEN);
if (params && (params[0] != '\0')) {
TAP_LOG(DEBUG, "parameters (%s)", params);
kvlist = rte_kvargs_parse(params, valid_arguments);
if (kvlist) {
if (rte_kvargs_count(kvlist, ETH_TAP_IFACE_ARG) == 1) {
ret = rte_kvargs_process(kvlist,
ETH_TAP_IFACE_ARG,
&set_interface_name,
tap_name);
if (ret == -1)
goto leave;
}
if (rte_kvargs_count(kvlist, ETH_TAP_REMOTE_ARG) == 1) {
ret = rte_kvargs_process(kvlist,
ETH_TAP_REMOTE_ARG,
&set_remote_iface,
remote_iface);
if (ret == -1)
goto leave;
}
if (rte_kvargs_count(kvlist, ETH_TAP_MAC_ARG) == 1) {
ret = rte_kvargs_process(kvlist,
ETH_TAP_MAC_ARG,
&set_mac_type,
&user_mac);
if (ret == -1)
goto leave;
}
if (rte_kvargs_count(kvlist, ETH_TAP_PERSIST_ARG) == 1)
persist = 1;
}
}
pmd_link.link_speed = speed;
TAP_LOG(DEBUG, "Initializing pmd_tap for %s", name);
/* Register IPC feed callback */
if (!tap_devices_count) {
ret = rte_mp_action_register(TAP_MP_KEY, tap_mp_sync_queues);
if (ret < 0 && rte_errno != ENOTSUP) {
TAP_LOG(ERR, "tap: Failed to register IPC callback: %s",
strerror(rte_errno));
goto leave;
}
}
tap_devices_count++;
tap_devices_count_increased = 1;
ret = eth_dev_tap_create(dev, tap_name, remote_iface, &user_mac,
ETH_TUNTAP_TYPE_TAP, persist);
leave:
if (ret == -1) {
TAP_LOG(ERR, "Failed to create pmd for %s as %s",
name, tap_name);
if (tap_devices_count_increased == 1) {
if (tap_devices_count == 1)
rte_mp_action_unregister(TAP_MP_KEY);
tap_devices_count--;
}
}
rte_kvargs_free(kvlist);
return ret;
}
/* detach a TUNTAP device.
*/
static int
rte_pmd_tap_remove(struct rte_vdev_device *dev)
{
struct rte_eth_dev *eth_dev = NULL;
/* find the ethdev entry */
eth_dev = rte_eth_dev_allocated(rte_vdev_device_name(dev));
if (!eth_dev)
return 0;
tap_dev_close(eth_dev);
rte_eth_dev_release_port(eth_dev);
return 0;
}
static struct rte_vdev_driver pmd_tun_drv = {
.probe = rte_pmd_tun_probe,
.remove = rte_pmd_tap_remove,
};
static struct rte_vdev_driver pmd_tap_drv = {
.probe = rte_pmd_tap_probe,
.remove = rte_pmd_tap_remove,
};
RTE_PMD_REGISTER_VDEV(net_tap, pmd_tap_drv);
RTE_PMD_REGISTER_VDEV(net_tun, pmd_tun_drv);
RTE_PMD_REGISTER_ALIAS(net_tap, eth_tap);
RTE_PMD_REGISTER_PARAM_STRING(net_tun,
ETH_TAP_IFACE_ARG "=<string> ");
RTE_PMD_REGISTER_PARAM_STRING(net_tap,
ETH_TAP_IFACE_ARG "=<string> "
ETH_TAP_MAC_ARG "=" ETH_TAP_MAC_ARG_FMT " "
ETH_TAP_REMOTE_ARG "=<string>");
RTE_LOG_REGISTER_DEFAULT(tap_logtype, NOTICE);