numam-dpdk/drivers/net/mlx5/mlx5_nl.c

/* SPDX-License-Identifier: BSD-3-Clause
 * Copyright 2018 6WIND S.A.
 * Copyright 2018 Mellanox Technologies, Ltd
 */

#include <errno.h>
#include <linux/if_link.h>
#include <linux/netlink.h>
#include <linux/rtnetlink.h>
#include <net/if.h>
#include <rdma/rdma_netlink.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>
#include <stdalign.h>
#include <string.h>
#include <sys/socket.h>
#include <unistd.h>

#include <rte_errno.h>
#include <rte_malloc.h>
#include <rte_hypervisor.h>

#include "mlx5.h"
#include "mlx5_utils.h"

/* Size of the buffer to receive kernel messages */
#define MLX5_NL_BUF_SIZE (32 * 1024)
/* Send buffer size for the Netlink socket */
#define MLX5_SEND_BUF_SIZE 32768
/* Receive buffer size for the Netlink socket */
#define MLX5_RECV_BUF_SIZE 32768

/** Parameters of VLAN devices created by driver. */
#define MLX5_VMWA_VLAN_DEVICE_PFX "evmlx"
/*
 * Define NDA_RTA as defined in iproute2 sources.
 *
 * see in iproute2 sources file include/libnetlink.h
 */
#ifndef MLX5_NDA_RTA
#define MLX5_NDA_RTA(r) \
	((struct rtattr *)(((char *)(r)) + NLMSG_ALIGN(sizeof(struct ndmsg))))
#endif

/*
 * The following definitions are normally found in rdma/rdma_netlink.h,
 * however they are so recent that most systems do not expose them yet.
 */
#ifndef HAVE_RDMA_NL_NLDEV
#define RDMA_NL_NLDEV 5
#endif
#ifndef HAVE_RDMA_NLDEV_CMD_GET
#define RDMA_NLDEV_CMD_GET 1
#endif
#ifndef HAVE_RDMA_NLDEV_CMD_PORT_GET
#define RDMA_NLDEV_CMD_PORT_GET 5
#endif
#ifndef HAVE_RDMA_NLDEV_ATTR_DEV_INDEX
#define RDMA_NLDEV_ATTR_DEV_INDEX 1
#endif
#ifndef HAVE_RDMA_NLDEV_ATTR_DEV_NAME
#define RDMA_NLDEV_ATTR_DEV_NAME 2
#endif
#ifndef HAVE_RDMA_NLDEV_ATTR_PORT_INDEX
#define RDMA_NLDEV_ATTR_PORT_INDEX 3
#endif
#ifndef HAVE_RDMA_NLDEV_ATTR_NDEV_INDEX
#define RDMA_NLDEV_ATTR_NDEV_INDEX 50
#endif

/* These are normally found in linux/if_link.h. */
#ifndef HAVE_IFLA_NUM_VF
#define IFLA_NUM_VF 21
#endif
#ifndef HAVE_IFLA_EXT_MASK
#define IFLA_EXT_MASK 29
#endif
#ifndef HAVE_IFLA_PHYS_SWITCH_ID
#define IFLA_PHYS_SWITCH_ID 36
#endif
#ifndef HAVE_IFLA_PHYS_PORT_NAME
#define IFLA_PHYS_PORT_NAME 38
#endif

/* Add/remove MAC address through Netlink */
struct mlx5_nl_mac_addr {
	struct rte_ether_addr (*mac)[];
	/**< MAC address handled by the device. */
	int mac_n; /**< Number of addresses in the array. */
};

#define MLX5_NL_CMD_GET_IB_NAME (1 << 0)
#define MLX5_NL_CMD_GET_IB_INDEX (1 << 1)
#define MLX5_NL_CMD_GET_NET_INDEX (1 << 2)
#define MLX5_NL_CMD_GET_PORT_INDEX (1 << 3)

/** Data structure used by mlx5_nl_cmdget_cb(). */
struct mlx5_nl_ifindex_data {
	const char *name; /**< IB device name (in). */
	uint32_t flags; /**< found attribute flags (out). */
	uint32_t ibindex; /**< IB device index (out). */
	uint32_t ifindex; /**< Network interface index (out). */
	uint32_t portnum; /**< IB device max port number (out). */
};

/**
 * Opens a Netlink socket.
 *
 * @param protocol
 *   Netlink protocol (e.g. NETLINK_ROUTE, NETLINK_RDMA).
 *
 * @return
 *   A file descriptor on success, a negative errno value otherwise and
 *   rte_errno is set.
 */
int
mlx5_nl_init(int protocol)
{
	int fd;
	int sndbuf_size = MLX5_SEND_BUF_SIZE;
	int rcvbuf_size = MLX5_RECV_BUF_SIZE;
	struct sockaddr_nl local = {
		.nl_family = AF_NETLINK,
	};
	int ret;

	fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, protocol);
	if (fd == -1) {
		rte_errno = errno;
		return -rte_errno;
	}
	ret = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sndbuf_size, sizeof(int));
	if (ret == -1) {
		rte_errno = errno;
		goto error;
	}
	ret = setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &rcvbuf_size, sizeof(int));
	if (ret == -1) {
		rte_errno = errno;
		goto error;
	}
	ret = bind(fd, (struct sockaddr *)&local, sizeof(local));
	if (ret == -1) {
		rte_errno = errno;
		goto error;
	}
	return fd;
error:
	close(fd);
	return -rte_errno;
}

/**
 * Send a request message to the kernel on the Netlink socket.
 *
 * @param[in] nlsk_fd
 *   Netlink socket file descriptor.
 * @param[in] nh
 *   The Netlink message send to the kernel.
 * @param[in] ssn
 *   Sequence number.
 * @param[in] req
 *   Pointer to the request structure.
 * @param[in] len
 *   Length of the request in bytes.
 *
 * @return
 *   The number of sent bytes on success, a negative errno value otherwise and
 *   rte_errno is set.
 */
static int
mlx5_nl_request(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn, void *req,
		int len)
{
	struct sockaddr_nl sa = {
		.nl_family = AF_NETLINK,
	};
	struct iovec iov[2] = {
		{ .iov_base = nh, .iov_len = sizeof(*nh), },
		{ .iov_base = req, .iov_len = len, },
	};
	struct msghdr msg = {
		.msg_name = &sa,
		.msg_namelen = sizeof(sa),
		.msg_iov = iov,
		.msg_iovlen = 2,
	};
	int send_bytes;

	nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
	nh->nlmsg_seq = sn;
	send_bytes = sendmsg(nlsk_fd, &msg, 0);
	if (send_bytes < 0) {
		rte_errno = errno;
		return -rte_errno;
	}
	return send_bytes;
}

/**
 * Send a message to the kernel on the Netlink socket.
 *
 * @param[in] nlsk_fd
 *   The Netlink socket file descriptor used for communication.
 * @param[in] nh
 *   The Netlink message send to the kernel.
 * @param[in] sn
 *   Sequence number.
 *
 * @return
 *   The number of sent bytes on success, a negative errno value otherwise and
 *   rte_errno is set.
 */
static int
mlx5_nl_send(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn)
{
	struct sockaddr_nl sa = {
		.nl_family = AF_NETLINK,
	};
	struct iovec iov = {
		.iov_base = nh,
		.iov_len = nh->nlmsg_len,
	};
	struct msghdr msg = {
		.msg_name = &sa,
		.msg_namelen = sizeof(sa),
		.msg_iov = &iov,
		.msg_iovlen = 1,
	};
	int send_bytes;

	nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
	nh->nlmsg_seq = sn;
	send_bytes = sendmsg(nlsk_fd, &msg, 0);
	if (send_bytes < 0) {
		rte_errno = errno;
		return -rte_errno;
	}
	return send_bytes;
}

/**
 * Receive a message from the kernel on the Netlink socket, following
 * mlx5_nl_send().
 *
 * @param[in] nlsk_fd
 *   The Netlink socket file descriptor used for communication.
 * @param[in] sn
 *   Sequence number.
 * @param[in] cb
 *   The callback function to call for each Netlink message received.
 * @param[in, out] arg
 *   Custom arguments for the callback.
 *
 * @return
 *   0 on success, a negative errno value otherwise and rte_errno is set.
 */
static int
mlx5_nl_recv(int nlsk_fd, uint32_t sn, int (*cb)(struct nlmsghdr *, void *arg),
	     void *arg)
{
	struct sockaddr_nl sa;
	char buf[MLX5_RECV_BUF_SIZE];
	struct iovec iov = {
		.iov_base = buf,
		.iov_len = sizeof(buf),
	};
	struct msghdr msg = {
		.msg_name = &sa,
		.msg_namelen = sizeof(sa),
		.msg_iov = &iov,
		/* One message at a time */
		.msg_iovlen = 1,
	};
	int multipart = 0;
	int ret = 0;

	do {
		struct nlmsghdr *nh;
		int recv_bytes = 0;

		do {
			recv_bytes = recvmsg(nlsk_fd, &msg, 0);
			if (recv_bytes == -1) {
				rte_errno = errno;
				return -rte_errno;
			}
			nh = (struct nlmsghdr *)buf;
		} while (nh->nlmsg_seq != sn);
		for (;
		     NLMSG_OK(nh, (unsigned int)recv_bytes);
		     nh = NLMSG_NEXT(nh, recv_bytes)) {
			if (nh->nlmsg_type == NLMSG_ERROR) {
				struct nlmsgerr *err_data = NLMSG_DATA(nh);

				if (err_data->error < 0) {
					rte_errno = -err_data->error;
					return -rte_errno;
				}
				/* Ack message. */
				return 0;
			}
			/* Multi-part msgs and their trailing DONE message. */
			if (nh->nlmsg_flags & NLM_F_MULTI) {
				if (nh->nlmsg_type == NLMSG_DONE)
					return 0;
				multipart = 1;
			}
			if (cb) {
				ret = cb(nh, arg);
				if (ret < 0)
					return ret;
			}
		}
	} while (multipart);
	return ret;
}

/**
 * Parse Netlink message to retrieve the bridge MAC address.
 *
 * @param nh
 *   Pointer to Netlink Message Header.
 * @param arg
 *   PMD data register with this callback.
 *
 * @return
 *   0 on success, a negative errno value otherwise and rte_errno is set.
 */
static int
mlx5_nl_mac_addr_cb(struct nlmsghdr *nh, void *arg)
{
	struct mlx5_nl_mac_addr *data = arg;
	struct ndmsg *r = NLMSG_DATA(nh);
	struct rtattr *attribute;
	int len;

	len = nh->nlmsg_len - NLMSG_LENGTH(sizeof(*r));
	for (attribute = MLX5_NDA_RTA(r);
	     RTA_OK(attribute, len);
	     attribute = RTA_NEXT(attribute, len)) {
		if (attribute->rta_type == NDA_LLADDR) {
			if (data->mac_n == MLX5_MAX_MAC_ADDRESSES) {
				DRV_LOG(WARNING,
					"not enough room to finalize the"
					" request");
				rte_errno = ENOMEM;
				return -rte_errno;
			}
#ifndef NDEBUG
			char m[18];

			rte_ether_format_addr(m, 18, RTA_DATA(attribute));
			DRV_LOG(DEBUG, "bridge MAC address %s", m);
#endif
			memcpy(&(*data->mac)[data->mac_n++],
			       RTA_DATA(attribute), RTE_ETHER_ADDR_LEN);
		}
	}
	return 0;
}

/**
 * Get bridge MAC addresses.
 *
 * @param dev
 *   Pointer to Ethernet device.
 * @param mac[out]
 *   Pointer to the array table of MAC addresses to fill.
 *   Its size should be of MLX5_MAX_MAC_ADDRESSES.
 * @param mac_n[out]
 *   Number of entries filled in MAC array.
 *
 * @return
 *   0 on success, a negative errno value otherwise and rte_errno is set.
 */
static int
mlx5_nl_mac_addr_list(struct rte_eth_dev *dev, struct rte_ether_addr (*mac)[],
		      int *mac_n)
{
	struct mlx5_priv *priv = dev->data->dev_private;
	unsigned int iface_idx = mlx5_ifindex(dev);
	struct {
		struct nlmsghdr	hdr;
		struct ifinfomsg ifm;
	} req = {
		.hdr = {
			.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
			.nlmsg_type = RTM_GETNEIGH,
			.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST,
		},
		.ifm = {
			.ifi_family = PF_BRIDGE,
			.ifi_index = iface_idx,
		},
	};
	struct mlx5_nl_mac_addr data = {
		.mac = mac,
		.mac_n = 0,
	};
	int fd;
	int ret;
	uint32_t sn = priv->nl_sn++;

	if (priv->nl_socket_route == -1)
		return 0;
	fd = priv->nl_socket_route;
	ret = mlx5_nl_request(fd, &req.hdr, sn, &req.ifm,
			      sizeof(struct ifinfomsg));
	if (ret < 0)
		goto error;
	ret = mlx5_nl_recv(fd, sn, mlx5_nl_mac_addr_cb, &data);
	if (ret < 0)
		goto error;
	*mac_n = data.mac_n;
	return 0;
error:
	DRV_LOG(DEBUG, "port %u cannot retrieve MAC address list %s",
		dev->data->port_id, strerror(rte_errno));
	return -rte_errno;
}

/**
 * Modify the MAC address neighbour table with Netlink.
 *
 * @param dev
 *   Pointer to Ethernet device.
 * @param mac
 *   MAC address to consider.
 * @param add
 *   1 to add the MAC address, 0 to remove the MAC address.
 *
 * @return
 *   0 on success, a negative errno value otherwise and rte_errno is set.
 */
static int
mlx5_nl_mac_addr_modify(struct rte_eth_dev *dev, struct rte_ether_addr *mac,
			int add)
{
	struct mlx5_priv *priv = dev->data->dev_private;
	unsigned int iface_idx = mlx5_ifindex(dev);
	struct {
		struct nlmsghdr hdr;
		struct ndmsg ndm;
		struct rtattr rta;
		uint8_t buffer[RTE_ETHER_ADDR_LEN];
	} req = {
		.hdr = {
			.nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)),
			.nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
				NLM_F_EXCL | NLM_F_ACK,
			.nlmsg_type = add ? RTM_NEWNEIGH : RTM_DELNEIGH,
		},
		.ndm = {
			.ndm_family = PF_BRIDGE,
			.ndm_state = NUD_NOARP | NUD_PERMANENT,
			.ndm_ifindex = iface_idx,
			.ndm_flags = NTF_SELF,
		},
		.rta = {
			.rta_type = NDA_LLADDR,
			.rta_len = RTA_LENGTH(RTE_ETHER_ADDR_LEN),
		},
	};
	int fd;
	int ret;
	uint32_t sn = priv->nl_sn++;

	if (priv->nl_socket_route == -1)
		return 0;
	fd = priv->nl_socket_route;
	memcpy(RTA_DATA(&req.rta), mac, RTE_ETHER_ADDR_LEN);
	req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) +
		RTA_ALIGN(req.rta.rta_len);
	ret = mlx5_nl_send(fd, &req.hdr, sn);
	if (ret < 0)
		goto error;
	ret = mlx5_nl_recv(fd, sn, NULL, NULL);
	if (ret < 0)
		goto error;
	return 0;
error:
	DRV_LOG(DEBUG,
		"port %u cannot %s MAC address %02X:%02X:%02X:%02X:%02X:%02X"
		" %s",
		dev->data->port_id,
		add ? "add" : "remove",
		mac->addr_bytes[0], mac->addr_bytes[1],
		mac->addr_bytes[2], mac->addr_bytes[3],
		mac->addr_bytes[4], mac->addr_bytes[5],
		strerror(rte_errno));
	return -rte_errno;
}

/**
 * Add a MAC address.
 *
 * @param dev
 *   Pointer to Ethernet device.
 * @param mac
 *   MAC address to register.
 * @param index
 *   MAC address index.
 *
 * @return
 *   0 on success, a negative errno value otherwise and rte_errno is set.
 */
int
mlx5_nl_mac_addr_add(struct rte_eth_dev *dev, struct rte_ether_addr *mac,
		     uint32_t index)
{
	struct mlx5_priv *priv = dev->data->dev_private;
	int ret;

	ret = mlx5_nl_mac_addr_modify(dev, mac, 1);
	if (!ret)
		BITFIELD_SET(priv->mac_own, index);
	if (ret == -EEXIST)
		return 0;
	return ret;
}

/**
 * Remove a MAC address.
 *
 * @param dev
 *   Pointer to Ethernet device.
 * @param mac
 *   MAC address to remove.
 * @param index
 *   MAC address index.
 *
 * @return
 *   0 on success, a negative errno value otherwise and rte_errno is set.
 */
int
mlx5_nl_mac_addr_remove(struct rte_eth_dev *dev, struct rte_ether_addr *mac,
			uint32_t index)
{
	struct mlx5_priv *priv = dev->data->dev_private;

	BITFIELD_RESET(priv->mac_own, index);
	return mlx5_nl_mac_addr_modify(dev, mac, 0);
}

/**
 * Synchronize Netlink bridge table to the internal table.
 *
 * @param dev
 *   Pointer to Ethernet device.
 */
void
mlx5_nl_mac_addr_sync(struct rte_eth_dev *dev)
{
	struct rte_ether_addr macs[MLX5_MAX_MAC_ADDRESSES];
	int macs_n = 0;
	int i;
	int ret;

	ret = mlx5_nl_mac_addr_list(dev, &macs, &macs_n);
	if (ret)
		return;
	for (i = 0; i != macs_n; ++i) {
		int j;

		/* Verify the address is not in the array yet. */
		for (j = 0; j != MLX5_MAX_MAC_ADDRESSES; ++j)
			if (rte_is_same_ether_addr(&macs[i],
					       &dev->data->mac_addrs[j]))
				break;
		if (j != MLX5_MAX_MAC_ADDRESSES)
			continue;
		/* Find the first entry available. */
		for (j = 0; j != MLX5_MAX_MAC_ADDRESSES; ++j) {
			if (rte_is_zero_ether_addr(&dev->data->mac_addrs[j])) {
				dev->data->mac_addrs[j] = macs[i];
				break;
			}
		}
	}
}

/**
 * Flush all added MAC addresses.
 *
 * @param dev
 *   Pointer to Ethernet device.
 */
void
mlx5_nl_mac_addr_flush(struct rte_eth_dev *dev)
{
	struct mlx5_priv *priv = dev->data->dev_private;
	int i;

	for (i = MLX5_MAX_MAC_ADDRESSES - 1; i >= 0; --i) {
		struct rte_ether_addr *m = &dev->data->mac_addrs[i];

		if (BITFIELD_ISSET(priv->mac_own, i))
			mlx5_nl_mac_addr_remove(dev, m, i);
	}
}

/**
 * Enable promiscuous / all multicast mode through Netlink.
 *
 * @param dev
 *   Pointer to Ethernet device structure.
 * @param flags
 *   IFF_PROMISC for promiscuous, IFF_ALLMULTI for allmulti.
 * @param enable
 *   Nonzero to enable, disable otherwise.
 *
 * @return
 *   0 on success, a negative errno value otherwise and rte_errno is set.
 */
static int
mlx5_nl_device_flags(struct rte_eth_dev *dev, uint32_t flags, int enable)
{
	struct mlx5_priv *priv = dev->data->dev_private;
	unsigned int iface_idx = mlx5_ifindex(dev);
	struct {
		struct nlmsghdr hdr;
		struct ifinfomsg ifi;
	} req = {
		.hdr = {
			.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
			.nlmsg_type = RTM_NEWLINK,
			.nlmsg_flags = NLM_F_REQUEST,
		},
		.ifi = {
			.ifi_flags = enable ? flags : 0,
			.ifi_change = flags,
			.ifi_index = iface_idx,
		},
	};
	int fd;
	int ret;

	assert(!(flags & ~(IFF_PROMISC | IFF_ALLMULTI)));
	if (priv->nl_socket_route < 0)
		return 0;
	fd = priv->nl_socket_route;
	ret = mlx5_nl_send(fd, &req.hdr, priv->nl_sn++);
	if (ret < 0)
		return ret;
	return 0;
}

/**
 * Enable promiscuous mode through Netlink.
 *
 * @param dev
 *   Pointer to Ethernet device structure.
 * @param enable
 *   Nonzero to enable, disable otherwise.
 *
 * @return
 *   0 on success, a negative errno value otherwise and rte_errno is set.
 */
int
mlx5_nl_promisc(struct rte_eth_dev *dev, int enable)
{
	int ret = mlx5_nl_device_flags(dev, IFF_PROMISC, enable);

	if (ret)
		DRV_LOG(DEBUG,
			"port %u cannot %s promisc mode: Netlink error %s",
			dev->data->port_id, enable ? "enable" : "disable",
			strerror(rte_errno));
	return ret;
}

/**
 * Enable all multicast mode through Netlink.
 *
 * @param dev
 *   Pointer to Ethernet device structure.
 * @param enable
 *   Nonzero to enable, disable otherwise.
 *
 * @return
 *   0 on success, a negative errno value otherwise and rte_errno is set.
 */
int
mlx5_nl_allmulti(struct rte_eth_dev *dev, int enable)
{
	int ret = mlx5_nl_device_flags(dev, IFF_ALLMULTI, enable);

	if (ret)
		DRV_LOG(DEBUG,
			"port %u cannot %s allmulti mode: Netlink error %s",
			dev->data->port_id, enable ? "enable" : "disable",
			strerror(rte_errno));
	return ret;
}

/**
 * Process network interface information from Netlink message.
 *
 * @param nh
 *   Pointer to Netlink message header.
 * @param arg
 *   Opaque data pointer for this callback.
 *
 * @return
 *   0 on success, a negative errno value otherwise and rte_errno is set.
 */
static int
mlx5_nl_cmdget_cb(struct nlmsghdr *nh, void *arg)
{
	struct mlx5_nl_ifindex_data *data = arg;
	struct mlx5_nl_ifindex_data local = {
		.flags = 0,
	};
	size_t off = NLMSG_HDRLEN;

	if (nh->nlmsg_type !=
	    RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET) &&
	    nh->nlmsg_type !=
	    RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_PORT_GET))
		goto error;
	while (off < nh->nlmsg_len) {
		struct nlattr *na = (void *)((uintptr_t)nh + off);
		void *payload = (void *)((uintptr_t)na + NLA_HDRLEN);

		if (na->nla_len > nh->nlmsg_len - off)
			goto error;
		switch (na->nla_type) {
		case RDMA_NLDEV_ATTR_DEV_INDEX:
			local.ibindex = *(uint32_t *)payload;
			local.flags |= MLX5_NL_CMD_GET_IB_INDEX;
			break;
		case RDMA_NLDEV_ATTR_DEV_NAME:
			if (!strcmp(payload, data->name))
				local.flags |= MLX5_NL_CMD_GET_IB_NAME;
			break;
		case RDMA_NLDEV_ATTR_NDEV_INDEX:
			local.ifindex = *(uint32_t *)payload;
			local.flags |= MLX5_NL_CMD_GET_NET_INDEX;
			break;
		case RDMA_NLDEV_ATTR_PORT_INDEX:
			local.portnum = *(uint32_t *)payload;
			local.flags |= MLX5_NL_CMD_GET_PORT_INDEX;
			break;
		default:
			break;
		}
		off += NLA_ALIGN(na->nla_len);
	}
	/*
	 * It is possible to have multiple messages for all
	 * Infiniband devices in the system with appropriate name.
	 * So we should gather parameters locally and copy to
	 * query context only in case of coinciding device name.
	 */
	if (local.flags & MLX5_NL_CMD_GET_IB_NAME) {
		data->flags = local.flags;
		data->ibindex = local.ibindex;
		data->ifindex = local.ifindex;
		data->portnum = local.portnum;
	}
	return 0;
error:
	rte_errno = EINVAL;
	return -rte_errno;
}

/**
 * Get index of network interface associated with some IB device.
 *
 * This is the only somewhat safe method to avoid resorting to heuristics
 * when faced with port representors. Unfortunately it requires at least
 * Linux 4.17.
 *
 * @param nl
 *   Netlink socket of the RDMA kind (NETLINK_RDMA).
 * @param[in] name
 *   IB device name.
 * @param[in] pindex
 *   IB device port index, starting from 1
 * @return
 *   A valid (nonzero) interface index on success, 0 otherwise and rte_errno
 *   is set.
 */
unsigned int
mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex)
{
	uint32_t seq = random();
	struct mlx5_nl_ifindex_data data = {
		.name = name,
		.flags = 0,
		.ibindex = 0, /* Determined during first pass. */
		.ifindex = 0, /* Determined during second pass. */
	};
	union {
		struct nlmsghdr nh;
		uint8_t buf[NLMSG_HDRLEN +
			    NLA_HDRLEN + NLA_ALIGN(sizeof(data.ibindex)) +
			    NLA_HDRLEN + NLA_ALIGN(sizeof(pindex))];
	} req = {
		.nh = {
			.nlmsg_len = NLMSG_LENGTH(0),
			.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
						       RDMA_NLDEV_CMD_GET),
			.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
		},
	};
	struct nlattr *na;
	int ret;

	ret = mlx5_nl_send(nl, &req.nh, seq);
	if (ret < 0)
		return 0;
	ret = mlx5_nl_recv(nl, seq, mlx5_nl_cmdget_cb, &data);
	if (ret < 0)
		return 0;
	if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
	    !(data.flags & MLX5_NL_CMD_GET_IB_INDEX))
		goto error;
	data.flags = 0;
	++seq;
	req.nh.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
					     RDMA_NLDEV_CMD_PORT_GET);
	req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
	req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.buf) - NLMSG_HDRLEN);
	na = (void *)((uintptr_t)req.buf + NLMSG_HDRLEN);
	na->nla_len = NLA_HDRLEN + sizeof(data.ibindex);
	na->nla_type = RDMA_NLDEV_ATTR_DEV_INDEX;
	memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
	       &data.ibindex, sizeof(data.ibindex));
	na = (void *)((uintptr_t)na + NLA_ALIGN(na->nla_len));
	na->nla_len = NLA_HDRLEN + sizeof(pindex);
	na->nla_type = RDMA_NLDEV_ATTR_PORT_INDEX;
	memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
	       &pindex, sizeof(pindex));
	ret = mlx5_nl_send(nl, &req.nh, seq);
	if (ret < 0)
		return 0;
	ret = mlx5_nl_recv(nl, seq, mlx5_nl_cmdget_cb, &data);
	if (ret < 0)
		return 0;
	if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
	    !(data.flags & MLX5_NL_CMD_GET_IB_INDEX) ||
	    !(data.flags & MLX5_NL_CMD_GET_NET_INDEX) ||
	    !data.ifindex)
		goto error;
	return data.ifindex;
error:
	rte_errno = ENODEV;
	return 0;
}

/**
 * Get the number of physical ports of given IB device.
 *
 * @param nl
 *   Netlink socket of the RDMA kind (NETLINK_RDMA).
 * @param[in] name
 *   IB device name.
 *
 * @return
 *   A valid (nonzero) number of ports on success, 0 otherwise
 *   and rte_errno is set.
 */
unsigned int
mlx5_nl_portnum(int nl, const char *name)
{
	uint32_t seq = random();
	struct mlx5_nl_ifindex_data data = {
		.flags = 0,
		.name = name,
		.ifindex = 0,
		.portnum = 0,
	};
	struct nlmsghdr req = {
		.nlmsg_len = NLMSG_LENGTH(0),
		.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
					       RDMA_NLDEV_CMD_GET),
		.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
	};
	int ret;

	ret = mlx5_nl_send(nl, &req, seq);
	if (ret < 0)
		return 0;
	ret = mlx5_nl_recv(nl, seq, mlx5_nl_cmdget_cb, &data);
	if (ret < 0)
		return 0;
	if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
	    !(data.flags & MLX5_NL_CMD_GET_IB_INDEX) ||
	    !(data.flags & MLX5_NL_CMD_GET_PORT_INDEX)) {
		rte_errno = ENODEV;
		return 0;
	}
	if (!data.portnum)
		rte_errno = EINVAL;
	return data.portnum;
}

/**
 * Process switch information from Netlink message.
 *
 * @param nh
 *   Pointer to Netlink message header.
 * @param arg
 *   Opaque data pointer for this callback.
 *
 * @return
 *   0 on success, a negative errno value otherwise and rte_errno is set.
 */
static int
mlx5_nl_switch_info_cb(struct nlmsghdr *nh, void *arg)
{
	struct mlx5_switch_info info = {
		.master = 0,
		.representor = 0,
		.name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET,
		.port_name = 0,
		.switch_id = 0,
	};
	size_t off = NLMSG_LENGTH(sizeof(struct ifinfomsg));
	bool switch_id_set = false;
	bool num_vf_set = false;

	if (nh->nlmsg_type != RTM_NEWLINK)
		goto error;
	while (off < nh->nlmsg_len) {
		struct rtattr *ra = (void *)((uintptr_t)nh + off);
		void *payload = RTA_DATA(ra);
		unsigned int i;

		if (ra->rta_len > nh->nlmsg_len - off)
			goto error;
		switch (ra->rta_type) {
		case IFLA_NUM_VF:
			num_vf_set = true;
			break;
		case IFLA_PHYS_PORT_NAME:
			mlx5_translate_port_name((char *)payload, &info);
			break;
		case IFLA_PHYS_SWITCH_ID:
			info.switch_id = 0;
			for (i = 0; i < RTA_PAYLOAD(ra); ++i) {
				info.switch_id <<= 8;
				info.switch_id |= ((uint8_t *)payload)[i];
			}
			switch_id_set = true;
			break;
		}
		off += RTA_ALIGN(ra->rta_len);
	}
	if (switch_id_set) {
		/* We have some E-Switch configuration. */
		mlx5_nl_check_switch_info(num_vf_set, &info);
	}
	assert(!(info.master && info.representor));
	memcpy(arg, &info, sizeof(info));
	return 0;
error:
	rte_errno = EINVAL;
	return -rte_errno;
}

/**
 * Get switch information associated with network interface.
 *
 * @param nl
 *   Netlink socket of the ROUTE kind (NETLINK_ROUTE).
 * @param ifindex
 *   Network interface index.
 * @param[out] info
 *   Switch information object, populated in case of success.
 *
 * @return
 *   0 on success, a negative errno value otherwise and rte_errno is set.
 */
int
mlx5_nl_switch_info(int nl, unsigned int ifindex, struct mlx5_switch_info *info)
{
	uint32_t seq = random();
	struct {
		struct nlmsghdr nh;
		struct ifinfomsg info;
		struct rtattr rta;
		uint32_t extmask;
	} req = {
		.nh = {
			.nlmsg_len = NLMSG_LENGTH
					(sizeof(req.info) +
					 RTA_LENGTH(sizeof(uint32_t))),
			.nlmsg_type = RTM_GETLINK,
			.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
		},
		.info = {
			.ifi_family = AF_UNSPEC,
			.ifi_index = ifindex,
		},
		.rta = {
			.rta_type = IFLA_EXT_MASK,
			.rta_len = RTA_LENGTH(sizeof(int32_t)),
		},
		.extmask = RTE_LE32(1),
	};
	int ret;

	ret = mlx5_nl_send(nl, &req.nh, seq);
	if (ret >= 0)
		ret = mlx5_nl_recv(nl, seq, mlx5_nl_switch_info_cb, info);
	if (info->master && info->representor) {
		DRV_LOG(ERR, "ifindex %u device is recognized as master"
			     " and as representor", ifindex);
		rte_errno = ENODEV;
		ret = -rte_errno;
	}
	return ret;
}

/*
 * Delete VLAN network device by ifindex.
 *
 * @param[in] tcf
 *   Context object initialized by mlx5_vlan_vmwa_init().
 * @param[in] ifindex
 *   Interface index of network device to delete.
 */
static void
mlx5_vlan_vmwa_delete(struct mlx5_vlan_vmwa_context *vmwa,
		      uint32_t ifindex)
{
	int ret;
	struct {
		struct nlmsghdr nh;
		struct ifinfomsg info;
	} req = {
		.nh = {
			.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
			.nlmsg_type = RTM_DELLINK,
			.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
		},
		.info = {
			.ifi_family = AF_UNSPEC,
			.ifi_index = ifindex,
		},
	};

	if (ifindex) {
		++vmwa->nl_sn;
		if (!vmwa->nl_sn)
			++vmwa->nl_sn;
		ret = mlx5_nl_send(vmwa->nl_socket, &req.nh, vmwa->nl_sn);
		if (ret >= 0)
			ret = mlx5_nl_recv(vmwa->nl_socket,
					   vmwa->nl_sn,
					   NULL, NULL);
		if (ret < 0)
			DRV_LOG(WARNING, "netlink: error deleting"
					 " VLAN WA ifindex %u, %d",
					 ifindex, ret);
	}
}

/* Set of subroutines to build Netlink message. */
static struct nlattr *
nl_msg_tail(struct nlmsghdr *nlh)
{
	return (struct nlattr *)
		(((uint8_t *)nlh) + NLMSG_ALIGN(nlh->nlmsg_len));
}

static void
nl_attr_put(struct nlmsghdr *nlh, int type, const void *data, int alen)
{
	struct nlattr *nla = nl_msg_tail(nlh);

	nla->nla_type = type;
	nla->nla_len = NLMSG_ALIGN(sizeof(struct nlattr) + alen);
	nlh->nlmsg_len = NLMSG_ALIGN(nlh->nlmsg_len) + nla->nla_len;

	if (alen)
		memcpy((uint8_t *)nla + sizeof(struct nlattr), data, alen);
}

static struct nlattr *
nl_attr_nest_start(struct nlmsghdr *nlh, int type)
{
	struct nlattr *nest = (struct nlattr *)nl_msg_tail(nlh);

	nl_attr_put(nlh, type, NULL, 0);
	return nest;
}

static void
nl_attr_nest_end(struct nlmsghdr *nlh, struct nlattr *nest)
{
	nest->nla_len = (uint8_t *)nl_msg_tail(nlh) - (uint8_t *)nest;
}

/*
 * Create network VLAN device with specified VLAN tag.
 *
 * @param[in] tcf
 *   Context object initialized by mlx5_vlan_vmwa_init().
 * @param[in] ifindex
 *   Base network interface index.
 * @param[in] tag
 *   VLAN tag for VLAN network device to create.
 */
static uint32_t
mlx5_vlan_vmwa_create(struct mlx5_vlan_vmwa_context *vmwa,
		      uint32_t ifindex,
		      uint16_t tag)
{
	struct nlmsghdr *nlh;
	struct ifinfomsg *ifm;
	char name[sizeof(MLX5_VMWA_VLAN_DEVICE_PFX) + 32];

	alignas(RTE_CACHE_LINE_SIZE)
	uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
		    NLMSG_ALIGN(sizeof(struct ifinfomsg)) +
		    NLMSG_ALIGN(sizeof(struct nlattr)) * 8 +
		    NLMSG_ALIGN(sizeof(uint32_t)) +
		    NLMSG_ALIGN(sizeof(name)) +
		    NLMSG_ALIGN(sizeof("vlan")) +
		    NLMSG_ALIGN(sizeof(uint32_t)) +
		    NLMSG_ALIGN(sizeof(uint16_t)) + 16];
	struct nlattr *na_info;
	struct nlattr *na_vlan;
	int ret;

	memset(buf, 0, sizeof(buf));
	++vmwa->nl_sn;
	if (!vmwa->nl_sn)
		++vmwa->nl_sn;
	nlh = (struct nlmsghdr *)buf;
	nlh->nlmsg_len = sizeof(struct nlmsghdr);
	nlh->nlmsg_type = RTM_NEWLINK;
	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
			   NLM_F_EXCL | NLM_F_ACK;
	ifm = (struct ifinfomsg *)nl_msg_tail(nlh);
	nlh->nlmsg_len += sizeof(struct ifinfomsg);
	ifm->ifi_family = AF_UNSPEC;
	ifm->ifi_type = 0;
	ifm->ifi_index = 0;
	ifm->ifi_flags = IFF_UP;
	ifm->ifi_change = 0xffffffff;
	nl_attr_put(nlh, IFLA_LINK, &ifindex, sizeof(ifindex));
	ret = snprintf(name, sizeof(name), "%s.%u.%u",
		       MLX5_VMWA_VLAN_DEVICE_PFX, ifindex, tag);
	nl_attr_put(nlh, IFLA_IFNAME, name, ret + 1);
	na_info = nl_attr_nest_start(nlh, IFLA_LINKINFO);
	nl_attr_put(nlh, IFLA_INFO_KIND, "vlan", sizeof("vlan"));
	na_vlan = nl_attr_nest_start(nlh, IFLA_INFO_DATA);
	nl_attr_put(nlh, IFLA_VLAN_ID, &tag, sizeof(tag));
	nl_attr_nest_end(nlh, na_vlan);
	nl_attr_nest_end(nlh, na_info);
	assert(sizeof(buf) >= nlh->nlmsg_len);
	ret = mlx5_nl_send(vmwa->nl_socket, nlh, vmwa->nl_sn);
	if (ret >= 0)
		ret = mlx5_nl_recv(vmwa->nl_socket, vmwa->nl_sn, NULL, NULL);
	if (ret < 0) {
		DRV_LOG(WARNING,
			"netlink: VLAN %s create failure (%d)",
			name, ret);
	}
	// Try to get ifindex of created or pre-existing device.
	ret = if_nametoindex(name);
	if (!ret) {
		DRV_LOG(WARNING,
			"VLAN %s failed to get index (%d)",
			name, errno);
		return 0;
	}
	return ret;
}

/*
 * Release VLAN network device, created for VM workaround.
 *
 * @param[in] dev
 *   Ethernet device object, Netlink context provider.
 * @param[in] vlan
 *   Object representing the network device to release.
 */
void mlx5_vlan_vmwa_release(struct rte_eth_dev *dev,
			    struct mlx5_vf_vlan *vlan)
{
	struct mlx5_priv *priv = dev->data->dev_private;
	struct mlx5_vlan_vmwa_context *vmwa = priv->vmwa_context;
	struct mlx5_vlan_dev *vlan_dev = &vmwa->vlan_dev[0];

	assert(vlan->created);
	assert(priv->vmwa_context);
	if (!vlan->created || !vmwa)
		return;
	vlan->created = 0;
	assert(vlan_dev[vlan->tag].refcnt);
	if (--vlan_dev[vlan->tag].refcnt == 0 &&
	    vlan_dev[vlan->tag].ifindex) {
		mlx5_vlan_vmwa_delete(vmwa, vlan_dev[vlan->tag].ifindex);
		vlan_dev[vlan->tag].ifindex = 0;
	}
}

/**
 * Acquire VLAN interface with specified tag for VM workaround.
 *
 * @param[in] dev
 *   Ethernet device object, Netlink context provider.
 * @param[in] vlan
 *   Object representing the network device to acquire.
 */
void mlx5_vlan_vmwa_acquire(struct rte_eth_dev *dev,
			    struct mlx5_vf_vlan *vlan)
{
	struct mlx5_priv *priv = dev->data->dev_private;
	struct mlx5_vlan_vmwa_context *vmwa = priv->vmwa_context;
	struct mlx5_vlan_dev *vlan_dev = &vmwa->vlan_dev[0];

	assert(!vlan->created);
	assert(priv->vmwa_context);
	if (vlan->created || !vmwa)
		return;
	if (vlan_dev[vlan->tag].refcnt == 0) {
		assert(!vlan_dev[vlan->tag].ifindex);
		vlan_dev[vlan->tag].ifindex =
			mlx5_vlan_vmwa_create(vmwa,
					      vmwa->vf_ifindex,
					      vlan->tag);
	}
	if (vlan_dev[vlan->tag].ifindex) {
		vlan_dev[vlan->tag].refcnt++;
		vlan->created = 1;
	}
}

/*
 * Create per ethernet device VLAN VM workaround context
 */
struct mlx5_vlan_vmwa_context *
mlx5_vlan_vmwa_init(struct rte_eth_dev *dev,
		    uint32_t ifindex)
{
	struct mlx5_priv *priv = dev->data->dev_private;
	struct mlx5_dev_config *config = &priv->config;
	struct mlx5_vlan_vmwa_context *vmwa;
	enum rte_hypervisor hv_type;

	/* Do not engage workaround over PF. */
	if (!config->vf)
		return NULL;
	/* Check whether there is desired virtual environment */
	hv_type = rte_hypervisor_get();
	switch (hv_type) {
	case RTE_HYPERVISOR_UNKNOWN:
	case RTE_HYPERVISOR_VMWARE:
		/*
		 * The "white list" of configurations
		 * to engage the workaround.
		 */
		break;
	default:
		/*
		 * The configuration is not found in the "white list".
		 * We should not engage the VLAN workaround.
		 */
		return NULL;
	}
	vmwa = rte_zmalloc(__func__, sizeof(*vmwa), sizeof(uint32_t));
	if (!vmwa) {
		DRV_LOG(WARNING,
			"Can not allocate memory"
			" for VLAN workaround context");
		return NULL;
	}
	vmwa->nl_socket = mlx5_nl_init(NETLINK_ROUTE);
	if (vmwa->nl_socket < 0) {
		DRV_LOG(WARNING,
			"Can not create Netlink socket"
			" for VLAN workaround context");
		rte_free(vmwa);
		return NULL;
	}
	vmwa->nl_sn = random();
	vmwa->vf_ifindex = ifindex;
	vmwa->dev = dev;
	/* Cleanup for existing VLAN devices. */
	return vmwa;
}

/*
 * Destroy per ethernet device VLAN VM workaround context
 */
void mlx5_vlan_vmwa_exit(struct mlx5_vlan_vmwa_context *vmwa)
{
	unsigned int i;

	/* Delete all remaining VLAN devices. */
	for (i = 0; i < RTE_DIM(vmwa->vlan_dev); i++) {
		if (vmwa->vlan_dev[i].ifindex)
			mlx5_vlan_vmwa_delete(vmwa, vmwa->vlan_dev[i].ifindex);
	}
	if (vmwa->nl_socket >= 0)
		close(vmwa->nl_socket);
	rte_free(vmwa);
}