vhost: broadcast RARP by injecting in receiving mbuf array
Broadcast RARP packet by injecting it to receiving mbuf array at
rte_vhost_dequeue_burst().
Commit 33226236a3
("vhost: handle request to send RARP") iterates
all host interfaces and then broadcast it by all of them. It did
notify the switches about the new location of the migrated VM, however,
the mac learning table in the target host is wrong (at least in my
test with OVS):
$ ovs-appctl fdb/show ovsbr0
port VLAN MAC Age
1 0 b6:3c:72:71:cd:4d 10
LOCAL 0 b6:3c:72:71:cd:4e 10
LOCAL 0 52:54:00:12:34:68 9
1 0 56:f6:64:2c:bc:c0 1
Where 52:54:00:12:34:68 is the mac of the VM. As you can see from the
above, the port learned is "LOCAL", which is the "ovsbr0" port. That
is reasonable, since we indeed send the pkt by the "ovsbr0" interface.
The wrong mac table lead all the packets to the VM go to the "ovsbr0"
in the end, which ends up with all packets being lost, until the guest
send a ARP quest (or reply) to refresh the mac learning table.
Jianfeng then came up with a solution I have thought of firstly but NAKed
by myself, concerning it has potential issues [0]. The solution is as title
stated: broadcast the RARP packet by injecting it to the receiving mbuf
arrays at rte_vhost_dequeue_burst(). The re-bring of that idea made me
think it twice; it looked like a false concern to me then. And I had done
a rough verification: it worked as expected.
[0]: http://dpdk.org/ml/archives/dev/2016-February/033527.html
Another note is that while preparing this version, I found that DPDK has
some ARP related structures and macros defined. So, use them instead of
the one from standard header files here.
Cc: Thibaut Collet <thibaut.collet@6wind.com>
Suggested-by: Jianfeng Tan <jianfeng.tan@intel.com>
Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
This commit is contained in:
parent
e7d088780e
commit
bb66588304
@ -49,6 +49,7 @@
|
||||
|
||||
#include <rte_memory.h>
|
||||
#include <rte_mempool.h>
|
||||
#include <rte_ether.h>
|
||||
|
||||
struct rte_mbuf;
|
||||
|
||||
@ -133,7 +134,9 @@ struct virtio_net {
|
||||
void *priv; /**< private context */
|
||||
uint64_t log_size; /**< Size of log area */
|
||||
uint64_t log_base; /**< Where dirty pages are logged */
|
||||
uint64_t reserved[62]; /**< Reserve some spaces for future extension. */
|
||||
struct ether_addr mac; /**< MAC address */
|
||||
rte_atomic16_t broadcast_rarp; /**< A flag to tell if we need broadcast rarp packet */
|
||||
uint64_t reserved[61]; /**< Reserve some spaces for future extension. */
|
||||
struct vhost_virtqueue *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2]; /**< Contains all virtqueue information. */
|
||||
} __rte_cache_aligned;
|
||||
|
||||
|
@ -43,6 +43,7 @@
|
||||
#include <rte_tcp.h>
|
||||
#include <rte_udp.h>
|
||||
#include <rte_sctp.h>
|
||||
#include <rte_arp.h>
|
||||
|
||||
#include "vhost-net.h"
|
||||
|
||||
@ -761,11 +762,50 @@ vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
|
||||
}
|
||||
}
|
||||
|
||||
#define RARP_PKT_SIZE 64
|
||||
|
||||
static int
|
||||
make_rarp_packet(struct rte_mbuf *rarp_mbuf, const struct ether_addr *mac)
|
||||
{
|
||||
struct ether_hdr *eth_hdr;
|
||||
struct arp_hdr *rarp;
|
||||
|
||||
if (rarp_mbuf->buf_len < 64) {
|
||||
RTE_LOG(WARNING, VHOST_DATA,
|
||||
"failed to make RARP; mbuf size too small %u (< %d)\n",
|
||||
rarp_mbuf->buf_len, RARP_PKT_SIZE);
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Ethernet header. */
|
||||
eth_hdr = rte_pktmbuf_mtod_offset(rarp_mbuf, struct ether_hdr *, 0);
|
||||
memset(eth_hdr->d_addr.addr_bytes, 0xff, ETHER_ADDR_LEN);
|
||||
ether_addr_copy(mac, ð_hdr->s_addr);
|
||||
eth_hdr->ether_type = htons(ETHER_TYPE_RARP);
|
||||
|
||||
/* RARP header. */
|
||||
rarp = (struct arp_hdr *)(eth_hdr + 1);
|
||||
rarp->arp_hrd = htons(ARP_HRD_ETHER);
|
||||
rarp->arp_pro = htons(ETHER_TYPE_IPv4);
|
||||
rarp->arp_hln = ETHER_ADDR_LEN;
|
||||
rarp->arp_pln = 4;
|
||||
rarp->arp_op = htons(ARP_OP_REVREQUEST);
|
||||
|
||||
ether_addr_copy(mac, &rarp->arp_data.arp_sha);
|
||||
ether_addr_copy(mac, &rarp->arp_data.arp_tha);
|
||||
memset(&rarp->arp_data.arp_sip, 0x00, 4);
|
||||
memset(&rarp->arp_data.arp_tip, 0x00, 4);
|
||||
|
||||
rarp_mbuf->pkt_len = rarp_mbuf->data_len = RARP_PKT_SIZE;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint16_t
|
||||
rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id,
|
||||
struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
|
||||
{
|
||||
struct rte_mbuf *m, *prev;
|
||||
struct rte_mbuf *m, *prev, *rarp_mbuf = NULL;
|
||||
struct vhost_virtqueue *vq;
|
||||
struct vring_desc *desc;
|
||||
uint64_t vb_addr = 0;
|
||||
@ -788,11 +828,34 @@ rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id,
|
||||
if (unlikely(vq->enabled == 0))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* Construct a RARP broadcast packet, and inject it to the "pkts"
|
||||
* array, to looks like that guest actually send such packet.
|
||||
*
|
||||
* Check user_send_rarp() for more information.
|
||||
*/
|
||||
if (unlikely(rte_atomic16_cmpset((volatile uint16_t *)
|
||||
&dev->broadcast_rarp.cnt, 1, 0))) {
|
||||
rarp_mbuf = rte_pktmbuf_alloc(mbuf_pool);
|
||||
if (rarp_mbuf == NULL) {
|
||||
RTE_LOG(ERR, VHOST_DATA,
|
||||
"Failed to allocate memory for mbuf.\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (make_rarp_packet(rarp_mbuf, &dev->mac)) {
|
||||
rte_pktmbuf_free(rarp_mbuf);
|
||||
rarp_mbuf = NULL;
|
||||
} else {
|
||||
count -= 1;
|
||||
}
|
||||
}
|
||||
|
||||
avail_idx = *((volatile uint16_t *)&vq->avail->idx);
|
||||
|
||||
/* If there are no available buffers then return. */
|
||||
if (vq->last_used_idx == avail_idx)
|
||||
return 0;
|
||||
goto out;
|
||||
|
||||
LOG_DEBUG(VHOST_DATA, "%s (%"PRIu64")\n", __func__,
|
||||
dev->device_fh);
|
||||
@ -983,8 +1046,21 @@ rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id,
|
||||
vq->used->idx += entry_success;
|
||||
vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
|
||||
sizeof(vq->used->idx));
|
||||
|
||||
/* Kick guest if required. */
|
||||
if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
|
||||
eventfd_write(vq->callfd, (eventfd_t)1);
|
||||
|
||||
out:
|
||||
if (unlikely(rarp_mbuf != NULL)) {
|
||||
/*
|
||||
* Inject it to the head of "pkts" array, so that switch's mac
|
||||
* learning table will get updated first.
|
||||
*/
|
||||
memmove(&pkts[1], pkts, entry_success * sizeof(m));
|
||||
pkts[0] = rarp_mbuf;
|
||||
entry_success += 1;
|
||||
}
|
||||
|
||||
return entry_success;
|
||||
}
|
||||
|
@ -437,7 +437,7 @@ vserver_message_handler(int connfd, void *dat, int *remove)
|
||||
user_set_vring_enable(ctx, &msg.payload.state);
|
||||
break;
|
||||
case VHOST_USER_SEND_RARP:
|
||||
user_send_rarp(&msg);
|
||||
user_send_rarp(ctx, &msg);
|
||||
break;
|
||||
|
||||
default:
|
||||
|
@ -39,12 +39,6 @@
|
||||
#include <sys/mman.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <sys/socket.h>
|
||||
#include <net/ethernet.h>
|
||||
#include <netinet/in.h>
|
||||
#include <netinet/if_ether.h>
|
||||
#include <linux/if_packet.h>
|
||||
|
||||
#include <rte_common.h>
|
||||
#include <rte_log.h>
|
||||
@ -415,120 +409,38 @@ user_set_log_base(struct vhost_device_ctx ctx,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define RARP_BUF_SIZE 64
|
||||
|
||||
static void
|
||||
make_rarp_packet(uint8_t *buf, uint8_t *mac)
|
||||
{
|
||||
struct ether_header *eth_hdr;
|
||||
struct ether_arp *rarp;
|
||||
|
||||
/* Ethernet header. */
|
||||
eth_hdr = (struct ether_header *)buf;
|
||||
memset(ð_hdr->ether_dhost, 0xff, ETH_ALEN);
|
||||
memcpy(ð_hdr->ether_shost, mac, ETH_ALEN);
|
||||
eth_hdr->ether_type = htons(ETH_P_RARP);
|
||||
|
||||
/* RARP header. */
|
||||
rarp = (struct ether_arp *)(eth_hdr + 1);
|
||||
rarp->ea_hdr.ar_hrd = htons(ARPHRD_ETHER);
|
||||
rarp->ea_hdr.ar_pro = htons(ETHERTYPE_IP);
|
||||
rarp->ea_hdr.ar_hln = ETH_ALEN;
|
||||
rarp->ea_hdr.ar_pln = 4;
|
||||
rarp->ea_hdr.ar_op = htons(ARPOP_RREQUEST);
|
||||
|
||||
memcpy(&rarp->arp_sha, mac, ETH_ALEN);
|
||||
memset(&rarp->arp_spa, 0x00, 4);
|
||||
memcpy(&rarp->arp_tha, mac, 6);
|
||||
memset(&rarp->arp_tpa, 0x00, 4);
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
send_rarp(const char *ifname, uint8_t *rarp)
|
||||
{
|
||||
int fd;
|
||||
struct ifreq ifr;
|
||||
struct sockaddr_ll addr;
|
||||
|
||||
fd = socket(AF_PACKET, SOCK_RAW, 0);
|
||||
if (fd < 0) {
|
||||
perror("socket failed");
|
||||
return;
|
||||
}
|
||||
|
||||
memset(&ifr, 0, sizeof(struct ifreq));
|
||||
strncpy(ifr.ifr_name, ifname, IFNAMSIZ);
|
||||
if (ioctl(fd, SIOCGIFINDEX, &ifr) < 0) {
|
||||
perror("failed to get interface index");
|
||||
close(fd);
|
||||
return;
|
||||
}
|
||||
|
||||
addr.sll_ifindex = ifr.ifr_ifindex;
|
||||
addr.sll_halen = ETH_ALEN;
|
||||
|
||||
if (sendto(fd, rarp, RARP_BUF_SIZE, 0,
|
||||
(const struct sockaddr*)&addr, sizeof(addr)) < 0) {
|
||||
perror("send rarp packet failed");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Broadcast a RARP message to all interfaces, to update
|
||||
* switch's mac table
|
||||
* An rarp packet is constructed and broadcasted to notify switches about
|
||||
* the new location of the migrated VM, so that packets from outside will
|
||||
* not be lost after migration.
|
||||
*
|
||||
* However, we don't actually "send" a rarp packet here, instead, we set
|
||||
* a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it.
|
||||
*/
|
||||
int
|
||||
user_send_rarp(struct VhostUserMsg *msg)
|
||||
user_send_rarp(struct vhost_device_ctx ctx, struct VhostUserMsg *msg)
|
||||
{
|
||||
struct virtio_net *dev;
|
||||
uint8_t *mac = (uint8_t *)&msg->payload.u64;
|
||||
uint8_t rarp[RARP_BUF_SIZE];
|
||||
struct ifconf ifc = {0, };
|
||||
struct ifreq *ifr;
|
||||
int nr = 16;
|
||||
int fd;
|
||||
uint32_t i;
|
||||
|
||||
dev = get_device(ctx);
|
||||
if (!dev)
|
||||
return -1;
|
||||
|
||||
RTE_LOG(DEBUG, VHOST_CONFIG,
|
||||
":: mac: %02x:%02x:%02x:%02x:%02x:%02x\n",
|
||||
mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]);
|
||||
|
||||
make_rarp_packet(rarp, mac);
|
||||
memcpy(dev->mac.addr_bytes, mac, 6);
|
||||
|
||||
/*
|
||||
* Get all interfaces
|
||||
* Set the flag to inject a RARP broadcast packet at
|
||||
* rte_vhost_dequeue_burst().
|
||||
*
|
||||
* rte_smp_wmb() is for making sure the mac is copied
|
||||
* before the flag is set.
|
||||
*/
|
||||
fd = socket(AF_INET, SOCK_DGRAM, 0);
|
||||
if (fd < 0) {
|
||||
perror("failed to create AF_INET socket");
|
||||
return -1;
|
||||
}
|
||||
|
||||
again:
|
||||
ifc.ifc_len = sizeof(*ifr) * nr;
|
||||
ifc.ifc_buf = realloc(ifc.ifc_buf, ifc.ifc_len);
|
||||
|
||||
if (ioctl(fd, SIOCGIFCONF, &ifc) < 0) {
|
||||
perror("failed at SIOCGIFCONF");
|
||||
close(fd);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (ifc.ifc_len == (int)sizeof(struct ifreq) * nr) {
|
||||
/*
|
||||
* current ifc_buf is not big enough to hold
|
||||
* all interfaces; double it and try again.
|
||||
*/
|
||||
nr *= 2;
|
||||
goto again;
|
||||
}
|
||||
|
||||
ifr = (struct ifreq *)ifc.ifc_buf;
|
||||
for (i = 0; i < ifc.ifc_len / sizeof(struct ifreq); i++)
|
||||
send_rarp(ifr[i].ifr_name, rarp);
|
||||
|
||||
close(fd);
|
||||
rte_smp_wmb();
|
||||
rte_atomic16_set(&dev->broadcast_rarp, 1);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -54,7 +54,7 @@ void user_set_vring_kick(struct vhost_device_ctx, struct VhostUserMsg *);
|
||||
void user_set_protocol_features(struct vhost_device_ctx ctx,
|
||||
uint64_t protocol_features);
|
||||
int user_set_log_base(struct vhost_device_ctx ctx, struct VhostUserMsg *);
|
||||
int user_send_rarp(struct VhostUserMsg *);
|
||||
int user_send_rarp(struct vhost_device_ctx ctx, struct VhostUserMsg *);
|
||||
|
||||
int user_get_vring_base(struct vhost_device_ctx, struct vhost_vring_state *);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user