From b23ffbaa821bc5676e7f4d60c717f445b85f990d Mon Sep 17 00:00:00 2001 From: Intel Date: Wed, 18 Sep 2013 12:00:00 +0200 Subject: [PATCH] kni: add vhost backend Attach to vhost-net as raw socket backend. Signed-off-by: Intel --- config/defconfig_i686-default-linuxapp-gcc | 5 + config/defconfig_i686-default-linuxapp-icc | 5 + config/defconfig_x86_64-default-linuxapp-gcc | 5 + config/defconfig_x86_64-default-linuxapp-icc | 5 + lib/librte_eal/linuxapp/kni/Makefile | 1 + lib/librte_eal/linuxapp/kni/kni_dev.h | 47 +- lib/librte_eal/linuxapp/kni/kni_fifo.h | 14 + lib/librte_eal/linuxapp/kni/kni_misc.c | 22 + lib/librte_eal/linuxapp/kni/kni_net.c | 13 + lib/librte_eal/linuxapp/kni/kni_vhost.c | 764 +++++++++++++++++++ 10 files changed, 880 insertions(+), 1 deletion(-) create mode 100644 lib/librte_eal/linuxapp/kni/kni_vhost.c diff --git a/config/defconfig_i686-default-linuxapp-gcc b/config/defconfig_i686-default-linuxapp-gcc index a9b9ac8690..17419cb59b 100644 --- a/config/defconfig_i686-default-linuxapp-gcc +++ b/config/defconfig_i686-default-linuxapp-gcc @@ -277,6 +277,11 @@ CONFIG_RTE_SCHED_PORT_N_GRINDERS=8 CONFIG_RTE_LIBRTE_KNI=y CONFIG_RTE_LIBRTE_KNI_DEBUG=n CONFIG_RTE_KNI_KO_DEBUG=n +CONFIG_RTE_KNI_VHOST=n +CONFIG_RTE_KNI_VHOST_MAX_CACHE_SIZE=1024 +CONFIG_RTE_KNI_VHOST_VNET_HDR_EN=n +CONFIG_RTE_KNI_VHOST_DEBUG_RX=n +CONFIG_RTE_KNI_VHOST_DEBUG_TX=n # # Enable warning directives diff --git a/config/defconfig_i686-default-linuxapp-icc b/config/defconfig_i686-default-linuxapp-icc index f8eb7a621b..8be04a440b 100644 --- a/config/defconfig_i686-default-linuxapp-icc +++ b/config/defconfig_i686-default-linuxapp-icc @@ -277,6 +277,11 @@ CONFIG_RTE_SCHED_PORT_N_GRINDERS=8 CONFIG_RTE_LIBRTE_KNI=y CONFIG_RTE_LIBRTE_KNI_DEBUG=n CONFIG_RTE_KNI_KO_DEBUG=n +CONFIG_RTE_KNI_VHOST=n +CONFIG_RTE_KNI_VHOST_MAX_CACHE_SIZE=1024 +CONFIG_RTE_KNI_VHOST_VNET_HDR_EN=n +CONFIG_RTE_KNI_VHOST_DEBUG_RX=n +CONFIG_RTE_KNI_VHOST_DEBUG_TX=n # # Enable warning directives diff --git a/config/defconfig_x86_64-default-linuxapp-gcc b/config/defconfig_x86_64-default-linuxapp-gcc index ce9619ccbe..c70a478d76 100644 --- a/config/defconfig_x86_64-default-linuxapp-gcc +++ b/config/defconfig_x86_64-default-linuxapp-gcc @@ -289,6 +289,11 @@ CONFIG_RTE_SCHED_PORT_N_GRINDERS=8 CONFIG_RTE_LIBRTE_KNI=y CONFIG_RTE_LIBRTE_KNI_DEBUG=n CONFIG_RTE_KNI_KO_DEBUG=n +CONFIG_RTE_KNI_VHOST=n +CONFIG_RTE_KNI_VHOST_MAX_CACHE_SIZE=1024 +CONFIG_RTE_KNI_VHOST_VNET_HDR_EN=n +CONFIG_RTE_KNI_VHOST_DEBUG_RX=n +CONFIG_RTE_KNI_VHOST_DEBUG_TX=n # # Enable warning directives diff --git a/config/defconfig_x86_64-default-linuxapp-icc b/config/defconfig_x86_64-default-linuxapp-icc index f176e56341..034e4ed0ee 100644 --- a/config/defconfig_x86_64-default-linuxapp-icc +++ b/config/defconfig_x86_64-default-linuxapp-icc @@ -277,6 +277,11 @@ CONFIG_RTE_SCHED_PORT_N_GRINDERS=8 CONFIG_RTE_LIBRTE_KNI=y CONFIG_RTE_LIBRTE_KNI_DEBUG=n CONFIG_RTE_KNI_KO_DEBUG=n +CONFIG_RTE_KNI_VHOST=n +CONFIG_RTE_KNI_VHOST_MAX_CACHE_SIZE=1024 +CONFIG_RTE_KNI_VHOST_VNET_HDR_EN=n +CONFIG_RTE_KNI_VHOST_DEBUG_RX=n +CONFIG_RTE_KNI_VHOST_DEBUG_TX=n # # Enable warning directives diff --git a/lib/librte_eal/linuxapp/kni/Makefile b/lib/librte_eal/linuxapp/kni/Makefile index 48d1fbd58d..27410a6f86 100644 --- a/lib/librte_eal/linuxapp/kni/Makefile +++ b/lib/librte_eal/linuxapp/kni/Makefile @@ -79,5 +79,6 @@ SRCS-y += ethtool/igb/igb_vmdq.c SRCS-y += kni_misc.c SRCS-y += kni_net.c SRCS-y += kni_ethtool.c +SRCS-$(CONFIG_RTE_KNI_VHOST) += kni_vhost.c include $(RTE_SDK)/mk/rte.module.mk diff --git a/lib/librte_eal/linuxapp/kni/kni_dev.h b/lib/librte_eal/linuxapp/kni/kni_dev.h index 1d7f066fec..34873f2990 100644 --- a/lib/librte_eal/linuxapp/kni/kni_dev.h +++ b/lib/librte_eal/linuxapp/kni/kni_dev.h @@ -32,6 +32,10 @@ #include #include +#ifdef RTE_KNI_VHOST +#include +#endif + #include #define KNI_KTHREAD_RESCHEDULE_INTERVAL 5 /* us */ @@ -91,8 +95,16 @@ struct kni_dev { /* synchro for request processing */ unsigned long synchro; -}; +#ifdef RTE_KNI_VHOST + struct kni_vhost_queue* vhost_queue; + volatile enum { + BE_STOP = 0x1, + BE_START = 0x2, + BE_FINISH = 0x4, + }vq_status; +#endif +}; #define KNI_ERR(args...) printk(KERN_DEBUG "KNI: Error: " args) #define KNI_PRINT(args...) printk(KERN_DEBUG "KNI: " args) @@ -102,4 +114,37 @@ struct kni_dev { #define KNI_DBG(args...) #endif +#ifdef RTE_KNI_VHOST +unsigned int +kni_poll(struct file *file, struct socket *sock, poll_table * wait); +int kni_chk_vhost_rx(struct kni_dev *kni); +int kni_vhost_init(struct kni_dev *kni); +int kni_vhost_backend_release(struct kni_dev *kni); + +struct kni_vhost_queue { + struct sock sk; + struct socket *sock; + int vnet_hdr_sz; + struct kni_dev *kni; + int sockfd; + unsigned int flags; + struct sk_buff* cache; + struct rte_kni_fifo* fifo; +}; + #endif + +#ifdef RTE_KNI_VHOST_DEBUG_RX + #define KNI_DBG_RX(args...) printk(KERN_DEBUG "KNI RX: " args) +#else + #define KNI_DBG_RX(args...) +#endif + +#ifdef RTE_KNI_VHOST_DEBUG_TX + #define KNI_DBG_TX(args...) printk(KERN_DEBUG "KNI TX: " args) +#else + #define KNI_DBG_TX(args...) +#endif + +#endif + diff --git a/lib/librte_eal/linuxapp/kni/kni_fifo.h b/lib/librte_eal/linuxapp/kni/kni_fifo.h index 2022cb2743..521da0fc1d 100644 --- a/lib/librte_eal/linuxapp/kni/kni_fifo.h +++ b/lib/librte_eal/linuxapp/kni/kni_fifo.h @@ -91,4 +91,18 @@ kni_fifo_free_count(struct rte_kni_fifo *fifo) return (fifo->read - fifo->write - 1) & (fifo->len - 1); } +#ifdef RTE_KNI_VHOST +/** + * Initializes the kni fifo structure + */ +static inline void +kni_fifo_init(struct rte_kni_fifo *fifo, unsigned size) +{ + fifo->write = 0; + fifo->read = 0; + fifo->len = size; + fifo->elem_size = sizeof(void *); +} +#endif + #endif /* _KNI_FIFO_H_ */ diff --git a/lib/librte_eal/linuxapp/kni/kni_misc.c b/lib/librte_eal/linuxapp/kni/kni_misc.c index 79ab1f9b92..36bfaac4c1 100644 --- a/lib/librte_eal/linuxapp/kni/kni_misc.c +++ b/lib/librte_eal/linuxapp/kni/kni_misc.c @@ -193,6 +193,9 @@ kni_release(struct inode *inode, struct file *file) dev->pthread = NULL; } +#ifdef RTE_KNI_VHOST + kni_vhost_backend_release(dev); +#endif kni_dev_remove(dev); list_del(&dev->list); } @@ -217,7 +220,11 @@ kni_thread_single(void *unused) for (j = 0; j < KNI_RX_LOOP_NUM; j++) { list_for_each_entry_safe(dev, n, &kni_list_head, list) { +#ifdef RTE_KNI_VHOST + kni_chk_vhost_rx(dev); +#else kni_net_rx(dev); +#endif kni_net_poll_resp(dev); } } @@ -238,7 +245,11 @@ kni_thread_multiple(void *param) while (!kthread_should_stop()) { for (j = 0; j < KNI_RX_LOOP_NUM; j++) { +#ifdef RTE_KNI_VHOST + kni_chk_vhost_rx(dev); +#else kni_net_rx(dev); +#endif kni_net_poll_resp(dev); } schedule_timeout_interruptible(usecs_to_jiffies( \ @@ -361,6 +372,10 @@ kni_ioctl_create(unsigned int ioctl_num, unsigned long ioctl_param) kni->mbuf_kva = phys_to_virt(dev_info.mbuf_phys); kni->mbuf_va = dev_info.mbuf_va; +#ifdef RTE_KNI_VHOST + kni->vhost_queue = NULL; + kni->vq_status = BE_STOP; +#endif kni->mbuf_size = dev_info.mbuf_size; KNI_PRINT("tx_phys: 0x%016llx, tx_q addr: 0x%p\n", @@ -443,6 +458,10 @@ kni_ioctl_create(unsigned int ioctl_num, unsigned long ioctl_param) return -ENODEV; } +#ifdef RTE_KNI_VHOST + kni_vhost_init(kni); +#endif + /** * Create a new kernel thread for multiple mode, set its core affinity, * and finally wake it up. @@ -497,6 +516,9 @@ kni_ioctl_release(unsigned int ioctl_num, unsigned long ioctl_param) dev->pthread = NULL; } +#ifdef RTE_KNI_VHOST + kni_vhost_backend_release(dev); +#endif kni_dev_remove(dev); list_del(&dev->list); ret = 0; diff --git a/lib/librte_eal/linuxapp/kni/kni_net.c b/lib/librte_eal/linuxapp/kni/kni_net.c index f98a2ce466..ed62c3d491 100644 --- a/lib/librte_eal/linuxapp/kni/kni_net.c +++ b/lib/librte_eal/linuxapp/kni/kni_net.c @@ -379,6 +379,18 @@ kni_net_rx(struct kni_dev *kni) /* * Transmit a packet (called by the kernel) */ +#ifdef RTE_KNI_VHOST +static int +kni_net_tx(struct sk_buff *skb, struct net_device *dev) +{ + struct kni_dev *kni = netdev_priv(dev); + + dev_kfree_skb(skb); + kni->stats.tx_dropped++; + + return NETDEV_TX_OK; +} +#else static int kni_net_tx(struct sk_buff *skb, struct net_device *dev) { @@ -451,6 +463,7 @@ kni_net_tx(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; } +#endif /* * Deal with a transmit timeout. diff --git a/lib/librte_eal/linuxapp/kni/kni_vhost.c b/lib/librte_eal/linuxapp/kni/kni_vhost.c new file mode 100644 index 0000000000..fefc98ce2b --- /dev/null +++ b/lib/librte_eal/linuxapp/kni/kni_vhost.c @@ -0,0 +1,764 @@ +/*- + * GPL LICENSE SUMMARY + * + * Copyright(c) 2010-2013 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * The full GNU General Public License is included in this distribution + * in the file called LICENSE.GPL. + * + * Contact Information: + * Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kni_dev.h" +#include "kni_fifo.h" + +#define RX_BURST_SZ 4 + +extern void put_unused_fd(unsigned int fd); + +static struct proto kni_raw_proto = { + .name = "kni_vhost", + .owner = THIS_MODULE, + .obj_size = sizeof(struct kni_vhost_queue), +}; + +static inline int +kni_vhost_net_tx(struct kni_dev *kni, struct iovec *iov, + unsigned offset, unsigned len) +{ + struct rte_kni_mbuf *pkt_kva = NULL; + struct rte_kni_mbuf *pkt_va = NULL; + int ret; + + KNI_DBG_TX("tx offset=%d, len=%d, iovlen=%d\n", + offset, len, (int)iov->iov_len); + + /** + * Check if it has at least one free entry in tx_q and + * one entry in alloc_q. + */ + if (kni_fifo_free_count(kni->tx_q) == 0 || + kni_fifo_count(kni->alloc_q) == 0) { + /** + * If no free entry in tx_q or no entry in alloc_q, + * drops skb and goes out. + */ + goto drop; + } + + /* dequeue a mbuf from alloc_q */ + ret = kni_fifo_get(kni->alloc_q, (void **)&pkt_va, 1); + if (likely(ret == 1)) { + void *data_kva; + + pkt_kva = (void *)pkt_va - kni->mbuf_va + kni->mbuf_kva; + data_kva = pkt_kva->data - kni->mbuf_va + kni->mbuf_kva; + + memcpy_fromiovecend(data_kva, iov, offset, len); + if (unlikely(len < ETH_ZLEN)) { + memset(data_kva + len, 0, ETH_ZLEN - len); + len = ETH_ZLEN; + } + pkt_kva->pkt_len = len; + pkt_kva->data_len = len; + + /* enqueue mbuf into tx_q */ + ret = kni_fifo_put(kni->tx_q, (void **)&pkt_va, 1); + if (unlikely(ret != 1)) { + /* Failing should not happen */ + KNI_ERR("Fail to enqueue mbuf into tx_q\n"); + goto drop; + } + } else { + /* Failing should not happen */ + KNI_ERR("Fail to dequeue mbuf from alloc_q\n"); + goto drop; + } + + /* update statistics */ + kni->stats.tx_bytes += len; + kni->stats.tx_packets++; + + return 0; + +drop: + /* update statistics */ + kni->stats.tx_dropped++; + + return 0; +} + +static inline int +kni_vhost_net_rx(struct kni_dev *kni, struct iovec *iov, + unsigned offset, unsigned len) +{ + uint32_t pkt_len; + struct rte_kni_mbuf *kva; + struct rte_kni_mbuf *va; + void * data_kva; + struct sk_buff *skb; + struct kni_vhost_queue *q = kni->vhost_queue; + + if (unlikely(q == NULL)) + return 0; + + /* ensure at least one entry in free_q */ + if (unlikely(kni_fifo_free_count(kni->free_q) == 0)) + return 0; + + skb = skb_dequeue(&q->sk.sk_receive_queue); + if (unlikely(skb == NULL)) + return 0; + + kva = (struct rte_kni_mbuf*)skb->data; + + /* free skb to cache */ + skb->data = NULL; + if (unlikely(1 != kni_fifo_put(q->fifo, (void **)&skb, 1))) + /* Failing should not happen */ + KNI_ERR("Fail to enqueue entries into rx cache fifo\n"); + + pkt_len = kva->data_len; + if (unlikely(pkt_len > len)) + goto drop; + + KNI_DBG_RX("rx offset=%d, len=%d, pkt_len=%d, iovlen=%d\n", + offset, len, pkt_len, (int)iov->iov_len); + + data_kva = kva->data - kni->mbuf_va + kni->mbuf_kva; + if (unlikely(memcpy_toiovecend(iov, data_kva, offset, pkt_len))) + goto drop; + + /* Update statistics */ + kni->stats.rx_bytes += pkt_len; + kni->stats.rx_packets++; + + /* enqueue mbufs into free_q */ + va = (void*)kva - kni->mbuf_kva + kni->mbuf_va; + if (unlikely(1 != kni_fifo_put(kni->free_q, (void **)&va, 1))) + /* Failing should not happen */ + KNI_ERR("Fail to enqueue entries into free_q\n"); + + KNI_DBG_RX("receive done %d\n", pkt_len); + + return pkt_len; + +drop: + /* Update drop statistics */ + kni->stats.rx_dropped++; + + return 0; +} + +static unsigned int +kni_sock_poll(struct file *file, struct socket *sock, poll_table * wait) +{ + struct kni_vhost_queue *q = + container_of(sock->sk, struct kni_vhost_queue, sk); + struct kni_dev *kni; + unsigned int mask = 0; + + if (unlikely(q == NULL || q->kni == NULL)) + return POLLERR; + + kni = q->kni; + KNI_DBG("start kni_poll on group %d, wq 0x%16llx\n", + kni->group_id, (uint64_t)sock->wq); + + poll_wait(file, &sock->wq->wait, wait); + + if (kni_fifo_count(kni->rx_q) > 0) + mask |= POLLIN | POLLRDNORM; + + if (sock_writeable(&q->sk) || + (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &q->sock->flags) && + sock_writeable(&q->sk))) + mask |= POLLOUT | POLLWRNORM; + + return mask; +} + +static inline void +kni_vhost_enqueue(struct kni_dev *kni, struct kni_vhost_queue *q, + struct sk_buff *skb, struct rte_kni_mbuf *va) +{ + struct rte_kni_mbuf *kva; + + kva = (void *)(va) - kni->mbuf_va + kni->mbuf_kva; + (skb)->data = (unsigned char*)kva; + (skb)->len = kva->data_len; + skb_queue_tail(&q->sk.sk_receive_queue, skb); +} + +static inline void +kni_vhost_enqueue_burst(struct kni_dev *kni, struct kni_vhost_queue *q, + struct sk_buff **skb, struct rte_kni_mbuf **va) +{ + int i; + for (i = 0; i < RX_BURST_SZ; skb++, va++, i++) + kni_vhost_enqueue(kni, q, *skb, *va); +} + +int +kni_chk_vhost_rx(struct kni_dev *kni) +{ + struct kni_vhost_queue *q = kni->vhost_queue; + unsigned nb_in, nb_mbuf, nb_skb; + const unsigned BURST_MASK = RX_BURST_SZ - 1; + unsigned nb_burst, nb_backlog, i; + struct sk_buff *skb[RX_BURST_SZ]; + struct rte_kni_mbuf *va[RX_BURST_SZ]; + + if (unlikely(BE_STOP & kni->vq_status)) { + kni->vq_status |= BE_FINISH; + return 0; + } + + if (unlikely(q == NULL)) + return 0; + + nb_skb = kni_fifo_count(q->fifo); + nb_mbuf = kni_fifo_count(kni->rx_q); + + nb_in = min(nb_mbuf, nb_skb); + nb_in = min(nb_in, (unsigned)RX_BURST_SZ); + nb_burst = (nb_in & ~BURST_MASK); + nb_backlog = (nb_in & BURST_MASK); + + /* enqueue skb_queue per BURST_SIZE bulk */ + if (0 != nb_burst) { + if (unlikely(RX_BURST_SZ != kni_fifo_get( + kni->rx_q, (void **)&va, + RX_BURST_SZ))) + goto except; + + if (unlikely(RX_BURST_SZ != kni_fifo_get( + q->fifo, (void **)&skb, + RX_BURST_SZ))) + goto except; + + kni_vhost_enqueue_burst(kni, q, skb, va); + } + + /* all leftover, do one by one */ + for (i = 0; i < nb_backlog; ++i) { + if (unlikely(1 != kni_fifo_get( + kni->rx_q,(void **)&va, 1))) + goto except; + + if (unlikely(1 != kni_fifo_get( + q->fifo, (void **)&skb, 1))) + goto except; + + kni_vhost_enqueue(kni, q, *skb, *va); + } + + /* Ondemand wake up */ + if ((nb_in == RX_BURST_SZ) || (nb_skb == 0) || + ((nb_mbuf < RX_BURST_SZ) && (nb_mbuf != 0))) { + wake_up_interruptible_poll(sk_sleep(&q->sk), + POLLIN | POLLRDNORM | POLLRDBAND); + KNI_DBG_RX("RX CHK KICK nb_mbuf %d, nb_skb %d, nb_in %d\n", + nb_mbuf, nb_skb, nb_in); + } + + return 0; + +except: + /* Failing should not happen */ + KNI_ERR("Fail to enqueue fifo, it shouldn't happen \n"); + BUG_ON(1); + + return 0; +} + +static int +kni_sock_sndmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *m, size_t total_len) +{ + struct kni_vhost_queue *q = + container_of(sock->sk, struct kni_vhost_queue, sk); + int vnet_hdr_len = 0; + unsigned long len = total_len; + + if (unlikely(q == NULL || q->kni == NULL)) + return 0; + + KNI_DBG_TX("kni_sndmsg len %ld, flags 0x%08x, nb_iov %d\n", + len, q->flags, (int)m->msg_iovlen); + +#ifdef RTE_KNI_VHOST_VNET_HDR_EN + if (likely(q->flags & IFF_VNET_HDR)) { + vnet_hdr_len = q->vnet_hdr_sz; + if (unlikely(len < vnet_hdr_len)) + return -EINVAL; + len -= vnet_hdr_len; + } +#endif + + if (unlikely(len < ETH_HLEN + q->vnet_hdr_sz)) + return -EINVAL; + + return kni_vhost_net_tx(q->kni, m->msg_iov, vnet_hdr_len, len); +} + +static int +kni_sock_rcvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *m, size_t len, int flags) +{ + int vnet_hdr_len = 0; + int pkt_len = 0; + struct kni_vhost_queue *q = + container_of(sock->sk, struct kni_vhost_queue, sk); + static struct virtio_net_hdr + __attribute__ ((unused)) vnet_hdr = { + .flags = 0, + .gso_type = VIRTIO_NET_HDR_GSO_NONE + }; + + if (unlikely(q == NULL || q->kni == NULL)) + return 0; + +#ifdef RTE_KNI_VHOST_VNET_HDR_EN + if (likely(q->flags & IFF_VNET_HDR)) { + vnet_hdr_len = q->vnet_hdr_sz; + if ((len -= vnet_hdr_len) < 0) + return -EINVAL; + } +#endif + + if (unlikely(0 == (pkt_len = kni_vhost_net_rx(q->kni, + m->msg_iov, vnet_hdr_len, len)))) + return 0; + +#ifdef RTE_KNI_VHOST_VNET_HDR_EN + /* no need to copy hdr when no pkt received */ + if (unlikely(memcpy_toiovecend(m->msg_iov, + (void *)&vnet_hdr, 0, vnet_hdr_len))) + return -EFAULT; +#endif + KNI_DBG_RX("kni_rcvmsg expect_len %ld, flags 0x%08x, pkt_len %d\n", + (unsigned long)len, q->flags, pkt_len); + + return (pkt_len + vnet_hdr_len); +} + +/* dummy tap like ioctl */ +static int +kni_sock_ioctl(struct socket *sock, unsigned int cmd, + unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct ifreq __user *ifr = argp; + unsigned int __user *up = argp; + struct kni_vhost_queue *q = + container_of(sock->sk, struct kni_vhost_queue, sk); + struct kni_dev *kni; + unsigned int u; + int __user *sp = argp; + int s; + int ret; + + KNI_DBG("tap ioctl cmd 0x%08x\n", cmd); + + switch (cmd) { + case TUNSETIFF: + KNI_DBG("TUNSETIFF\n"); + /* ignore the name, just look at flags */ + if (get_user(u, &ifr->ifr_flags)) + return -EFAULT; + + ret = 0; + if ((u & ~IFF_VNET_HDR) != (IFF_NO_PI | IFF_TAP)) + ret = -EINVAL; + else + q->flags = u; + + return ret; + + case TUNGETIFF: + KNI_DBG("TUNGETIFF\n"); + rcu_read_lock_bh(); + kni = rcu_dereference_bh(q->kni); + if (kni) + dev_hold(kni->net_dev); + rcu_read_unlock_bh(); + + if (!kni) + return -ENOLINK; + + ret = 0; + if (copy_to_user(&ifr->ifr_name, kni->net_dev->name, IFNAMSIZ) || + put_user(q->flags, &ifr->ifr_flags)) + ret = -EFAULT; + dev_put(kni->net_dev); + return ret; + + case TUNGETFEATURES: + KNI_DBG("TUNGETFEATURES\n"); + u = IFF_TAP | IFF_NO_PI; +#ifdef RTE_KNI_VHOST_VNET_HDR_EN + u |= IFF_VNET_HDR; +#endif + if (put_user(u, up)) + return -EFAULT; + return 0; + + case TUNSETSNDBUF: + KNI_DBG("TUNSETSNDBUF\n"); + if (get_user(u, up)) + return -EFAULT; + + q->sk.sk_sndbuf = u; + return 0; + + case TUNGETVNETHDRSZ: + s = q->vnet_hdr_sz; + if (put_user(s, sp)) + return -EFAULT; + KNI_DBG("TUNGETVNETHDRSZ %d\n", s); + return 0; + + case TUNSETVNETHDRSZ: + if (get_user(s, sp)) + return -EFAULT; + if (s < (int)sizeof(struct virtio_net_hdr)) + return -EINVAL; + + KNI_DBG("TUNSETVNETHDRSZ %d\n", s); + q->vnet_hdr_sz = s; + return 0; + + case TUNSETOFFLOAD: + KNI_DBG("TUNSETOFFLOAD %lx\n", arg); +#ifdef RTE_KNI_VHOST_VNET_HDR_EN + /* not support any offload yet */ + if (!(q->flags & IFF_VNET_HDR)) + return -EINVAL; + + return 0; +#else + return -EINVAL; +#endif + + default: + KNI_DBG("NOT SUPPORT\n"); + return -EINVAL; + } +} + +static int +kni_sock_compat_ioctl(struct socket *sock, unsigned int cmd, + unsigned long arg) +{ + /* 32 bits app on 64 bits OS to be supported later */ + KNI_PRINT("Not implemented.\n"); + + return -EINVAL; +} + +#define KNI_VHOST_WAIT_WQ_SAFE() \ +do { \ + while ((BE_FINISH | BE_STOP) == kni->vq_status) \ + msleep(1); \ +}while(0) \ + + +static int +kni_sock_release(struct socket *sock) +{ + struct kni_vhost_queue *q = + container_of(sock->sk, struct kni_vhost_queue, sk); + struct kni_dev *kni; + + if (q == NULL) + return 0; + + if (NULL != (kni = q->kni)) { + kni->vq_status = BE_STOP; + KNI_VHOST_WAIT_WQ_SAFE(); + kni->vhost_queue = NULL; + q->kni = NULL; + } + + if (q->sockfd != -1) + q->sockfd = -1; + + sk_set_socket(&q->sk, NULL); + sock->sk = NULL; + + sock_put(&q->sk); + + KNI_DBG("dummy sock release done\n"); + + return 0; +} + +int +kni_sock_getname (struct socket *sock, + struct sockaddr *addr, + int *sockaddr_len, int peer) +{ + KNI_DBG("dummy sock getname\n"); + ((struct sockaddr_ll*)addr)->sll_family = AF_PACKET; + return 0; +} + +static const struct proto_ops kni_socket_ops = { + .getname = kni_sock_getname, + .sendmsg = kni_sock_sndmsg, + .recvmsg = kni_sock_rcvmsg, + .release = kni_sock_release, + .poll = kni_sock_poll, + .ioctl = kni_sock_ioctl, + .compat_ioctl = kni_sock_compat_ioctl, +}; + +static void +kni_sk_write_space(struct sock *sk) +{ + wait_queue_head_t *wqueue; + + if (!sock_writeable(sk) || + !test_and_clear_bit(SOCK_ASYNC_NOSPACE, + &sk->sk_socket->flags)) + return; + wqueue = sk_sleep(sk); + if (wqueue && waitqueue_active(wqueue)) + wake_up_interruptible_poll( + wqueue, POLLOUT | POLLWRNORM | POLLWRBAND); +} + +static void +kni_sk_destruct(struct sock *sk) +{ + struct kni_vhost_queue *q = + container_of(sk, struct kni_vhost_queue, sk); + + if (!q) + return; + + /* make sure there's no packet in buffer */ + while (skb_dequeue(&sk->sk_receive_queue) != NULL) + ; + + mb(); + + if (q->fifo != NULL) { + kfree(q->fifo); + q->fifo = NULL; + } + + if (q->cache != NULL) { + kfree(q->cache); + q->cache = NULL; + } +} + +static int +kni_vhost_backend_init(struct kni_dev *kni) +{ + struct kni_vhost_queue *q; + struct net *net = current->nsproxy->net_ns; + int err, i, sockfd; + struct rte_kni_fifo *fifo; + struct sk_buff *elem; + + if (kni->vhost_queue != NULL) + return -1; + + if (!(q = (struct kni_vhost_queue *)sk_alloc( + net, AF_UNSPEC, GFP_KERNEL, &kni_raw_proto))) + return -ENOMEM; + + err = sock_create_lite(AF_UNSPEC, SOCK_RAW, IPPROTO_RAW, &q->sock); + if (err) + goto free_sk; + + sockfd = sock_map_fd(q->sock, 0); + if (sockfd < 0) { + err = sockfd; + goto free_sock; + } + + /* cache init */ + q->cache = (struct sk_buff*) + kzalloc(RTE_KNI_VHOST_MAX_CACHE_SIZE * sizeof(struct sk_buff), + GFP_KERNEL); + if (!q->cache) + goto free_fd; + + fifo = (struct rte_kni_fifo*) + kzalloc(RTE_KNI_VHOST_MAX_CACHE_SIZE * sizeof(void *) + + sizeof(struct rte_kni_fifo), GFP_KERNEL); + if (!fifo) + goto free_cache; + + kni_fifo_init(fifo, RTE_KNI_VHOST_MAX_CACHE_SIZE); + + for (i = 0; i < RTE_KNI_VHOST_MAX_CACHE_SIZE; i++) { + elem = &q->cache[i]; + kni_fifo_put(fifo, (void**)&elem, 1); + } + q->fifo = fifo; + + /* store sockfd in vhost_queue */ + q->sockfd = sockfd; + + /* init socket */ + q->sock->type = SOCK_RAW; + q->sock->state = SS_CONNECTED; + q->sock->ops = &kni_socket_ops; + sock_init_data(q->sock, &q->sk); + + /* init sock data */ + q->sk.sk_write_space = kni_sk_write_space; + q->sk.sk_destruct = kni_sk_destruct; + q->flags = IFF_NO_PI | IFF_TAP; + q->vnet_hdr_sz = sizeof(struct virtio_net_hdr); +#ifdef RTE_KNI_VHOST_VNET_HDR_EN + q->flags |= IFF_VNET_HDR; +#endif + + /* bind kni_dev with vhost_queue */ + q->kni = kni; + kni->vhost_queue = q; + + wmb(); + + kni->vq_status = BE_START; + + KNI_DBG("backend init sockfd=%d, sock->wq=0x%16llx," + "sk->sk_wq=0x%16llx", + q->sockfd, (uint64_t)q->sock->wq, + (uint64_t)q->sk.sk_wq); + + return 0; + +free_cache: + kfree(q->cache); + q->cache = NULL; + +free_fd: + put_unused_fd(sockfd); + +free_sock: + q->kni = NULL; + kni->vhost_queue = NULL; + kni->vq_status |= BE_FINISH; + sock_release(q->sock); + q->sock->ops = NULL; + q->sock = NULL; + +free_sk: + sk_free((struct sock*)q); + + return err; +} + +/* kni vhost sock sysfs */ +static ssize_t +show_sock_fd(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct net_device *net_dev = container_of(dev, struct net_device, dev); + struct kni_dev *kni = netdev_priv(net_dev); + int sockfd = -1; + if (kni->vhost_queue != NULL) + sockfd = kni->vhost_queue->sockfd; + return snprintf(buf, 10, "%d\n", sockfd); +} + +static ssize_t +show_sock_en(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct net_device *net_dev = container_of(dev, struct net_device, dev); + struct kni_dev *kni = netdev_priv(net_dev); + return snprintf(buf, 10, "%u\n", (kni->vhost_queue == NULL ? 0 : 1)); +} + +static ssize_t +set_sock_en(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct net_device *net_dev = container_of(dev, struct net_device, dev); + struct kni_dev *kni = netdev_priv(net_dev); + unsigned long en; + int err = 0; + + if (0 != strict_strtoul(buf, 0, &en)) + return -EINVAL; + + if (en) + err = kni_vhost_backend_init(kni); + + return err ? err : count; +} + +static DEVICE_ATTR(sock_fd, S_IRUGO | S_IRUSR, show_sock_fd, NULL); +static DEVICE_ATTR(sock_en, S_IRUGO | S_IWUSR, show_sock_en, set_sock_en); +static struct attribute *dev_attrs[] = { + &dev_attr_sock_fd.attr, + &dev_attr_sock_en.attr, + NULL, +}; + +static const struct attribute_group dev_attr_grp = { + .attrs = dev_attrs, +}; + +int +kni_vhost_backend_release(struct kni_dev *kni) +{ + struct kni_vhost_queue *q = kni->vhost_queue; + + if (q == NULL) + return 0; + + /* dettach from kni */ + q->kni = NULL; + + KNI_DBG("release backend done\n"); + + return 0; +} + +int +kni_vhost_init(struct kni_dev *kni) +{ + struct net_device *dev = kni->net_dev; + + if (sysfs_create_group(&dev->dev.kobj, &dev_attr_grp)) + sysfs_remove_group(&dev->dev.kobj, &dev_attr_grp); + + kni->vq_status = BE_STOP; + + KNI_DBG("kni_vhost_init done\n"); + + return 0; +} +