5569dd7d90
The Kni kthreads seem to be re-scheduled at a granularity of roughly 1 millisecond right now, which seems to be insufficient for performing tests involving a lot of control plane traffic. Even if KNI_KTHREAD_RESCHEDULE_INTERVAL is set to 5 microseconds, it seems that the existing code cannot reschedule at the desired granularily, due to precision constraints of schedule_timeout_interruptible(). In our use case, we leverage the Linux Kernel for control plane, and it is not uncommon to have 60K - 100K pps for some signaling protocols. Since we are not in atomic context, the usleep_range() function seems to be more appropriate for being able to introduce smaller controlled delays, in the range of 5-10 microseconds. Upon reading the existing code, it would seem that this was the original intent. Adding sub-millisecond delays, seems unfeasible with a call to schedule_timeout_interruptible(). KNI_KTHREAD_RESCHEDULE_INTERVAL 5 /* us */ schedule_timeout_interruptible( usecs_to_jiffies(KNI_KTHREAD_RESCHEDULE_INTERVAL)); Below, we attempted a brief comparison between the existing implementation, which uses schedule_timeout_interruptible() and usleep_range(). We attempt to measure the CPU usage, and RTT between two Kni interfaces, which are created on top of vmxnet3 adapters, connected by a vSwitch. insmod rte_kni.ko kthread_mode=single carrier=on schedule_timeout_interruptible(usecs_to_jiffies(5)) kni_single CPU Usage: 2-4 % [root@localhost ~]# ping 1.1.1.2 -I eth1 PING 1.1.1.2 (1.1.1.2) from 1.1.1.1 eth1: 56(84) bytes of data. 64 bytes from 1.1.1.2: icmp_seq=1 ttl=64 time=2.70 ms 64 bytes from 1.1.1.2: icmp_seq=2 ttl=64 time=1.00 ms 64 bytes from 1.1.1.2: icmp_seq=3 ttl=64 time=1.99 ms 64 bytes from 1.1.1.2: icmp_seq=4 ttl=64 time=0.985 ms 64 bytes from 1.1.1.2: icmp_seq=5 ttl=64 time=1.00 ms usleep_range(5, 10) kni_single CPU usage: 50% 64 bytes from 1.1.1.2: icmp_seq=1 ttl=64 time=0.338 ms 64 bytes from 1.1.1.2: icmp_seq=2 ttl=64 time=0.150 ms 64 bytes from 1.1.1.2: icmp_seq=3 ttl=64 time=0.123 ms 64 bytes from 1.1.1.2: icmp_seq=4 ttl=64 time=0.139 ms 64 bytes from 1.1.1.2: icmp_seq=5 ttl=64 time=0.159 ms usleep_range(20, 50) kni_single CPU usage: 24% 64 bytes from 1.1.1.2: icmp_seq=1 ttl=64 time=0.202 ms 64 bytes from 1.1.1.2: icmp_seq=2 ttl=64 time=0.170 ms 64 bytes from 1.1.1.2: icmp_seq=3 ttl=64 time=0.171 ms 64 bytes from 1.1.1.2: icmp_seq=4 ttl=64 time=0.248 ms 64 bytes from 1.1.1.2: icmp_seq=5 ttl=64 time=0.185 ms usleep_range(50, 100) kni_single CPU usage: 13% 64 bytes from 1.1.1.2: icmp_seq=1 ttl=64 time=0.537 ms 64 bytes from 1.1.1.2: icmp_seq=2 ttl=64 time=0.257 ms 64 bytes from 1.1.1.2: icmp_seq=3 ttl=64 time=0.231 ms 64 bytes from 1.1.1.2: icmp_seq=4 ttl=64 time=0.143 ms 64 bytes from 1.1.1.2: icmp_seq=5 ttl=64 time=0.200 ms usleep_range(100, 200) kni_single CPU usage: 7% 64 bytes from 1.1.1.2: icmp_seq=1 ttl=64 time=0.716 ms 64 bytes from 1.1.1.2: icmp_seq=2 ttl=64 time=0.167 ms 64 bytes from 1.1.1.2: icmp_seq=3 ttl=64 time=0.459 ms 64 bytes from 1.1.1.2: icmp_seq=4 ttl=64 time=0.455 ms 64 bytes from 1.1.1.2: icmp_seq=5 ttl=64 time=0.252 ms usleep_range(1000, 1100) kni_single CPU usage: 2% 64 bytes from 1.1.1.2: icmp_seq=1 ttl=64 time=2.22 ms 64 bytes from 1.1.1.2: icmp_seq=2 ttl=64 time=1.17 ms 64 bytes from 1.1.1.2: icmp_seq=3 ttl=64 time=1.17 ms 64 bytes from 1.1.1.2: icmp_seq=4 ttl=64 time=1.17 ms 64 bytes from 1.1.1.2: icmp_seq=5 ttl=64 time=1.15 ms Upon testing, usleep_range(1000, 1100) seems roughly equivalent in latency and cpu usage to the variant with schedule_timeout_interruptible(), while usleep_range(100, 200) seems to give a decent tradeoff between latency and cpu usage, while allowing users to tweak the limits for improved precision if they have such use cases. Disabling RTE_KNI_PREEMPT_DEFAULT, interestingly seems to lead to a softlockup on my kernel. Kernel panic - not syncing: softlockup: hung tasks CPU: 0 PID: 1226 Comm: kni_single Tainted: G W O 3.10 #1 <IRQ> [<ffffffff814f84de>] dump_stack+0x19/0x1b [<ffffffff814f7891>] panic+0xcd/0x1e0 [<ffffffff810993b0>] watchdog_timer_fn+0x160/0x160 [<ffffffff810644b2>] __run_hrtimer.isra.4+0x42/0xd0 [<ffffffff81064b57>] hrtimer_interrupt+0xe7/0x1f0 [<ffffffff8102cd57>] smp_apic_timer_interrupt+0x67/0xa0 [<ffffffff8150321d>] apic_timer_interrupt+0x6d/0x80 This patch also attempts to remove this option. References: [1] https://www.kernel.org/doc/Documentation/timers/timers-howto.txt Signed-off-by: Tudor Cornea <tudor.cornea@gmail.com> Acked-by: Padraig Connolly <Padraig.J.Connolly@intel.com> Reviewed-by: Ferruh Yigit <ferruh.yigit@intel.com>
136 lines
2.8 KiB
C
136 lines
2.8 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
/*
|
|
* Copyright(c) 2010-2014 Intel Corporation.
|
|
*/
|
|
|
|
#ifndef _KNI_DEV_H_
|
|
#define _KNI_DEV_H_
|
|
|
|
#ifdef pr_fmt
|
|
#undef pr_fmt
|
|
#endif
|
|
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
|
|
|
#define KNI_VERSION "1.0"
|
|
|
|
#include "compat.h"
|
|
|
|
#include <linux/if.h>
|
|
#include <linux/wait.h>
|
|
#ifdef HAVE_SIGNAL_FUNCTIONS_OWN_HEADER
|
|
#include <linux/sched/signal.h>
|
|
#else
|
|
#include <linux/sched.h>
|
|
#endif
|
|
#include <linux/netdevice.h>
|
|
#include <linux/spinlock.h>
|
|
#include <linux/list.h>
|
|
|
|
#include <rte_kni_common.h>
|
|
#define KNI_KTHREAD_MAX_RESCHEDULE_INTERVAL 1000000 /* us */
|
|
|
|
#define MBUF_BURST_SZ 32
|
|
|
|
/* Default carrier state for created KNI network interfaces */
|
|
extern uint32_t kni_dflt_carrier;
|
|
|
|
/* Request processing support for bifurcated drivers. */
|
|
extern uint32_t bifurcated_support;
|
|
|
|
/**
|
|
* A structure describing the private information for a kni device.
|
|
*/
|
|
struct kni_dev {
|
|
/* kni list */
|
|
struct list_head list;
|
|
|
|
uint8_t iova_mode;
|
|
|
|
uint32_t core_id; /* Core ID to bind */
|
|
char name[RTE_KNI_NAMESIZE]; /* Network device name */
|
|
struct task_struct *pthread;
|
|
|
|
/* wait queue for req/resp */
|
|
wait_queue_head_t wq;
|
|
struct mutex sync_lock;
|
|
|
|
/* kni device */
|
|
struct net_device *net_dev;
|
|
|
|
/* queue for packets to be sent out */
|
|
struct rte_kni_fifo *tx_q;
|
|
|
|
/* queue for the packets received */
|
|
struct rte_kni_fifo *rx_q;
|
|
|
|
/* queue for the allocated mbufs those can be used to save sk buffs */
|
|
struct rte_kni_fifo *alloc_q;
|
|
|
|
/* free queue for the mbufs to be freed */
|
|
struct rte_kni_fifo *free_q;
|
|
|
|
/* request queue */
|
|
struct rte_kni_fifo *req_q;
|
|
|
|
/* response queue */
|
|
struct rte_kni_fifo *resp_q;
|
|
|
|
void *sync_kva;
|
|
void *sync_va;
|
|
|
|
void *mbuf_kva;
|
|
void *mbuf_va;
|
|
|
|
/* mbuf size */
|
|
uint32_t mbuf_size;
|
|
|
|
/* buffers */
|
|
void *pa[MBUF_BURST_SZ];
|
|
void *va[MBUF_BURST_SZ];
|
|
void *alloc_pa[MBUF_BURST_SZ];
|
|
void *alloc_va[MBUF_BURST_SZ];
|
|
|
|
struct task_struct *usr_tsk;
|
|
};
|
|
|
|
#ifdef HAVE_IOVA_TO_KVA_MAPPING_SUPPORT
|
|
static inline phys_addr_t iova_to_phys(struct task_struct *tsk,
|
|
unsigned long iova)
|
|
{
|
|
phys_addr_t offset, phys_addr;
|
|
struct page *page = NULL;
|
|
long ret;
|
|
|
|
offset = iova & (PAGE_SIZE - 1);
|
|
|
|
/* Read one page struct info */
|
|
#ifdef HAVE_TSK_IN_GUP
|
|
ret = get_user_pages_remote(tsk, tsk->mm, iova, 1,
|
|
FOLL_TOUCH, &page, NULL, NULL);
|
|
#else
|
|
ret = get_user_pages_remote(tsk->mm, iova, 1,
|
|
FOLL_TOUCH, &page, NULL, NULL);
|
|
#endif
|
|
if (ret < 0)
|
|
return 0;
|
|
|
|
phys_addr = page_to_phys(page) | offset;
|
|
put_page(page);
|
|
|
|
return phys_addr;
|
|
}
|
|
|
|
static inline void *iova_to_kva(struct task_struct *tsk, unsigned long iova)
|
|
{
|
|
return phys_to_virt(iova_to_phys(tsk, iova));
|
|
}
|
|
#endif
|
|
|
|
void kni_net_release_fifo_phy(struct kni_dev *kni);
|
|
void kni_net_rx(struct kni_dev *kni);
|
|
void kni_net_init(struct net_device *dev);
|
|
void kni_net_config_lo_mode(char *lo_str);
|
|
void kni_net_poll_resp(struct kni_dev *kni);
|
|
|
|
#endif
|