vdpa/mlx5: control completion queue event mode

The CQ polling is necessary in order to manage guest notifications when
the guest doesn't work with poll mode (callfd != -1).

The CQ polling scheduling method can affect the host CPU utilization and
the traffic bandwidth.

Define 3 modes to control the CQ polling scheduling:

1. A timer thread which automatically adjusts its delays to the coming
   traffic rate.
2. A timer thread with fixed delay time.
3. Interrupts: Each CQE burst arms the CQ in order to get an interrupt
   event in the next traffic burst.

When traffic becomes off, mode 3 is taken automatically.

The interrupt management takes a lot of CPU cycles but forward traffic
event to the guest very fast.

Timer thread save the interrupt overhead but may add delay for the guest
notification.

Add device arguments to control on the mode.

Signed-off-by: Matan Azrad <matan@mellanox.com>
Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
This commit is contained in:
Matan Azrad 2020-06-29 14:01:56 +00:00 committed by Ferruh Yigit
parent c5f714e50b
commit edc6391e45
4 changed files with 153 additions and 19 deletions

View File

@ -106,8 +106,40 @@ Run-time configuration
- **ethtool** operations on related kernel interfaces also affect the PMD.
Driver options
^^^^^^^^^^^^^^
- ``class`` parameter [string]
Select the class of the driver that should probe the device.
`vdpa` for the mlx5 vDPA driver.
- ``event_mode`` parameter [int]
- 0, Completion queue scheduling will be managed by a timer thread which
automatically adjusts its delays to the coming traffic rate.
- 1, Completion queue scheduling will be managed by a timer thread with fixed
delay time.
- 2, Completion queue scheduling will be managed by interrupts. Each CQ burst
arms the CQ in order to get an interrupt event in the next traffic burst.
- Default mode is 0.
- ``event_us`` parameter [int]
Per mode micro-seconds parameter - relevant only for event mode 0 and 1:
- 0, A nonzero value to set timer step in micro-seconds. The timer thread
dynamic delay change steps according to this value. Default value is 1us.
- 1, A nonzero value to set fixed timer delay in micro-seconds. Default value
is 100us.
- ``no_traffic_time`` parameter [int]
A nonzero value defines the traffic off time, in seconds, that moves the
driver to no-traffic mode. In this mode the timer events are stopped and
interrupts are configured to the device in order to notify traffic for the
driver. Default value is 2s.

View File

@ -43,6 +43,7 @@
#define MLX5_VDPA_MAX_RETRIES 20
#define MLX5_VDPA_USEC 1000
#define MLX5_VDPA_DEFAULT_NO_TRAFFIC_TIME_S 2LLU
TAILQ_HEAD(mlx5_vdpa_privs, mlx5_vdpa_priv) priv_list =
TAILQ_HEAD_INITIALIZER(priv_list);
@ -624,6 +625,61 @@ mlx5_vdpa_roce_disable(struct rte_pci_addr *addr, struct ibv_device **ibv)
return -rte_errno;
}
static int
mlx5_vdpa_args_check_handler(const char *key, const char *val, void *opaque)
{
struct mlx5_vdpa_priv *priv = opaque;
unsigned long tmp;
if (strcmp(key, "class") == 0)
return 0;
errno = 0;
tmp = strtoul(val, NULL, 0);
if (errno) {
DRV_LOG(WARNING, "%s: \"%s\" is an invalid integer.", key, val);
return -errno;
}
if (strcmp(key, "event_mode") == 0) {
if (tmp <= MLX5_VDPA_EVENT_MODE_ONLY_INTERRUPT)
priv->event_mode = (int)tmp;
else
DRV_LOG(WARNING, "Invalid event_mode %s.", val);
} else if (strcmp(key, "event_us") == 0) {
priv->event_us = (uint32_t)tmp;
} else if (strcmp(key, "no_traffic_time") == 0) {
priv->no_traffic_time_s = (uint32_t)tmp;
} else {
DRV_LOG(WARNING, "Invalid key %s.", key);
}
return 0;
}
static void
mlx5_vdpa_config_get(struct rte_devargs *devargs, struct mlx5_vdpa_priv *priv)
{
struct rte_kvargs *kvlist;
priv->event_mode = MLX5_VDPA_EVENT_MODE_DYNAMIC_TIMER;
priv->event_us = 0;
priv->no_traffic_time_s = MLX5_VDPA_DEFAULT_NO_TRAFFIC_TIME_S;
if (devargs == NULL)
return;
kvlist = rte_kvargs_parse(devargs->args, NULL);
if (kvlist == NULL)
return;
rte_kvargs_process(kvlist, NULL, mlx5_vdpa_args_check_handler, priv);
rte_kvargs_free(kvlist);
if (!priv->event_us) {
if (priv->event_mode == MLX5_VDPA_EVENT_MODE_DYNAMIC_TIMER)
priv->event_us = MLX5_VDPA_DEFAULT_TIMER_STEP_US;
else if (priv->event_mode == MLX5_VDPA_EVENT_MODE_FIXED_TIMER)
priv->event_us = MLX5_VDPA_DEFAULT_TIMER_DELAY_US;
}
DRV_LOG(DEBUG, "event mode is %d.", priv->event_mode);
DRV_LOG(DEBUG, "event_us is %u us.", priv->event_us);
DRV_LOG(DEBUG, "no traffic time is %u s.", priv->no_traffic_time_s);
}
/**
* DPDK callback to register a PCI device.
*
@ -713,6 +769,7 @@ mlx5_vdpa_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
rte_errno = rte_errno ? rte_errno : EINVAL;
goto error;
}
mlx5_vdpa_config_get(pci_dev->device.devargs, priv);
SLIST_INIT(&priv->mr_list);
pthread_mutex_lock(&priv_list_lock);
TAILQ_INSERT_TAIL(&priv_list, priv, next);

View File

@ -36,6 +36,9 @@
#define VIRTIO_F_RING_PACKED 34
#endif
#define MLX5_VDPA_DEFAULT_TIMER_DELAY_US 100u
#define MLX5_VDPA_DEFAULT_TIMER_STEP_US 1u
struct mlx5_vdpa_cq {
uint16_t log_desc_n;
uint32_t cq_ci:24;
@ -101,6 +104,12 @@ struct mlx5_vdpa_steer {
} rss[7];
};
enum {
MLX5_VDPA_EVENT_MODE_DYNAMIC_TIMER,
MLX5_VDPA_EVENT_MODE_FIXED_TIMER,
MLX5_VDPA_EVENT_MODE_ONLY_INTERRUPT
};
struct mlx5_vdpa_priv {
TAILQ_ENTRY(mlx5_vdpa_priv) next;
uint8_t configured;
@ -110,7 +119,10 @@ struct mlx5_vdpa_priv {
pthread_mutex_t timer_lock;
pthread_cond_t timer_cond;
volatile uint8_t timer_on;
int event_mode;
uint32_t event_us;
uint32_t timer_delay_us;
uint32_t no_traffic_time_s;
struct rte_vdpa_device *vdev; /* vDPA device. */
int vid; /* vhost device id. */
struct ibv_context *ctx; /* Device context. */

View File

@ -20,9 +20,6 @@
#include "mlx5_vdpa.h"
#define MLX5_VDPA_DEFAULT_TIMER_DELAY_US 500u
#define MLX5_VDPA_NO_TRAFFIC_TIME_S 2LLU
void
mlx5_vdpa_event_qp_global_release(struct mlx5_vdpa_priv *priv)
{
@ -175,7 +172,8 @@ mlx5_vdpa_cq_create(struct mlx5_vdpa_priv *priv, uint16_t log_desc_n,
rte_errno = errno;
goto error;
}
if (callfd != -1) {
if (callfd != -1 &&
priv->event_mode != MLX5_VDPA_EVENT_MODE_ONLY_INTERRUPT) {
ret = mlx5_glue->devx_subscribe_devx_event_fd(priv->eventc,
callfd,
cq->cq->obj, 0);
@ -253,21 +251,43 @@ mlx5_vdpa_arm_all_cqs(struct mlx5_vdpa_priv *priv)
}
}
static void
mlx5_vdpa_timer_sleep(struct mlx5_vdpa_priv *priv, uint32_t max)
{
if (priv->event_mode == MLX5_VDPA_EVENT_MODE_DYNAMIC_TIMER) {
switch (max) {
case 0:
priv->timer_delay_us += priv->event_us;
break;
case 1:
break;
default:
priv->timer_delay_us /= max;
break;
}
}
usleep(priv->timer_delay_us);
}
static void *
mlx5_vdpa_poll_handle(void *arg)
{
struct mlx5_vdpa_priv *priv = arg;
int i;
struct mlx5_vdpa_cq *cq;
uint32_t total;
uint32_t max;
uint64_t current_tic;
pthread_mutex_lock(&priv->timer_lock);
while (!priv->timer_on)
pthread_cond_wait(&priv->timer_cond, &priv->timer_lock);
pthread_mutex_unlock(&priv->timer_lock);
priv->timer_delay_us = priv->event_mode ==
MLX5_VDPA_EVENT_MODE_DYNAMIC_TIMER ?
MLX5_VDPA_DEFAULT_TIMER_DELAY_US :
priv->event_us;
while (1) {
total = 0;
max = 0;
for (i = 0; i < priv->nr_virtqs; i++) {
cq = &priv->virtqs[i].eqp.cq;
if (cq->cq && !cq->armed) {
@ -278,15 +298,16 @@ mlx5_vdpa_poll_handle(void *arg)
if (cq->callfd != -1)
eventfd_write(cq->callfd,
(eventfd_t)1);
total += comp;
if (comp > max)
max = comp;
}
}
}
current_tic = rte_rdtsc();
if (!total) {
if (!max) {
/* No traffic ? stop timer and load interrupts. */
if (current_tic - priv->last_traffic_tic >=
rte_get_timer_hz() * MLX5_VDPA_NO_TRAFFIC_TIME_S) {
rte_get_timer_hz() * priv->no_traffic_time_s) {
DRV_LOG(DEBUG, "Device %s traffic was stopped.",
priv->vdev->device->name);
mlx5_vdpa_arm_all_cqs(priv);
@ -296,12 +317,16 @@ mlx5_vdpa_poll_handle(void *arg)
pthread_cond_wait(&priv->timer_cond,
&priv->timer_lock);
pthread_mutex_unlock(&priv->timer_lock);
priv->timer_delay_us = priv->event_mode ==
MLX5_VDPA_EVENT_MODE_DYNAMIC_TIMER ?
MLX5_VDPA_DEFAULT_TIMER_DELAY_US :
priv->event_us;
continue;
}
} else {
priv->last_traffic_tic = current_tic;
}
usleep(priv->timer_delay_us);
mlx5_vdpa_timer_sleep(priv, max);
}
return NULL;
}
@ -327,6 +352,13 @@ mlx5_vdpa_interrupt_handler(void *cb_arg)
struct mlx5_vdpa_virtq, eqp);
mlx5_vdpa_cq_poll(cq);
if (priv->event_mode == MLX5_VDPA_EVENT_MODE_ONLY_INTERRUPT) {
mlx5_vdpa_cq_arm(priv, cq);
/* Notify guest for descs consuming. */
if (cq->callfd != -1)
eventfd_write(cq->callfd, (eventfd_t)1);
return;
}
/* Don't arm again - timer will take control. */
DRV_LOG(DEBUG, "Device %s virtq %d cq %d event was captured."
" Timer is %s, cq ci is %u.\n",
@ -356,15 +388,16 @@ mlx5_vdpa_cqe_event_setup(struct mlx5_vdpa_priv *priv)
if (!priv->eventc)
/* All virtqs are in poll mode. */
return 0;
pthread_mutex_init(&priv->timer_lock, NULL);
pthread_cond_init(&priv->timer_cond, NULL);
priv->timer_on = 0;
priv->timer_delay_us = MLX5_VDPA_DEFAULT_TIMER_DELAY_US;
ret = pthread_create(&priv->timer_tid, NULL, mlx5_vdpa_poll_handle,
(void *)priv);
if (ret) {
DRV_LOG(ERR, "Failed to create timer thread.");
return -1;
if (priv->event_mode != MLX5_VDPA_EVENT_MODE_ONLY_INTERRUPT) {
pthread_mutex_init(&priv->timer_lock, NULL);
pthread_cond_init(&priv->timer_cond, NULL);
priv->timer_on = 0;
ret = pthread_create(&priv->timer_tid, NULL,
mlx5_vdpa_poll_handle, (void *)priv);
if (ret) {
DRV_LOG(ERR, "Failed to create timer thread.");
return -1;
}
}
flags = fcntl(priv->eventc->fd, F_GETFL);
ret = fcntl(priv->eventc->fd, F_SETFL, flags | O_NONBLOCK);