numam-spdk/lib/vhost/vhost_internal.h

404 lines
13 KiB
C
Raw Normal View History

/*-
* BSD LICENSE
*
* Copyright (c) Intel Corporation.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef SPDK_VHOST_INTERNAL_H
#define SPDK_VHOST_INTERNAL_H
#include "spdk/stdinc.h"
#include <rte_vhost.h>
#include "spdk_internal/log.h"
#include "spdk/event.h"
#include "spdk/rpc.h"
#include "spdk/config.h"
#define SPDK_CACHE_LINE_SIZE RTE_CACHE_LINE_SIZE
#ifndef VHOST_USER_F_PROTOCOL_FEATURES
#define VHOST_USER_F_PROTOCOL_FEATURES 30
#endif
#ifndef VIRTIO_F_VERSION_1
#define VIRTIO_F_VERSION_1 32
#endif
#ifndef VIRTIO_BLK_F_MQ
#define VIRTIO_BLK_F_MQ 12 /* support more than one vq */
#endif
#ifndef VIRTIO_BLK_F_CONFIG_WCE
#define VIRTIO_BLK_F_CONFIG_WCE 11
#endif
#define SPDK_VHOST_MAX_VQUEUES 256
#define SPDK_VHOST_MAX_VQ_SIZE 1024
#define SPDK_VHOST_SCSI_CTRLR_MAX_DEVS 8
#define SPDK_VHOST_IOVS_MAX 129
/*
* Rate at which stats are checked for interrupt coalescing.
*/
#define SPDK_VHOST_STATS_CHECK_INTERVAL_MS 10
/*
* Default threshold at which interrupts start to be coalesced.
*/
#define SPDK_VHOST_VQ_IOPS_COALESCING_THRESHOLD 60000
/*
* Currently coalescing is not used by default.
* Setting this to value > 0 here or by RPC will enable coalescing.
*/
#define SPDK_VHOST_COALESCING_DELAY_BASE_US 0
#define SPDK_VHOST_FEATURES ((1ULL << VHOST_F_LOG_ALL) | \
(1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \
(1ULL << VIRTIO_F_VERSION_1) | \
(1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) | \
(1ULL << VIRTIO_RING_F_EVENT_IDX) | \
(1ULL << VIRTIO_RING_F_INDIRECT_DESC))
#define SPDK_VHOST_DISABLED_FEATURES ((1ULL << VIRTIO_RING_F_EVENT_IDX) | \
(1ULL << VIRTIO_F_NOTIFY_ON_EMPTY))
struct vhost_poll_group;
struct spdk_vhost_virtqueue {
struct rte_vhost_vring vring;
uint16_t last_avail_idx;
uint16_t last_used_idx;
void *tasks;
/* Request count from last stats check */
uint32_t req_cnt;
/* Request count from last event */
uint16_t used_req_cnt;
/* How long interrupt is delayed */
uint32_t irq_delay_time;
/* Next time when we need to send event */
uint64_t next_event_time;
/* Associated vhost_virtqueue in the virtio device's virtqueue list */
uint32_t vring_idx;
} __attribute((aligned(SPDK_CACHE_LINE_SIZE)));
struct spdk_vhost_session {
struct spdk_vhost_dev *vdev;
/* rte_vhost connection ID. */
int vid;
/* Unique session ID. */
uint64_t id;
/* Unique session name. */
char *name;
struct vhost_poll_group *poll_group;
bool initialized;
bool started;
bool needs_restart;
vhost/compat: start polling queues prematurely rte_vhost requires all queues to be fully initialized in order to start I/O processing. This behavior is not compliant with the vhost-user specification and doesn't work with QEMU 2.12+, which will only initialize 1 I/O queue for the SeaBIOS boot. Theoretically, we should start polling each virtqueue individually after receiving its SET_VRING_KICK message, but rte_vhost is not designed to poll individual queues. So we use a workaround to detect when a vhost session could be potentially at that SeaBIOS stage and we mark it to start polling as soon as its first virtqueue gets initialized. This doesn't hurt any non-QEMU vhost slaves and allows QEMU 2.12+ to boot correctly. SET_FEATURES could be sent at any time, but QEMU will send it at least once on SeaBIOS initialization - whenever powered-up or rebooted. Vhost sessions are still mostly started/stopped from within rte_vhost callbacks, but now there's additional concept of "forced" polling, in which SPDK starts sessions manually, while rte_vhost still thinks the sessions are stopped. This can potentially lead to cases where a session is "started" twice, or gets destroyed while it's still being polled (by force). Those cases also need to be handled within this patch. Change-Id: I70636d63e27914906ddece59cec34f1dd37ec5cd Signed-off-by: Darek Stojaczyk <dariusz.stojaczyk@intel.com> Reviewed-on: https://review.gerrithub.io/c/spdk/spdk/+/446086 Tested-by: SPDK CI Jenkins <sys_sgci@intel.com> Reviewed-by: Changpeng Liu <changpeng.liu@intel.com> Reviewed-by: Jim Harris <james.r.harris@intel.com>
2019-03-04 22:44:43 +00:00
bool forced_polling;
struct rte_vhost_memory *mem;
int task_cnt;
uint16_t max_queues;
uint64_t negotiated_features;
/* Local copy of device coalescing settings. */
uint32_t coalescing_delay_time_base;
uint32_t coalescing_io_rate_threshold;
/* Next time when stats for event coalescing will be checked. */
uint64_t next_stats_check_time;
/* Interval used for event coalescing checking. */
uint64_t stats_check_interval;
struct spdk_vhost_virtqueue virtqueue[SPDK_VHOST_MAX_VQUEUES];
TAILQ_ENTRY(spdk_vhost_session) tailq;
};
struct spdk_vhost_dev {
char *name;
char *path;
struct spdk_cpuset *cpumask;
bool registered;
const struct spdk_vhost_dev_backend *backend;
/* Saved orginal values used to setup coalescing to avoid integer
* rounding issues during save/load config.
*/
uint32_t coalescing_delay_us;
uint32_t coalescing_iops_threshold;
/* Current connections to the device */
TAILQ_HEAD(, spdk_vhost_session) vsessions;
/* Increment-only session counter */
uint64_t vsessions_num;
/* Number of started and actively polled sessions */
uint32_t active_session_num;
/* Number of pending asynchronous operations */
uint32_t pending_async_op_num;
TAILQ_ENTRY(spdk_vhost_dev) tailq;
};
/**
* \param vdev vhost device.
* \param vsession vhost session.
* \param arg user-provided parameter.
*
* \return negative values will break the foreach call, meaning
* the function won't be called again. Return codes zero and
* positive don't have any effect.
*/
typedef int (*spdk_vhost_session_fn)(struct spdk_vhost_dev *vdev,
struct spdk_vhost_session *vsession,
void *arg);
/**
* \param vdev vhost device.
* \param arg user-provided parameter.
*/
typedef void (*spdk_vhost_dev_fn)(struct spdk_vhost_dev *vdev, void *arg);
struct spdk_vhost_dev_backend {
uint64_t virtio_features;
uint64_t disabled_features;
/**
* Size of additional per-session context data
* allocated whenever a new client connects.
*/
size_t session_ctx_size;
int (*start_session)(struct spdk_vhost_session *vsession);
int (*stop_session)(struct spdk_vhost_session *vsession);
int (*vhost_get_config)(struct spdk_vhost_dev *vdev, uint8_t *config, uint32_t len);
int (*vhost_set_config)(struct spdk_vhost_dev *vdev, uint8_t *config,
uint32_t offset, uint32_t size, uint32_t flags);
void (*dump_info_json)(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w);
void (*write_config_json)(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w);
int (*remove_device)(struct spdk_vhost_dev *vdev);
};
void *vhost_gpa_to_vva(struct spdk_vhost_session *vsession, uint64_t addr, uint64_t len);
uint16_t vhost_vq_avail_ring_get(struct spdk_vhost_virtqueue *vq, uint16_t *reqs,
uint16_t reqs_len);
/**
* Get a virtio descriptor at given index in given virtqueue.
* The descriptor will provide access to the entire descriptor
* chain. The subsequent descriptors are accesible via
* \c spdk_vhost_vring_desc_get_next.
2018-12-13 10:51:34 +00:00
* \param vsession vhost session
* \param vq virtqueue
* \param req_idx descriptor index
* \param desc pointer to be set to the descriptor
* \param desc_table descriptor table to be used with
* \c spdk_vhost_vring_desc_get_next. This might be either
* default virtqueue descriptor table or per-chain indirect
* table.
* \param desc_table_size size of the *desc_table*
* \return 0 on success, -1 if given index is invalid.
* If -1 is returned, the content of params is undefined.
*/
int vhost_vq_get_desc(struct spdk_vhost_session *vsession, struct spdk_vhost_virtqueue *vq,
uint16_t req_idx, struct vring_desc **desc, struct vring_desc **desc_table,
uint32_t *desc_table_size);
/**
* Send IRQ/call client (if pending) for \c vq.
2018-12-13 10:51:34 +00:00
* \param vsession vhost session
* \param vq virtqueue
* \return
* 0 - if no interrupt was signalled
* 1 - if interrupt was signalled
*/
int vhost_vq_used_signal(struct spdk_vhost_session *vsession, struct spdk_vhost_virtqueue *vq);
/**
* Send IRQs for all queues that need to be signaled.
2018-12-13 10:51:34 +00:00
* \param vsession vhost session
* \param vq virtqueue
*/
void vhost_session_used_signal(struct spdk_vhost_session *vsession);
void vhost_vq_used_ring_enqueue(struct spdk_vhost_session *vsession,
struct spdk_vhost_virtqueue *vq,
uint16_t id, uint32_t len);
/**
* Get subsequent descriptor from given table.
* \param desc current descriptor, will be set to the
* next descriptor (NULL in case this is the last
* descriptor in the chain or the next desc is invalid)
* \param desc_table descriptor table
* \param desc_table_size size of the *desc_table*
* \return 0 on success, -1 if given index is invalid
* The *desc* param will be set regardless of the
* return value.
*/
int vhost_vring_desc_get_next(struct vring_desc **desc,
struct vring_desc *desc_table, uint32_t desc_table_size);
bool vhost_vring_desc_is_wr(struct vring_desc *cur_desc);
int vhost_vring_desc_to_iov(struct spdk_vhost_session *vsession, struct iovec *iov,
uint16_t *iov_index, const struct vring_desc *desc);
static inline bool __attribute__((always_inline))
vhost_dev_has_feature(struct spdk_vhost_session *vsession, unsigned feature_id)
{
2018-12-13 10:51:34 +00:00
return vsession->negotiated_features & (1ULL << feature_id);
}
int vhost_dev_register(struct spdk_vhost_dev *vdev, const char *name, const char *mask_str,
const struct spdk_vhost_dev_backend *backend);
int vhost_dev_unregister(struct spdk_vhost_dev *vdev);
int vhost_scsi_controller_construct(void);
int vhost_blk_controller_construct(void);
void vhost_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w);
/*
* Call a function for each session of the provided vhost device.
* The function will be called one-by-one on each session's thread.
*
* \param vdev vhost device
* \param fn function to call on each session's thread
* \param cpl_fn function to be called at the end of the iteration on
* the vhost management thread.
* Optional, can be NULL.
* \param arg additional argument to the both callbacks
*/
void vhost_dev_foreach_session(struct spdk_vhost_dev *dev,
spdk_vhost_session_fn fn,
spdk_vhost_dev_fn cpl_fn,
void *arg);
/**
vhost: change vsession->lcore only within that lcore There is currently a small window after we stop session's pollers and before we mark the session as stopped (by setting vsession->lcore to -1). If spdk_vhost_dev_foreach_session() is called within this window, its callback could assume the session is still running and for example in vhost scsi target hotremove case, could destroy an io_channel for the second time - as it'd first done when the session was stopped. That's a bug. A similar case exists for session start. We fix the above by setting vsession->lcore directly after starting or stopping the session, hence eliminating the possible window for data races. This has a few implications: * spdk_vhost_session_send_event() called before session start can't operate on vsession->lcore, so it needs to be provided with the lcore as an additional parameter now. * the vsession->lcore can't be accessed until spdk_vhost_session_start_done() is called, so its existing usages were replaced with spdk_env_get_current_core() * active_session_num is decremented right after spdk_vhost_session_stop_done() is called and before spdk_vhost_session_send_event() returns, so some active_session_num == 1 checks meaning "the last session gets stopped now" needed to be changed to check against == 0, as if "the last session has been just stopped" Change-Id: I5781bb0ce247425130c9672e0df27d06b6234317 Signed-off-by: Darek Stojaczyk <dariusz.stojaczyk@intel.com> Reviewed-on: https://review.gerrithub.io/c/spdk/spdk/+/448229 Tested-by: SPDK CI Jenkins <sys_sgci@intel.com> Reviewed-by: Jim Harris <james.r.harris@intel.com> Reviewed-by: Changpeng Liu <changpeng.liu@intel.com>
2019-03-17 11:13:01 +00:00
* Call a function on the provided lcore and block until either
* spdk_vhost_session_start_done() or spdk_vhost_session_stop_done()
* is called.
*
* This must be called under the global vhost mutex, which this function
* will unlock for the time it's waiting. It's meant to be called only
* from start/stop session callbacks.
*
* \param pg designated session's poll group
* \param vsession vhost session
* \param cb_fn the function to call. The void *arg parameter in cb_fn
* is always NULL.
* \param timeout_sec timeout in seconds. This function will still
* block after the timeout expires, but will print the provided errmsg.
* \param errmsg error message to print once the timeout expires
* \return return the code passed to spdk_vhost_session_event_done().
*/
int vhost_session_send_event(struct vhost_poll_group *pg,
struct spdk_vhost_session *vsession,
spdk_vhost_session_fn cb_fn, unsigned timeout_sec,
const char *errmsg);
/**
* Finish a blocking spdk_vhost_session_send_event() call and finally
* start the session. This must be called on the target lcore, which
* will now receive all session-related messages (e.g. from
* spdk_vhost_dev_foreach_session()).
*
* Must be called under the global vhost lock.
*
* \param vsession vhost session
* \param response return code
*/
void vhost_session_start_done(struct spdk_vhost_session *vsession, int response);
/**
* Finish a blocking spdk_vhost_session_send_event() call and finally
* stop the session. This must be called on the session's lcore which
* used to receive all session-related messages (e.g. from
vhost: change vsession->lcore only within that lcore There is currently a small window after we stop session's pollers and before we mark the session as stopped (by setting vsession->lcore to -1). If spdk_vhost_dev_foreach_session() is called within this window, its callback could assume the session is still running and for example in vhost scsi target hotremove case, could destroy an io_channel for the second time - as it'd first done when the session was stopped. That's a bug. A similar case exists for session start. We fix the above by setting vsession->lcore directly after starting or stopping the session, hence eliminating the possible window for data races. This has a few implications: * spdk_vhost_session_send_event() called before session start can't operate on vsession->lcore, so it needs to be provided with the lcore as an additional parameter now. * the vsession->lcore can't be accessed until spdk_vhost_session_start_done() is called, so its existing usages were replaced with spdk_env_get_current_core() * active_session_num is decremented right after spdk_vhost_session_stop_done() is called and before spdk_vhost_session_send_event() returns, so some active_session_num == 1 checks meaning "the last session gets stopped now" needed to be changed to check against == 0, as if "the last session has been just stopped" Change-Id: I5781bb0ce247425130c9672e0df27d06b6234317 Signed-off-by: Darek Stojaczyk <dariusz.stojaczyk@intel.com> Reviewed-on: https://review.gerrithub.io/c/spdk/spdk/+/448229 Tested-by: SPDK CI Jenkins <sys_sgci@intel.com> Reviewed-by: Jim Harris <james.r.harris@intel.com> Reviewed-by: Changpeng Liu <changpeng.liu@intel.com>
2019-03-17 11:13:01 +00:00
* spdk_vhost_dev_foreach_session()). After this call, the session-
* related messages will be once again processed by any arbitrary thread.
*
* Must be called under the global vhost lock.
*
* Must be called under the global vhost mutex.
*
* \param vsession vhost session
* \param response return code
*/
void vhost_session_stop_done(struct spdk_vhost_session *vsession, int response);
struct spdk_vhost_session *vhost_session_find_by_vid(int vid);
void vhost_session_install_rte_compat_hooks(struct spdk_vhost_session *vsession);
void vhost_dev_install_rte_compat_hooks(struct spdk_vhost_dev *vdev);
struct vhost_poll_group *vhost_get_poll_group(struct spdk_cpuset *cpumask);
void vhost_put_poll_group(struct vhost_poll_group *pg);
int remove_vhost_controller(struct spdk_vhost_dev *vdev);
#ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB
int vhost_nvme_admin_passthrough(int vid, void *cmd, void *cqe, void *buf);
int vhost_nvme_set_cq_call(int vid, uint16_t qid, int fd);
int vhost_nvme_set_bar_mr(int vid, void *bar_addr, uint64_t bar_size);
int vhost_nvme_get_cap(int vid, uint64_t *cap);
int vhost_nvme_controller_construct(void);
int vhost_nvme_dev_construct(const char *name, const char *cpumask, uint32_t io_queues);
int vhost_nvme_dev_remove(struct spdk_vhost_dev *vdev);
int vhost_nvme_dev_add_ns(struct spdk_vhost_dev *vdev,
const char *bdev_name);
#endif
#endif /* SPDK_VHOST_INTERNAL_H */