numam-spdk/lib/vhost/vhost_blk.c
Liu Xiaodong ba32ee2b29 vhost: set timeout for session's stop_poller
If there is still some inflight IO which prevents
vhost_session_stop_done(), stop_poller can try
within 4 seconds, and then call vhost_session_stop_done
with -ETIMEDOUT.

This can avoid endless blocking in ctrl pthread if there
is no response from vhost session or its backend bdev.
Then spdk vhost target can still serve all other vhost
devices and operations besides the error one.

Change-Id: I2fc78b4da926c936a2e42dc0e66ce1c60001330d
Signed-off-by: Liu Xiaodong <xiaodong.liu@intel.com>
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/10393
Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com>
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Changpeng Liu <changpeng.liu@intel.com>
Reviewed-by: Jim Harris <james.r.harris@intel.com>
2022-01-07 15:24:12 +00:00

1640 lines
47 KiB
C

/*-
* BSD LICENSE
*
* Copyright(c) Intel Corporation. All rights reserved.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <linux/virtio_blk.h>
#include "spdk/env.h"
#include "spdk/bdev.h"
#include "spdk/bdev_module.h"
#include "spdk/thread.h"
#include "spdk/likely.h"
#include "spdk/string.h"
#include "spdk/util.h"
#include "spdk/vhost.h"
#include "vhost_internal.h"
#include <rte_version.h>
/* Minimal set of features supported by every SPDK VHOST-BLK device */
#define SPDK_VHOST_BLK_FEATURES_BASE (SPDK_VHOST_FEATURES | \
(1ULL << VIRTIO_BLK_F_SIZE_MAX) | (1ULL << VIRTIO_BLK_F_SEG_MAX) | \
(1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_BLK_SIZE) | \
(1ULL << VIRTIO_BLK_F_TOPOLOGY) | (1ULL << VIRTIO_BLK_F_BARRIER) | \
(1ULL << VIRTIO_BLK_F_SCSI) | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) | \
(1ULL << VIRTIO_BLK_F_MQ))
/* Not supported features */
#define SPDK_VHOST_BLK_DISABLED_FEATURES (SPDK_VHOST_DISABLED_FEATURES | \
(1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) | \
(1ULL << VIRTIO_BLK_F_BARRIER) | (1ULL << VIRTIO_BLK_F_SCSI))
/* Vhost-blk support protocol features */
#define SPDK_VHOST_BLK_PROTOCOL_FEATURES ((1ULL << VHOST_USER_PROTOCOL_F_CONFIG) | \
(1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD))
struct spdk_vhost_blk_task {
struct spdk_bdev_io *bdev_io;
struct spdk_vhost_blk_session *bvsession;
struct spdk_vhost_virtqueue *vq;
volatile uint8_t *status;
uint16_t req_idx;
uint16_t num_descs;
uint16_t buffer_id;
uint16_t inflight_head;
/* for io wait */
struct spdk_bdev_io_wait_entry bdev_io_wait;
/* If set, the task is currently used for I/O processing. */
bool used;
/** Number of bytes that were written. */
uint32_t used_len;
uint16_t iovcnt;
struct iovec iovs[SPDK_VHOST_IOVS_MAX];
/** Size of whole payload in bytes */
uint32_t payload_size;
};
struct spdk_vhost_blk_dev {
struct spdk_vhost_dev vdev;
struct spdk_bdev *bdev;
struct spdk_bdev_desc *bdev_desc;
/* dummy_io_channel is used to hold a bdev reference */
struct spdk_io_channel *dummy_io_channel;
bool readonly;
};
struct spdk_vhost_blk_session {
/* The parent session must be the very first field in this struct */
struct spdk_vhost_session vsession;
struct spdk_vhost_blk_dev *bvdev;
struct spdk_poller *requestq_poller;
struct spdk_io_channel *io_channel;
struct spdk_poller *stop_poller;
};
/* forward declaration */
static const struct spdk_vhost_dev_backend vhost_blk_device_backend;
static int
process_blk_request(struct spdk_vhost_blk_task *task,
struct spdk_vhost_blk_session *bvsession);
static struct spdk_vhost_blk_session *
to_blk_session(struct spdk_vhost_session *vsession)
{
assert(vsession->vdev->backend == &vhost_blk_device_backend);
return (struct spdk_vhost_blk_session *)vsession;
}
static void
blk_task_finish(struct spdk_vhost_blk_task *task)
{
assert(task->bvsession->vsession.task_cnt > 0);
task->bvsession->vsession.task_cnt--;
task->used = false;
}
static void
blk_task_init(struct spdk_vhost_blk_task *task)
{
task->used = true;
task->iovcnt = SPDK_COUNTOF(task->iovs);
task->status = NULL;
task->used_len = 0;
task->payload_size = 0;
}
static void
blk_task_enqueue(struct spdk_vhost_blk_task *task)
{
if (task->vq->packed.packed_ring) {
vhost_vq_packed_ring_enqueue(&task->bvsession->vsession, task->vq,
task->num_descs,
task->buffer_id, task->used_len,
task->inflight_head);
} else {
vhost_vq_used_ring_enqueue(&task->bvsession->vsession, task->vq,
task->req_idx, task->used_len);
}
}
static void
invalid_blk_request(struct spdk_vhost_blk_task *task, uint8_t status)
{
if (task->status) {
*task->status = status;
}
blk_task_enqueue(task);
blk_task_finish(task);
SPDK_DEBUGLOG(vhost_blk_data, "Invalid request (status=%" PRIu8")\n", status);
}
/*
* Process task's descriptor chain and setup data related fields.
* Return
* total size of supplied buffers
*
* FIXME: Make this function return to rd_cnt and wr_cnt
*/
static int
blk_iovs_split_queue_setup(struct spdk_vhost_blk_session *bvsession,
struct spdk_vhost_virtqueue *vq,
uint16_t req_idx, struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length)
{
struct spdk_vhost_session *vsession = &bvsession->vsession;
struct spdk_vhost_dev *vdev = vsession->vdev;
struct vring_desc *desc, *desc_table;
uint16_t out_cnt = 0, cnt = 0;
uint32_t desc_table_size, len = 0;
uint32_t desc_handled_cnt;
int rc;
rc = vhost_vq_get_desc(vsession, vq, req_idx, &desc, &desc_table, &desc_table_size);
if (rc != 0) {
SPDK_ERRLOG("%s: invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx);
return -1;
}
desc_handled_cnt = 0;
while (1) {
/*
* Maximum cnt reached?
* Should not happen if request is well formatted, otherwise this is a BUG.
*/
if (spdk_unlikely(cnt == *iovs_cnt)) {
SPDK_DEBUGLOG(vhost_blk, "%s: max IOVs in request reached (req_idx = %"PRIu16").\n",
vsession->name, req_idx);
return -1;
}
if (spdk_unlikely(vhost_vring_desc_to_iov(vsession, iovs, &cnt, desc))) {
SPDK_DEBUGLOG(vhost_blk, "%s: invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n",
vsession->name, req_idx, cnt);
return -1;
}
len += desc->len;
out_cnt += vhost_vring_desc_is_wr(desc);
rc = vhost_vring_desc_get_next(&desc, desc_table, desc_table_size);
if (rc != 0) {
SPDK_ERRLOG("%s: descriptor chain at index %"PRIu16" terminated unexpectedly.\n",
vsession->name, req_idx);
return -1;
} else if (desc == NULL) {
break;
}
desc_handled_cnt++;
if (spdk_unlikely(desc_handled_cnt > desc_table_size)) {
/* Break a cycle and report an error, if any. */
SPDK_ERRLOG("%s: found a cycle in the descriptor chain: desc_table_size = %d, desc_handled_cnt = %d.\n",
vsession->name, desc_table_size, desc_handled_cnt);
return -1;
}
}
/*
* There must be least two descriptors.
* First contain request so it must be readable.
* Last descriptor contain buffer for response so it must be writable.
*/
if (spdk_unlikely(out_cnt == 0 || cnt < 2)) {
return -1;
}
*length = len;
*iovs_cnt = cnt;
return 0;
}
static int
blk_iovs_packed_desc_setup(struct spdk_vhost_session *vsession,
struct spdk_vhost_virtqueue *vq, uint16_t req_idx,
struct vring_packed_desc *desc_table, uint16_t desc_table_size,
struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length)
{
struct vring_packed_desc *desc;
uint16_t cnt = 0, out_cnt = 0;
uint32_t len = 0;
if (desc_table == NULL) {
desc = &vq->vring.desc_packed[req_idx];
} else {
req_idx = 0;
desc = desc_table;
}
while (1) {
/*
* Maximum cnt reached?
* Should not happen if request is well formatted, otherwise this is a BUG.
*/
if (spdk_unlikely(cnt == *iovs_cnt)) {
SPDK_ERRLOG("%s: max IOVs in request reached (req_idx = %"PRIu16").\n",
vsession->name, req_idx);
return -EINVAL;
}
if (spdk_unlikely(vhost_vring_packed_desc_to_iov(vsession, iovs, &cnt, desc))) {
SPDK_ERRLOG("%s: invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n",
vsession->name, req_idx, cnt);
return -EINVAL;
}
len += desc->len;
out_cnt += vhost_vring_packed_desc_is_wr(desc);
/* desc is NULL means we reach the last desc of this request */
vhost_vring_packed_desc_get_next(&desc, &req_idx, vq, desc_table, desc_table_size);
if (desc == NULL) {
break;
}
}
/*
* There must be least two descriptors.
* First contain request so it must be readable.
* Last descriptor contain buffer for response so it must be writable.
*/
if (spdk_unlikely(out_cnt == 0 || cnt < 2)) {
return -EINVAL;
}
*length = len;
*iovs_cnt = cnt;
return 0;
}
static int
blk_iovs_packed_queue_setup(struct spdk_vhost_blk_session *bvsession,
struct spdk_vhost_virtqueue *vq, uint16_t req_idx,
struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length)
{
struct spdk_vhost_session *vsession = &bvsession->vsession;
struct spdk_vhost_dev *vdev = vsession->vdev;
struct vring_packed_desc *desc = NULL, *desc_table;
uint32_t desc_table_size;
int rc;
rc = vhost_vq_get_desc_packed(vsession, vq, req_idx, &desc,
&desc_table, &desc_table_size);
if (spdk_unlikely(rc != 0)) {
SPDK_ERRLOG("%s: Invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx);
return rc;
}
return blk_iovs_packed_desc_setup(vsession, vq, req_idx, desc_table, desc_table_size,
iovs, iovs_cnt, length);
}
static int
blk_iovs_inflight_queue_setup(struct spdk_vhost_blk_session *bvsession,
struct spdk_vhost_virtqueue *vq, uint16_t req_idx,
struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length)
{
struct spdk_vhost_session *vsession = &bvsession->vsession;
struct spdk_vhost_dev *vdev = vsession->vdev;
spdk_vhost_inflight_desc *inflight_desc;
struct vring_packed_desc *desc_table;
uint16_t out_cnt = 0, cnt = 0;
uint32_t desc_table_size, len = 0;
int rc = 0;
rc = vhost_inflight_queue_get_desc(vsession, vq->vring_inflight.inflight_packed->desc,
req_idx, &inflight_desc, &desc_table, &desc_table_size);
if (spdk_unlikely(rc != 0)) {
SPDK_ERRLOG("%s: Invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx);
return rc;
}
if (desc_table != NULL) {
return blk_iovs_packed_desc_setup(vsession, vq, req_idx, desc_table, desc_table_size,
iovs, iovs_cnt, length);
}
while (1) {
/*
* Maximum cnt reached?
* Should not happen if request is well formatted, otherwise this is a BUG.
*/
if (spdk_unlikely(cnt == *iovs_cnt)) {
SPDK_ERRLOG("%s: max IOVs in request reached (req_idx = %"PRIu16").\n",
vsession->name, req_idx);
return -EINVAL;
}
if (spdk_unlikely(vhost_vring_inflight_desc_to_iov(vsession, iovs, &cnt, inflight_desc))) {
SPDK_ERRLOG("%s: invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n",
vsession->name, req_idx, cnt);
return -EINVAL;
}
len += inflight_desc->len;
out_cnt += vhost_vring_inflight_desc_is_wr(inflight_desc);
/* Without F_NEXT means it's the last desc */
if ((inflight_desc->flags & VRING_DESC_F_NEXT) == 0) {
break;
}
inflight_desc = &vq->vring_inflight.inflight_packed->desc[inflight_desc->next];
}
/*
* There must be least two descriptors.
* First contain request so it must be readable.
* Last descriptor contain buffer for response so it must be writable.
*/
if (spdk_unlikely(out_cnt == 0 || cnt < 2)) {
return -EINVAL;
}
*length = len;
*iovs_cnt = cnt;
return 0;
}
static void
blk_request_finish(bool success, struct spdk_vhost_blk_task *task)
{
*task->status = success ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR;
blk_task_enqueue(task);
SPDK_DEBUGLOG(vhost_blk, "Finished task (%p) req_idx=%d\n status: %s\n", task,
task->req_idx, success ? "OK" : "FAIL");
blk_task_finish(task);
}
static void
blk_request_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
{
struct spdk_vhost_blk_task *task = cb_arg;
spdk_bdev_free_io(bdev_io);
blk_request_finish(success, task);
}
static void
blk_request_resubmit(void *arg)
{
struct spdk_vhost_blk_task *task = (struct spdk_vhost_blk_task *)arg;
int rc = 0;
rc = process_blk_request(task, task->bvsession);
if (rc == 0) {
SPDK_DEBUGLOG(vhost_blk, "====== Task %p resubmitted ======\n", task);
} else {
SPDK_DEBUGLOG(vhost_blk, "====== Task %p failed ======\n", task);
}
}
static inline void
blk_request_queue_io(struct spdk_vhost_blk_task *task)
{
int rc;
struct spdk_vhost_blk_session *bvsession = task->bvsession;
struct spdk_bdev *bdev = bvsession->bvdev->bdev;
task->bdev_io_wait.bdev = bdev;
task->bdev_io_wait.cb_fn = blk_request_resubmit;
task->bdev_io_wait.cb_arg = task;
rc = spdk_bdev_queue_io_wait(bdev, bvsession->io_channel, &task->bdev_io_wait);
if (rc != 0) {
SPDK_ERRLOG("%s: failed to queue I/O, rc=%d\n", bvsession->vsession.name, rc);
invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
}
}
static int
process_blk_request(struct spdk_vhost_blk_task *task,
struct spdk_vhost_blk_session *bvsession)
{
struct spdk_vhost_blk_dev *bvdev = bvsession->bvdev;
const struct virtio_blk_outhdr *req;
struct virtio_blk_discard_write_zeroes *desc;
struct iovec *iov;
uint32_t type;
uint64_t flush_bytes;
uint32_t payload_len;
int rc;
iov = &task->iovs[0];
if (spdk_unlikely(iov->iov_len != sizeof(*req))) {
SPDK_DEBUGLOG(vhost_blk,
"First descriptor size is %zu but expected %zu (req_idx = %"PRIu16").\n",
iov->iov_len, sizeof(*req), task->req_idx);
invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
return -1;
}
req = iov->iov_base;
iov = &task->iovs[task->iovcnt - 1];
if (spdk_unlikely(iov->iov_len != 1)) {
SPDK_DEBUGLOG(vhost_blk,
"Last descriptor size is %zu but expected %d (req_idx = %"PRIu16").\n",
iov->iov_len, 1, task->req_idx);
invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
return -1;
}
payload_len = task->payload_size;
task->status = iov->iov_base;
payload_len -= sizeof(*req) + sizeof(*task->status);
task->iovcnt -= 2;
type = req->type;
#ifdef VIRTIO_BLK_T_BARRIER
/* Don't care about barrier for now (as QEMU's virtio-blk do). */
type &= ~VIRTIO_BLK_T_BARRIER;
#endif
switch (type) {
case VIRTIO_BLK_T_IN:
case VIRTIO_BLK_T_OUT:
if (spdk_unlikely(payload_len == 0 || (payload_len & (512 - 1)) != 0)) {
SPDK_ERRLOG("%s - passed IO buffer is not multiple of 512b (req_idx = %"PRIu16").\n",
type ? "WRITE" : "READ", task->req_idx);
invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
return -1;
}
if (type == VIRTIO_BLK_T_IN) {
task->used_len = payload_len + sizeof(*task->status);
rc = spdk_bdev_readv(bvdev->bdev_desc, bvsession->io_channel,
&task->iovs[1], task->iovcnt, req->sector * 512,
payload_len, blk_request_complete_cb, task);
} else if (!bvdev->readonly) {
task->used_len = sizeof(*task->status);
rc = spdk_bdev_writev(bvdev->bdev_desc, bvsession->io_channel,
&task->iovs[1], task->iovcnt, req->sector * 512,
payload_len, blk_request_complete_cb, task);
} else {
SPDK_DEBUGLOG(vhost_blk, "Device is in read-only mode!\n");
rc = -1;
}
if (rc) {
if (rc == -ENOMEM) {
SPDK_DEBUGLOG(vhost_blk, "No memory, start to queue io.\n");
blk_request_queue_io(task);
} else {
invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
return -1;
}
}
break;
case VIRTIO_BLK_T_DISCARD:
desc = task->iovs[1].iov_base;
if (payload_len != sizeof(*desc)) {
SPDK_NOTICELOG("Invalid discard payload size: %u\n", payload_len);
invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
return -1;
}
if (desc->flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) {
SPDK_ERRLOG("UNMAP flag is only used for WRITE ZEROES command\n");
invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
return -1;
}
rc = spdk_bdev_unmap(bvdev->bdev_desc, bvsession->io_channel,
desc->sector * 512, desc->num_sectors * 512,
blk_request_complete_cb, task);
if (rc) {
if (rc == -ENOMEM) {
SPDK_DEBUGLOG(vhost_blk, "No memory, start to queue io.\n");
blk_request_queue_io(task);
} else {
invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
return -1;
}
}
break;
case VIRTIO_BLK_T_WRITE_ZEROES:
desc = task->iovs[1].iov_base;
if (payload_len != sizeof(*desc)) {
SPDK_NOTICELOG("Invalid write zeroes payload size: %u\n", payload_len);
invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
return -1;
}
/* Unmap this range, SPDK doesn't support it, kernel will enable this flag by default
* without checking unmap feature is negotiated or not, the flag isn't mandatory, so
* just print a warning.
*/
if (desc->flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) {
SPDK_WARNLOG("Ignore the unmap flag for WRITE ZEROES from %"PRIx64", len %"PRIx64"\n",
(uint64_t)desc->sector * 512, (uint64_t)desc->num_sectors * 512);
}
rc = spdk_bdev_write_zeroes(bvdev->bdev_desc, bvsession->io_channel,
desc->sector * 512, desc->num_sectors * 512,
blk_request_complete_cb, task);
if (rc) {
if (rc == -ENOMEM) {
SPDK_DEBUGLOG(vhost_blk, "No memory, start to queue io.\n");
blk_request_queue_io(task);
} else {
invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
return -1;
}
}
break;
case VIRTIO_BLK_T_FLUSH:
flush_bytes = spdk_bdev_get_num_blocks(bvdev->bdev) * spdk_bdev_get_block_size(bvdev->bdev);
if (req->sector != 0) {
SPDK_NOTICELOG("sector must be zero for flush command\n");
invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
return -1;
}
rc = spdk_bdev_flush(bvdev->bdev_desc, bvsession->io_channel,
0, flush_bytes,
blk_request_complete_cb, task);
if (rc) {
if (rc == -ENOMEM) {
SPDK_DEBUGLOG(vhost_blk, "No memory, start to queue io.\n");
blk_request_queue_io(task);
} else {
invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
return -1;
}
}
break;
case VIRTIO_BLK_T_GET_ID:
if (!task->iovcnt || !payload_len) {
invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
return -1;
}
task->used_len = spdk_min((size_t)VIRTIO_BLK_ID_BYTES, task->iovs[1].iov_len);
spdk_strcpy_pad(task->iovs[1].iov_base, spdk_bdev_get_name(bvdev->bdev),
task->used_len, ' ');
blk_request_finish(true, task);
break;
default:
SPDK_DEBUGLOG(vhost_blk, "Not supported request type '%"PRIu32"'.\n", type);
invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
return -1;
}
return 0;
}
static void
process_blk_task(struct spdk_vhost_virtqueue *vq, uint16_t req_idx)
{
struct spdk_vhost_blk_task *task;
int rc;
assert(vq->packed.packed_ring == false);
task = &((struct spdk_vhost_blk_task *)vq->tasks)[req_idx];
if (spdk_unlikely(task->used)) {
SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
task->bvsession->vsession.name, req_idx);
task->used_len = 0;
blk_task_enqueue(task);
return;
}
task->bvsession->vsession.task_cnt++;
blk_task_init(task);
rc = blk_iovs_split_queue_setup(task->bvsession, vq, task->req_idx, task->iovs, &task->iovcnt,
&task->payload_size);
if (rc) {
SPDK_DEBUGLOG(vhost_blk, "Invalid request (req_idx = %"PRIu16").\n", task->req_idx);
/* Only READ and WRITE are supported for now. */
invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
return;
}
if (process_blk_request(task, task->bvsession) == 0) {
SPDK_DEBUGLOG(vhost_blk, "====== Task %p req_idx %d submitted ======\n", task,
req_idx);
} else {
SPDK_ERRLOG("====== Task %p req_idx %d failed ======\n", task, req_idx);
}
}
static void
process_packed_blk_task(struct spdk_vhost_virtqueue *vq, uint16_t req_idx)
{
struct spdk_vhost_blk_task *task;
uint16_t task_idx = req_idx, num_descs;
int rc;
assert(vq->packed.packed_ring);
/* Packed ring used the buffer_id as the task_idx to get task struct.
* In kernel driver, it uses the vq->free_head to set the buffer_id so the value
* must be in the range of 0 ~ vring.size. The free_head value must be unique
* in the outstanding requests.
* We can't use the req_idx as the task_idx because the desc can be reused in
* the next phase even when it's not completed in the previous phase. For example,
* At phase 0, last_used_idx was 2 and desc0 was not completed.Then after moving
* phase 1, last_avail_idx is updated to 1. In this case, req_idx can not be used
* as task_idx because we will know task[0]->used is true at phase 1.
* The split queue is quite different, the desc would insert into the free list when
* device completes the request, the driver gets the desc from the free list which
* ensures the req_idx is unique in the outstanding requests.
*/
task_idx = vhost_vring_packed_desc_get_buffer_id(vq, req_idx, &num_descs);
task = &((struct spdk_vhost_blk_task *)vq->tasks)[task_idx];
if (spdk_unlikely(task->used)) {
SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
task->bvsession->vsession.name, task_idx);
task->used_len = 0;
blk_task_enqueue(task);
return;
}
task->req_idx = req_idx;
task->num_descs = num_descs;
task->buffer_id = task_idx;
rte_vhost_set_inflight_desc_packed(task->bvsession->vsession.vid, vq->vring_idx,
req_idx, (req_idx + num_descs - 1) % vq->vring.size,
&task->inflight_head);
task->bvsession->vsession.task_cnt++;
blk_task_init(task);
rc = blk_iovs_packed_queue_setup(task->bvsession, vq, task->req_idx, task->iovs, &task->iovcnt,
&task->payload_size);
if (rc) {
SPDK_DEBUGLOG(vhost_blk, "Invalid request (req_idx = %"PRIu16").\n", task->req_idx);
/* Only READ and WRITE are supported for now. */
invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
return;
}
if (process_blk_request(task, task->bvsession) == 0) {
SPDK_DEBUGLOG(vhost_blk, "====== Task %p req_idx %d submitted ======\n", task,
task_idx);
} else {
SPDK_ERRLOG("====== Task %p req_idx %d failed ======\n", task, task_idx);
}
}
static void
process_packed_inflight_blk_task(struct spdk_vhost_virtqueue *vq,
uint16_t req_idx)
{
spdk_vhost_inflight_desc *desc_array = vq->vring_inflight.inflight_packed->desc;
spdk_vhost_inflight_desc *desc = &desc_array[req_idx];
struct spdk_vhost_blk_task *task;
uint16_t task_idx, num_descs;
int rc;
task_idx = desc_array[desc->last].id;
num_descs = desc->num;
/* In packed ring reconnection, we use the last_used_idx as the
* initial value. So when we process the inflight descs we still
* need to update the available ring index.
*/
vq->last_avail_idx += num_descs;
if (vq->last_avail_idx >= vq->vring.size) {
vq->last_avail_idx -= vq->vring.size;
vq->packed.avail_phase = !vq->packed.avail_phase;
}
task = &((struct spdk_vhost_blk_task *)vq->tasks)[task_idx];
if (spdk_unlikely(task->used)) {
SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
task->bvsession->vsession.name, task_idx);
task->used_len = 0;
blk_task_enqueue(task);
return;
}
task->req_idx = req_idx;
task->num_descs = num_descs;
task->buffer_id = task_idx;
/* It's for cleaning inflight entries */
task->inflight_head = req_idx;
task->bvsession->vsession.task_cnt++;
blk_task_init(task);
rc = blk_iovs_inflight_queue_setup(task->bvsession, vq, task->req_idx, task->iovs, &task->iovcnt,
&task->payload_size);
if (rc) {
SPDK_DEBUGLOG(vhost_blk, "Invalid request (req_idx = %"PRIu16").\n", task->req_idx);
/* Only READ and WRITE are supported for now. */
invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
return;
}
if (process_blk_request(task, task->bvsession) == 0) {
SPDK_DEBUGLOG(vhost_blk, "====== Task %p req_idx %d submitted ======\n", task,
task_idx);
} else {
SPDK_ERRLOG("====== Task %p req_idx %d failed ======\n", task, task_idx);
}
}
static void
submit_inflight_desc(struct spdk_vhost_blk_session *bvsession,
struct spdk_vhost_virtqueue *vq)
{
struct spdk_vhost_session *vsession;
spdk_vhost_resubmit_info *resubmit;
spdk_vhost_resubmit_desc *resubmit_list;
uint16_t req_idx;
int i;
resubmit = vq->vring_inflight.resubmit_inflight;
if (spdk_likely(resubmit == NULL || resubmit->resubmit_list == NULL ||
resubmit->resubmit_num == 0)) {
return;
}
resubmit_list = resubmit->resubmit_list;
vsession = &bvsession->vsession;
for (i = resubmit->resubmit_num - 1; i >= 0; --i) {
req_idx = resubmit_list[i].index;
SPDK_DEBUGLOG(vhost_blk, "====== Start processing resubmit request idx %"PRIu16"======\n",
req_idx);
if (spdk_unlikely(req_idx >= vq->vring.size)) {
SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n",
vsession->name, req_idx, vq->vring.size);
vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0);
continue;
}
if (vq->packed.packed_ring) {
process_packed_inflight_blk_task(vq, req_idx);
} else {
process_blk_task(vq, req_idx);
}
}
resubmit->resubmit_num = 0;
}
static void
process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
{
struct spdk_vhost_session *vsession = &bvsession->vsession;
uint16_t reqs[SPDK_VHOST_VQ_MAX_SUBMISSIONS];
uint16_t reqs_cnt, i;
submit_inflight_desc(bvsession, vq);
reqs_cnt = vhost_vq_avail_ring_get(vq, reqs, SPDK_COUNTOF(reqs));
if (!reqs_cnt) {
return;
}
for (i = 0; i < reqs_cnt; i++) {
SPDK_DEBUGLOG(vhost_blk, "====== Starting processing request idx %"PRIu16"======\n",
reqs[i]);
if (spdk_unlikely(reqs[i] >= vq->vring.size)) {
SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n",
vsession->name, reqs[i], vq->vring.size);
vhost_vq_used_ring_enqueue(vsession, vq, reqs[i], 0);
continue;
}
rte_vhost_set_inflight_desc_split(vsession->vid, vq->vring_idx, reqs[i]);
process_blk_task(vq, reqs[i]);
}
}
static void
process_packed_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
{
uint16_t i = 0;
submit_inflight_desc(bvsession, vq);
while (i++ < SPDK_VHOST_VQ_MAX_SUBMISSIONS &&
vhost_vq_packed_ring_is_avail(vq)) {
SPDK_DEBUGLOG(vhost_blk, "====== Starting processing request idx %"PRIu16"======\n",
vq->last_avail_idx);
process_packed_blk_task(vq, vq->last_avail_idx);
}
}
static int
_vdev_vq_worker(struct spdk_vhost_virtqueue *vq)
{
struct spdk_vhost_session *vsession = vq->vsession;
struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession);
bool packed_ring;
packed_ring = vq->packed.packed_ring;
if (packed_ring) {
process_packed_vq(bvsession, vq);
} else {
process_vq(bvsession, vq);
}
vhost_session_vq_used_signal(vq);
return SPDK_POLLER_BUSY;
}
static int
vdev_vq_worker(void *arg)
{
struct spdk_vhost_virtqueue *vq = arg;
return _vdev_vq_worker(vq);
}
static int
vdev_worker(void *arg)
{
struct spdk_vhost_blk_session *bvsession = arg;
struct spdk_vhost_session *vsession = &bvsession->vsession;
uint16_t q_idx;
for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
_vdev_vq_worker(&vsession->virtqueue[q_idx]);
}
return SPDK_POLLER_BUSY;
}
static void
no_bdev_process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
{
struct spdk_vhost_session *vsession = &bvsession->vsession;
struct iovec iovs[SPDK_VHOST_IOVS_MAX];
uint32_t length;
uint16_t iovcnt, req_idx;
if (vhost_vq_avail_ring_get(vq, &req_idx, 1) != 1) {
return;
}
iovcnt = SPDK_COUNTOF(iovs);
if (blk_iovs_split_queue_setup(bvsession, vq, req_idx, iovs, &iovcnt, &length) == 0) {
*(volatile uint8_t *)iovs[iovcnt - 1].iov_base = VIRTIO_BLK_S_IOERR;
SPDK_DEBUGLOG(vhost_blk_data, "Aborting request %" PRIu16"\n", req_idx);
}
vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0);
}
static void
no_bdev_process_packed_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
{
struct spdk_vhost_session *vsession = &bvsession->vsession;
struct spdk_vhost_blk_task *task;
uint32_t length;
uint16_t req_idx = vq->last_avail_idx;
uint16_t task_idx, num_descs;
if (!vhost_vq_packed_ring_is_avail(vq)) {
return;
}
task_idx = vhost_vring_packed_desc_get_buffer_id(vq, req_idx, &num_descs);
task = &((struct spdk_vhost_blk_task *)vq->tasks)[task_idx];
if (spdk_unlikely(task->used)) {
SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
vsession->name, req_idx);
vhost_vq_packed_ring_enqueue(vsession, vq, num_descs,
task->buffer_id, task->used_len,
task->inflight_head);
return;
}
task->req_idx = req_idx;
task->num_descs = num_descs;
task->buffer_id = task_idx;
blk_task_init(task);
if (blk_iovs_packed_queue_setup(bvsession, vq, task->req_idx, task->iovs, &task->iovcnt,
&length)) {
*(volatile uint8_t *)(task->iovs[task->iovcnt - 1].iov_base) = VIRTIO_BLK_S_IOERR;
SPDK_DEBUGLOG(vhost_blk_data, "Aborting request %" PRIu16"\n", req_idx);
}
task->used = false;
vhost_vq_packed_ring_enqueue(vsession, vq, num_descs,
task->buffer_id, task->used_len,
task->inflight_head);
}
static int
_no_bdev_vdev_vq_worker(struct spdk_vhost_virtqueue *vq)
{
struct spdk_vhost_session *vsession = vq->vsession;
struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession);
bool packed_ring;
packed_ring = vq->packed.packed_ring;
if (packed_ring) {
no_bdev_process_packed_vq(bvsession, vq);
} else {
no_bdev_process_vq(bvsession, vq);
}
vhost_session_vq_used_signal(vq);
if (vsession->task_cnt == 0 && bvsession->io_channel) {
spdk_put_io_channel(bvsession->io_channel);
bvsession->io_channel = NULL;
}
return SPDK_POLLER_BUSY;
}
static int
no_bdev_vdev_vq_worker(void *arg)
{
struct spdk_vhost_virtqueue *vq = arg;
return _no_bdev_vdev_vq_worker(vq);
}
static int
no_bdev_vdev_worker(void *arg)
{
struct spdk_vhost_blk_session *bvsession = arg;
struct spdk_vhost_session *vsession = &bvsession->vsession;
uint16_t q_idx;
for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
_no_bdev_vdev_vq_worker(&vsession->virtqueue[q_idx]);
}
return SPDK_POLLER_BUSY;
}
static void
vhost_blk_session_unregister_interrupts(struct spdk_vhost_blk_session *bvsession)
{
struct spdk_vhost_session *vsession = &bvsession->vsession;
struct spdk_vhost_virtqueue *vq;
int i;
SPDK_DEBUGLOG(vhost_blk, "unregister virtqueues interrupt\n");
for (i = 0; i < vsession->max_queues; i++) {
vq = &vsession->virtqueue[i];
if (vq->intr == NULL) {
break;
}
SPDK_DEBUGLOG(vhost_blk, "unregister vq[%d]'s kickfd is %d\n",
i, vq->vring.kickfd);
spdk_interrupt_unregister(&vq->intr);
}
}
static int
vhost_blk_session_register_interrupts(struct spdk_vhost_blk_session *bvsession,
spdk_interrupt_fn fn, const char *name)
{
struct spdk_vhost_session *vsession = &bvsession->vsession;
struct spdk_vhost_virtqueue *vq = NULL;
int i;
SPDK_DEBUGLOG(vhost_blk, "Register virtqueues interrupt\n");
for (i = 0; i < vsession->max_queues; i++) {
vq = &vsession->virtqueue[i];
SPDK_DEBUGLOG(vhost_blk, "Register vq[%d]'s kickfd is %d\n",
i, vq->vring.kickfd);
vq->intr = spdk_interrupt_register(vq->vring.kickfd, fn, vq, name);
if (vq->intr == NULL) {
SPDK_ERRLOG("Fail to register req notifier handler.\n");
goto err;
}
}
return 0;
err:
vhost_blk_session_unregister_interrupts(bvsession);
return -1;
}
static void
vhost_blk_poller_set_interrupt_mode(struct spdk_poller *poller, void *cb_arg, bool interrupt_mode)
{
struct spdk_vhost_blk_session *bvsession = cb_arg;
vhost_session_set_interrupt_mode(&bvsession->vsession, interrupt_mode);
}
static struct spdk_vhost_blk_dev *
to_blk_dev(struct spdk_vhost_dev *vdev)
{
if (vdev == NULL) {
return NULL;
}
if (vdev->backend != &vhost_blk_device_backend) {
SPDK_ERRLOG("%s: not a vhost-blk device\n", vdev->name);
return NULL;
}
return SPDK_CONTAINEROF(vdev, struct spdk_vhost_blk_dev, vdev);
}
static int
vhost_session_bdev_resize_cb(struct spdk_vhost_dev *vdev,
struct spdk_vhost_session *vsession,
void *ctx)
{
#if RTE_VERSION >= RTE_VERSION_NUM(20, 02, 0, 0)
SPDK_NOTICELOG("bdev send slave msg to vid(%d)\n", vsession->vid);
rte_vhost_slave_config_change(vsession->vid, false);
#else
SPDK_NOTICELOG("bdev does not support resize until DPDK submodule version >= 20.02\n");
#endif
return 0;
}
static void
blk_resize_cb(void *resize_ctx)
{
struct spdk_vhost_blk_dev *bvdev = resize_ctx;
spdk_vhost_lock();
vhost_dev_foreach_session(&bvdev->vdev, vhost_session_bdev_resize_cb,
NULL, NULL);
spdk_vhost_unlock();
}
static void
vhost_dev_bdev_remove_cpl_cb(struct spdk_vhost_dev *vdev, void *ctx)
{
/* All sessions have been notified, time to close the bdev */
struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
assert(bvdev != NULL);
spdk_put_io_channel(bvdev->dummy_io_channel);
spdk_bdev_close(bvdev->bdev_desc);
bvdev->bdev_desc = NULL;
bvdev->bdev = NULL;
}
static int
vhost_session_bdev_remove_cb(struct spdk_vhost_dev *vdev,
struct spdk_vhost_session *vsession,
void *ctx)
{
struct spdk_vhost_blk_session *bvsession;
int rc;
bvsession = to_blk_session(vsession);
if (bvsession->requestq_poller) {
spdk_poller_unregister(&bvsession->requestq_poller);
if (vsession->virtqueue[0].intr) {
vhost_blk_session_unregister_interrupts(bvsession);
rc = vhost_blk_session_register_interrupts(bvsession, no_bdev_vdev_vq_worker,
"no_bdev_vdev_vq_worker");
if (rc) {
SPDK_ERRLOG("%s: Interrupt register failed\n", vsession->name);
return rc;
}
}
bvsession->requestq_poller = SPDK_POLLER_REGISTER(no_bdev_vdev_worker, bvsession, 0);
spdk_poller_register_interrupt(bvsession->requestq_poller, vhost_blk_poller_set_interrupt_mode,
bvsession);
}
return 0;
}
static void
bdev_remove_cb(void *remove_ctx)
{
struct spdk_vhost_blk_dev *bvdev = remove_ctx;
SPDK_WARNLOG("%s: hot-removing bdev - all further requests will fail.\n",
bvdev->vdev.name);
spdk_vhost_lock();
vhost_dev_foreach_session(&bvdev->vdev, vhost_session_bdev_remove_cb,
vhost_dev_bdev_remove_cpl_cb, NULL);
spdk_vhost_unlock();
}
static void
bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev,
void *event_ctx)
{
SPDK_DEBUGLOG(vhost_blk, "Bdev event: type %d, name %s\n",
type,
bdev->name);
switch (type) {
case SPDK_BDEV_EVENT_REMOVE:
SPDK_NOTICELOG("bdev name (%s) received event(SPDK_BDEV_EVENT_REMOVE)\n", bdev->name);
bdev_remove_cb(event_ctx);
break;
case SPDK_BDEV_EVENT_RESIZE:
SPDK_NOTICELOG("bdev name (%s) received event(SPDK_BDEV_EVENT_RESIZE)\n", bdev->name);
blk_resize_cb(event_ctx);
break;
default:
SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type);
break;
}
}
static void
free_task_pool(struct spdk_vhost_blk_session *bvsession)
{
struct spdk_vhost_session *vsession = &bvsession->vsession;
struct spdk_vhost_virtqueue *vq;
uint16_t i;
for (i = 0; i < vsession->max_queues; i++) {
vq = &vsession->virtqueue[i];
if (vq->tasks == NULL) {
continue;
}
spdk_free(vq->tasks);
vq->tasks = NULL;
}
}
static int
alloc_task_pool(struct spdk_vhost_blk_session *bvsession)
{
struct spdk_vhost_session *vsession = &bvsession->vsession;
struct spdk_vhost_virtqueue *vq;
struct spdk_vhost_blk_task *task;
uint32_t task_cnt;
uint16_t i;
uint32_t j;
for (i = 0; i < vsession->max_queues; i++) {
vq = &vsession->virtqueue[i];
if (vq->vring.desc == NULL) {
continue;
}
task_cnt = vq->vring.size;
if (task_cnt > SPDK_VHOST_MAX_VQ_SIZE) {
/* sanity check */
SPDK_ERRLOG("%s: virtqueue %"PRIu16" is too big. (size = %"PRIu32", max = %"PRIu32")\n",
vsession->name, i, task_cnt, SPDK_VHOST_MAX_VQ_SIZE);
free_task_pool(bvsession);
return -1;
}
vq->tasks = spdk_zmalloc(sizeof(struct spdk_vhost_blk_task) * task_cnt,
SPDK_CACHE_LINE_SIZE, NULL,
SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
if (vq->tasks == NULL) {
SPDK_ERRLOG("%s: failed to allocate %"PRIu32" tasks for virtqueue %"PRIu16"\n",
vsession->name, task_cnt, i);
free_task_pool(bvsession);
return -1;
}
for (j = 0; j < task_cnt; j++) {
task = &((struct spdk_vhost_blk_task *)vq->tasks)[j];
task->bvsession = bvsession;
task->req_idx = j;
task->vq = vq;
}
}
return 0;
}
static int
vhost_blk_start_cb(struct spdk_vhost_dev *vdev,
struct spdk_vhost_session *vsession, void *unused)
{
struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession);
struct spdk_vhost_blk_dev *bvdev;
int i, rc = 0;
bvdev = to_blk_dev(vdev);
assert(bvdev != NULL);
bvsession->bvdev = bvdev;
/* validate all I/O queues are in a contiguous index range */
for (i = 0; i < vsession->max_queues; i++) {
/* vring.desc and vring.desc_packed are in a union struct
* so q->vring.desc can replace q->vring.desc_packed.
*/
if (vsession->virtqueue[i].vring.desc == NULL) {
SPDK_ERRLOG("%s: queue %"PRIu32" is empty\n", vsession->name, i);
rc = -1;
goto out;
}
}
rc = alloc_task_pool(bvsession);
if (rc != 0) {
SPDK_ERRLOG("%s: failed to alloc task pool.\n", vsession->name);
goto out;
}
if (bvdev->bdev) {
bvsession->io_channel = spdk_bdev_get_io_channel(bvdev->bdev_desc);
if (!bvsession->io_channel) {
free_task_pool(bvsession);
SPDK_ERRLOG("%s: I/O channel allocation failed\n", vsession->name);
rc = -1;
goto out;
}
}
if (spdk_interrupt_mode_is_enabled()) {
if (bvdev->bdev) {
rc = vhost_blk_session_register_interrupts(bvsession,
vdev_vq_worker,
"vdev_vq_worker");
} else {
rc = vhost_blk_session_register_interrupts(bvsession,
no_bdev_vdev_vq_worker,
"no_bdev_vdev_vq_worker");
}
if (rc) {
SPDK_ERRLOG("%s: Interrupt register failed\n", vsession->name);
goto out;
}
}
if (bvdev->bdev) {
bvsession->requestq_poller = SPDK_POLLER_REGISTER(vdev_worker, bvsession, 0);
} else {
bvsession->requestq_poller = SPDK_POLLER_REGISTER(no_bdev_vdev_worker, bvsession, 0);
}
SPDK_INFOLOG(vhost, "%s: started poller on lcore %d\n",
vsession->name, spdk_env_get_current_core());
spdk_poller_register_interrupt(bvsession->requestq_poller, vhost_blk_poller_set_interrupt_mode,
bvsession);
out:
vhost_session_start_done(vsession, rc);
return rc;
}
static int
vhost_blk_start(struct spdk_vhost_session *vsession)
{
return vhost_session_send_event(vsession, vhost_blk_start_cb,
3, "start session");
}
static int
destroy_session_poller_cb(void *arg)
{
struct spdk_vhost_blk_session *bvsession = arg;
struct spdk_vhost_session *vsession = &bvsession->vsession;
int i;
if (vsession->task_cnt > 0 || spdk_vhost_trylock() != 0) {
assert(vsession->stop_retry_count > 0);
vsession->stop_retry_count--;
if (vsession->stop_retry_count == 0) {
SPDK_ERRLOG("%s: Timedout when destroy session (task_cnt %d)\n", vsession->name,
vsession->task_cnt);
spdk_poller_unregister(&bvsession->stop_poller);
vhost_session_stop_done(vsession, -ETIMEDOUT);
}
return SPDK_POLLER_BUSY;
}
for (i = 0; i < vsession->max_queues; i++) {
vsession->virtqueue[i].next_event_time = 0;
vhost_vq_used_signal(vsession, &vsession->virtqueue[i]);
}
SPDK_INFOLOG(vhost, "%s: stopping poller on lcore %d\n",
vsession->name, spdk_env_get_current_core());
if (bvsession->io_channel) {
spdk_put_io_channel(bvsession->io_channel);
bvsession->io_channel = NULL;
}
free_task_pool(bvsession);
spdk_poller_unregister(&bvsession->stop_poller);
vhost_session_stop_done(vsession, 0);
spdk_vhost_unlock();
return SPDK_POLLER_BUSY;
}
static int
vhost_blk_stop_cb(struct spdk_vhost_dev *vdev,
struct spdk_vhost_session *vsession, void *unused)
{
struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession);
spdk_poller_unregister(&bvsession->requestq_poller);
if (vsession->virtqueue[0].intr) {
vhost_blk_session_unregister_interrupts(bvsession);
}
/* vhost_session_send_event timeout is 3 seconds, here set retry within 4 seconds */
bvsession->vsession.stop_retry_count = 4000;
bvsession->stop_poller = SPDK_POLLER_REGISTER(destroy_session_poller_cb,
bvsession, 1000);
return 0;
}
static int
vhost_blk_stop(struct spdk_vhost_session *vsession)
{
return vhost_session_send_event(vsession, vhost_blk_stop_cb,
3, "stop session");
}
static void
vhost_blk_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
{
struct spdk_vhost_blk_dev *bvdev;
bvdev = to_blk_dev(vdev);
assert(bvdev != NULL);
spdk_json_write_named_object_begin(w, "block");
spdk_json_write_named_bool(w, "readonly", bvdev->readonly);
spdk_json_write_name(w, "bdev");
if (bvdev->bdev) {
spdk_json_write_string(w, spdk_bdev_get_name(bvdev->bdev));
} else {
spdk_json_write_null(w);
}
spdk_json_write_object_end(w);
}
static void
vhost_blk_write_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
{
struct spdk_vhost_blk_dev *bvdev;
bvdev = to_blk_dev(vdev);
assert(bvdev != NULL);
if (!bvdev->bdev) {
return;
}
spdk_json_write_object_begin(w);
spdk_json_write_named_string(w, "method", "vhost_create_blk_controller");
spdk_json_write_named_object_begin(w, "params");
spdk_json_write_named_string(w, "ctrlr", vdev->name);
spdk_json_write_named_string(w, "dev_name", spdk_bdev_get_name(bvdev->bdev));
spdk_json_write_named_string(w, "cpumask",
spdk_cpuset_fmt(spdk_thread_get_cpumask(vdev->thread)));
spdk_json_write_named_bool(w, "readonly", bvdev->readonly);
spdk_json_write_object_end(w);
spdk_json_write_object_end(w);
}
static int vhost_blk_destroy(struct spdk_vhost_dev *dev);
static int
vhost_blk_get_config(struct spdk_vhost_dev *vdev, uint8_t *config,
uint32_t len)
{
struct virtio_blk_config blkcfg;
struct spdk_vhost_blk_dev *bvdev;
struct spdk_bdev *bdev;
uint32_t blk_size;
uint64_t blkcnt;
memset(&blkcfg, 0, sizeof(blkcfg));
bvdev = to_blk_dev(vdev);
assert(bvdev != NULL);
bdev = bvdev->bdev;
if (bdev == NULL) {
/* We can't just return -1 here as this GET_CONFIG message might
* be caused by a QEMU VM reboot. Returning -1 will indicate an
* error to QEMU, who might then decide to terminate itself.
* We don't want that. A simple reboot shouldn't break the system.
*
* Presenting a block device with block size 0 and block count 0
* doesn't cause any problems on QEMU side and the virtio-pci
* device is even still available inside the VM, but there will
* be no block device created for it - the kernel drivers will
* silently reject it.
*/
blk_size = 0;
blkcnt = 0;
} else {
blk_size = spdk_bdev_get_block_size(bdev);
blkcnt = spdk_bdev_get_num_blocks(bdev);
if (spdk_bdev_get_buf_align(bdev) > 1) {
blkcfg.size_max = SPDK_BDEV_LARGE_BUF_MAX_SIZE;
blkcfg.seg_max = spdk_min(SPDK_VHOST_IOVS_MAX - 2 - 1, BDEV_IO_NUM_CHILD_IOV - 2 - 1);
} else {
blkcfg.size_max = 131072;
/* -2 for REQ and RESP and -1 for region boundary splitting */
blkcfg.seg_max = SPDK_VHOST_IOVS_MAX - 2 - 1;
}
}
blkcfg.blk_size = blk_size;
/* minimum I/O size in blocks */
blkcfg.min_io_size = 1;
/* expressed in 512 Bytes sectors */
blkcfg.capacity = (blkcnt * blk_size) / 512;
/* QEMU can overwrite this value when started */
blkcfg.num_queues = SPDK_VHOST_MAX_VQUEUES;
if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
/* 16MiB, expressed in 512 Bytes */
blkcfg.max_discard_sectors = 32768;
blkcfg.max_discard_seg = 1;
blkcfg.discard_sector_alignment = blk_size / 512;
}
if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
blkcfg.max_write_zeroes_sectors = 32768;
blkcfg.max_write_zeroes_seg = 1;
}
memcpy(config, &blkcfg, spdk_min(len, sizeof(blkcfg)));
return 0;
}
static const struct spdk_vhost_dev_backend vhost_blk_device_backend = {
.session_ctx_size = sizeof(struct spdk_vhost_blk_session) - sizeof(struct spdk_vhost_session),
.start_session = vhost_blk_start,
.stop_session = vhost_blk_stop,
.vhost_get_config = vhost_blk_get_config,
.dump_info_json = vhost_blk_dump_info_json,
.write_config_json = vhost_blk_write_config_json,
.remove_device = vhost_blk_destroy,
};
int
spdk_vhost_blk_construct(const char *name, const char *cpumask, const char *dev_name,
bool readonly, bool packed_ring)
{
struct spdk_vhost_blk_dev *bvdev = NULL;
struct spdk_vhost_dev *vdev;
struct spdk_bdev *bdev;
int ret = 0;
spdk_vhost_lock();
bvdev = calloc(1, sizeof(*bvdev));
if (bvdev == NULL) {
ret = -ENOMEM;
goto out;
}
ret = spdk_bdev_open_ext(dev_name, true, bdev_event_cb, bvdev, &bvdev->bdev_desc);
if (ret != 0) {
SPDK_ERRLOG("%s: could not open bdev '%s', error=%d\n",
name, dev_name, ret);
goto out;
}
bdev = spdk_bdev_desc_get_bdev(bvdev->bdev_desc);
vdev = &bvdev->vdev;
vdev->virtio_features = SPDK_VHOST_BLK_FEATURES_BASE;
vdev->disabled_features = SPDK_VHOST_BLK_DISABLED_FEATURES;
vdev->protocol_features = SPDK_VHOST_BLK_PROTOCOL_FEATURES;
vdev->virtio_features |= (uint64_t)packed_ring << VIRTIO_F_RING_PACKED;
if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_DISCARD);
}
if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_WRITE_ZEROES);
}
if (readonly) {
vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_RO);
}
if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH)) {
vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_FLUSH);
}
/*
* When starting qemu with vhost-user-blk multiqueue, the vhost device will
* be started/stopped many times, related to the queues num, as the
* vhost-user backend doesn't know the exact number of queues used for this
* device. The target have to stop and start the device once got a valid
* IO queue.
* When stoping and starting the vhost device, the backend bdev io device
* will be deleted and created repeatedly.
* Hold a bdev reference so that in the struct spdk_vhost_blk_dev, so that
* the io device will not be deleted.
*/
bvdev->dummy_io_channel = spdk_bdev_get_io_channel(bvdev->bdev_desc);
bvdev->bdev = bdev;
bvdev->readonly = readonly;
ret = vhost_dev_register(vdev, name, cpumask, &vhost_blk_device_backend);
if (ret != 0) {
spdk_put_io_channel(bvdev->dummy_io_channel);
spdk_bdev_close(bvdev->bdev_desc);
goto out;
}
SPDK_INFOLOG(vhost, "%s: using bdev '%s'\n", name, dev_name);
out:
if (ret != 0 && bvdev) {
free(bvdev);
}
spdk_vhost_unlock();
return ret;
}
static int
vhost_blk_destroy(struct spdk_vhost_dev *vdev)
{
struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
int rc;
assert(bvdev != NULL);
rc = vhost_dev_unregister(&bvdev->vdev);
if (rc != 0) {
return rc;
}
/* if the bdev is removed, don't need call spdk_put_io_channel. */
if (bvdev->bdev) {
spdk_put_io_channel(bvdev->dummy_io_channel);
}
if (bvdev->bdev_desc) {
spdk_bdev_close(bvdev->bdev_desc);
bvdev->bdev_desc = NULL;
}
bvdev->bdev = NULL;
free(bvdev);
return 0;
}
SPDK_LOG_REGISTER_COMPONENT(vhost_blk)
SPDK_LOG_REGISTER_COMPONENT(vhost_blk_data)