numam-spdk/lib/vhost/vhost_blk.c
Shuhei Matsumoto baf3e12c09 lib/vhost: Replace poll group per session by thread per controller
Currently each controller is associated with one of the poll groups.
For each controller, all sessions are associated with the poll
groups with which the corresponding controller is associated.

Vhost poll group does not have any polling loop but its usage was
very complex.

Association of controller with poll group is done based on the
specified cpumask, and poll group is created per CPU core.

This is as same as association of thread with CPU core.

So in this patch, replace poll group per session by thread per
controller.

Signed-off-by: Shuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com>
Change-Id: Ifa1e136caae11959f7b097b06a22910bc2169b30
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/1146
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
Reviewed-by: Changpeng Liu <changpeng.liu@intel.com>
Reviewed-by: Darek Stojaczyk <dariusz.stojaczyk@intel.com>
2020-03-10 09:12:12 +00:00

1117 lines
31 KiB
C

/*-
* BSD LICENSE
*
* Copyright(c) Intel Corporation. All rights reserved.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <linux/virtio_blk.h>
#include "spdk/env.h"
#include "spdk/bdev.h"
#include "spdk/bdev_module.h"
#include "spdk/conf.h"
#include "spdk/thread.h"
#include "spdk/likely.h"
#include "spdk/string.h"
#include "spdk/util.h"
#include "spdk/vhost.h"
#include "vhost_internal.h"
/* Minimal set of features supported by every SPDK VHOST-BLK device */
#define SPDK_VHOST_BLK_FEATURES_BASE (SPDK_VHOST_FEATURES | \
(1ULL << VIRTIO_BLK_F_SIZE_MAX) | (1ULL << VIRTIO_BLK_F_SEG_MAX) | \
(1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_BLK_SIZE) | \
(1ULL << VIRTIO_BLK_F_TOPOLOGY) | (1ULL << VIRTIO_BLK_F_BARRIER) | \
(1ULL << VIRTIO_BLK_F_SCSI) | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) | \
(1ULL << VIRTIO_BLK_F_MQ))
/* Not supported features */
#define SPDK_VHOST_BLK_DISABLED_FEATURES (SPDK_VHOST_DISABLED_FEATURES | \
(1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) | \
(1ULL << VIRTIO_BLK_F_BARRIER) | (1ULL << VIRTIO_BLK_F_SCSI))
/* Vhost-blk support protocol features */
#ifndef SPDK_CONFIG_VHOST_INTERNAL_LIB
#define SPDK_VHOST_BLK_PROTOCOL_FEATURES ((1ULL << VHOST_USER_PROTOCOL_F_CONFIG) | \
(1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD))
#else
#define SPDK_VHOST_BLK_PROTOCOL_FEATURES (1ULL << VHOST_USER_PROTOCOL_F_CONFIG)
#endif
struct spdk_vhost_blk_task {
struct spdk_bdev_io *bdev_io;
struct spdk_vhost_blk_session *bvsession;
struct spdk_vhost_virtqueue *vq;
volatile uint8_t *status;
uint16_t req_idx;
/* for io wait */
struct spdk_bdev_io_wait_entry bdev_io_wait;
/* If set, the task is currently used for I/O processing. */
bool used;
/** Number of bytes that were written. */
uint32_t used_len;
uint16_t iovcnt;
struct iovec iovs[SPDK_VHOST_IOVS_MAX];
};
struct spdk_vhost_blk_dev {
struct spdk_vhost_dev vdev;
struct spdk_bdev *bdev;
struct spdk_bdev_desc *bdev_desc;
bool readonly;
};
struct spdk_vhost_blk_session {
/* The parent session must be the very first field in this struct */
struct spdk_vhost_session vsession;
struct spdk_vhost_blk_dev *bvdev;
struct spdk_poller *requestq_poller;
struct spdk_io_channel *io_channel;
struct spdk_poller *stop_poller;
};
/* forward declaration */
static const struct spdk_vhost_dev_backend vhost_blk_device_backend;
static int
process_blk_request(struct spdk_vhost_blk_task *task,
struct spdk_vhost_blk_session *bvsession,
struct spdk_vhost_virtqueue *vq);
static void
blk_task_finish(struct spdk_vhost_blk_task *task)
{
assert(task->bvsession->vsession.task_cnt > 0);
task->bvsession->vsession.task_cnt--;
task->used = false;
}
static void
blk_task_init(struct spdk_vhost_blk_task *task)
{
task->used = true;
task->iovcnt = SPDK_COUNTOF(task->iovs);
task->status = NULL;
task->used_len = 0;
}
static void
invalid_blk_request(struct spdk_vhost_blk_task *task, uint8_t status)
{
if (task->status) {
*task->status = status;
}
vhost_vq_used_ring_enqueue(&task->bvsession->vsession, task->vq, task->req_idx,
task->used_len);
blk_task_finish(task);
SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Invalid request (status=%" PRIu8")\n", status);
}
/*
* Process task's descriptor chain and setup data related fields.
* Return
* total size of suplied buffers
*
* FIXME: Make this function return to rd_cnt and wr_cnt
*/
static int
blk_iovs_setup(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq,
uint16_t req_idx, struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length)
{
struct spdk_vhost_session *vsession = &bvsession->vsession;
struct spdk_vhost_dev *vdev = vsession->vdev;
struct vring_desc *desc, *desc_table;
uint16_t out_cnt = 0, cnt = 0;
uint32_t desc_table_size, len = 0;
uint32_t desc_handled_cnt;
int rc;
rc = vhost_vq_get_desc(vsession, vq, req_idx, &desc, &desc_table, &desc_table_size);
if (rc != 0) {
SPDK_ERRLOG("%s: invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx);
return -1;
}
desc_handled_cnt = 0;
while (1) {
/*
* Maximum cnt reached?
* Should not happen if request is well formatted, otherwise this is a BUG.
*/
if (spdk_unlikely(cnt == *iovs_cnt)) {
SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "%s: max IOVs in request reached (req_idx = %"PRIu16").\n",
vsession->name, req_idx);
return -1;
}
if (spdk_unlikely(vhost_vring_desc_to_iov(vsession, iovs, &cnt, desc))) {
SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "%s: invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n",
vsession->name, req_idx, cnt);
return -1;
}
len += desc->len;
out_cnt += vhost_vring_desc_is_wr(desc);
rc = vhost_vring_desc_get_next(&desc, desc_table, desc_table_size);
if (rc != 0) {
SPDK_ERRLOG("%s: descriptor chain at index %"PRIu16" terminated unexpectedly.\n",
vsession->name, req_idx);
return -1;
} else if (desc == NULL) {
break;
}
desc_handled_cnt++;
if (spdk_unlikely(desc_handled_cnt > desc_table_size)) {
/* Break a cycle and report an error, if any. */
SPDK_ERRLOG("%s: found a cycle in the descriptor chain: desc_table_size = %d, desc_handled_cnt = %d.\n",
vsession->name, desc_table_size, desc_handled_cnt);
return -1;
}
}
/*
* There must be least two descriptors.
* First contain request so it must be readable.
* Last descriptor contain buffer for response so it must be writable.
*/
if (spdk_unlikely(out_cnt == 0 || cnt < 2)) {
return -1;
}
*length = len;
*iovs_cnt = cnt;
return 0;
}
static void
blk_request_finish(bool success, struct spdk_vhost_blk_task *task)
{
*task->status = success ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR;
vhost_vq_used_ring_enqueue(&task->bvsession->vsession, task->vq, task->req_idx,
task->used_len);
SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Finished task (%p) req_idx=%d\n status: %s\n", task,
task->req_idx, success ? "OK" : "FAIL");
blk_task_finish(task);
}
static void
blk_request_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
{
struct spdk_vhost_blk_task *task = cb_arg;
spdk_bdev_free_io(bdev_io);
blk_request_finish(success, task);
}
static void
blk_request_resubmit(void *arg)
{
struct spdk_vhost_blk_task *task = (struct spdk_vhost_blk_task *)arg;
int rc = 0;
blk_task_init(task);
rc = process_blk_request(task, task->bvsession, task->vq);
if (rc == 0) {
SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p resubmitted ======\n", task);
} else {
SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p failed ======\n", task);
}
}
static inline void
blk_request_queue_io(struct spdk_vhost_blk_task *task)
{
int rc;
struct spdk_vhost_blk_session *bvsession = task->bvsession;
struct spdk_bdev *bdev = bvsession->bvdev->bdev;
task->bdev_io_wait.bdev = bdev;
task->bdev_io_wait.cb_fn = blk_request_resubmit;
task->bdev_io_wait.cb_arg = task;
rc = spdk_bdev_queue_io_wait(bdev, bvsession->io_channel, &task->bdev_io_wait);
if (rc != 0) {
SPDK_ERRLOG("%s: failed to queue I/O, rc=%d\n", bvsession->vsession.name, rc);
invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
}
}
static int
process_blk_request(struct spdk_vhost_blk_task *task,
struct spdk_vhost_blk_session *bvsession,
struct spdk_vhost_virtqueue *vq)
{
struct spdk_vhost_blk_dev *bvdev = bvsession->bvdev;
const struct virtio_blk_outhdr *req;
struct virtio_blk_discard_write_zeroes *desc;
struct iovec *iov;
uint32_t type;
uint32_t payload_len;
uint64_t flush_bytes;
int rc;
if (blk_iovs_setup(bvsession, vq, task->req_idx, task->iovs, &task->iovcnt, &payload_len)) {
SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Invalid request (req_idx = %"PRIu16").\n", task->req_idx);
/* Only READ and WRITE are supported for now. */
invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
return -1;
}
iov = &task->iovs[0];
if (spdk_unlikely(iov->iov_len != sizeof(*req))) {
SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK,
"First descriptor size is %zu but expected %zu (req_idx = %"PRIu16").\n",
iov->iov_len, sizeof(*req), task->req_idx);
invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
return -1;
}
req = iov->iov_base;
iov = &task->iovs[task->iovcnt - 1];
if (spdk_unlikely(iov->iov_len != 1)) {
SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK,
"Last descriptor size is %zu but expected %d (req_idx = %"PRIu16").\n",
iov->iov_len, 1, task->req_idx);
invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
return -1;
}
task->status = iov->iov_base;
payload_len -= sizeof(*req) + sizeof(*task->status);
task->iovcnt -= 2;
type = req->type;
#ifdef VIRTIO_BLK_T_BARRIER
/* Don't care about barier for now (as QEMU's virtio-blk do). */
type &= ~VIRTIO_BLK_T_BARRIER;
#endif
switch (type) {
case VIRTIO_BLK_T_IN:
case VIRTIO_BLK_T_OUT:
if (spdk_unlikely(payload_len == 0 || (payload_len & (512 - 1)) != 0)) {
SPDK_ERRLOG("%s - passed IO buffer is not multiple of 512b (req_idx = %"PRIu16").\n",
type ? "WRITE" : "READ", task->req_idx);
invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
return -1;
}
if (type == VIRTIO_BLK_T_IN) {
task->used_len = payload_len + sizeof(*task->status);
rc = spdk_bdev_readv(bvdev->bdev_desc, bvsession->io_channel,
&task->iovs[1], task->iovcnt, req->sector * 512,
payload_len, blk_request_complete_cb, task);
} else if (!bvdev->readonly) {
task->used_len = sizeof(*task->status);
rc = spdk_bdev_writev(bvdev->bdev_desc, bvsession->io_channel,
&task->iovs[1], task->iovcnt, req->sector * 512,
payload_len, blk_request_complete_cb, task);
} else {
SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Device is in read-only mode!\n");
rc = -1;
}
if (rc) {
if (rc == -ENOMEM) {
SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
blk_request_queue_io(task);
} else {
invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
return -1;
}
}
break;
case VIRTIO_BLK_T_DISCARD:
desc = task->iovs[1].iov_base;
if (payload_len != sizeof(*desc)) {
SPDK_NOTICELOG("Invalid discard payload size: %u\n", payload_len);
invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
return -1;
}
rc = spdk_bdev_unmap(bvdev->bdev_desc, bvsession->io_channel,
desc->sector * 512, desc->num_sectors * 512,
blk_request_complete_cb, task);
if (rc) {
if (rc == -ENOMEM) {
SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
blk_request_queue_io(task);
} else {
invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
return -1;
}
}
break;
case VIRTIO_BLK_T_WRITE_ZEROES:
desc = task->iovs[1].iov_base;
if (payload_len != sizeof(*desc)) {
SPDK_NOTICELOG("Invalid write zeroes payload size: %u\n", payload_len);
invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
return -1;
}
/* Zeroed and Unmap the range, SPDK doen't support it. */
if (desc->flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) {
SPDK_NOTICELOG("Can't support Write Zeroes with Unmap flag\n");
invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
return -1;
}
rc = spdk_bdev_write_zeroes(bvdev->bdev_desc, bvsession->io_channel,
desc->sector * 512, desc->num_sectors * 512,
blk_request_complete_cb, task);
if (rc) {
if (rc == -ENOMEM) {
SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
blk_request_queue_io(task);
} else {
invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
return -1;
}
}
break;
case VIRTIO_BLK_T_FLUSH:
flush_bytes = spdk_bdev_get_num_blocks(bvdev->bdev) * spdk_bdev_get_block_size(bvdev->bdev);
if (req->sector != 0) {
SPDK_NOTICELOG("sector must be zero for flush command\n");
invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
return -1;
}
rc = spdk_bdev_flush(bvdev->bdev_desc, bvsession->io_channel,
0, flush_bytes,
blk_request_complete_cb, task);
if (rc) {
if (rc == -ENOMEM) {
SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
blk_request_queue_io(task);
} else {
invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
return -1;
}
}
break;
case VIRTIO_BLK_T_GET_ID:
if (!task->iovcnt || !payload_len) {
invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
return -1;
}
task->used_len = spdk_min((size_t)VIRTIO_BLK_ID_BYTES, task->iovs[1].iov_len);
spdk_strcpy_pad(task->iovs[1].iov_base, spdk_bdev_get_product_name(bvdev->bdev),
task->used_len, ' ');
blk_request_finish(true, task);
break;
default:
SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Not supported request type '%"PRIu32"'.\n", type);
invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
return -1;
}
return 0;
}
static void
submit_inflight_desc(struct spdk_vhost_blk_session *bvsession,
struct spdk_vhost_virtqueue *vq)
{
struct spdk_vhost_blk_dev *bvdev = bvsession->bvdev;
struct spdk_vhost_blk_task *task;
struct spdk_vhost_session *vsession = &bvsession->vsession;
spdk_vhost_resubmit_info *resubmit = vq->vring_inflight.resubmit_inflight;
spdk_vhost_resubmit_desc *resubmit_list;
int rc;
uint16_t req_idx;
if (spdk_likely(resubmit == NULL || resubmit->resubmit_list == NULL)) {
return;
}
resubmit_list = resubmit->resubmit_list;
while (resubmit->resubmit_num-- > 0) {
req_idx = resubmit_list[resubmit->resubmit_num].index;
SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Start processing request idx %"PRIu16"======\n",
req_idx);
if (spdk_unlikely(req_idx >= vq->vring.size)) {
SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n",
bvdev->vdev.name, req_idx, vq->vring.size);
vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0);
continue;
}
task = &((struct spdk_vhost_blk_task *)vq->tasks)[req_idx];
if (spdk_unlikely(task->used)) {
SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
bvdev->vdev.name, req_idx);
vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0);
continue;
}
vsession->task_cnt++;
blk_task_init(task);
rc = process_blk_request(task, bvsession, vq);
if (rc == 0) {
SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p req_idx %d submitted ======\n", task,
req_idx);
} else {
SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p req_idx %d failed ======\n", task,
req_idx);
}
}
free(resubmit_list);
resubmit->resubmit_list = NULL;
}
static void
process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
{
struct spdk_vhost_blk_task *task;
struct spdk_vhost_session *vsession = &bvsession->vsession;
int rc;
uint16_t reqs[32];
uint16_t reqs_cnt, i;
uint16_t vq_idx = vq->vring_idx;
submit_inflight_desc(bvsession, vq);
reqs_cnt = vhost_vq_avail_ring_get(vq, reqs, SPDK_COUNTOF(reqs));
if (!reqs_cnt) {
return;
}
for (i = 0; i < reqs_cnt; i++) {
SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Starting processing request idx %"PRIu16"======\n",
reqs[i]);
if (spdk_unlikely(reqs[i] >= vq->vring.size)) {
SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n",
vsession->name, reqs[i], vq->vring.size);
vhost_vq_used_ring_enqueue(vsession, vq, reqs[i], 0);
continue;
}
rte_vhost_set_inflight_desc_split(vsession->vid, vq_idx, reqs[i]);
task = &((struct spdk_vhost_blk_task *)vq->tasks)[reqs[i]];
if (spdk_unlikely(task->used)) {
SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
vsession->name, reqs[i]);
vhost_vq_used_ring_enqueue(vsession, vq, reqs[i], 0);
continue;
}
vsession->task_cnt++;
blk_task_init(task);
rc = process_blk_request(task, bvsession, vq);
if (rc == 0) {
SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p req_idx %d submitted ======\n", task,
reqs[i]);
} else {
SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p req_idx %d failed ======\n", task, reqs[i]);
}
}
}
static int
vdev_worker(void *arg)
{
struct spdk_vhost_blk_session *bvsession = arg;
struct spdk_vhost_session *vsession = &bvsession->vsession;
uint16_t q_idx;
for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
process_vq(bvsession, &vsession->virtqueue[q_idx]);
}
vhost_session_used_signal(vsession);
return -1;
}
static void
no_bdev_process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
{
struct spdk_vhost_session *vsession = &bvsession->vsession;
struct iovec iovs[SPDK_VHOST_IOVS_MAX];
uint32_t length;
uint16_t iovcnt, req_idx;
if (vhost_vq_avail_ring_get(vq, &req_idx, 1) != 1) {
return;
}
iovcnt = SPDK_COUNTOF(iovs);
if (blk_iovs_setup(bvsession, vq, req_idx, iovs, &iovcnt, &length) == 0) {
*(volatile uint8_t *)iovs[iovcnt - 1].iov_base = VIRTIO_BLK_S_IOERR;
SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Aborting request %" PRIu16"\n", req_idx);
}
vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0);
}
static int
no_bdev_vdev_worker(void *arg)
{
struct spdk_vhost_blk_session *bvsession = arg;
struct spdk_vhost_session *vsession = &bvsession->vsession;
uint16_t q_idx;
for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
no_bdev_process_vq(bvsession, &vsession->virtqueue[q_idx]);
}
vhost_session_used_signal(vsession);
if (vsession->task_cnt == 0 && bvsession->io_channel) {
spdk_put_io_channel(bvsession->io_channel);
bvsession->io_channel = NULL;
}
return -1;
}
static struct spdk_vhost_blk_session *
to_blk_session(struct spdk_vhost_session *vsession)
{
assert(vsession->vdev->backend == &vhost_blk_device_backend);
return (struct spdk_vhost_blk_session *)vsession;
}
static struct spdk_vhost_blk_dev *
to_blk_dev(struct spdk_vhost_dev *vdev)
{
if (vdev == NULL) {
return NULL;
}
if (vdev->backend != &vhost_blk_device_backend) {
SPDK_ERRLOG("%s: not a vhost-blk device\n", vdev->name);
return NULL;
}
return SPDK_CONTAINEROF(vdev, struct spdk_vhost_blk_dev, vdev);
}
struct spdk_bdev *
spdk_vhost_blk_get_dev(struct spdk_vhost_dev *vdev)
{
struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
assert(bvdev != NULL);
return bvdev->bdev;
}
static void
vhost_dev_bdev_remove_cpl_cb(struct spdk_vhost_dev *vdev, void *ctx)
{
/* All sessions have been notified, time to close the bdev */
struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
assert(bvdev != NULL);
spdk_bdev_close(bvdev->bdev_desc);
bvdev->bdev_desc = NULL;
bvdev->bdev = NULL;
}
static int
vhost_session_bdev_remove_cb(struct spdk_vhost_dev *vdev,
struct spdk_vhost_session *vsession,
void *ctx)
{
struct spdk_vhost_blk_session *bvsession;
bvsession = (struct spdk_vhost_blk_session *)vsession;
if (bvsession->requestq_poller) {
spdk_poller_unregister(&bvsession->requestq_poller);
bvsession->requestq_poller = spdk_poller_register(no_bdev_vdev_worker, bvsession, 0);
}
return 0;
}
static void
bdev_remove_cb(void *remove_ctx)
{
struct spdk_vhost_blk_dev *bvdev = remove_ctx;
SPDK_WARNLOG("%s: hot-removing bdev - all further requests will fail.\n",
bvdev->vdev.name);
spdk_vhost_lock();
vhost_dev_foreach_session(&bvdev->vdev, vhost_session_bdev_remove_cb,
vhost_dev_bdev_remove_cpl_cb, NULL);
spdk_vhost_unlock();
}
static void
free_task_pool(struct spdk_vhost_blk_session *bvsession)
{
struct spdk_vhost_session *vsession = &bvsession->vsession;
struct spdk_vhost_virtqueue *vq;
uint16_t i;
for (i = 0; i < vsession->max_queues; i++) {
vq = &vsession->virtqueue[i];
if (vq->tasks == NULL) {
continue;
}
spdk_free(vq->tasks);
vq->tasks = NULL;
}
}
static int
alloc_task_pool(struct spdk_vhost_blk_session *bvsession)
{
struct spdk_vhost_session *vsession = &bvsession->vsession;
struct spdk_vhost_virtqueue *vq;
struct spdk_vhost_blk_task *task;
uint32_t task_cnt;
uint16_t i;
uint32_t j;
for (i = 0; i < vsession->max_queues; i++) {
vq = &vsession->virtqueue[i];
if (vq->vring.desc == NULL) {
continue;
}
task_cnt = vq->vring.size;
if (task_cnt > SPDK_VHOST_MAX_VQ_SIZE) {
/* sanity check */
SPDK_ERRLOG("%s: virtuque %"PRIu16" is too big. (size = %"PRIu32", max = %"PRIu32")\n",
vsession->name, i, task_cnt, SPDK_VHOST_MAX_VQ_SIZE);
free_task_pool(bvsession);
return -1;
}
vq->tasks = spdk_zmalloc(sizeof(struct spdk_vhost_blk_task) * task_cnt,
SPDK_CACHE_LINE_SIZE, NULL,
SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
if (vq->tasks == NULL) {
SPDK_ERRLOG("%s: failed to allocate %"PRIu32" tasks for virtqueue %"PRIu16"\n",
vsession->name, task_cnt, i);
free_task_pool(bvsession);
return -1;
}
for (j = 0; j < task_cnt; j++) {
task = &((struct spdk_vhost_blk_task *)vq->tasks)[j];
task->bvsession = bvsession;
task->req_idx = j;
task->vq = vq;
}
}
return 0;
}
static int
vhost_blk_start_cb(struct spdk_vhost_dev *vdev,
struct spdk_vhost_session *vsession, void *unused)
{
struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession);
struct spdk_vhost_blk_dev *bvdev;
int i, rc = 0;
bvdev = to_blk_dev(vdev);
assert(bvdev != NULL);
bvsession->bvdev = bvdev;
/* validate all I/O queues are in a contiguous index range */
for (i = 0; i < vsession->max_queues; i++) {
if (vsession->virtqueue[i].vring.desc == NULL) {
SPDK_ERRLOG("%s: queue %"PRIu32" is empty\n", vsession->name, i);
rc = -1;
goto out;
}
}
rc = alloc_task_pool(bvsession);
if (rc != 0) {
SPDK_ERRLOG("%s: failed to alloc task pool.\n", vsession->name);
goto out;
}
if (bvdev->bdev) {
bvsession->io_channel = spdk_bdev_get_io_channel(bvdev->bdev_desc);
if (!bvsession->io_channel) {
free_task_pool(bvsession);
SPDK_ERRLOG("%s: I/O channel allocation failed\n", vsession->name);
rc = -1;
goto out;
}
}
bvsession->requestq_poller = spdk_poller_register(bvdev->bdev ? vdev_worker : no_bdev_vdev_worker,
bvsession, 0);
SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: started poller on lcore %d\n",
vsession->name, spdk_env_get_current_core());
out:
vhost_session_start_done(vsession, rc);
return rc;
}
static int
vhost_blk_start(struct spdk_vhost_session *vsession)
{
return vhost_session_send_event(vsession, vhost_blk_start_cb,
3, "start session");
}
static int
destroy_session_poller_cb(void *arg)
{
struct spdk_vhost_blk_session *bvsession = arg;
struct spdk_vhost_session *vsession = &bvsession->vsession;
int i;
if (vsession->task_cnt > 0) {
return -1;
}
if (spdk_vhost_trylock() != 0) {
return -1;
}
for (i = 0; i < vsession->max_queues; i++) {
vsession->virtqueue[i].next_event_time = 0;
vhost_vq_used_signal(vsession, &vsession->virtqueue[i]);
}
SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: stopping poller on lcore %d\n",
vsession->name, spdk_env_get_current_core());
if (bvsession->io_channel) {
spdk_put_io_channel(bvsession->io_channel);
bvsession->io_channel = NULL;
}
free_task_pool(bvsession);
spdk_poller_unregister(&bvsession->stop_poller);
vhost_session_stop_done(vsession, 0);
spdk_vhost_unlock();
return -1;
}
static int
vhost_blk_stop_cb(struct spdk_vhost_dev *vdev,
struct spdk_vhost_session *vsession, void *unused)
{
struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession);
spdk_poller_unregister(&bvsession->requestq_poller);
bvsession->stop_poller = spdk_poller_register(destroy_session_poller_cb,
bvsession, 1000);
return 0;
}
static int
vhost_blk_stop(struct spdk_vhost_session *vsession)
{
return vhost_session_send_event(vsession, vhost_blk_stop_cb,
3, "stop session");
}
static void
vhost_blk_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
{
struct spdk_bdev *bdev = spdk_vhost_blk_get_dev(vdev);
struct spdk_vhost_blk_dev *bvdev;
bvdev = to_blk_dev(vdev);
assert(bvdev != NULL);
spdk_json_write_named_object_begin(w, "block");
spdk_json_write_named_bool(w, "readonly", bvdev->readonly);
spdk_json_write_name(w, "bdev");
if (bdev) {
spdk_json_write_string(w, spdk_bdev_get_name(bdev));
} else {
spdk_json_write_null(w);
}
spdk_json_write_object_end(w);
}
static void
vhost_blk_write_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
{
struct spdk_vhost_blk_dev *bvdev;
bvdev = to_blk_dev(vdev);
assert(bvdev != NULL);
if (!bvdev->bdev) {
return;
}
spdk_json_write_object_begin(w);
spdk_json_write_named_string(w, "method", "vhost_create_blk_controller");
spdk_json_write_named_object_begin(w, "params");
spdk_json_write_named_string(w, "ctrlr", vdev->name);
spdk_json_write_named_string(w, "dev_name", spdk_bdev_get_name(bvdev->bdev));
spdk_json_write_named_string(w, "cpumask",
spdk_cpuset_fmt(spdk_thread_get_cpumask(vdev->thread)));
spdk_json_write_named_bool(w, "readonly", bvdev->readonly);
spdk_json_write_object_end(w);
spdk_json_write_object_end(w);
}
static int vhost_blk_destroy(struct spdk_vhost_dev *dev);
static int
vhost_blk_get_config(struct spdk_vhost_dev *vdev, uint8_t *config,
uint32_t len)
{
struct virtio_blk_config blkcfg;
struct spdk_vhost_blk_dev *bvdev;
struct spdk_bdev *bdev;
uint32_t blk_size;
uint64_t blkcnt;
memset(&blkcfg, 0, sizeof(blkcfg));
bvdev = to_blk_dev(vdev);
assert(bvdev != NULL);
bdev = bvdev->bdev;
if (bdev == NULL) {
/* We can't just return -1 here as this GET_CONFIG message might
* be caused by a QEMU VM reboot. Returning -1 will indicate an
* error to QEMU, who might then decide to terminate itself.
* We don't want that. A simple reboot shouldn't break the system.
*
* Presenting a block device with block size 0 and block count 0
* doesn't cause any problems on QEMU side and the virtio-pci
* device is even still available inside the VM, but there will
* be no block device created for it - the kernel drivers will
* silently reject it.
*/
blk_size = 0;
blkcnt = 0;
} else {
blk_size = spdk_bdev_get_block_size(bdev);
blkcnt = spdk_bdev_get_num_blocks(bdev);
if (spdk_bdev_get_buf_align(bdev) > 1) {
blkcfg.size_max = SPDK_BDEV_LARGE_BUF_MAX_SIZE;
blkcfg.seg_max = spdk_min(SPDK_VHOST_IOVS_MAX - 2 - 1, BDEV_IO_NUM_CHILD_IOV - 2 - 1);
} else {
blkcfg.size_max = 131072;
/* -2 for REQ and RESP and -1 for region boundary splitting */
blkcfg.seg_max = SPDK_VHOST_IOVS_MAX - 2 - 1;
}
}
blkcfg.blk_size = blk_size;
/* minimum I/O size in blocks */
blkcfg.min_io_size = 1;
/* expressed in 512 Bytes sectors */
blkcfg.capacity = (blkcnt * blk_size) / 512;
/* QEMU can overwrite this value when started */
blkcfg.num_queues = SPDK_VHOST_MAX_VQUEUES;
if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
/* 16MiB, expressed in 512 Bytes */
blkcfg.max_discard_sectors = 32768;
blkcfg.max_discard_seg = 1;
blkcfg.discard_sector_alignment = blk_size / 512;
}
if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
blkcfg.max_write_zeroes_sectors = 32768;
blkcfg.max_write_zeroes_seg = 1;
}
memcpy(config, &blkcfg, spdk_min(len, sizeof(blkcfg)));
return 0;
}
static const struct spdk_vhost_dev_backend vhost_blk_device_backend = {
.session_ctx_size = sizeof(struct spdk_vhost_blk_session) - sizeof(struct spdk_vhost_session),
.start_session = vhost_blk_start,
.stop_session = vhost_blk_stop,
.vhost_get_config = vhost_blk_get_config,
.dump_info_json = vhost_blk_dump_info_json,
.write_config_json = vhost_blk_write_config_json,
.remove_device = vhost_blk_destroy,
};
int
vhost_blk_controller_construct(void)
{
struct spdk_conf_section *sp;
unsigned ctrlr_num;
char *bdev_name;
char *cpumask;
char *name;
bool readonly;
for (sp = spdk_conf_first_section(NULL); sp != NULL; sp = spdk_conf_next_section(sp)) {
if (!spdk_conf_section_match_prefix(sp, "VhostBlk")) {
continue;
}
if (sscanf(spdk_conf_section_get_name(sp), "VhostBlk%u", &ctrlr_num) != 1) {
SPDK_ERRLOG("Section '%s' has non-numeric suffix.\n",
spdk_conf_section_get_name(sp));
return -1;
}
name = spdk_conf_section_get_val(sp, "Name");
if (name == NULL) {
SPDK_ERRLOG("VhostBlk%u: missing Name\n", ctrlr_num);
return -1;
}
cpumask = spdk_conf_section_get_val(sp, "Cpumask");
readonly = spdk_conf_section_get_boolval(sp, "ReadOnly", false);
bdev_name = spdk_conf_section_get_val(sp, "Dev");
if (bdev_name == NULL) {
continue;
}
if (spdk_vhost_blk_construct(name, cpumask, bdev_name, readonly) < 0) {
return -1;
}
}
return 0;
}
int
spdk_vhost_blk_construct(const char *name, const char *cpumask, const char *dev_name, bool readonly)
{
struct spdk_vhost_blk_dev *bvdev = NULL;
struct spdk_vhost_dev *vdev;
struct spdk_bdev *bdev;
int ret = 0;
spdk_vhost_lock();
bdev = spdk_bdev_get_by_name(dev_name);
if (bdev == NULL) {
SPDK_ERRLOG("%s: bdev '%s' not found\n",
name, dev_name);
ret = -ENODEV;
goto out;
}
bvdev = calloc(1, sizeof(*bvdev));
if (bvdev == NULL) {
ret = -ENOMEM;
goto out;
}
vdev = &bvdev->vdev;
vdev->virtio_features = SPDK_VHOST_BLK_FEATURES_BASE;
vdev->disabled_features = SPDK_VHOST_BLK_DISABLED_FEATURES;
vdev->protocol_features = SPDK_VHOST_BLK_PROTOCOL_FEATURES;
if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_DISCARD);
}
if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_WRITE_ZEROES);
}
if (readonly) {
vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_RO);
}
if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH)) {
vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_FLUSH);
}
ret = spdk_bdev_open(bdev, true, bdev_remove_cb, bvdev, &bvdev->bdev_desc);
if (ret != 0) {
SPDK_ERRLOG("%s: could not open bdev '%s', error=%d\n",
name, dev_name, ret);
goto out;
}
bvdev->bdev = bdev;
bvdev->readonly = readonly;
ret = vhost_dev_register(vdev, name, cpumask, &vhost_blk_device_backend);
if (ret != 0) {
spdk_bdev_close(bvdev->bdev_desc);
goto out;
}
SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: using bdev '%s'\n", name, dev_name);
out:
if (ret != 0 && bvdev) {
free(bvdev);
}
spdk_vhost_unlock();
return ret;
}
static int
vhost_blk_destroy(struct spdk_vhost_dev *vdev)
{
struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
int rc;
assert(bvdev != NULL);
rc = vhost_dev_unregister(&bvdev->vdev);
if (rc != 0) {
return rc;
}
if (bvdev->bdev_desc) {
spdk_bdev_close(bvdev->bdev_desc);
bvdev->bdev_desc = NULL;
}
bvdev->bdev = NULL;
free(bvdev);
return 0;
}
SPDK_LOG_REGISTER_COMPONENT("vhost_blk", SPDK_LOG_VHOST_BLK)
SPDK_LOG_REGISTER_COMPONENT("vhost_blk_data", SPDK_LOG_VHOST_BLK_DATA)