diff --git a/etc/spdk/vhost.conf.in b/etc/spdk/vhost.conf.in index a14a362f86..b7f1b8ec5c 100644 --- a/etc/spdk/vhost.conf.in +++ b/etc/spdk/vhost.conf.in @@ -138,3 +138,17 @@ # this cpumask. By default, it not specified, will use any core in the # SPDK process. #Cpumask 0x1 + +#[VhostNvme0] + # Define name for controller + #Name vhost.0 + #NumberOfQueues 2 + # Use first partition from the first NVMe device + #Namespace Nvme0n1p0 + # Use first partition from the first NVMe device + #Namespace Nvme0n1p1 + + # Start the poller for this vhost controller on one of the cores in + # this cpumask. By default, it not specified, will use any core in the + # SPDK process. + #Cpumask 0x1 diff --git a/lib/vhost/Makefile b/lib/vhost/Makefile index 09581b4185..b46978e2b0 100644 --- a/lib/vhost/Makefile +++ b/lib/vhost/Makefile @@ -38,7 +38,7 @@ CFLAGS += -I. CFLAGS += -Irte_vhost CFLAGS += $(ENV_CFLAGS) -C_SRCS = vhost.c vhost_rpc.c vhost_scsi.c vhost_blk.c +C_SRCS = vhost.c vhost_rpc.c vhost_scsi.c vhost_blk.c vhost_nvme.c LIBNAME = vhost diff --git a/lib/vhost/rte_vhost/rte_vhost.h b/lib/vhost/rte_vhost/rte_vhost.h index 08380cca6b..2e138bec53 100644 --- a/lib/vhost/rte_vhost/rte_vhost.h +++ b/lib/vhost/rte_vhost/rte_vhost.h @@ -104,6 +104,9 @@ struct vhost_device_ops { * is used to inform the application on such change. */ int (*features_changed)(int vid, uint64_t features); + int (*vhost_nvme_admin_passthrough)(int vid, void *cmd, void *cqe, void *buf); + int (*vhost_nvme_set_cq_call)(int vid, uint16_t qid, int fd); + int (*vhost_nvme_get_cap)(int vid, uint64_t *cap); int (*new_connection)(int vid); void (*destroy_connection)(int vid); diff --git a/lib/vhost/rte_vhost/vhost.h b/lib/vhost/rte_vhost/vhost.h index 4e741e721c..b0a0201d36 100644 --- a/lib/vhost/rte_vhost/vhost.h +++ b/lib/vhost/rte_vhost/vhost.h @@ -178,6 +178,7 @@ struct virtio_net { uint64_t negotiated_features; uint64_t protocol_features; int vid; + uint32_t is_nvme; uint32_t flags; uint16_t vhost_hlen; /* to tell if we need broadcast rarp packet */ diff --git a/lib/vhost/rte_vhost/vhost_user.c b/lib/vhost/rte_vhost/vhost_user.c index bc38e3d61e..a1f620fa0f 100644 --- a/lib/vhost/rte_vhost/vhost_user.c +++ b/lib/vhost/rte_vhost/vhost_user.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -78,6 +79,11 @@ static const char *vhost_message_str[VHOST_USER_MAX] = { [VHOST_USER_NET_SET_MTU] = "VHOST_USER_NET_SET_MTU", [VHOST_USER_GET_CONFIG] = "VHOST_USER_GET_CONFIG", [VHOST_USER_SET_CONFIG] = "VHOST_USER_SET_CONFIG", + [VHOST_USER_NVME_ADMIN] = "VHOST_USER_NVME_ADMIN", + [VHOST_USER_NVME_SET_CQ_CALL] = "VHOST_USER_NVME_SET_CQ_CALL", + [VHOST_USER_NVME_GET_CAP] = "VHOST_USER_NVME_GET_CAP", + [VHOST_USER_NVME_START_STOP] = "VHOST_USER_NVME_START_STOP", + [VHOST_USER_NVME_IO_CMD] = "VHOST_USER_NVME_IO_CMD" }; static uint64_t @@ -548,6 +554,14 @@ vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg) memcpy(&dev->mem_table, &pmsg->payload.memory, sizeof(dev->mem_table)); memcpy(dev->mem_table_fds, pmsg->fds, sizeof(dev->mem_table_fds)); dev->has_new_mem_table = 1; + /* vhost-user-nvme will not send + * set vring addr message, enable + * memory address table now. + */ + if (dev->has_new_mem_table && dev->is_nvme) { + vhost_setup_mem_table(dev); + dev->has_new_mem_table = 0; + } return 0; } @@ -1040,12 +1054,59 @@ vhost_user_check_and_alloc_queue_pair(struct virtio_net *dev, VhostUserMsg *msg) return alloc_vring_queue(dev, vring_idx); } +static int +vhost_user_nvme_io_request_passthrough(struct virtio_net *dev, + uint16_t qid, uint16_t tail_head, + bool is_submission_queue) +{ + return -1; +} + +static int +vhost_user_nvme_admin_passthrough(struct virtio_net *dev, + void *cmd, void *cqe, void *buf) +{ + if (dev->notify_ops->vhost_nvme_admin_passthrough) { + return dev->notify_ops->vhost_nvme_admin_passthrough(dev->vid, cmd, cqe, buf); + } + + return -1; +} + +static int +vhost_user_nvme_set_cq_call(struct virtio_net *dev, uint16_t qid, int fd) +{ + if (dev->notify_ops->vhost_nvme_set_cq_call) { + return dev->notify_ops->vhost_nvme_set_cq_call(dev->vid, qid, fd); + } + + return -1; +} + +static int +vhost_user_nvme_get_cap(struct virtio_net *dev, uint64_t *cap) +{ + if (dev->notify_ops->vhost_nvme_get_cap) { + return dev->notify_ops->vhost_nvme_get_cap(dev->vid, cap); + } + + return -1; +} + int vhost_user_msg_handler(int vid, int fd) { struct virtio_net *dev; struct VhostUserMsg msg; + struct vhost_vring_file file; int ret; + uint64_t cap; + uint64_t enable; + uint8_t cqe[16]; + uint8_t cmd[64]; + uint8_t buf[4096]; + uint16_t qid, tail_head; + bool is_submission_queue; dev = get_device(vid); if (dev == NULL) @@ -1106,6 +1167,60 @@ vhost_user_msg_handler(int vid, int fd) ret = 0; } break; + case VHOST_USER_NVME_ADMIN: + if (!dev->is_nvme) { + dev->is_nvme = 1; + } + memcpy(cmd, &msg.payload.nvme.cmd, 64); + ret = vhost_user_nvme_admin_passthrough(dev, cmd, cqe, buf); + memcpy(&msg.payload.nvme.cmd, &cqe, 16); + msg.size = 16; + /* NVMe Identify Command */ + if (cmd[0] == 0x06) { + memcpy(msg.payload.nvme.buf, &buf, 4096); + msg.size += 4096; + } else if (cmd[0] == 0x09 || cmd[0] == 0x0a) { + memcpy(&msg.payload.nvme.buf, &buf, 4); + msg.size += 4096; + + } + send_vhost_message(fd, &msg); + break; + case VHOST_USER_NVME_SET_CQ_CALL: + file.index = msg.payload.u64 & VHOST_USER_VRING_IDX_MASK; + file.fd = msg.fds[0]; + ret = vhost_user_nvme_set_cq_call(dev, file.index, file.fd); + break; + case VHOST_USER_NVME_GET_CAP: + ret = vhost_user_nvme_get_cap(dev, &cap); + if (!ret) + msg.payload.u64 = cap; + else + msg.payload.u64 = 0; + msg.size = sizeof(msg.payload.u64); + send_vhost_message(fd, &msg); + break; + case VHOST_USER_NVME_START_STOP: + enable = msg.payload.u64; + /* device must be started before set cq call */ + if (enable) { + if (!(dev->flags & VIRTIO_DEV_RUNNING)) { + if (dev->notify_ops->new_device(dev->vid) == 0) + dev->flags |= VIRTIO_DEV_RUNNING; + } + } else { + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + dev->notify_ops->destroy_device(dev->vid); + } + } + break; + case VHOST_USER_NVME_IO_CMD: + qid = msg.payload.nvme_io.qid; + tail_head = msg.payload.nvme_io.tail_head; + is_submission_queue = (msg.payload.nvme_io.queue_type == VHOST_USER_NVME_SUBMISSION_QUEUE) ? true : false; + vhost_user_nvme_io_request_passthrough(dev, qid, tail_head, is_submission_queue); + break; case VHOST_USER_GET_FEATURES: msg.payload.u64 = vhost_user_get_features(dev); msg.size = sizeof(msg.payload.u64); diff --git a/lib/vhost/rte_vhost/vhost_user.h b/lib/vhost/rte_vhost/vhost_user.h index 1f3941c8c6..5b18a3c5be 100644 --- a/lib/vhost/rte_vhost/vhost_user.h +++ b/lib/vhost/rte_vhost/vhost_user.h @@ -84,6 +84,11 @@ typedef enum VhostUserRequest { VHOST_USER_NET_SET_MTU = 20, VHOST_USER_GET_CONFIG = 24, VHOST_USER_SET_CONFIG = 25, + VHOST_USER_NVME_ADMIN = 27, + VHOST_USER_NVME_SET_CQ_CALL = 28, + VHOST_USER_NVME_GET_CAP = 29, + VHOST_USER_NVME_START_STOP = 30, + VHOST_USER_NVME_IO_CMD = 31, VHOST_USER_MAX } VhostUserRequest; @@ -119,6 +124,17 @@ typedef struct VhostUserConfig { uint8_t region[VHOST_USER_MAX_CONFIG_SIZE]; } VhostUserConfig; +enum VhostUserNvmeQueueTypes { + VHOST_USER_NVME_SUBMISSION_QUEUE = 1, + VHOST_USER_NVME_COMPLETION_QUEUE = 2, +}; + +typedef struct VhostUserNvmeIO { + enum VhostUserNvmeQueueTypes queue_type; + uint32_t qid; + uint32_t tail_head; +} VhostUserNvmeIO; + typedef struct VhostUserMsg { VhostUserRequest request; @@ -136,6 +152,14 @@ typedef struct VhostUserMsg { VhostUserMemory memory; VhostUserLog log; VhostUserConfig config; + struct nvme { + union { + uint8_t req[64]; + uint8_t cqe[16]; + } cmd; + uint8_t buf[4096]; + } nvme; + struct VhostUserNvmeIO nvme_io; } payload; int fds[VHOST_MEMORY_MAX_NREGIONS]; } __attribute((packed)) VhostUserMsg; diff --git a/lib/vhost/vhost.c b/lib/vhost/vhost.c index 13cdd23def..b950eec1cb 100644 --- a/lib/vhost/vhost.c +++ b/lib/vhost/vhost.c @@ -79,6 +79,9 @@ const struct vhost_device_ops g_spdk_vhost_ops = { .set_config = set_config, .new_connection = new_connection, .destroy_connection = destroy_connection, + .vhost_nvme_admin_passthrough = spdk_vhost_nvme_admin_passthrough, + .vhost_nvme_set_cq_call = spdk_vhost_nvme_set_cq_call, + .vhost_nvme_get_cap = spdk_vhost_nvme_get_cap, }; static TAILQ_HEAD(, spdk_vhost_dev) g_spdk_vhost_devices = TAILQ_HEAD_INITIALIZER( @@ -534,6 +537,7 @@ spdk_vhost_dev_mem_unregister(struct spdk_vhost_dev *vdev) assert(false); } } + } static void @@ -882,7 +886,6 @@ spdk_vhost_event_send(struct spdk_vhost_dev *vdev, spdk_vhost_event_fn cb_fn, ev_ctx.vdev = vdev; ev_ctx.cb_fn = cb_fn; - ev = spdk_event_allocate(vdev->lcore, spdk_vhost_event_cb, &ev_ctx, NULL); assert(ev); spdk_event_call(ev); @@ -1290,6 +1293,12 @@ spdk_vhost_init(void) return -1; } + ret = spdk_vhost_nvme_controller_construct(); + if (ret != 0) { + SPDK_ERRLOG("Cannot construct vhost NVMe controllers\n"); + return -1; + } + return 0; } diff --git a/lib/vhost/vhost_internal.h b/lib/vhost/vhost_internal.h index cb32d33188..47dceab46a 100644 --- a/lib/vhost/vhost_internal.h +++ b/lib/vhost/vhost_internal.h @@ -254,5 +254,14 @@ void spdk_vhost_dump_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_w void spdk_vhost_dev_backend_event_done(void *event_ctx, int response); void spdk_vhost_lock(void); void spdk_vhost_unlock(void); +int spdk_remove_vhost_controller(struct spdk_vhost_dev *vdev); +int spdk_vhost_nvme_admin_passthrough(int vid, void *cmd, void *cqe, void *buf); +int spdk_vhost_nvme_set_cq_call(int vid, uint16_t qid, int fd); +int spdk_vhost_nvme_get_cap(int vid, uint64_t *cap); +int spdk_vhost_nvme_controller_construct(void); +int spdk_vhost_nvme_dev_construct(const char *name, const char *cpumask, uint32_t io_queues); +int spdk_vhost_nvme_dev_remove(struct spdk_vhost_dev *vdev); +int spdk_vhost_nvme_dev_add_ns(struct spdk_vhost_dev *vdev, + const char *bdev_name); #endif /* SPDK_VHOST_INTERNAL_H */ diff --git a/lib/vhost/vhost_nvme.c b/lib/vhost/vhost_nvme.c new file mode 100644 index 0000000000..1701035eac --- /dev/null +++ b/lib/vhost/vhost_nvme.c @@ -0,0 +1,1296 @@ +/*- + * BSD LICENSE + * + * Copyright(c) Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/nvme.h" +#include "spdk/env.h" +#include "spdk/conf.h" +#include "spdk/util.h" +#include "spdk/string.h" +#include "spdk/io_channel.h" +#include "spdk/barrier.h" +#include "spdk/vhost.h" +#include "spdk/bdev.h" +#include "spdk/version.h" +#include "spdk/nvme_spec.h" +#include "spdk/likely.h" + +#include "vhost_internal.h" + +#define MAX_IO_QUEUES 31 +#define MAX_IOVS 64 +#define MAX_NAMESPACE 8 +#define MAX_QUEUE_ENTRIES_SUPPORTED 256 +#define MAX_BATCH_IO 8 + +struct spdk_vhost_nvme_sq { + uint16_t sqid; + uint16_t size; + uint16_t cqid; + bool valid; + struct spdk_nvme_cmd *sq_cmd; + uint16_t sq_head; + uint16_t sq_tail; +}; + +struct spdk_vhost_nvme_cq { + uint8_t phase; + uint16_t size; + uint16_t cqid; + bool valid; + volatile struct spdk_nvme_cpl *cq_cqe; + uint16_t cq_head; + uint16_t last_signaled_cq_head; + bool irq_enabled; + int virq; +}; + +struct spdk_vhost_nvme_ns { + struct spdk_bdev *bdev; + uint32_t block_size; + uint64_t capacity; + uint32_t nsid; + uint32_t active_ns; + struct spdk_bdev_desc *bdev_desc; + struct spdk_io_channel *bdev_io_channel; + struct spdk_nvme_ns_data nsdata; +}; + +struct spdk_vhost_nvme_task { + struct spdk_nvme_cmd cmd; + struct spdk_vhost_nvme_dev *nvme; + uint16_t sqid; + uint16_t cqid; + + /** array of iovecs to transfer. */ + struct iovec iovs[MAX_IOVS]; + + /** Number of iovecs in iovs array. */ + int iovcnt; + + /** Current iovec position. */ + int iovpos; + + /** Offset in current iovec. */ + uint32_t iov_offset; + + /* parent pointer. */ + struct spdk_vhost_nvme_task *parent; + bool success; + uint32_t num_children; + STAILQ_ENTRY(spdk_vhost_nvme_task) stailq; +}; + +struct spdk_vhost_nvme_dev { + struct spdk_vhost_dev vdev; + + uint32_t num_io_queues; + union spdk_nvme_cap_register cap; + union spdk_nvme_cc_register cc; + union spdk_nvme_csts_register csts; + struct spdk_nvme_ctrlr_data cdata; + + uint32_t num_sqs; + uint32_t num_cqs; + + struct rte_vhost_memory *mem; + + uint32_t num_ns; + struct spdk_vhost_nvme_ns ns[MAX_NAMESPACE]; + + volatile uint32_t *dbbuf_dbs; + volatile uint32_t *dbbuf_eis; + struct spdk_vhost_nvme_sq sq_queue[MAX_IO_QUEUES + 1]; + struct spdk_vhost_nvme_cq cq_queue[MAX_IO_QUEUES + 1]; + + TAILQ_ENTRY(spdk_vhost_nvme_dev) tailq; + STAILQ_HEAD(, spdk_vhost_nvme_task) free_tasks; + struct spdk_poller *requestq_poller; +}; + +static const struct spdk_vhost_dev_backend spdk_vhost_nvme_device_backend; + +/* + * Report the SPDK version as the firmware revision. + * SPDK_VERSION_STRING won't fit into FR (only 8 bytes), so try to fit the most important parts. + */ +#define FW_VERSION SPDK_VERSION_MAJOR_STRING SPDK_VERSION_MINOR_STRING SPDK_VERSION_PATCH_STRING + +static struct spdk_vhost_nvme_dev * +to_nvme_dev(struct spdk_vhost_dev *vdev) +{ + if (vdev == NULL) { + return NULL; + } + + if (vdev->backend != &spdk_vhost_nvme_device_backend) { + SPDK_ERRLOG("%s: not a vhost-nvme device\n", vdev->name); + return NULL; + } + + return SPDK_CONTAINEROF(vdev, struct spdk_vhost_nvme_dev, vdev); +} + +static TAILQ_HEAD(, spdk_vhost_nvme_dev) g_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_ctrlrs); + +static inline unsigned int sq_offset(unsigned int qid, uint32_t db_stride) +{ + return qid * 2 * db_stride; +} + +static inline unsigned int cq_offset(unsigned int qid, uint32_t db_stride) +{ + return (qid * 2 + 1) * db_stride; +} + +static void +nvme_inc_cq_head(struct spdk_vhost_nvme_cq *cq) +{ + cq->cq_head++; + if (cq->cq_head >= cq->size) { + cq->cq_head = 0; + cq->phase = !cq->phase; + } +} + +static void +nvme_inc_sq_head(struct spdk_vhost_nvme_sq *sq) +{ + sq->sq_head = (sq->sq_head + 1) % sq->size; +} + +static struct spdk_vhost_nvme_sq * +spdk_vhost_nvme_get_sq_from_qid(struct spdk_vhost_nvme_dev *dev, uint16_t qid) +{ + if (spdk_unlikely(!qid || qid > MAX_IO_QUEUES)) { + return NULL; + } + + return &dev->sq_queue[qid]; +} + +static struct spdk_vhost_nvme_cq * +spdk_vhost_nvme_get_cq_from_qid(struct spdk_vhost_nvme_dev *dev, uint16_t qid) +{ + if (spdk_unlikely(!qid || qid > MAX_IO_QUEUES)) { + return NULL; + } + + return &dev->cq_queue[qid]; +} + +static int +spdk_nvme_map_prps(struct spdk_vhost_nvme_dev *nvme, struct spdk_nvme_cmd *cmd, + struct spdk_vhost_nvme_task *task, uint32_t len) +{ + uint64_t prp1, prp2; + uintptr_t vva; + uint32_t i; + uint32_t residue_len, mps = 4096; + uint64_t *prp_list; + + prp1 = cmd->dptr.prp.prp1; + prp2 = cmd->dptr.prp.prp2; + + vva = (uintptr_t)rte_vhost_gpa_to_vva(nvme->mem, prp1); + if (spdk_unlikely(vva == 0)) { + SPDK_ERRLOG("GPA to VVA failed\n"); + return -1; + } + task->iovs[0].iov_base = (void *)vva; + /* PRP1 may started with unaligned page address */ + residue_len = mps - (prp1 % mps); + residue_len = spdk_min(len, residue_len); + task->iovs[0].iov_len = residue_len; + + len -= residue_len; + + if (len) { + if (len <= mps) { + /* 2 PRP used */ + task->iovcnt = 2; + assert(prp2 != 0); + vva = (uintptr_t)rte_vhost_gpa_to_vva(nvme->mem, prp2); + if (spdk_unlikely(vva == 0)) { + return -1; + } + task->iovs[1].iov_base = (void *)vva; + task->iovs[1].iov_len = len; + } else { + /* PRP list used */ + assert(prp2 != 0); + vva = (uintptr_t)rte_vhost_gpa_to_vva(nvme->mem, prp2); + if (spdk_unlikely(vva == 0)) { + return -1; + } + prp_list = (uint64_t *)vva; + i = 0; + while (len != 0) { + residue_len = spdk_min(len, mps); + vva = (uintptr_t)rte_vhost_gpa_to_vva(nvme->mem, prp_list[i]); + if (spdk_unlikely(vva == 0)) { + return -1; + } + task->iovs[i + 1].iov_base = (void *)vva; + task->iovs[i + 1].iov_len = residue_len; + len -= residue_len; + i++; + } + task->iovcnt = i + 1; + } + } else { + /* 1 PRP used */ + task->iovcnt = 1; + } + + return 0; +} + +static void +blk_request_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_vhost_nvme_task *task = cb_arg; + struct spdk_vhost_nvme_dev *nvme = task->nvme; + uint16_t cqid; + struct spdk_nvme_cpl cqe = {0}; + struct spdk_vhost_nvme_cq *cq; + struct spdk_vhost_nvme_sq *sq; + struct spdk_nvme_cmd *cmd = &task->cmd; + uint32_t cq_head; + int sc, sct; + + if (spdk_likely(bdev_io)) { + spdk_bdev_free_io(bdev_io); + } + + cqid = task->cqid; + cq = spdk_vhost_nvme_get_cq_from_qid(nvme, cqid); + sq = spdk_vhost_nvme_get_sq_from_qid(nvme, task->sqid); + if (spdk_unlikely(!cq || !sq)) { + spdk_bdev_free_io(bdev_io); + return; + } + + cqe.sqid = task->sqid; + cqe.sqhd = sq->sq_head; + cqe.cid = cmd->cid; + cqe.status.sct = 0; + cqe.status.sc = 0; + if (spdk_unlikely(!success)) { + spdk_bdev_io_get_nvme_status(bdev_io, &sct, &sc); + cqe.status.sct = sct; + cqe.status.sc = sc; + cqe.status.dnr = 1; + SPDK_ERRLOG("I/O error, sector %u\n", cmd->cdw10); + } + cqe.status.p = !cq->phase; + cq->cq_cqe[cq->cq_head] = cqe; + spdk_smp_wmb(); + cq->cq_cqe[cq->cq_head].status.p = cq->phase; + + nvme_inc_cq_head(cq); + + /* completion */ + cq_head = nvme->dbbuf_dbs[cq_offset(cqid, 1)]; + if (cq_head != cq->last_signaled_cq_head) { + cq->last_signaled_cq_head = (uint16_t)cq_head; + /* MMIO Controll */ + nvme->dbbuf_eis[cq_offset(cqid, 1)] = (uint32_t)(cq_head - 1); + } + + if (cq->irq_enabled && (cq->cq_head != cq_head)) { + eventfd_write(cq->virq, (eventfd_t)1); + } + + STAILQ_INSERT_TAIL(&nvme->free_tasks, task, stailq); +} + +static void +blk_unmap_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_vhost_nvme_task *child = cb_arg; + struct spdk_vhost_nvme_task *task = child->parent; + struct spdk_vhost_nvme_dev *nvme = task->nvme; + + if (bdev_io) { + spdk_bdev_free_io(bdev_io); + } + + if (task) { + task->num_children--; + if (!success) { + task->success = false; + } + if (!task->num_children) { + blk_request_complete_cb(NULL, task->success, task); + } + } + STAILQ_INSERT_TAIL(&nvme->free_tasks, child, stailq); +} + +static struct spdk_vhost_nvme_ns * +spdk_vhost_nvme_get_ns_from_nsid(struct spdk_vhost_nvme_dev *dev, uint32_t nsid) +{ + if (spdk_unlikely(!nsid || nsid > dev->num_ns)) { + return NULL; + } + + return &dev->ns[nsid - 1]; +} + +static int +spdk_nvme_process_sq(struct spdk_vhost_nvme_dev *nvme, struct spdk_vhost_nvme_sq *sq, + struct spdk_vhost_nvme_task *task) +{ + struct spdk_vhost_nvme_task *child; + struct spdk_nvme_cmd *cmd = &task->cmd; + struct spdk_vhost_nvme_ns *ns; + int ret = -1; + uint32_t len, nlba, block_size; + uint64_t slba; + struct spdk_nvme_dsm_range *range; + uint16_t i, num_ranges = 0; + + task->nvme = nvme; + + ns = spdk_vhost_nvme_get_ns_from_nsid(nvme, cmd->nsid); + if (spdk_unlikely(!ns)) { + blk_request_complete_cb(NULL, false, task); + return -1; + } + + block_size = ns->block_size; + task->success = true; + task->num_children = 0; + task->cqid = sq->cqid; + task->sqid = sq->sqid; + + if (spdk_unlikely(!ns->active_ns)) { + blk_request_complete_cb(NULL, false, task); + return -1; + } + + /* valid only for Read/Write commands */ + nlba = (cmd->cdw12 & 0xffff) + 1; + slba = cmd->cdw11; + slba = (slba << 32) | cmd->cdw10; + + if (cmd->opc == SPDK_NVME_OPC_READ || cmd->opc == SPDK_NVME_OPC_WRITE || + cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) { + + assert(cmd->psdt == SPDK_NVME_PSDT_PRP); + + if (cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) { + num_ranges = (cmd->cdw10 & 0xff) + 1; + len = num_ranges * sizeof(struct spdk_nvme_dsm_range); + } else { + len = nlba * block_size; + } + + ret = spdk_nvme_map_prps(nvme, cmd, task, len); + if (spdk_unlikely(ret != 0)) { + SPDK_ERRLOG("nvme command map prps failed\n"); + blk_request_complete_cb(NULL, false, task); + return -1; + } + } + + switch (cmd->opc) { + case SPDK_NVME_OPC_READ: + ret = spdk_bdev_readv(ns->bdev_desc, ns->bdev_io_channel, + task->iovs, task->iovcnt, slba * block_size, + nlba * block_size, blk_request_complete_cb, task); + break; + case SPDK_NVME_OPC_WRITE: + ret = spdk_bdev_writev(ns->bdev_desc, ns->bdev_io_channel, + task->iovs, task->iovcnt, slba * block_size, + nlba * block_size, blk_request_complete_cb, task); + break; + case SPDK_NVME_OPC_FLUSH: + ret = spdk_bdev_flush(ns->bdev_desc, ns->bdev_io_channel, + 0, ns->capacity, + blk_request_complete_cb, task); + break; + case SPDK_NVME_OPC_DATASET_MANAGEMENT: + range = (struct spdk_nvme_dsm_range *)task->iovs[0].iov_base; + for (i = 0; i < num_ranges; i++) { + if (!STAILQ_EMPTY(&nvme->free_tasks)) { + child = STAILQ_FIRST(&nvme->free_tasks); + STAILQ_REMOVE_HEAD(&nvme->free_tasks, stailq); + } else { + SPDK_ERRLOG("No free task now\n"); + ret = -1; + break; + } + task->num_children++; + child->parent = task; + ret = spdk_bdev_unmap(ns->bdev_desc, ns->bdev_io_channel, + range[i].starting_lba * block_size, + range[i].length * block_size, + blk_unmap_complete_cb, child); + if (ret) { + STAILQ_INSERT_TAIL(&nvme->free_tasks, child, stailq); + break; + } + } + break; + default: + ret = -1; + break; + } + + if (spdk_unlikely(ret)) { + /* post error status to cqe */ + SPDK_ERRLOG("Error Submission For Command %u, ret %d\n", cmd->opc, ret); + blk_request_complete_cb(NULL, false, task); + } + + return ret; +} + +static int +nvme_worker(void *arg) +{ + struct spdk_vhost_nvme_dev *nvme = (struct spdk_vhost_nvme_dev *)arg; + struct spdk_vhost_nvme_sq *sq; + struct spdk_vhost_nvme_task *task; + uint32_t qid, dbbuf_sq; + int ret; + int count = -1; + + if (spdk_unlikely(!nvme->num_sqs)) { + return -1; + } + + /* worker thread can't start before the admin doorbell + * buffer config command + */ + if (spdk_unlikely(!nvme->dbbuf_dbs)) { + return -1; + } + + /* Submission Queue */ + for (qid = 1; qid <= MAX_IO_QUEUES; qid++) { + + sq = spdk_vhost_nvme_get_sq_from_qid(nvme, qid); + if (!sq->valid) { + continue; + } + + dbbuf_sq = nvme->dbbuf_dbs[sq_offset(qid, 1)]; + sq->sq_tail = (uint16_t)dbbuf_sq; + count = 0; + + while (sq->sq_head != sq->sq_tail) { + if (spdk_unlikely(!sq->sq_cmd)) { + break; + } + if (spdk_likely(!STAILQ_EMPTY(&nvme->free_tasks))) { + task = STAILQ_FIRST(&nvme->free_tasks); + STAILQ_REMOVE_HEAD(&nvme->free_tasks, stailq); + } else { + return -1; + } + + task->cmd = sq->sq_cmd[sq->sq_head]; + nvme_inc_sq_head(sq); + + /* processing IO */ + ret = spdk_nvme_process_sq(nvme, sq, task); + if (spdk_unlikely(ret)) { + SPDK_ERRLOG("QID %u CID %u, SQ HEAD %u, DBBUF SQ TAIL %u\n", qid, task->cmd.cid, sq->sq_head, + sq->sq_tail); + } + + /* MMIO Control */ + nvme->dbbuf_eis[sq_offset(qid, 1)] = (uint32_t)(sq->sq_head - 1); + + /* Maximum batch I/Os to pick up at once */ + if (count++ == MAX_BATCH_IO) { + break; + } + } + } + + return count; +} + +static int +vhost_nvme_doorbell_buffer_config(struct spdk_vhost_nvme_dev *nvme, + struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl) +{ + uint64_t dbs_dma_addr, eis_dma_addr; + + dbs_dma_addr = cmd->dptr.prp.prp1; + eis_dma_addr = cmd->dptr.prp.prp2; + + if ((dbs_dma_addr % 4096) || (eis_dma_addr % 4096)) { + return -1; + } + /* Guest Physical Address to Host Virtual Address */ + nvme->dbbuf_dbs = (void *)(uintptr_t)rte_vhost_gpa_to_vva(nvme->mem, dbs_dma_addr); + nvme->dbbuf_eis = (void *)(uintptr_t)rte_vhost_gpa_to_vva(nvme->mem, eis_dma_addr); + if (!nvme->dbbuf_dbs || !nvme->dbbuf_eis) { + return -1; + } + /* zeroed the doorbell buffer memory */ + memset((void *)nvme->dbbuf_dbs, 0, sizeof((nvme->num_sqs + 1) * 8)); + memset((void *)nvme->dbbuf_eis, 0, sizeof((nvme->num_sqs + 1) * 8)); + + cpl->status.sc = 0; + cpl->status.sct = 0; + return 0; +} + +static int +vhost_nvme_create_io_sq(struct spdk_vhost_nvme_dev *nvme, + struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl) +{ + uint16_t qid, qsize, cqid; + uint64_t dma_addr; + struct spdk_vhost_nvme_sq *sq; + + /* physical contiguous */ + if (!(cmd->cdw11 & 0x1)) { + return -1; + } + + cqid = (cmd->cdw11 >> 16) & 0xffff; + qid = cmd->cdw10 & 0xffff; + qsize = (cmd->cdw10 >> 16) & 0xffff; + dma_addr = cmd->dptr.prp.prp1; + if (!dma_addr || dma_addr % 4096) { + return -1; + } + + sq = spdk_vhost_nvme_get_sq_from_qid(nvme, qid); + if (!sq) { + return -1; + } + sq->sqid = qid; + sq->cqid = cqid; + sq->size = qsize + 1; + sq->sq_head = sq->sq_tail = 0; + sq->sq_cmd = (void *)(uintptr_t)rte_vhost_gpa_to_vva(nvme->mem, dma_addr); + if (!sq->sq_cmd) { + return -1; + } + nvme->num_sqs++; + sq->valid = true; + + cpl->status.sc = 0; + cpl->status.sct = 0; + return 0; +} + +static int +vhost_nvme_delete_io_sq(struct spdk_vhost_nvme_dev *nvme, + struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl) +{ + uint16_t qid; + struct spdk_vhost_nvme_sq *sq; + + qid = cmd->cdw10 & 0xffff; + sq = spdk_vhost_nvme_get_sq_from_qid(nvme, qid); + if (!sq) { + return -1; + } + + /* We didn't see scenarios when deleting submission + * queue while I/O is running against the submisson + * queue for now, otherwise, we must ensure the poller + * will not run with this submission queue. + */ + nvme->num_sqs--; + sq->valid = false; + + memset(sq, 0, sizeof(*sq)); + sq->sq_cmd = NULL; + + cpl->status.sc = 0; + cpl->status.sct = 0; + + return 0; +} + +static int +vhost_nvme_create_io_cq(struct spdk_vhost_nvme_dev *nvme, + struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl) +{ + uint16_t qsize, qid; + uint64_t dma_addr; + struct spdk_vhost_nvme_cq *cq; + + /* physical contiguous */ + assert(cmd->cdw11 & 0x1); + qid = cmd->cdw10 & 0xffff; + qsize = (cmd->cdw10 >> 16) & 0xffff; + dma_addr = cmd->dptr.prp.prp1; + if (!dma_addr || dma_addr % 4096) { + return -1; + } + + cq = spdk_vhost_nvme_get_cq_from_qid(nvme, qid); + if (!cq) { + return -1; + } + cq->cqid = qid; + cq->size = qsize + 1; + cq->phase = 1; + cq->irq_enabled = (cmd->cdw11 >> 1) & 0x1; + /* Setup virq through vhost messages */ + cq->virq = -1; + cq->cq_head = 0; + cq->last_signaled_cq_head = 0; + cq->cq_cqe = (void *)(uintptr_t)rte_vhost_gpa_to_vva(nvme->mem, dma_addr); + if (!cq->cq_cqe) { + return -1; + } + nvme->num_cqs++; + cq->valid = true; + + cpl->status.sc = 0; + cpl->status.sct = 0; + return 0; +} + +static int +vhost_nvme_delete_io_cq(struct spdk_vhost_nvme_dev *nvme, + struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl) +{ + uint16_t qid; + struct spdk_vhost_nvme_cq *cq; + + qid = cmd->cdw10 & 0xffff; + cq = spdk_vhost_nvme_get_cq_from_qid(nvme, qid); + if (!cq) { + return -1; + } + nvme->num_cqs--; + cq->valid = false; + + memset(cq, 0, sizeof(*cq)); + cq->cq_cqe = NULL; + + cpl->status.sc = 0; + cpl->status.sct = 0; + return 0; +} + +static struct spdk_vhost_nvme_dev * +spdk_vhost_nvme_get_by_name(int vid) +{ + struct spdk_vhost_nvme_dev *nvme; + + TAILQ_FOREACH(nvme, &g_nvme_ctrlrs, tailq) { + if (nvme->vdev.vid == vid) { + return nvme; + } + } + + return NULL; +} + +int +spdk_vhost_nvme_get_cap(int vid, uint64_t *cap) +{ + struct spdk_vhost_nvme_dev *nvme; + + nvme = spdk_vhost_nvme_get_by_name(vid); + if (!nvme) { + return -1; + } + + *cap = nvme->cap.raw; + return 0; +} + +int +spdk_vhost_nvme_admin_passthrough(int vid, void *cmd, void *cqe, void *buf) +{ + struct spdk_nvme_cmd *req = (struct spdk_nvme_cmd *)cmd; + struct spdk_nvme_cpl *cpl = (struct spdk_nvme_cpl *)cqe; + struct spdk_vhost_nvme_ns *ns; + int ret = 0; + struct spdk_vhost_nvme_dev *nvme; + uint32_t cq_head, sq_tail; + uint32_t dw0; + + nvme = spdk_vhost_nvme_get_by_name(vid); + if (!nvme) { + return -1; + } + + SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "Admin Command Opcode %u\n", req->opc); + switch (req->opc) { + case SPDK_NVME_OPC_IDENTIFY: + if (req->cdw10 == SPDK_NVME_IDENTIFY_CTRLR) { + memcpy(buf, &nvme->cdata, sizeof(struct spdk_nvme_ctrlr_data)); + + } else if (req->cdw10 == SPDK_NVME_IDENTIFY_NS) { + ns = spdk_vhost_nvme_get_ns_from_nsid(nvme, req->nsid); + if (!ns) { + cpl->status.sc = SPDK_NVME_SC_NAMESPACE_ID_UNAVAILABLE; + cpl->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + break; + } + memcpy(buf, &ns->nsdata, sizeof(struct spdk_nvme_ns_data)); + } + /* successfully */ + cpl->status.sc = 0; + cpl->status.sct = 0; + break; + case SPDK_NVME_OPC_CREATE_IO_CQ: + ret = vhost_nvme_create_io_cq(nvme, req, cpl); + break; + case SPDK_NVME_OPC_DELETE_IO_CQ: + ret = vhost_nvme_delete_io_cq(nvme, req, cpl); + break; + case SPDK_NVME_OPC_CREATE_IO_SQ: + ret = vhost_nvme_create_io_sq(nvme, req, cpl); + break; + case SPDK_NVME_OPC_DELETE_IO_SQ: + ret = vhost_nvme_delete_io_sq(nvme, req, cpl); + break; + case SPDK_NVME_OPC_GET_FEATURES: + case SPDK_NVME_OPC_SET_FEATURES: + if (req->cdw10 == SPDK_NVME_FEAT_NUMBER_OF_QUEUES) { + cpl->status.sc = 0; + cpl->status.sct = 0; + dw0 = (nvme->num_io_queues - 1) | ((nvme->num_io_queues - 1) << 16); + memcpy(buf, &dw0, 4); + } else { + cpl->status.sc = SPDK_NVME_SC_INVALID_FIELD; + cpl->status.sct = SPDK_NVME_SCT_GENERIC; + } + break; + case SPDK_NVME_OPC_DOORBELL_BUFFER_CONFIG: + ret = vhost_nvme_doorbell_buffer_config(nvme, req, cpl); + break; + case SPDK_NVME_OPC_ABORT: + sq_tail = nvme->dbbuf_dbs[sq_offset(1, 1)] & 0xffffu; + cq_head = nvme->dbbuf_dbs[cq_offset(1, 1)] & 0xffffu; + SPDK_NOTICELOG("ABORT: CID %u, SQ_TAIL %u, CQ_HEAD %u\n", + (req->cdw10 >> 16) & 0xffffu, sq_tail, cq_head); + /* TODO: ABORT failed fow now */ + cpl->cdw0 = 1; + cpl->status.sc = 0; + cpl->status.sct = 0; + break; + } + + if (ret) { + SPDK_ERRLOG("Admin Passthrough Faild with %u\n", req->opc); + } + + return 0; +} + +int +spdk_vhost_nvme_set_cq_call(int vid, uint16_t qid, int fd) +{ + struct spdk_vhost_nvme_dev *nvme; + struct spdk_vhost_nvme_cq *cq; + + nvme = spdk_vhost_nvme_get_by_name(vid); + if (!nvme) { + return -1; + } + + cq = spdk_vhost_nvme_get_cq_from_qid(nvme, qid); + if (!cq) { + return -1; + } + if (cq->irq_enabled) { + cq->virq = fd; + } else { + SPDK_ERRLOG("NVMe Qid %d Disabled IRQ\n", qid); + } + + return 0; +} + +static void +free_task_pool(struct spdk_vhost_nvme_dev *nvme) +{ + struct spdk_vhost_nvme_task *task; + + while (!STAILQ_EMPTY(&nvme->free_tasks)) { + task = STAILQ_FIRST(&nvme->free_tasks); + STAILQ_REMOVE_HEAD(&nvme->free_tasks, stailq); + spdk_dma_free(task); + } +} + +static int +alloc_task_pool(struct spdk_vhost_nvme_dev *nvme) +{ + uint32_t entries, i; + struct spdk_vhost_nvme_task *task; + + entries = nvme->num_io_queues * MAX_QUEUE_ENTRIES_SUPPORTED; + + for (i = 0; i < entries; i++) { + task = spdk_dma_zmalloc(sizeof(struct spdk_vhost_nvme_task), + SPDK_CACHE_LINE_SIZE, NULL); + if (task == NULL) { + SPDK_ERRLOG("Controller %s alloc task pool failed\n", + nvme->vdev.name); + free_task_pool(nvme); + return -1; + } + STAILQ_INSERT_TAIL(&nvme->free_tasks, task, stailq); + } + + return 0; +} + +/* new device means enable the + * virtual NVMe controller + */ +static int +spdk_vhost_nvme_start_device(struct spdk_vhost_dev *vdev, void *event_ctx) +{ + struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev); + struct spdk_vhost_nvme_ns *ns_dev; + uint32_t i; + + spdk_vhost_dev_mem_register(vdev); + nvme->mem = vdev->mem; + + if (alloc_task_pool(nvme)) { + return -1; + } + + SPDK_NOTICELOG("Start Device %u, Path %s, lcore %d\n", vdev->vid, + vdev->path, vdev->lcore); + + for (i = 0; i < nvme->num_ns; i++) { + ns_dev = &nvme->ns[i]; + ns_dev->bdev_io_channel = spdk_bdev_get_io_channel(ns_dev->bdev_desc); + if (!ns_dev->bdev_io_channel) { + return -1; + } + } + + /* Start the NVMe Poller */ + nvme->requestq_poller = spdk_poller_register(nvme_worker, nvme, 0); + + spdk_vhost_dev_backend_event_done(event_ctx, 0); + return 0; +} + +static void +spdk_vhost_nvme_deactive_ns(struct spdk_vhost_nvme_ns *ns) +{ + ns->active_ns = 0; + spdk_bdev_close(ns->bdev_desc); + ns->bdev_desc = NULL; + ns->bdev = NULL; +} + +static void +bdev_remove_cb(void *remove_ctx) +{ + struct spdk_vhost_nvme_ns *ns = remove_ctx; + + SPDK_NOTICELOG("Removing NS %u, Block Device %s\n", + ns->nsid, spdk_bdev_get_name(ns->bdev)); + + spdk_vhost_nvme_deactive_ns(ns); +} + +struct spdk_vhost_dev_destroy_ctx { + struct spdk_vhost_nvme_dev *bvdev; + struct spdk_poller *poller; + void *event_ctx; +}; + +static int +destroy_device_poller_cb(void *arg) +{ + struct spdk_vhost_dev_destroy_ctx *ctx = arg; + struct spdk_vhost_nvme_dev *nvme = ctx->bvdev; + struct spdk_vhost_nvme_dev *dev, *tmp; + struct spdk_vhost_nvme_ns *ns_dev; + uint32_t i; + + SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "Destroy device poller callback\n"); + + TAILQ_FOREACH_SAFE(dev, &g_nvme_ctrlrs, tailq, tmp) { + if (dev == nvme) { + for (i = 0; i < nvme->num_ns; i++) { + ns_dev = &nvme->ns[i]; + if (ns_dev->bdev_io_channel) { + spdk_put_io_channel(ns_dev->bdev_io_channel); + ns_dev->bdev_io_channel = NULL; + } + } + nvme->num_sqs = 0; + nvme->num_cqs = 0; + nvme->dbbuf_dbs = NULL; + nvme->dbbuf_eis = NULL; + spdk_vhost_dev_mem_unregister(&nvme->vdev); + } + } + + spdk_poller_unregister(&ctx->poller); + spdk_vhost_dev_backend_event_done(ctx->event_ctx, 0); + free(ctx); + + return -1; +} + +/* Disable NVMe controller + */ +static int +spdk_vhost_nvme_stop_device(struct spdk_vhost_dev *vdev, void *event_ctx) +{ + struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev); + struct spdk_vhost_dev_destroy_ctx *destroy_ctx; + + free_task_pool(nvme); + SPDK_NOTICELOG("Stopping Device %u, Path %s\n", vdev->vid, vdev->path); + + destroy_ctx = malloc(sizeof(*destroy_ctx)); + if (destroy_ctx == NULL) { + SPDK_ERRLOG("Failed to alloc memory for destroying device.\n"); + goto err; + } + + destroy_ctx->bvdev = nvme; + destroy_ctx->event_ctx = event_ctx; + + spdk_poller_unregister(&nvme->requestq_poller); + destroy_ctx->poller = spdk_poller_register(destroy_device_poller_cb, destroy_ctx, 1000); + + return 0; + +err: + spdk_vhost_dev_backend_event_done(event_ctx, -1); + return -1; +} + +static void +spdk_vhost_nvme_dump_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w) +{ + struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev); + struct spdk_vhost_nvme_ns *ns_dev; + struct spdk_bdev *bdev; + uint32_t i; + + spdk_json_write_name(w, "namespaces"); + spdk_json_write_object_begin(w); + + for (i = 0; i < nvme->num_ns; i++) { + ns_dev = &nvme->ns[i]; + if (!ns_dev->active_ns) { + continue; + } + bdev = ns_dev->bdev; + + spdk_json_write_name(w, "nsid"); + spdk_json_write_uint32(w, ns_dev->nsid); + + spdk_json_write_name(w, "bdev"); + if (bdev) { + spdk_json_write_string(w, spdk_bdev_get_name(bdev)); + } else { + spdk_json_write_null(w); + } + } + + spdk_json_write_object_end(w); +} + +static const struct spdk_vhost_dev_backend spdk_vhost_nvme_device_backend = { + .start_device = spdk_vhost_nvme_start_device, + .stop_device = spdk_vhost_nvme_stop_device, + .dump_config_json = spdk_vhost_nvme_dump_config_json, + .remove_device = spdk_vhost_nvme_dev_remove, +}; + +static int +spdk_vhost_nvme_ns_identify_update(struct spdk_vhost_nvme_dev *dev) +{ + struct spdk_nvme_ctrlr_data *cdata = &dev->cdata; + struct spdk_nvme_ns_data *nsdata; + uint64_t num_blocks; + uint32_t i; + + /* Identify Namespace */ + cdata->nn = dev->num_ns; + for (i = 0; i < dev->num_ns; i++) { + nsdata = &dev->ns[i].nsdata; + if (dev->ns[i].active_ns) { + num_blocks = spdk_bdev_get_num_blocks(dev->ns[i].bdev); + nsdata->nsze = num_blocks; + /* ncap must be non-zero for active Namespace */ + nsdata->ncap = num_blocks; + nsdata->nuse = num_blocks; + nsdata->nlbaf = 0; + nsdata->flbas.format = 0; + nsdata->lbaf[0].lbads = spdk_u32log2(spdk_bdev_get_block_size(dev->ns[i].bdev)); + nsdata->noiob = spdk_bdev_get_optimal_io_boundary(dev->ns[i].bdev); + dev->ns[i].block_size = spdk_bdev_get_block_size(dev->ns[i].bdev); + dev->ns[i].capacity = num_blocks * dev->ns[i].block_size; + } else { + memset(nsdata, 0, sizeof(*nsdata)); + } + } + return 0; +} + +static int +spdk_vhost_nvme_ctrlr_identify_update(struct spdk_vhost_nvme_dev *dev) +{ + struct spdk_nvme_ctrlr_data *cdata = &dev->cdata; + char sn[20]; + + /* Controller Capabilities */ + dev->cap.bits.cqr = 1; + dev->cap.bits.to = 1; + dev->cap.bits.dstrd = 0; + dev->cap.bits.css_nvm = 1; + dev->cap.bits.mpsmin = 0; + dev->cap.bits.mpsmax = 0; + /* MQES is 0 based value */ + dev->cap.bits.mqes = MAX_QUEUE_ENTRIES_SUPPORTED - 1; + + /* Controller Configuration */ + dev->cc.bits.en = 0; + + /* Controller Status */ + dev->csts.bits.rdy = 0; + + /* Identify Controller */ + spdk_strcpy_pad(cdata->fr, FW_VERSION, sizeof(cdata->fr), ' '); + cdata->vid = 0x8086; + cdata->ssvid = 0x8086; + spdk_strcpy_pad(cdata->mn, "SPDK Virtual NVMe Controller", sizeof(cdata->mn), ' '); + snprintf(sn, sizeof(sn), "NVMe_%s", dev->vdev.name); + spdk_strcpy_pad(cdata->sn, sn, sizeof(cdata->sn), ' '); + cdata->ieee[0] = 0xe4; + cdata->ieee[1] = 0xd2; + cdata->ieee[2] = 0x5c; + cdata->ver.bits.mjr = 1; + cdata->ver.bits.mnr = 0; + cdata->mdts = 5; /* 128 KiB */ + cdata->rab = 6; + cdata->sqes.min = 6; + cdata->sqes.max = 6; + cdata->cqes.min = 4; + cdata->cqes.max = 4; + cdata->oncs.dsm = 1; + /* Emulated NVMe controller */ + cdata->oacs.doorbell_buffer_config = 1; + + spdk_vhost_nvme_ns_identify_update(dev); + + return 0; +} + +int +spdk_vhost_nvme_dev_construct(const char *name, const char *cpumask, uint32_t num_io_queues) +{ + struct spdk_vhost_nvme_dev *dev = spdk_dma_zmalloc(sizeof(struct spdk_vhost_nvme_dev), + SPDK_CACHE_LINE_SIZE, NULL); + int rc; + + if (dev == NULL) { + return -ENOMEM; + } + + if (num_io_queues < 1 || num_io_queues > MAX_IO_QUEUES) { + return -EINVAL; + } + + spdk_vhost_lock(); + rc = spdk_vhost_dev_register(&dev->vdev, name, cpumask, + &spdk_vhost_nvme_device_backend); + + if (rc) { + spdk_dma_free(dev); + spdk_vhost_unlock(); + return rc; + } + + dev->num_io_queues = num_io_queues; + STAILQ_INIT(&dev->free_tasks); + TAILQ_INSERT_TAIL(&g_nvme_ctrlrs, dev, tailq); + + spdk_vhost_nvme_ctrlr_identify_update(dev); + + SPDK_NOTICELOG("Controller %s: Constructed\n", name); + spdk_vhost_unlock(); + return rc; +} + +int +spdk_vhost_nvme_dev_remove(struct spdk_vhost_dev *vdev) +{ + struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev); + struct spdk_vhost_nvme_dev *dev, *tmp; + struct spdk_vhost_nvme_ns *ns; + int rc; + uint32_t i; + + if (nvme == NULL) { + return -EINVAL; + } + + TAILQ_FOREACH_SAFE(dev, &g_nvme_ctrlrs, tailq, tmp) { + if (dev == nvme) { + TAILQ_REMOVE(&g_nvme_ctrlrs, dev, tailq); + for (i = 0; i < nvme->num_ns; i++) { + ns = &nvme->ns[i]; + if (ns->active_ns) { + spdk_vhost_nvme_deactive_ns(ns); + } + } + } + } + + rc = spdk_vhost_dev_unregister(vdev); + if (rc != 0) { + return rc; + } + + spdk_dma_free(nvme); + return 0; +} + +int +spdk_vhost_nvme_dev_add_ns(struct spdk_vhost_dev *vdev, const char *bdev_name) +{ + struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev); + struct spdk_vhost_nvme_ns *ns; + struct spdk_bdev *bdev; + int rc = -1; + + if (nvme->num_ns == MAX_NAMESPACE) { + SPDK_ERRLOG("Can't support %d Namespaces\n", nvme->num_ns); + return -1; + } + + bdev = spdk_bdev_get_by_name(bdev_name); + if (!bdev) { + SPDK_ERRLOG("could not find bdev %s\n", bdev_name); + return -1; + } + + ns = &nvme->ns[nvme->num_ns]; + rc = spdk_bdev_open(bdev, true, bdev_remove_cb, ns, &nvme->ns[nvme->num_ns].bdev_desc); + if (rc != 0) { + SPDK_ERRLOG("Could not open bdev '%s', error=%d\n", + bdev_name, rc); + return -1; + } + + nvme->ns[nvme->num_ns].bdev = bdev; + nvme->ns[nvme->num_ns].active_ns = 1; + nvme->ns[nvme->num_ns].nsid = nvme->num_ns + 1; + nvme->num_ns++; + + spdk_vhost_nvme_ns_identify_update(nvme); + + return rc; +} + +int +spdk_vhost_nvme_controller_construct(void) +{ + struct spdk_conf_section *sp; + const char *name; + const char *bdev_name; + const char *cpumask; + int rc, i = 0; + struct spdk_vhost_dev *vdev; + uint32_t ctrlr_num, io_queues; + + for (sp = spdk_conf_first_section(NULL); sp != NULL; sp = spdk_conf_next_section(sp)) { + if (!spdk_conf_section_match_prefix(sp, "VhostNvme")) { + continue; + } + + if (sscanf(spdk_conf_section_get_name(sp), "VhostNvme%u", &ctrlr_num) != 1) { + SPDK_ERRLOG("Section '%s' has non-numeric suffix.\n", + spdk_conf_section_get_name(sp)); + return -1; + } + + name = spdk_conf_section_get_val(sp, "Name"); + if (name == NULL) { + SPDK_ERRLOG("VhostNvme%u: missing Name\n", ctrlr_num); + return -1; + } + + cpumask = spdk_conf_section_get_val(sp, "Cpumask"); + rc = spdk_conf_section_get_intval(sp, "NumberOfQueues"); + if (rc > 0) { + io_queues = rc; + } else { + io_queues = 1; + } + + rc = spdk_vhost_nvme_dev_construct(name, cpumask, io_queues); + if (rc < 0) { + SPDK_ERRLOG("VhostNvme%u: Construct failed\n", ctrlr_num); + return -1; + } + + vdev = spdk_vhost_dev_find(name); + if (!vdev) { + return -1; + } + + for (i = 0; spdk_conf_section_get_nval(sp, "Namespace", i) != NULL; i++) { + bdev_name = spdk_conf_section_get_nmval(sp, "Namespace", i, 0); + if (!bdev_name) { + SPDK_ERRLOG("namespace configuration missing bdev name\n"); + break; + } + rc = spdk_vhost_nvme_dev_add_ns(vdev, bdev_name); + if (rc < 0) { + SPDK_WARNLOG("VhostNvme%u: Construct Namespace with %s failed\n", + ctrlr_num, bdev_name); + break; + } + } + } + + return 0; +} + +SPDK_LOG_REGISTER_COMPONENT("vhost_nvme", SPDK_LOG_VHOST_NVME) diff --git a/test/unit/lib/vhost/vhost.c/vhost_ut.c b/test/unit/lib/vhost/vhost.c/vhost_ut.c index 7912c7434a..98942216fa 100644 --- a/test/unit/lib/vhost/vhost.c/vhost_ut.c +++ b/test/unit/lib/vhost/vhost.c/vhost_ut.c @@ -108,6 +108,27 @@ DEFINE_STUB(spdk_env_get_current_core, uint32_t, (void), 0); static struct spdk_vhost_dev_backend g_vdev_backend; +int spdk_vhost_nvme_admin_passthrough(int vid, void *cmd, void *cqe, void *buf) +{ + return 0; +} + +int spdk_vhost_nvme_set_cq_call(int vid, uint16_t qid, int fd) +{ + return 0; +} + +int spdk_vhost_nvme_get_cap(int vid, uint64_t *cap) +{ + return 0; +} + +int +spdk_vhost_nvme_controller_construct(void) +{ + return 0; +} + static int test_setup(void) {