diff --git a/test/lib/nvme/Makefile b/test/lib/nvme/Makefile index 89dc843b2f..f7c274272e 100644 --- a/test/lib/nvme/Makefile +++ b/test/lib/nvme/Makefile @@ -34,7 +34,7 @@ SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) include $(SPDK_ROOT_DIR)/mk/spdk.common.mk -DIRS-y = unit aer reset sgl e2edp +DIRS-y = unit aer reset sgl e2edp overhead .PHONY: all clean $(DIRS-y) diff --git a/test/lib/nvme/overhead/Makefile b/test/lib/nvme/overhead/Makefile new file mode 100644 index 0000000000..a7cf7136d1 --- /dev/null +++ b/test/lib/nvme/overhead/Makefile @@ -0,0 +1,62 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +APP = overhead + +C_SRCS := overhead.c + +CFLAGS += -I. $(DPDK_INC) + +SPDK_LIBS += $(SPDK_ROOT_DIR)/lib/nvme/libspdk_nvme.a \ + $(SPDK_ROOT_DIR)/lib/util/libspdk_util.a \ + $(SPDK_ROOT_DIR)/lib/memory/libspdk_memory.a + +LIBS += $(SPDK_LIBS) $(PCIACCESS_LIB) -lpthread $(DPDK_LIB) -lrt + +ifeq ($(OS),Linux) +LIBS += -laio +CFLAGS += -DHAVE_LIBAIO +endif + +all : $(APP) + +$(APP) : $(OBJS) $(SPDK_LIBS) + $(LINK_C) + +clean : + $(CLEAN_C) $(APP) + +include $(SPDK_ROOT_DIR)/mk/spdk.deps.mk diff --git a/test/lib/nvme/overhead/README b/test/lib/nvme/overhead/README new file mode 100644 index 0000000000..b88c421767 --- /dev/null +++ b/test/lib/nvme/overhead/README @@ -0,0 +1,24 @@ +This application measures the software overhead of I/O submission +and completion for both the SPDK NVMe driver and an AIO file handle. +It runs a random read, queue depth = 1 workload to a single device, +and captures TSC as follows: + +* Submission: capture TSC before and after the I/O submission + call (SPDK or AIO). +* Completion: capture TSC before and after the I/O completion + check. Only record the TSC delta if the I/O completion check + resulted in a completed I/O. Also use heuristics in the AIO + case to account for time spent in interrupt handling outside + of the actual I/O completion check. + +Usage: + +To test software overhead for a 4KB I/O over a 10 second period: + +SPDK: overhead -s 4096 -t 10 +AIO: overhead -s 4096 -t 10 /dev/nvme0n1 + +Note that for the SPDK case, it will only use the first namespace +on the first controller found by SPDK. If a different namespace is +desired, attach controllers individually to the kernel NVMe driver +to ensure they will not be enumerated by SPDK. diff --git a/test/lib/nvme/overhead/overhead.c b/test/lib/nvme/overhead/overhead.c new file mode 100644 index 0000000000..b78749dc36 --- /dev/null +++ b/test/lib/nvme/overhead/overhead.c @@ -0,0 +1,654 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "spdk/file.h" +#include "spdk/nvme.h" +#include "spdk/pci.h" +#include "spdk/string.h" +#include "spdk/nvme_intel.h" + +#if HAVE_LIBAIO +#include +#include +#include +#endif + +struct ctrlr_entry { + struct spdk_nvme_ctrlr *ctrlr; + struct ctrlr_entry *next; + char name[1024]; +}; + +enum entry_type { + ENTRY_TYPE_NVME_NS, + ENTRY_TYPE_AIO_FILE, +}; + +struct ns_entry { + enum entry_type type; + + union { + struct { + struct spdk_nvme_ctrlr *ctrlr; + struct spdk_nvme_ns *ns; + struct spdk_nvme_qpair *qpair; + } nvme; +#if HAVE_LIBAIO + struct { + int fd; + struct io_event *events; + io_context_t ctx; + } aio; +#endif + } u; + + uint32_t io_size_blocks; + uint64_t size_in_ios; + bool is_draining; + uint32_t current_queue_depth; + char name[1024]; +}; + +struct perf_task { + void *buf; + uint64_t submit_tsc; +#if HAVE_LIBAIO + struct iocb iocb; +#endif +}; + +struct rte_mempool *request_mempool; + +static struct ctrlr_entry *g_ctrlr = NULL; +static struct ns_entry *g_ns = NULL; + +static uint64_t g_tsc_rate; + +static uint32_t g_io_size_bytes; +static int g_time_in_sec; + +static int g_aio_optind; /* Index of first AIO filename in argv */ + +struct perf_task *g_task; +uint64_t g_tsc_submit = 0; +uint64_t g_tsc_complete = 0; +uint64_t g_io_completed = 0; + +static void +register_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns) +{ + struct ns_entry *entry; + const struct spdk_nvme_ctrlr_data *cdata; + + cdata = spdk_nvme_ctrlr_get_data(ctrlr); + + if (!spdk_nvme_ns_is_active(ns)) { + printf("Controller %-20.20s (%-20.20s): Skipping inactive NS %u\n", + cdata->mn, cdata->sn, + spdk_nvme_ns_get_id(ns)); + return; + } + + if (spdk_nvme_ns_get_size(ns) < g_io_size_bytes || + spdk_nvme_ns_get_sector_size(ns) > g_io_size_bytes) { + printf("WARNING: controller %-20.20s (%-20.20s) ns %u has invalid " + "ns size %" PRIu64 " / block size %u for I/O size %u\n", + cdata->mn, cdata->sn, spdk_nvme_ns_get_id(ns), + spdk_nvme_ns_get_size(ns), spdk_nvme_ns_get_sector_size(ns), g_io_size_bytes); + return; + } + + entry = calloc(1, sizeof(struct ns_entry)); + if (entry == NULL) { + perror("ns_entry malloc"); + exit(1); + } + + entry->type = ENTRY_TYPE_NVME_NS; + entry->u.nvme.ctrlr = ctrlr; + entry->u.nvme.ns = ns; + + entry->size_in_ios = spdk_nvme_ns_get_size(ns) / + g_io_size_bytes; + entry->io_size_blocks = g_io_size_bytes / spdk_nvme_ns_get_sector_size(ns); + + snprintf(entry->name, 44, "%-20.20s (%-20.20s)", cdata->mn, cdata->sn); + + g_ns = entry; +} + +static void +register_ctrlr(struct spdk_nvme_ctrlr *ctrlr) +{ + int num_ns; + struct ctrlr_entry *entry = malloc(sizeof(struct ctrlr_entry)); + const struct spdk_nvme_ctrlr_data *cdata = spdk_nvme_ctrlr_get_data(ctrlr); + + if (entry == NULL) { + perror("ctrlr_entry malloc"); + exit(1); + } + + snprintf(entry->name, sizeof(entry->name), "%-20.20s (%-20.20s)", cdata->mn, cdata->sn); + + entry->ctrlr = ctrlr; + g_ctrlr = entry; + + num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr); + /* Only register the first namespace. */ + if (num_ns < 1) { + fprintf(stderr, "controller found with no namespaces\n"); + exit(1); + } + + register_ns(ctrlr, spdk_nvme_ctrlr_get_ns(ctrlr, 1)); +} + +#if HAVE_LIBAIO +static int +register_aio_file(const char *path) +{ + struct ns_entry *entry; + + int fd; + uint64_t size; + uint32_t blklen; + + fd = open(path, O_RDWR | O_DIRECT); + if (fd < 0) { + fprintf(stderr, "Could not open AIO device %s: %s\n", path, strerror(errno)); + return -1; + } + + size = spdk_file_get_size(fd); + if (size == 0) { + fprintf(stderr, "Could not determine size of AIO device %s\n", path); + close(fd); + return -1; + } + + blklen = spdk_dev_get_blocklen(fd); + if (blklen == 0) { + fprintf(stderr, "Could not determine block size of AIO device %s\n", path); + close(fd); + return -1; + } + + entry = calloc(1, sizeof(struct ns_entry)); + if (entry == NULL) { + close(fd); + perror("aio ns_entry malloc"); + return -1; + } + + entry->type = ENTRY_TYPE_AIO_FILE; + entry->u.aio.fd = fd; + entry->size_in_ios = size / g_io_size_bytes; + entry->io_size_blocks = g_io_size_bytes / blklen; + + snprintf(entry->name, sizeof(entry->name), "%s", path); + + g_ns = entry; + + return 0; +} + +static int +aio_submit(io_context_t aio_ctx, struct iocb *iocb, int fd, enum io_iocb_cmd cmd, void *buf, + unsigned long nbytes, uint64_t offset, void *cb_ctx) +{ + iocb->aio_fildes = fd; + iocb->aio_reqprio = 0; + iocb->aio_lio_opcode = cmd; + iocb->u.c.buf = buf; + iocb->u.c.nbytes = nbytes; + iocb->u.c.offset = offset; + iocb->data = cb_ctx; + + if (io_submit(aio_ctx, 1, &iocb) < 0) { + printf("io_submit"); + return -1; + } + + return 0; +} + +static void +aio_check_io(void) +{ + int count, i; + struct timespec timeout; + + timeout.tv_sec = 0; + timeout.tv_nsec = 0; + + count = io_getevents(g_ns->u.aio.ctx, 1, 1, g_ns->u.aio.events, &timeout); + if (count < 0) { + fprintf(stderr, "io_getevents error\n"); + exit(1); + } + + for (i = 0; i < count; i++) { + g_ns->current_queue_depth--; + } +} +#endif /* HAVE_LIBAIO */ + +static void io_complete(void *ctx, const struct spdk_nvme_cpl *completion); + +static __thread unsigned int seed = 0; + +static void +submit_single_io(void) +{ + uint64_t offset_in_ios; + uint64_t start; + int rc; + struct ns_entry *entry = g_ns; + + offset_in_ios = rand_r(&seed) % entry->size_in_ios; + + start = rte_get_tsc_cycles(); + rte_mb(); +#if HAVE_LIBAIO + if (entry->type == ENTRY_TYPE_AIO_FILE) { + rc = aio_submit(g_ns->u.aio.ctx, &g_task->iocb, entry->u.aio.fd, IO_CMD_PREAD, g_task->buf, + g_io_size_bytes, offset_in_ios * g_io_size_bytes, g_task); + } else +#endif + { + rc = spdk_nvme_ns_cmd_read(entry->u.nvme.ns, g_ns->u.nvme.qpair, g_task->buf, + offset_in_ios * entry->io_size_blocks, + entry->io_size_blocks, io_complete, g_task, 0); + } + + rte_mb(); + g_tsc_submit += rte_get_tsc_cycles() - start; + + if (rc != 0) { + fprintf(stderr, "starting I/O failed\n"); + } + + g_ns->current_queue_depth++; +} + +static void +io_complete(void *ctx, const struct spdk_nvme_cpl *completion) +{ + g_ns->current_queue_depth--; +} + +uint64_t g_complete_tsc_start; + +static void +check_io(void) +{ + uint64_t end; + rte_mb(); +#if HAVE_LIBAIO + if (g_ns->type == ENTRY_TYPE_AIO_FILE) { + aio_check_io(); + } else +#endif + { + spdk_nvme_qpair_process_completions(g_ns->u.nvme.qpair, 0); + } + rte_mb(); + end = rte_get_tsc_cycles(); + if (g_ns->current_queue_depth == 1) { + /* + * Account for race condition in AIO case where interrupt occurs + * after checking for queue depth. If the timestamp capture + * is too big compared to the last capture, assume that an + * interrupt fired, and do not bump the start tsc forward. This + * will ensure this extra time is accounted for next time through + * when we see current_queue_depth drop to 0. + */ + if ((end - g_complete_tsc_start) < 500) { + g_complete_tsc_start = end; + } + } else { + g_tsc_complete += end - g_complete_tsc_start; + g_io_completed++; + if (!g_ns->is_draining) { + submit_single_io(); + } + g_complete_tsc_start = rte_get_tsc_cycles(); + } +} + +static void +drain_io(void) +{ + g_ns->is_draining = true; + while (g_ns->current_queue_depth > 0) { + check_io(); + } +} + +static int +init_ns_worker_ctx(void) +{ + if (g_ns->type == ENTRY_TYPE_AIO_FILE) { +#ifdef HAVE_LIBAIO + g_ns->u.aio.events = calloc(1, sizeof(struct io_event)); + if (!g_ns->u.aio.events) { + return -1; + } + g_ns->u.aio.ctx = 0; + if (io_setup(1, &g_ns->u.aio.ctx) < 0) { + free(g_ns->u.aio.events); + perror("io_setup"); + return -1; + } +#endif + } else { + /* + * TODO: If a controller has multiple namespaces, they could all use the same queue. + * For now, give each namespace/thread combination its own queue. + */ + g_ns->u.nvme.qpair = spdk_nvme_ctrlr_alloc_io_qpair(g_ns->u.nvme.ctrlr, 0); + if (!g_ns->u.nvme.qpair) { + printf("ERROR: spdk_nvme_ctrlr_alloc_io_qpair failed\n"); + return -1; + } + } + + return 0; +} + +static void +cleanup_ns_worker_ctx(void) +{ + if (g_ns->type == ENTRY_TYPE_AIO_FILE) { +#ifdef HAVE_LIBAIO + io_destroy(g_ns->u.aio.ctx); + free(g_ns->u.aio.events); +#endif + } else { + spdk_nvme_ctrlr_free_io_qpair(g_ns->u.nvme.qpair); + } +} + +static int +work_fn(void) +{ + uint64_t tsc_end; + + printf("Starting work_fn on core %u\n", rte_lcore_id()); + + /* Allocate a queue pair for each namespace. */ + if (init_ns_worker_ctx() != 0) { + printf("ERROR: init_ns_worker_ctx() failed\n"); + return 1; + } + + tsc_end = rte_get_tsc_cycles() + g_time_in_sec * g_tsc_rate; + + /* Submit initial I/O for each namespace. */ + submit_single_io(); + g_complete_tsc_start = rte_get_tsc_cycles(); + + while (1) { + /* + * Check for completed I/O for each controller. A new + * I/O will be submitted in the io_complete callback + * to replace each I/O that is completed. + */ + check_io(); + + if (rte_get_tsc_cycles() > tsc_end) { + break; + } + } + + drain_io(); + cleanup_ns_worker_ctx(); + + return 0; +} + +static void usage(char *program_name) +{ + printf("%s options", program_name); +#if HAVE_LIBAIO + printf(" [AIO device(s)]..."); +#endif + printf("\n"); + printf("\t[-s io size in bytes]\n"); + printf("\t[-t time in seconds]\n"); + printf("\t\t(default: 1)]\n"); +} + +static void +print_stats(void) +{ + printf("g_tsc_submit = %ju\n", g_tsc_submit); + printf("g_tsc_complete = %ju\n", g_tsc_complete); + printf("g_io_completed = %ju\n", g_io_completed); + + printf("avg submit = %8.1f\n", (float)g_tsc_submit / g_io_completed); + printf("avg complete = %8.1f\n", (float)g_tsc_complete / g_io_completed); +} + +static int +parse_args(int argc, char **argv) +{ + int op; + + /* default value*/ + g_io_size_bytes = 0; + g_time_in_sec = 0; + + while ((op = getopt(argc, argv, "s:t:")) != -1) { + switch (op) { + case 's': + g_io_size_bytes = atoi(optarg); + break; + case 't': + g_time_in_sec = atoi(optarg); + break; + default: + usage(argv[0]); + return 1; + } + } + + if (!g_io_size_bytes) { + usage(argv[0]); + return 1; + } + if (!g_time_in_sec) { + usage(argv[0]); + return 1; + } + + g_aio_optind = optind; + optind = 1; + return 0; +} + +static bool +probe_cb(void *cb_ctx, struct spdk_pci_device *dev, struct spdk_nvme_ctrlr_opts *opts) +{ + static uint32_t ctrlr_found = 0; + + if (spdk_pci_device_has_non_uio_driver(dev)) { + fprintf(stderr, "non-uio kernel driver attached to NVMe\n"); + fprintf(stderr, " controller at PCI address %04x:%02x:%02x.%02x\n", + spdk_pci_device_get_domain(dev), + spdk_pci_device_get_bus(dev), + spdk_pci_device_get_dev(dev), + spdk_pci_device_get_func(dev)); + fprintf(stderr, " skipping...\n"); + return false; + } + + if (ctrlr_found == 1) { + fprintf(stderr, "only attching to one controller, so skipping\n"); + fprintf(stderr, " controller at PCI address %04x:%02x:%02x.%02x\n", + spdk_pci_device_get_domain(dev), + spdk_pci_device_get_bus(dev), + spdk_pci_device_get_dev(dev), + spdk_pci_device_get_func(dev)); + return false; + } + ctrlr_found = 1; + + printf("Attaching to %04x:%02x:%02x.%02x\n", + spdk_pci_device_get_domain(dev), + spdk_pci_device_get_bus(dev), + spdk_pci_device_get_dev(dev), + spdk_pci_device_get_func(dev)); + + return true; +} + +static void +attach_cb(void *cb_ctx, struct spdk_pci_device *dev, struct spdk_nvme_ctrlr *ctrlr, + const struct spdk_nvme_ctrlr_opts *opts) +{ + printf("Attached to %04x:%02x:%02x.%02x\n", + spdk_pci_device_get_domain(dev), + spdk_pci_device_get_bus(dev), + spdk_pci_device_get_dev(dev), + spdk_pci_device_get_func(dev)); + + register_ctrlr(ctrlr); +} + +static int +register_controllers(void) +{ + printf("Initializing NVMe Controllers\n"); + + if (spdk_nvme_probe(NULL, probe_cb, attach_cb, NULL) != 0) { + fprintf(stderr, "spdk_nvme_probe() failed\n"); + return 1; + } + + return 0; +} + +static char *ealargs[] = { + "perf", + "-c 0x1", + "-n 4", +}; + +int main(int argc, char **argv) +{ + int rc; + + rc = parse_args(argc, argv); + if (rc != 0) { + return rc; + } + + rc = rte_eal_init(sizeof(ealargs) / sizeof(ealargs[0]), ealargs); + + if (rc < 0) { + fprintf(stderr, "could not initialize dpdk\n"); + return 1; + } + + request_mempool = rte_mempool_create("nvme_request", 8192, + spdk_nvme_request_size(), 128, 0, + NULL, NULL, NULL, NULL, + SOCKET_ID_ANY, 0); + + if (request_mempool == NULL) { + fprintf(stderr, "could not initialize request mempool\n"); + return 1; + } + + g_task = rte_zmalloc("task", sizeof(struct perf_task), 0); + if (g_task == NULL) { + fprintf(stderr, "g_task alloc failed\n"); + exit(1); + } + + g_task->buf = rte_malloc(NULL, g_io_size_bytes, 0x1000); + if (g_task->buf == NULL) { + fprintf(stderr, "g_task->buf rte_malloc failed\n"); + exit(1); + } + + g_tsc_rate = rte_get_tsc_hz(); + +#if HAVE_LIBAIO + if (g_aio_optind < argc) { + printf("Measuring overhead for AIO device %s.\n", argv[g_aio_optind]); + if (register_aio_file(argv[g_aio_optind]) != 0) { + rc = -1; + goto cleanup; + } + } else +#endif + { + if (register_controllers() != 0) { + rc = -1; + goto cleanup; + } + } + + printf("Initialization complete. Launching workers.\n"); + + rc = work_fn(); + + print_stats(); + +cleanup: + free(g_ns); + if (g_ctrlr) { + spdk_nvme_detach(g_ctrlr->ctrlr); + free(g_ctrlr); + } + + if (rc != 0) { + fprintf(stderr, "%s: errors occured\n", argv[0]); + } + + return rc; +}