Make nvme(4) driver some more NUMA aware.

- For each queue pair precalculate CPU and domain it is bound to.
If queue pairs are not per-CPU, then use the domain of the device.
 - Allocate most of queue pair memory from the domain it is bound to.
 - Bind callouts to the same CPUs as queue pair to avoid migrations.
 - Do not assign queue pairs to each SMT thread.  It just wasted
resources and increased lock congestions.
 - Remove fixed multiplier of CPUs per queue pair, spread them even.
This allows to use more queue pairs in some hardware configurations.
 - If queue pair serves multiple CPUs, bind different NVMe devices to
different CPUs.

MFC after:	1 month
Sponsored by:	iXsystems, Inc.
This commit is contained in:
Alexander Motin 2019-09-23 17:53:47 +00:00
parent 9093dd9a66
commit 1eab19cbec
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=352630
7 changed files with 101 additions and 113 deletions

View File

@ -96,7 +96,6 @@ nvme_ahci_attach(device_t dev)
ctrlr->msix_enabled = 0;
ctrlr->num_io_queues = 1;
ctrlr->num_cpus_per_ioq = mp_ncpus;
if (bus_setup_intr(dev, ctrlr->res,
INTR_TYPE_MISC | INTR_MPSAFE, NULL, nvme_ctrlr_intx_handler,
ctrlr, &ctrlr->tag) != 0) {

View File

@ -41,6 +41,7 @@ __FBSDID("$FreeBSD$");
#include <sys/smp.h>
#include <sys/uio.h>
#include <sys/endian.h>
#include <vm/vm.h>
#include "nvme_private.h"
@ -57,6 +58,9 @@ nvme_ctrlr_construct_admin_qpair(struct nvme_controller *ctrlr)
int error;
qpair = &ctrlr->adminq;
qpair->id = 0;
qpair->cpu = CPU_FFS(&cpuset_domain[ctrlr->domain]) - 1;
qpair->domain = ctrlr->domain;
num_entries = NVME_ADMIN_ENTRIES;
TUNABLE_INT_FETCH("hw.nvme.admin_entries", &num_entries);
@ -75,22 +79,21 @@ nvme_ctrlr_construct_admin_qpair(struct nvme_controller *ctrlr)
* The admin queue's max xfer size is treated differently than the
* max I/O xfer size. 16KB is sufficient here - maybe even less?
*/
error = nvme_qpair_construct(qpair,
0, /* qpair ID */
0, /* vector */
num_entries,
NVME_ADMIN_TRACKERS,
ctrlr);
error = nvme_qpair_construct(qpair, num_entries, NVME_ADMIN_TRACKERS,
ctrlr);
return (error);
}
#define QP(ctrlr, c) ((c) * (ctrlr)->num_io_queues / mp_ncpus)
static int
nvme_ctrlr_construct_io_qpairs(struct nvme_controller *ctrlr)
{
struct nvme_qpair *qpair;
uint32_t cap_lo;
uint16_t mqes;
int i, error, num_entries, num_trackers, max_entries;
int c, error, i, n;
int num_entries, num_trackers, max_entries;
/*
* NVMe spec sets a hard limit of 64K max entries, but devices may
@ -130,32 +133,35 @@ nvme_ctrlr_construct_io_qpairs(struct nvme_controller *ctrlr)
*/
ctrlr->max_hw_pend_io = num_trackers * ctrlr->num_io_queues * 3 / 4;
/*
* This was calculated previously when setting up interrupts, but
* a controller could theoretically support fewer I/O queues than
* MSI-X vectors. So calculate again here just to be safe.
*/
ctrlr->num_cpus_per_ioq = howmany(mp_ncpus, ctrlr->num_io_queues);
ctrlr->ioq = malloc(ctrlr->num_io_queues * sizeof(struct nvme_qpair),
M_NVME, M_ZERO | M_WAITOK);
for (i = 0; i < ctrlr->num_io_queues; i++) {
for (i = c = n = 0; i < ctrlr->num_io_queues; i++, c += n) {
qpair = &ctrlr->ioq[i];
/*
* Admin queue has ID=0. IO queues start at ID=1 -
* hence the 'i+1' here.
*
*/
qpair->id = i + 1;
if (ctrlr->num_io_queues > 1) {
/* Find number of CPUs served by this queue. */
for (n = 1; QP(ctrlr, c + n) == i; n++)
;
/* Shuffle multiple NVMe devices between CPUs. */
qpair->cpu = c + (device_get_unit(ctrlr->dev)+n/2) % n;
qpair->domain = pcpu_find(qpair->cpu)->pc_domain;
} else {
qpair->cpu = CPU_FFS(&cpuset_domain[ctrlr->domain]) - 1;
qpair->domain = ctrlr->domain;
}
/*
* For I/O queues, use the controller-wide max_xfer_size
* calculated in nvme_attach().
*/
error = nvme_qpair_construct(qpair,
i+1, /* qpair ID */
ctrlr->msix_enabled ? i+1 : 0, /* vector */
num_entries,
num_trackers,
ctrlr);
error = nvme_qpair_construct(qpair, num_entries, num_trackers,
ctrlr);
if (error)
return (error);
@ -164,8 +170,7 @@ nvme_ctrlr_construct_io_qpairs(struct nvme_controller *ctrlr)
* interrupt thread for this controller.
*/
if (ctrlr->num_io_queues > 1)
bus_bind_intr(ctrlr->dev, qpair->res,
i * ctrlr->num_cpus_per_ioq);
bus_bind_intr(ctrlr->dev, qpair->res, qpair->cpu);
}
return (0);
@ -458,6 +463,8 @@ nvme_ctrlr_set_num_qpairs(struct nvme_controller *ctrlr)
*/
ctrlr->num_io_queues = min(ctrlr->num_io_queues, sq_allocated);
ctrlr->num_io_queues = min(ctrlr->num_io_queues, cq_allocated);
if (ctrlr->num_io_queues > vm_ndomains)
ctrlr->num_io_queues -= ctrlr->num_io_queues % vm_ndomains;
return (0);
}
@ -473,7 +480,7 @@ nvme_ctrlr_create_qpairs(struct nvme_controller *ctrlr)
qpair = &ctrlr->ioq[i];
status.done = 0;
nvme_ctrlr_cmd_create_io_cq(ctrlr, qpair, qpair->vector,
nvme_ctrlr_cmd_create_io_cq(ctrlr, qpair,
nvme_completion_poll_cb, &status);
nvme_completion_poll(&status);
if (nvme_completion_is_error(&status.cpl)) {
@ -1132,6 +1139,8 @@ nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev)
ctrlr->dev = dev;
mtx_init(&ctrlr->lock, "nvme ctrlr lock", NULL, MTX_DEF);
if (bus_get_domain(dev, &ctrlr->domain) != 0)
ctrlr->domain = 0;
cap_hi = nvme_mmio_read_4(ctrlr, cap_hi);
ctrlr->dstrd = NVME_CAP_HI_DSTRD(cap_hi) + 2;
@ -1296,7 +1305,7 @@ nvme_ctrlr_submit_io_request(struct nvme_controller *ctrlr,
{
struct nvme_qpair *qpair;
qpair = &ctrlr->ioq[curcpu / ctrlr->num_cpus_per_ioq];
qpair = &ctrlr->ioq[QP(ctrlr, curcpu)];
nvme_qpair_submit_request(qpair, req);
}

View File

@ -76,8 +76,7 @@ nvme_ctrlr_cmd_identify_namespace(struct nvme_controller *ctrlr, uint32_t nsid,
void
nvme_ctrlr_cmd_create_io_cq(struct nvme_controller *ctrlr,
struct nvme_qpair *io_que, uint16_t vector, nvme_cb_fn_t cb_fn,
void *cb_arg)
struct nvme_qpair *io_que, nvme_cb_fn_t cb_fn, void *cb_arg)
{
struct nvme_request *req;
struct nvme_command *cmd;
@ -93,7 +92,7 @@ nvme_ctrlr_cmd_create_io_cq(struct nvme_controller *ctrlr,
*/
cmd->cdw10 = htole32(((io_que->num_entries-1) << 16) | io_que->id);
/* 0x3 = interrupts enabled | physically contiguous */
cmd->cdw11 = htole32((vector << 16) | 0x3);
cmd->cdw11 = htole32((io_que->vector << 16) | 0x3);
cmd->prp1 = htole64(io_que->cpl_bus_addr);
nvme_ctrlr_submit_admin_request(ctrlr, req);

View File

@ -34,6 +34,7 @@ __FBSDID("$FreeBSD$");
#include <sys/conf.h>
#include <sys/proc.h>
#include <sys/smp.h>
#include <vm/vm.h>
#include <dev/pci/pcireg.h>
#include <dev/pci/pcivar.h>
@ -233,7 +234,6 @@ nvme_ctrlr_configure_intx(struct nvme_controller *ctrlr)
ctrlr->msix_enabled = 0;
ctrlr->num_io_queues = 1;
ctrlr->num_cpus_per_ioq = mp_ncpus;
ctrlr->rid = 0;
ctrlr->res = bus_alloc_resource_any(ctrlr->dev, SYS_RES_IRQ,
&ctrlr->rid, RF_SHAREABLE | RF_ACTIVE);
@ -259,82 +259,61 @@ static void
nvme_ctrlr_setup_interrupts(struct nvme_controller *ctrlr)
{
device_t dev;
int per_cpu_io_queues;
int force_intx, num_io_queues, per_cpu_io_queues;
int min_cpus_per_ioq;
int num_vectors_requested, num_vectors_allocated;
int num_vectors_available;
dev = ctrlr->dev;
min_cpus_per_ioq = 1;
TUNABLE_INT_FETCH("hw.nvme.min_cpus_per_ioq", &min_cpus_per_ioq);
if (min_cpus_per_ioq < 1) {
min_cpus_per_ioq = 1;
} else if (min_cpus_per_ioq > mp_ncpus) {
min_cpus_per_ioq = mp_ncpus;
}
per_cpu_io_queues = 1;
TUNABLE_INT_FETCH("hw.nvme.per_cpu_io_queues", &per_cpu_io_queues);
if (per_cpu_io_queues == 0) {
min_cpus_per_ioq = mp_ncpus;
}
ctrlr->force_intx = 0;
TUNABLE_INT_FETCH("hw.nvme.force_intx", &ctrlr->force_intx);
/*
* FreeBSD currently cannot allocate more than about 190 vectors at
* boot, meaning that systems with high core count and many devices
* requesting per-CPU interrupt vectors will not get their full
* allotment. So first, try to allocate as many as we may need to
* understand what is available, then immediately release them.
* Then figure out how many of those we will actually use, based on
* assigning an equal number of cores to each I/O queue.
*/
/* One vector for per core I/O queue, plus one vector for admin queue. */
num_vectors_available = min(pci_msix_count(dev), mp_ncpus + 1);
if (pci_alloc_msix(dev, &num_vectors_available) != 0) {
num_vectors_available = 0;
}
pci_release_msi(dev);
if (ctrlr->force_intx || num_vectors_available < 2) {
force_intx = 0;
TUNABLE_INT_FETCH("hw.nvme.force_intx", &force_intx);
if (force_intx || pci_msix_count(dev) < 2) {
nvme_ctrlr_configure_intx(ctrlr);
return;
}
/*
* Do not use all vectors for I/O queues - one must be saved for the
* admin queue.
*/
ctrlr->num_cpus_per_ioq = max(min_cpus_per_ioq,
howmany(mp_ncpus, num_vectors_available - 1));
num_io_queues = mp_ncpus;
TUNABLE_INT_FETCH("hw.nvme.num_io_queues", &num_io_queues);
if (num_io_queues < 1 || num_io_queues > mp_ncpus)
num_io_queues = mp_ncpus;
ctrlr->num_io_queues = howmany(mp_ncpus, ctrlr->num_cpus_per_ioq);
num_vectors_requested = ctrlr->num_io_queues + 1;
per_cpu_io_queues = 1;
TUNABLE_INT_FETCH("hw.nvme.per_cpu_io_queues", &per_cpu_io_queues);
if (per_cpu_io_queues == 0)
num_io_queues = 1;
min_cpus_per_ioq = smp_threads_per_core;
TUNABLE_INT_FETCH("hw.nvme.min_cpus_per_ioq", &min_cpus_per_ioq);
if (min_cpus_per_ioq > 1) {
num_io_queues = min(num_io_queues,
max(1, mp_ncpus / min_cpus_per_ioq));
}
num_io_queues = min(num_io_queues, pci_msix_count(dev) - 1);
again:
if (num_io_queues > vm_ndomains)
num_io_queues -= num_io_queues % vm_ndomains;
/* One vector for per core I/O queue, plus one vector for admin queue. */
num_vectors_requested = num_io_queues + 1;
num_vectors_allocated = num_vectors_requested;
/*
* Now just allocate the number of vectors we need. This should
* succeed, since we previously called pci_alloc_msix()
* successfully returning at least this many vectors, but just to
* be safe, if something goes wrong just revert to INTx.
*/
if (pci_alloc_msix(dev, &num_vectors_allocated) != 0) {
nvme_ctrlr_configure_intx(ctrlr);
return;
}
if (num_vectors_allocated < num_vectors_requested) {
if (num_vectors_allocated < 2) {
pci_release_msi(dev);
nvme_ctrlr_configure_intx(ctrlr);
return;
}
if (num_vectors_allocated != num_vectors_requested) {
pci_release_msi(dev);
num_io_queues = num_vectors_allocated - 1;
goto again;
}
ctrlr->msix_enabled = 1;
ctrlr->num_io_queues = num_io_queues;
}
static int

View File

@ -175,7 +175,8 @@ struct nvme_qpair {
struct nvme_controller *ctrlr;
uint32_t id;
uint32_t phase;
int domain;
int cpu;
uint16_t vector;
int rid;
@ -187,6 +188,7 @@ struct nvme_qpair {
uint32_t sq_tdbl_off;
uint32_t cq_hdbl_off;
uint32_t phase;
uint32_t sq_head;
uint32_t sq_tail;
uint32_t cq_head;
@ -238,7 +240,7 @@ struct nvme_controller {
device_t dev;
struct mtx lock;
int domain;
uint32_t ready_timeout_in_ms;
uint32_t quirks;
#define QUIRK_DELAY_B4_CHK_RDY 1 /* Can't touch MMIO on disable */
@ -258,11 +260,9 @@ struct nvme_controller {
struct resource *bar4_resource;
uint32_t msix_enabled;
uint32_t force_intx;
uint32_t enable_aborts;
uint32_t num_io_queues;
uint32_t num_cpus_per_ioq;
uint32_t max_hw_pend_io;
/* Fields for tracking progress during controller initialization. */
@ -377,7 +377,7 @@ void nvme_ctrlr_cmd_get_firmware_page(struct nvme_controller *ctrlr,
nvme_cb_fn_t cb_fn,
void *cb_arg);
void nvme_ctrlr_cmd_create_io_cq(struct nvme_controller *ctrlr,
struct nvme_qpair *io_que, uint16_t vector,
struct nvme_qpair *io_que,
nvme_cb_fn_t cb_fn, void *cb_arg);
void nvme_ctrlr_cmd_create_io_sq(struct nvme_controller *ctrlr,
struct nvme_qpair *io_que,
@ -413,9 +413,8 @@ void nvme_ctrlr_submit_io_request(struct nvme_controller *ctrlr,
void nvme_ctrlr_post_failed_request(struct nvme_controller *ctrlr,
struct nvme_request *req);
int nvme_qpair_construct(struct nvme_qpair *qpair, uint32_t id,
uint16_t vector, uint32_t num_entries,
uint32_t num_trackers,
int nvme_qpair_construct(struct nvme_qpair *qpair,
uint32_t num_entries, uint32_t num_trackers,
struct nvme_controller *ctrlr);
void nvme_qpair_submit_tracker(struct nvme_qpair *qpair,
struct nvme_tracker *tr);

View File

@ -32,6 +32,7 @@ __FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/bus.h>
#include <sys/conf.h>
#include <sys/domainset.h>
#include <sys/proc.h>
#include <dev/pci/pcivar.h>
@ -637,8 +638,8 @@ nvme_qpair_msix_handler(void *arg)
}
int
nvme_qpair_construct(struct nvme_qpair *qpair, uint32_t id,
uint16_t vector, uint32_t num_entries, uint32_t num_trackers,
nvme_qpair_construct(struct nvme_qpair *qpair,
uint32_t num_entries, uint32_t num_trackers,
struct nvme_controller *ctrlr)
{
struct nvme_tracker *tr;
@ -647,8 +648,7 @@ nvme_qpair_construct(struct nvme_qpair *qpair, uint32_t id,
uint8_t *queuemem, *prpmem, *prp_list;
int i, err;
qpair->id = id;
qpair->vector = vector;
qpair->vector = ctrlr->msix_enabled ? qpair->id : 0;
qpair->num_entries = num_entries;
qpair->num_trackers = num_trackers;
qpair->ctrlr = ctrlr;
@ -659,19 +659,19 @@ nvme_qpair_construct(struct nvme_qpair *qpair, uint32_t id,
* MSI-X vector resource IDs start at 1, so we add one to
* the queue's vector to get the corresponding rid to use.
*/
qpair->rid = vector + 1;
qpair->rid = qpair->vector + 1;
qpair->res = bus_alloc_resource_any(ctrlr->dev, SYS_RES_IRQ,
&qpair->rid, RF_ACTIVE);
bus_setup_intr(ctrlr->dev, qpair->res,
INTR_TYPE_MISC | INTR_MPSAFE, NULL,
nvme_qpair_msix_handler, qpair, &qpair->tag);
if (id == 0) {
if (qpair->id == 0) {
bus_describe_intr(ctrlr->dev, qpair->res, qpair->tag,
"admin");
} else {
bus_describe_intr(ctrlr->dev, qpair->res, qpair->tag,
"io%d", id - 1);
"io%d", qpair->id - 1);
}
}
@ -707,6 +707,7 @@ nvme_qpair_construct(struct nvme_qpair *qpair, uint32_t id,
nvme_printf(ctrlr, "tag create failed %d\n", err);
goto out;
}
bus_dma_tag_set_domain(qpair->dma_tag, qpair->domain);
if (bus_dmamem_alloc(qpair->dma_tag, (void **)&queuemem,
BUS_DMA_NOWAIT, &qpair->queuemem_map)) {
@ -737,9 +738,9 @@ nvme_qpair_construct(struct nvme_qpair *qpair, uint32_t id,
* it to various small values.
*/
qpair->sq_tdbl_off = nvme_mmio_offsetof(doorbell[0]) +
(id << (ctrlr->dstrd + 1));
(qpair->id << (ctrlr->dstrd + 1));
qpair->cq_hdbl_off = nvme_mmio_offsetof(doorbell[0]) +
(id << (ctrlr->dstrd + 1)) + (1 << ctrlr->dstrd);
(qpair->id << (ctrlr->dstrd + 1)) + (1 << ctrlr->dstrd);
TAILQ_INIT(&qpair->free_tr);
TAILQ_INIT(&qpair->outstanding_tr);
@ -765,7 +766,8 @@ nvme_qpair_construct(struct nvme_qpair *qpair, uint32_t id,
(uint8_t *)roundup2((uintptr_t)prp_list, PAGE_SIZE);
}
tr = malloc(sizeof(*tr), M_NVME, M_ZERO | M_WAITOK);
tr = malloc_domainset(sizeof(*tr), M_NVME,
DOMAINSET_PREF(qpair->domain), M_ZERO | M_WAITOK);
bus_dmamap_create(qpair->dma_tag_payload, 0,
&tr->payload_dma_map);
callout_init(&tr->timer, 1);
@ -783,8 +785,9 @@ nvme_qpair_construct(struct nvme_qpair *qpair, uint32_t id,
goto out;
}
qpair->act_tr = malloc(sizeof(struct nvme_tracker *) *
qpair->num_entries, M_NVME, M_ZERO | M_WAITOK);
qpair->act_tr = malloc_domainset(sizeof(struct nvme_tracker *) *
qpair->num_entries, M_NVME, DOMAINSET_PREF(qpair->domain),
M_ZERO | M_WAITOK);
return (0);
out:
@ -814,14 +817,14 @@ nvme_qpair_destroy(struct nvme_qpair *qpair)
}
if (qpair->act_tr)
free(qpair->act_tr, M_NVME);
free_domain(qpair->act_tr, M_NVME);
while (!TAILQ_EMPTY(&qpair->free_tr)) {
tr = TAILQ_FIRST(&qpair->free_tr);
TAILQ_REMOVE(&qpair->free_tr, tr, tailq);
bus_dmamap_destroy(qpair->dma_tag_payload,
tr->payload_dma_map);
free(tr, M_NVME);
free_domain(tr, M_NVME);
}
if (qpair->dma_tag)
@ -938,8 +941,8 @@ nvme_qpair_submit_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr)
ctrlr = qpair->ctrlr;
if (req->timeout)
callout_reset_curcpu(&tr->timer, ctrlr->timeout_period * hz,
nvme_timeout, tr);
callout_reset_on(&tr->timer, ctrlr->timeout_period * hz,
nvme_timeout, tr, qpair->cpu);
/* Copy the command from the tracker to the submission queue. */
memcpy(&qpair->cmd[qpair->sq_tail], &req->cmd, sizeof(req->cmd));

View File

@ -306,9 +306,9 @@ nvme_sysctl_initialize_ctrlr(struct nvme_controller *ctrlr)
ctrlr_tree = device_get_sysctl_tree(ctrlr->dev);
ctrlr_list = SYSCTL_CHILDREN(ctrlr_tree);
SYSCTL_ADD_UINT(ctrlr_ctx, ctrlr_list, OID_AUTO, "num_cpus_per_ioq",
CTLFLAG_RD, &ctrlr->num_cpus_per_ioq, 0,
"Number of CPUs assigned per I/O queue pair");
SYSCTL_ADD_UINT(ctrlr_ctx, ctrlr_list, OID_AUTO, "num_io_queues",
CTLFLAG_RD, &ctrlr->num_io_queues, 0,
"Number of I/O queue pairs");
SYSCTL_ADD_PROC(ctrlr_ctx, ctrlr_list, OID_AUTO,
"int_coal_time", CTLTYPE_UINT | CTLFLAG_RW, ctrlr, 0,