nvmf/vfio-user: implement device quiesce APIs

libvfio-user will call quiesce callback when there are
memory region add/remove and device state change requests
from client, and in the quiesce callback, we will pause
the subsystem so that it's safe to do everything after
it, then after quiesce callback, we will resume the
subsystem.  The quiesce callback is also used in
live migration, each device state change will quiesce
the device first.

Change-Id: I3a6a0320ad76c6b2d1d65c754b9f79cce5c9c683
Signed-off-by: Changpeng Liu <changpeng.liu@intel.com>
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/10620
Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com>
Community-CI: Mellanox Build Bot
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
Reviewed-by: Jim Harris <james.r.harris@intel.com>
This commit is contained in:
Changpeng Liu 2021-12-06 20:49:14 +08:00 committed by Jim Harris
parent be0aae0658
commit b3cd421ffd
3 changed files with 133 additions and 6 deletions

View File

@ -214,6 +214,8 @@ struct nvmf_vfio_user_ctrlr {
struct spdk_thread *thread;
struct spdk_poller *vfu_ctx_poller;
bool queued_quiesce;
bool reset_shn;
uint16_t cntlid;
@ -1622,7 +1624,7 @@ memory_region_add_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
pthread_mutex_unlock(&endpoint->lock);
}
static int
static void
memory_region_remove_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
{
struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
@ -1632,7 +1634,7 @@ memory_region_remove_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
int ret = 0;
if (!info->vaddr) {
return 0;
return;
}
if (((uintptr_t)info->mapping.iov_base & MASK_2MB) ||
@ -1640,7 +1642,7 @@ memory_region_remove_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %#lx-%#lx\n", info->vaddr,
(uintptr_t)info->mapping.iov_base,
(uintptr_t)info->mapping.iov_base + info->mapping.iov_len);
return 0;
return;
}
assert(endpoint != NULL);
@ -1678,8 +1680,6 @@ memory_region_remove_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
ret);
}
}
return 0;
}
static int
@ -1943,6 +1943,125 @@ init_pci_config_space(vfu_pci_config_space_t *p)
p->hdr.intr.ipin = 0x1;
}
static void
vfio_user_dev_quiesce_done(struct spdk_nvmf_subsystem *subsystem,
void *cb_arg, int status);
static void
vfio_user_dev_quiesce_resume_done(struct spdk_nvmf_subsystem *subsystem,
void *cb_arg, int status)
{
struct nvmf_vfio_user_ctrlr *vu_ctrlr = cb_arg;
struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint;
int ret;
SPDK_DEBUGLOG(nvmf_vfio, "%s resumed done with status %d\n", ctrlr_id(vu_ctrlr), status);
vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING;
/* Basically, once we call `vfu_device_quiesced` the device is unquiesced from
* libvfio-user's perspective so from the moment `vfio_user_dev_quiesce_done` returns
* libvfio-user might quiesce the device again. However, because the NVMf subsytem is
* an asynchronous operation, this quiesce might come _before_ the NVMf subsystem has
* been resumed, so in the callback of `spdk_nvmf_subsystem_resume` we need to check
* whether a quiesce was requested.
*/
if (vu_ctrlr->queued_quiesce) {
SPDK_DEBUGLOG(nvmf_vfio, "%s has queued quiesce event, pause again\n", ctrlr_id(vu_ctrlr));
vu_ctrlr->state = VFIO_USER_CTRLR_PAUSING;
ret = spdk_nvmf_subsystem_pause((struct spdk_nvmf_subsystem *)endpoint->subsystem, 0,
vfio_user_dev_quiesce_done, vu_ctrlr);
if (ret < 0) {
vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING;
SPDK_ERRLOG("%s: failed to pause, ret=%d\n", endpoint_id(endpoint), ret);
}
}
}
static void
vfio_user_dev_quiesce_done(struct spdk_nvmf_subsystem *subsystem,
void *cb_arg, int status)
{
struct nvmf_vfio_user_ctrlr *vu_ctrlr = cb_arg;
struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint;
int ret;
SPDK_DEBUGLOG(nvmf_vfio, "%s paused done with status %d\n", ctrlr_id(vu_ctrlr), status);
assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSING);
vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED;
vfu_device_quiesced(endpoint->vfu_ctx, status);
vu_ctrlr->queued_quiesce = false;
SPDK_DEBUGLOG(nvmf_vfio, "%s start to resume\n", ctrlr_id(vu_ctrlr));
vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING;
ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem,
vfio_user_dev_quiesce_resume_done, vu_ctrlr);
if (ret < 0) {
vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED;
SPDK_ERRLOG("%s: failed to resume, ret=%d\n", endpoint_id(endpoint), ret);
}
}
static int
vfio_user_dev_quiesce_cb(vfu_ctx_t *vfu_ctx)
{
struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr;
int ret;
if (!vu_ctrlr) {
return 0;
}
/* NVMf library will destruct controller when no
* connected queue pairs.
*/
if (!nvmf_subsystem_get_ctrlr((struct spdk_nvmf_subsystem *)endpoint->subsystem,
vu_ctrlr->cntlid)) {
return 0;
}
SPDK_DEBUGLOG(nvmf_vfio, "%s starts to quiesce\n", ctrlr_id(vu_ctrlr));
/* There is no race condition here as device quiesce callback
* and nvmf_prop_set_cc() are running in the same thread context.
*/
if (!vu_ctrlr->ctrlr->vcprop.cc.bits.en) {
return 0;
} else if (!vu_ctrlr->ctrlr->vcprop.csts.bits.rdy) {
return 0;
} else if (vu_ctrlr->ctrlr->vcprop.csts.bits.shst == SPDK_NVME_SHST_COMPLETE) {
return 0;
}
switch (vu_ctrlr->state) {
case VFIO_USER_CTRLR_PAUSED:
return 0;
case VFIO_USER_CTRLR_RUNNING:
vu_ctrlr->state = VFIO_USER_CTRLR_PAUSING;
ret = spdk_nvmf_subsystem_pause((struct spdk_nvmf_subsystem *)endpoint->subsystem, 0,
vfio_user_dev_quiesce_done, vu_ctrlr);
if (ret < 0) {
vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING;
SPDK_ERRLOG("%s: failed to pause, ret=%d\n", endpoint_id(endpoint), ret);
return 0;
}
break;
case VFIO_USER_CTRLR_RESUMING:
vu_ctrlr->queued_quiesce = true;
SPDK_DEBUGLOG(nvmf_vfio, "%s is busy to quiesce, current state %u\n", ctrlr_id(vu_ctrlr),
vu_ctrlr->state);
break;
default:
assert(vu_ctrlr->state != VFIO_USER_CTRLR_PAUSING);
break;
}
errno = EBUSY;
return -1;
}
static int
vfio_user_dev_info_fill(struct nvmf_vfio_user_transport *vu_transport,
struct nvmf_vfio_user_endpoint *endpoint)
@ -2058,6 +2177,8 @@ vfio_user_dev_info_fill(struct nvmf_vfio_user_transport *vu_transport,
return ret;
}
vfu_setup_device_quiesce_cb(vfu_ctx, vfio_user_dev_quiesce_cb);
ret = vfu_realize_ctx(vfu_ctx);
if (ret < 0) {
SPDK_ERRLOG("vfu_ctx %p failed to realize\n", vfu_ctx);

@ -1 +1 @@
Subproject commit 56842967566dcf4b89514949a6e88ba89eeaf268
Subproject commit 17769cf1af093dfb4b9bc3347ae39324029989ac

View File

@ -50,6 +50,10 @@ DEFINE_STUB(spdk_nvmf_qpair_disconnect, int, (struct spdk_nvmf_qpair *qpair,
DEFINE_STUB(spdk_nvmf_subsystem_get_nqn, const char *,
(const struct spdk_nvmf_subsystem *subsystem), NULL);
DEFINE_STUB(spdk_bdev_get_block_size, uint32_t, (const struct spdk_bdev *bdev), 512);
DEFINE_STUB(spdk_nvmf_subsystem_pause, int, (struct spdk_nvmf_subsystem *subsystem,
uint32_t nsid, spdk_nvmf_subsystem_state_change_done cb_fn, void *cb_arg), 0);
DEFINE_STUB(spdk_nvmf_subsystem_resume, int, (struct spdk_nvmf_subsystem *subsystem,
spdk_nvmf_subsystem_state_change_done cb_fn, void *cb_arg), 0);
DEFINE_STUB_V(nvmf_ctrlr_abort_aer, (struct spdk_nvmf_ctrlr *ctrlr));
DEFINE_STUB(nvmf_ctrlr_async_event_error_event, int, (struct spdk_nvmf_ctrlr *ctrlr,
union spdk_nvme_async_event_completion event), 0);
@ -58,6 +62,8 @@ DEFINE_STUB(spdk_nvmf_qpair_get_listen_trid, int, (struct spdk_nvmf_qpair *qpair
struct spdk_nvme_transport_id *trid), 0);
DEFINE_STUB(spdk_nvme_transport_id_compare, int, (const struct spdk_nvme_transport_id *trid1,
const struct spdk_nvme_transport_id *trid2), 0);
DEFINE_STUB(nvmf_subsystem_get_ctrlr, struct spdk_nvmf_ctrlr *,
(struct spdk_nvmf_subsystem *subsystem, uint16_t cntlid), NULL);
static void *
gpa_to_vva(void *prv, uint64_t addr, uint64_t len, int prot)