bhyve: Snapshot impovements for 'blockif' backend

When pausing a block I/O device model as part of suspending a VM, wait
for all active block I/O requests to finish before saving snapshot
data.  This avoids having to save information about in-flight requests
both in the block_if layer and in storage device models.

For the AHCI device model, the queues are now guaranteed to be idle
when taking a snapshot, so remove the code to save queue state and
rely on the initial state in a resumed VM having all queues already
idle.

This will also simplify adding NVMe snapshot support in the future.

Reviewed by:	jhb
Sponsored by:	vStack
Differential Revision: https://reviews.freebsd.org/D26267
This commit is contained in:
Vitaliy Gusev 2022-06-23 11:46:06 -07:00 committed by John Baldwin
parent 3a4256dd86
commit cd9618bdb2
3 changed files with 15 additions and 236 deletions

View File

@ -109,11 +109,9 @@ struct blockif_ctxt {
int bc_psectoff;
int bc_closing;
int bc_paused;
int bc_work_count;
pthread_t bc_btid[BLOCKIF_NUMTHR];
pthread_mutex_t bc_mtx;
pthread_cond_t bc_cond;
pthread_cond_t bc_paused_cond;
pthread_cond_t bc_work_done_cond;
blockif_resize_cb *bc_resize_cb;
void *bc_resize_cb_arg;
@ -362,6 +360,12 @@ blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf)
(*br->br_callback)(br, err);
}
static inline bool
blockif_empty(const struct blockif_ctxt *bc)
{
return (TAILQ_EMPTY(&bc->bc_pendq) && TAILQ_EMPTY(&bc->bc_busyq));
}
static void *
blockif_thr(void *arg)
{
@ -379,30 +383,21 @@ blockif_thr(void *arg)
pthread_mutex_lock(&bc->bc_mtx);
for (;;) {
bc->bc_work_count++;
/* We cannot process work if the interface is paused */
while (!bc->bc_paused && blockif_dequeue(bc, t, &be)) {
while (blockif_dequeue(bc, t, &be)) {
pthread_mutex_unlock(&bc->bc_mtx);
blockif_proc(bc, be, buf);
pthread_mutex_lock(&bc->bc_mtx);
blockif_complete(bc, be);
}
bc->bc_work_count--;
/* If none of the workers are busy, notify the main thread */
if (bc->bc_work_count == 0)
/* If none to work, notify the main thread */
if (blockif_empty(bc))
pthread_cond_broadcast(&bc->bc_work_done_cond);
/* Check ctxt status here to see if exit requested */
if (bc->bc_closing)
break;
/* Make all worker threads wait here if the device is paused */
while (bc->bc_paused)
pthread_cond_wait(&bc->bc_paused_cond, &bc->bc_mtx);
pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx);
}
pthread_mutex_unlock(&bc->bc_mtx);
@ -638,8 +633,6 @@ blockif_open(nvlist_t *nvl, const char *ident)
pthread_mutex_init(&bc->bc_mtx, NULL);
pthread_cond_init(&bc->bc_cond, NULL);
bc->bc_paused = 0;
bc->bc_work_count = 0;
pthread_cond_init(&bc->bc_paused_cond, NULL);
pthread_cond_init(&bc->bc_work_done_cond, NULL);
TAILQ_INIT(&bc->bc_freeq);
TAILQ_INIT(&bc->bc_pendq);
@ -737,6 +730,7 @@ blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq,
err = 0;
pthread_mutex_lock(&bc->bc_mtx);
assert(!bc->bc_paused);
if (!TAILQ_EMPTY(&bc->bc_freeq)) {
/*
* Enqueue and inform the block i/o thread
@ -761,7 +755,6 @@ blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq,
int
blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq)
{
assert(bc->bc_magic == BLOCKIF_SIG);
return (blockif_request(bc, breq, BOP_READ));
}
@ -769,7 +762,6 @@ blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq)
int
blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq)
{
assert(bc->bc_magic == BLOCKIF_SIG);
return (blockif_request(bc, breq, BOP_WRITE));
}
@ -777,7 +769,6 @@ blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq)
int
blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq)
{
assert(bc->bc_magic == BLOCKIF_SIG);
return (blockif_request(bc, breq, BOP_FLUSH));
}
@ -785,7 +776,6 @@ blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq)
int
blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq)
{
assert(bc->bc_magic == BLOCKIF_SIG);
return (blockif_request(bc, breq, BOP_DELETE));
}
@ -955,7 +945,6 @@ blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s)
off_t
blockif_size(struct blockif_ctxt *bc)
{
assert(bc->bc_magic == BLOCKIF_SIG);
return (bc->bc_size);
}
@ -963,7 +952,6 @@ blockif_size(struct blockif_ctxt *bc)
int
blockif_sectsz(struct blockif_ctxt *bc)
{
assert(bc->bc_magic == BLOCKIF_SIG);
return (bc->bc_sectsz);
}
@ -971,7 +959,6 @@ blockif_sectsz(struct blockif_ctxt *bc)
void
blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off)
{
assert(bc->bc_magic == BLOCKIF_SIG);
*size = bc->bc_psectsz;
*off = bc->bc_psectoff;
@ -980,7 +967,6 @@ blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off)
int
blockif_queuesz(struct blockif_ctxt *bc)
{
assert(bc->bc_magic == BLOCKIF_SIG);
return (BLOCKIF_MAXREQ - 1);
}
@ -988,7 +974,6 @@ blockif_queuesz(struct blockif_ctxt *bc)
int
blockif_is_ro(struct blockif_ctxt *bc)
{
assert(bc->bc_magic == BLOCKIF_SIG);
return (bc->bc_rdonly);
}
@ -996,7 +981,6 @@ blockif_is_ro(struct blockif_ctxt *bc)
int
blockif_candelete(struct blockif_ctxt *bc)
{
assert(bc->bc_magic == BLOCKIF_SIG);
return (bc->bc_candelete);
}
@ -1012,7 +996,7 @@ blockif_pause(struct blockif_ctxt *bc)
bc->bc_paused = 1;
/* The interface is paused. Wait for workers to finish their work */
while (bc->bc_work_count)
while (!blockif_empty(bc))
pthread_cond_wait(&bc->bc_work_done_cond, &bc->bc_mtx);
pthread_mutex_unlock(&bc->bc_mtx);
@ -1029,71 +1013,6 @@ blockif_resume(struct blockif_ctxt *bc)
pthread_mutex_lock(&bc->bc_mtx);
bc->bc_paused = 0;
/* resume the threads waiting for paused */
pthread_cond_broadcast(&bc->bc_paused_cond);
/* kick the threads after restore */
pthread_cond_broadcast(&bc->bc_cond);
pthread_mutex_unlock(&bc->bc_mtx);
}
int
blockif_snapshot_req(struct blockif_req *br, struct vm_snapshot_meta *meta)
{
int i;
struct iovec *iov;
int ret;
SNAPSHOT_VAR_OR_LEAVE(br->br_iovcnt, meta, ret, done);
SNAPSHOT_VAR_OR_LEAVE(br->br_offset, meta, ret, done);
SNAPSHOT_VAR_OR_LEAVE(br->br_resid, meta, ret, done);
/*
* XXX: The callback and parameter must be filled by the virtualized
* device that uses the interface, during its init; we're not touching
* them here.
*/
/* Snapshot the iovecs. */
for (i = 0; i < br->br_iovcnt; i++) {
iov = &br->br_iov[i];
SNAPSHOT_VAR_OR_LEAVE(iov->iov_len, meta, ret, done);
/* We assume the iov is a guest-mapped address. */
SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(iov->iov_base, iov->iov_len,
false, meta, ret, done);
}
done:
return (ret);
}
int
blockif_snapshot(struct blockif_ctxt *bc, struct vm_snapshot_meta *meta)
{
int ret;
if (bc->bc_paused == 0) {
fprintf(stderr, "%s: Snapshot failed: "
"interface not paused.\r\n", __func__);
return (ENXIO);
}
pthread_mutex_lock(&bc->bc_mtx);
SNAPSHOT_VAR_OR_LEAVE(bc->bc_magic, meta, ret, done);
SNAPSHOT_VAR_OR_LEAVE(bc->bc_ischr, meta, ret, done);
SNAPSHOT_VAR_OR_LEAVE(bc->bc_isgeom, meta, ret, done);
SNAPSHOT_VAR_OR_LEAVE(bc->bc_candelete, meta, ret, done);
SNAPSHOT_VAR_OR_LEAVE(bc->bc_rdonly, meta, ret, done);
SNAPSHOT_VAR_OR_LEAVE(bc->bc_size, meta, ret, done);
SNAPSHOT_VAR_OR_LEAVE(bc->bc_sectsz, meta, ret, done);
SNAPSHOT_VAR_OR_LEAVE(bc->bc_psectsz, meta, ret, done);
SNAPSHOT_VAR_OR_LEAVE(bc->bc_psectoff, meta, ret, done);
SNAPSHOT_VAR_OR_LEAVE(bc->bc_closing, meta, ret, done);
done:
pthread_mutex_unlock(&bc->bc_mtx);
return (ret);
}
#endif
#endif /* BHYVE_SNAPSHOT */

View File

@ -87,10 +87,6 @@ int blockif_close(struct blockif_ctxt *bc);
#ifdef BHYVE_SNAPSHOT
void blockif_pause(struct blockif_ctxt *bc);
void blockif_resume(struct blockif_ctxt *bc);
int blockif_snapshot_req(struct blockif_req *br,
struct vm_snapshot_meta *meta);
int blockif_snapshot(struct blockif_ctxt *bc,
struct vm_snapshot_meta *meta);
#endif
#endif /* _BLOCK_IF_H_ */

View File

@ -2562,114 +2562,14 @@ pci_ahci_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl)
}
#ifdef BHYVE_SNAPSHOT
static int
pci_ahci_snapshot_save_queues(struct ahci_port *port,
struct vm_snapshot_meta *meta)
{
int ret;
int idx;
struct ahci_ioreq *ioreq;
STAILQ_FOREACH(ioreq, &port->iofhd, io_flist) {
idx = ((void *) ioreq - (void *) port->ioreq) / sizeof(*ioreq);
SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done);
}
idx = -1;
SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done);
TAILQ_FOREACH(ioreq, &port->iobhd, io_blist) {
idx = ((void *) ioreq - (void *) port->ioreq) / sizeof(*ioreq);
SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done);
/*
* Snapshot only the busy requests; other requests are
* not valid.
*/
ret = blockif_snapshot_req(&ioreq->io_req, meta);
if (ret != 0) {
fprintf(stderr, "%s: failed to snapshot req\r\n",
__func__);
goto done;
}
}
idx = -1;
SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done);
done:
return (ret);
}
static int
pci_ahci_snapshot_restore_queues(struct ahci_port *port,
struct vm_snapshot_meta *meta)
{
int ret;
int idx;
struct ahci_ioreq *ioreq;
/* Empty the free queue before restoring. */
while (!STAILQ_EMPTY(&port->iofhd))
STAILQ_REMOVE_HEAD(&port->iofhd, io_flist);
/* Restore the free queue. */
while (1) {
SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done);
if (idx == -1)
break;
STAILQ_INSERT_TAIL(&port->iofhd, &port->ioreq[idx], io_flist);
}
/* Restore the busy queue. */
while (1) {
SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done);
if (idx == -1)
break;
ioreq = &port->ioreq[idx];
TAILQ_INSERT_TAIL(&port->iobhd, ioreq, io_blist);
/*
* Restore only the busy requests; other requests are
* not valid.
*/
ret = blockif_snapshot_req(&ioreq->io_req, meta);
if (ret != 0) {
fprintf(stderr, "%s: failed to restore request\r\n",
__func__);
goto done;
}
/* Re-enqueue the requests in the block interface. */
if (ioreq->readop)
ret = blockif_read(port->bctx, &ioreq->io_req);
else
ret = blockif_write(port->bctx, &ioreq->io_req);
if (ret != 0) {
fprintf(stderr,
"%s: failed to re-enqueue request\r\n",
__func__);
goto done;
}
}
done:
return (ret);
}
static int
pci_ahci_snapshot(struct vm_snapshot_meta *meta)
{
int i, j, ret;
int i, ret;
void *bctx;
struct pci_devinst *pi;
struct pci_ahci_softc *sc;
struct ahci_port *port;
struct ahci_cmd_hdr *hdr;
struct ahci_ioreq *ioreq;
pi = meta->dev_data;
sc = pi->pi_arg;
@ -2753,43 +2653,7 @@ pci_ahci_snapshot(struct vm_snapshot_meta *meta)
SNAPSHOT_VAR_OR_LEAVE(port->fbs, meta, ret, done);
SNAPSHOT_VAR_OR_LEAVE(port->ioqsz, meta, ret, done);
for (j = 0; j < port->ioqsz; j++) {
ioreq = &port->ioreq[j];
/* blockif_req snapshot done only for busy requests. */
hdr = (struct ahci_cmd_hdr *)(port->cmd_lst +
ioreq->slot * AHCI_CL_SIZE);
SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(ioreq->cfis,
0x80 + hdr->prdtl * sizeof(struct ahci_prdt_entry),
false, meta, ret, done);
SNAPSHOT_VAR_OR_LEAVE(ioreq->len, meta, ret, done);
SNAPSHOT_VAR_OR_LEAVE(ioreq->done, meta, ret, done);
SNAPSHOT_VAR_OR_LEAVE(ioreq->slot, meta, ret, done);
SNAPSHOT_VAR_OR_LEAVE(ioreq->more, meta, ret, done);
SNAPSHOT_VAR_OR_LEAVE(ioreq->readop, meta, ret, done);
}
/* Perform save / restore specific operations. */
if (meta->op == VM_SNAPSHOT_SAVE) {
ret = pci_ahci_snapshot_save_queues(port, meta);
if (ret != 0)
goto done;
} else if (meta->op == VM_SNAPSHOT_RESTORE) {
ret = pci_ahci_snapshot_restore_queues(port, meta);
if (ret != 0)
goto done;
} else {
ret = EINVAL;
goto done;
}
ret = blockif_snapshot(port->bctx, meta);
if (ret != 0) {
fprintf(stderr, "%s: failed to restore blockif\r\n",
__func__);
goto done;
}
assert(TAILQ_EMPTY(&port->iobhd));
}
done:
@ -2835,7 +2699,7 @@ pci_ahci_resume(struct vmctx *ctx, struct pci_devinst *pi)
return (0);
}
#endif
#endif /* BHYVE_SNAPSHOT */
/*
* Use separate emulation names to distinguish drive and atapi devices