d3e4e51223
Save and restore (also known as suspend and resume) permits a snapshot to be taken of a guest's state that can later be resumed. In the current implementation, bhyve(8) creates a UNIX domain socket that is used by bhyvectl(8) to send a request to save a snapshot (and optionally exit after the snapshot has been taken). A snapshot currently consists of two files: the first holds a copy of guest RAM, and the second file holds other guest state such as vCPU register values and device model state. To resume a guest, bhyve(8) must be started with a matching pair of command line arguments to instantiate the same set of device models as well as a pointer to the saved snapshot. While the current implementation is useful for several uses cases, it has a few limitations. The file format for saving the guest state is tied to the ABI of internal bhyve structures and is not self-describing (in that it does not communicate the set of device models present in the system). In addition, the state saved for some device models closely matches the internal data structures which might prove a challenge for compatibility of snapshot files across a range of bhyve versions. The file format also does not currently support versioning of individual chunks of state. As a result, the current file format is not a fixed binary format and future revisions to save and restore will break binary compatiblity of snapshot files. The goal is to move to a more flexible format that adds versioning, etc. and at that point to commit to providing a reasonable level of compatibility. As a result, the current implementation is not enabled by default. It can be enabled via the WITH_BHYVE_SNAPSHOT=yes option for userland builds, and the kernel option BHYVE_SHAPSHOT. Submitted by: Mihai Tiganus, Flavius Anton, Darius Mihai Submitted by: Elena Mihailescu, Mihai Carabas, Sergiu Weisz Relnotes: yes Sponsored by: University Politehnica of Bucharest Sponsored by: Matthew Grooms (student scholarships) Sponsored by: iXsystems Differential Revision: https://reviews.freebsd.org/D19495
561 lines
12 KiB
C
561 lines
12 KiB
C
/*-
|
|
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
|
|
*
|
|
* Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com)
|
|
* All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice unmodified, this list of conditions, and the following
|
|
* disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
|
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
|
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
|
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
|
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#include <sys/cdefs.h>
|
|
__FBSDID("$FreeBSD$");
|
|
|
|
#include "opt_bhyve_snapshot.h"
|
|
|
|
#include <sys/param.h>
|
|
#include <sys/systm.h>
|
|
|
|
#include <machine/segments.h>
|
|
#include <machine/specialreg.h>
|
|
#include <machine/vmm.h>
|
|
#include <machine/vmm_snapshot.h>
|
|
|
|
#include "vmm_ktr.h"
|
|
|
|
#include "vmcb.h"
|
|
#include "svm.h"
|
|
#include "svm_softc.h"
|
|
|
|
/*
|
|
* The VMCB aka Virtual Machine Control Block is a 4KB aligned page
|
|
* in memory that describes the virtual machine.
|
|
*
|
|
* The VMCB contains:
|
|
* - instructions or events in the guest to intercept
|
|
* - control bits that modify execution environment of the guest
|
|
* - guest processor state (e.g. general purpose registers)
|
|
*/
|
|
|
|
/*
|
|
* Return VMCB segment area.
|
|
*/
|
|
static struct vmcb_segment *
|
|
vmcb_segptr(struct vmcb *vmcb, int type)
|
|
{
|
|
struct vmcb_state *state;
|
|
struct vmcb_segment *seg;
|
|
|
|
state = &vmcb->state;
|
|
|
|
switch (type) {
|
|
case VM_REG_GUEST_CS:
|
|
seg = &state->cs;
|
|
break;
|
|
|
|
case VM_REG_GUEST_DS:
|
|
seg = &state->ds;
|
|
break;
|
|
|
|
case VM_REG_GUEST_ES:
|
|
seg = &state->es;
|
|
break;
|
|
|
|
case VM_REG_GUEST_FS:
|
|
seg = &state->fs;
|
|
break;
|
|
|
|
case VM_REG_GUEST_GS:
|
|
seg = &state->gs;
|
|
break;
|
|
|
|
case VM_REG_GUEST_SS:
|
|
seg = &state->ss;
|
|
break;
|
|
|
|
case VM_REG_GUEST_GDTR:
|
|
seg = &state->gdt;
|
|
break;
|
|
|
|
case VM_REG_GUEST_IDTR:
|
|
seg = &state->idt;
|
|
break;
|
|
|
|
case VM_REG_GUEST_LDTR:
|
|
seg = &state->ldt;
|
|
break;
|
|
|
|
case VM_REG_GUEST_TR:
|
|
seg = &state->tr;
|
|
break;
|
|
|
|
default:
|
|
seg = NULL;
|
|
break;
|
|
}
|
|
|
|
return (seg);
|
|
}
|
|
|
|
static int
|
|
vmcb_access(struct svm_softc *softc, int vcpu, int write, int ident,
|
|
uint64_t *val)
|
|
{
|
|
struct vmcb *vmcb;
|
|
int off, bytes;
|
|
char *ptr;
|
|
|
|
vmcb = svm_get_vmcb(softc, vcpu);
|
|
off = VMCB_ACCESS_OFFSET(ident);
|
|
bytes = VMCB_ACCESS_BYTES(ident);
|
|
|
|
if ((off + bytes) >= sizeof (struct vmcb))
|
|
return (EINVAL);
|
|
|
|
ptr = (char *)vmcb;
|
|
|
|
if (!write)
|
|
*val = 0;
|
|
|
|
switch (bytes) {
|
|
case 8:
|
|
case 4:
|
|
case 2:
|
|
if (write)
|
|
memcpy(ptr + off, val, bytes);
|
|
else
|
|
memcpy(val, ptr + off, bytes);
|
|
break;
|
|
default:
|
|
VCPU_CTR1(softc->vm, vcpu,
|
|
"Invalid size %d for VMCB access: %d", bytes);
|
|
return (EINVAL);
|
|
}
|
|
|
|
/* Invalidate all VMCB state cached by h/w. */
|
|
if (write)
|
|
svm_set_dirty(softc, vcpu, 0xffffffff);
|
|
|
|
return (0);
|
|
}
|
|
|
|
/*
|
|
* Read from segment selector, control and general purpose register of VMCB.
|
|
*/
|
|
int
|
|
vmcb_read(struct svm_softc *sc, int vcpu, int ident, uint64_t *retval)
|
|
{
|
|
struct vmcb *vmcb;
|
|
struct vmcb_state *state;
|
|
struct vmcb_segment *seg;
|
|
int err;
|
|
|
|
vmcb = svm_get_vmcb(sc, vcpu);
|
|
state = &vmcb->state;
|
|
err = 0;
|
|
|
|
if (VMCB_ACCESS_OK(ident))
|
|
return (vmcb_access(sc, vcpu, 0, ident, retval));
|
|
|
|
switch (ident) {
|
|
case VM_REG_GUEST_CR0:
|
|
*retval = state->cr0;
|
|
break;
|
|
|
|
case VM_REG_GUEST_CR2:
|
|
*retval = state->cr2;
|
|
break;
|
|
|
|
case VM_REG_GUEST_CR3:
|
|
*retval = state->cr3;
|
|
break;
|
|
|
|
case VM_REG_GUEST_CR4:
|
|
*retval = state->cr4;
|
|
break;
|
|
|
|
case VM_REG_GUEST_DR6:
|
|
*retval = state->dr6;
|
|
break;
|
|
|
|
case VM_REG_GUEST_DR7:
|
|
*retval = state->dr7;
|
|
break;
|
|
|
|
case VM_REG_GUEST_EFER:
|
|
*retval = state->efer;
|
|
break;
|
|
|
|
case VM_REG_GUEST_RAX:
|
|
*retval = state->rax;
|
|
break;
|
|
|
|
case VM_REG_GUEST_RFLAGS:
|
|
*retval = state->rflags;
|
|
break;
|
|
|
|
case VM_REG_GUEST_RIP:
|
|
*retval = state->rip;
|
|
break;
|
|
|
|
case VM_REG_GUEST_RSP:
|
|
*retval = state->rsp;
|
|
break;
|
|
|
|
case VM_REG_GUEST_CS:
|
|
case VM_REG_GUEST_DS:
|
|
case VM_REG_GUEST_ES:
|
|
case VM_REG_GUEST_FS:
|
|
case VM_REG_GUEST_GS:
|
|
case VM_REG_GUEST_SS:
|
|
case VM_REG_GUEST_LDTR:
|
|
case VM_REG_GUEST_TR:
|
|
seg = vmcb_segptr(vmcb, ident);
|
|
KASSERT(seg != NULL, ("%s: unable to get segment %d from VMCB",
|
|
__func__, ident));
|
|
*retval = seg->selector;
|
|
break;
|
|
|
|
case VM_REG_GUEST_GDTR:
|
|
case VM_REG_GUEST_IDTR:
|
|
/* GDTR and IDTR don't have segment selectors */
|
|
err = EINVAL;
|
|
break;
|
|
default:
|
|
err = EINVAL;
|
|
break;
|
|
}
|
|
|
|
return (err);
|
|
}
|
|
|
|
/*
|
|
* Write to segment selector, control and general purpose register of VMCB.
|
|
*/
|
|
int
|
|
vmcb_write(struct svm_softc *sc, int vcpu, int ident, uint64_t val)
|
|
{
|
|
struct vmcb *vmcb;
|
|
struct vmcb_state *state;
|
|
struct vmcb_segment *seg;
|
|
int err, dirtyseg;
|
|
|
|
vmcb = svm_get_vmcb(sc, vcpu);
|
|
state = &vmcb->state;
|
|
dirtyseg = 0;
|
|
err = 0;
|
|
|
|
if (VMCB_ACCESS_OK(ident))
|
|
return (vmcb_access(sc, vcpu, 1, ident, &val));
|
|
|
|
switch (ident) {
|
|
case VM_REG_GUEST_CR0:
|
|
state->cr0 = val;
|
|
svm_set_dirty(sc, vcpu, VMCB_CACHE_CR);
|
|
break;
|
|
|
|
case VM_REG_GUEST_CR2:
|
|
state->cr2 = val;
|
|
svm_set_dirty(sc, vcpu, VMCB_CACHE_CR2);
|
|
break;
|
|
|
|
case VM_REG_GUEST_CR3:
|
|
state->cr3 = val;
|
|
svm_set_dirty(sc, vcpu, VMCB_CACHE_CR);
|
|
break;
|
|
|
|
case VM_REG_GUEST_CR4:
|
|
state->cr4 = val;
|
|
svm_set_dirty(sc, vcpu, VMCB_CACHE_CR);
|
|
break;
|
|
|
|
case VM_REG_GUEST_DR6:
|
|
state->dr6 = val;
|
|
svm_set_dirty(sc, vcpu, VMCB_CACHE_DR);
|
|
break;
|
|
|
|
case VM_REG_GUEST_DR7:
|
|
state->dr7 = val;
|
|
svm_set_dirty(sc, vcpu, VMCB_CACHE_DR);
|
|
break;
|
|
|
|
case VM_REG_GUEST_EFER:
|
|
/* EFER_SVM must always be set when the guest is executing */
|
|
state->efer = val | EFER_SVM;
|
|
svm_set_dirty(sc, vcpu, VMCB_CACHE_CR);
|
|
break;
|
|
|
|
case VM_REG_GUEST_RAX:
|
|
state->rax = val;
|
|
break;
|
|
|
|
case VM_REG_GUEST_RFLAGS:
|
|
state->rflags = val;
|
|
break;
|
|
|
|
case VM_REG_GUEST_RIP:
|
|
state->rip = val;
|
|
break;
|
|
|
|
case VM_REG_GUEST_RSP:
|
|
state->rsp = val;
|
|
break;
|
|
|
|
case VM_REG_GUEST_CS:
|
|
case VM_REG_GUEST_DS:
|
|
case VM_REG_GUEST_ES:
|
|
case VM_REG_GUEST_SS:
|
|
dirtyseg = 1; /* FALLTHROUGH */
|
|
case VM_REG_GUEST_FS:
|
|
case VM_REG_GUEST_GS:
|
|
case VM_REG_GUEST_LDTR:
|
|
case VM_REG_GUEST_TR:
|
|
seg = vmcb_segptr(vmcb, ident);
|
|
KASSERT(seg != NULL, ("%s: unable to get segment %d from VMCB",
|
|
__func__, ident));
|
|
seg->selector = val;
|
|
if (dirtyseg)
|
|
svm_set_dirty(sc, vcpu, VMCB_CACHE_SEG);
|
|
break;
|
|
|
|
case VM_REG_GUEST_GDTR:
|
|
case VM_REG_GUEST_IDTR:
|
|
/* GDTR and IDTR don't have segment selectors */
|
|
err = EINVAL;
|
|
break;
|
|
default:
|
|
err = EINVAL;
|
|
break;
|
|
}
|
|
|
|
return (err);
|
|
}
|
|
|
|
int
|
|
vmcb_seg(struct vmcb *vmcb, int ident, struct vmcb_segment *seg2)
|
|
{
|
|
struct vmcb_segment *seg;
|
|
|
|
seg = vmcb_segptr(vmcb, ident);
|
|
if (seg != NULL) {
|
|
bcopy(seg, seg2, sizeof(struct vmcb_segment));
|
|
return (0);
|
|
} else {
|
|
return (EINVAL);
|
|
}
|
|
}
|
|
|
|
int
|
|
vmcb_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
|
|
{
|
|
struct vmcb *vmcb;
|
|
struct svm_softc *sc;
|
|
struct vmcb_segment *seg;
|
|
uint16_t attrib;
|
|
|
|
sc = arg;
|
|
vmcb = svm_get_vmcb(sc, vcpu);
|
|
|
|
seg = vmcb_segptr(vmcb, reg);
|
|
KASSERT(seg != NULL, ("%s: invalid segment descriptor %d",
|
|
__func__, reg));
|
|
|
|
seg->base = desc->base;
|
|
seg->limit = desc->limit;
|
|
if (reg != VM_REG_GUEST_GDTR && reg != VM_REG_GUEST_IDTR) {
|
|
/*
|
|
* Map seg_desc access to VMCB attribute format.
|
|
*
|
|
* SVM uses the 'P' bit in the segment attributes to indicate a
|
|
* NULL segment so clear it if the segment is marked unusable.
|
|
*/
|
|
attrib = ((desc->access & 0xF000) >> 4) | (desc->access & 0xFF);
|
|
if (SEG_DESC_UNUSABLE(desc->access)) {
|
|
attrib &= ~0x80;
|
|
}
|
|
seg->attrib = attrib;
|
|
}
|
|
|
|
VCPU_CTR4(sc->vm, vcpu, "Setting desc %d: base (%#lx), limit (%#x), "
|
|
"attrib (%#x)", reg, seg->base, seg->limit, seg->attrib);
|
|
|
|
switch (reg) {
|
|
case VM_REG_GUEST_CS:
|
|
case VM_REG_GUEST_DS:
|
|
case VM_REG_GUEST_ES:
|
|
case VM_REG_GUEST_SS:
|
|
svm_set_dirty(sc, vcpu, VMCB_CACHE_SEG);
|
|
break;
|
|
case VM_REG_GUEST_GDTR:
|
|
case VM_REG_GUEST_IDTR:
|
|
svm_set_dirty(sc, vcpu, VMCB_CACHE_DT);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
return (0);
|
|
}
|
|
|
|
int
|
|
vmcb_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
|
|
{
|
|
struct vmcb *vmcb;
|
|
struct svm_softc *sc;
|
|
struct vmcb_segment *seg;
|
|
|
|
sc = arg;
|
|
vmcb = svm_get_vmcb(sc, vcpu);
|
|
seg = vmcb_segptr(vmcb, reg);
|
|
KASSERT(seg != NULL, ("%s: invalid segment descriptor %d",
|
|
__func__, reg));
|
|
|
|
desc->base = seg->base;
|
|
desc->limit = seg->limit;
|
|
desc->access = 0;
|
|
|
|
if (reg != VM_REG_GUEST_GDTR && reg != VM_REG_GUEST_IDTR) {
|
|
/* Map seg_desc access to VMCB attribute format */
|
|
desc->access = ((seg->attrib & 0xF00) << 4) |
|
|
(seg->attrib & 0xFF);
|
|
|
|
/*
|
|
* VT-x uses bit 16 to indicate a segment that has been loaded
|
|
* with a NULL selector (aka unusable). The 'desc->access'
|
|
* field is interpreted in the VT-x format by the
|
|
* processor-independent code.
|
|
*
|
|
* SVM uses the 'P' bit to convey the same information so
|
|
* convert it into the VT-x format. For more details refer to
|
|
* section "Segment State in the VMCB" in APMv2.
|
|
*/
|
|
if (reg != VM_REG_GUEST_CS && reg != VM_REG_GUEST_TR) {
|
|
if ((desc->access & 0x80) == 0)
|
|
desc->access |= 0x10000; /* Unusable segment */
|
|
}
|
|
}
|
|
|
|
return (0);
|
|
}
|
|
|
|
#ifdef BHYVE_SNAPSHOT
|
|
int
|
|
vmcb_getany(struct svm_softc *sc, int vcpu, int ident, uint64_t *val)
|
|
{
|
|
int error = 0;
|
|
|
|
if (vcpu < 0 || vcpu >= VM_MAXCPU) {
|
|
error = EINVAL;
|
|
goto err;
|
|
}
|
|
|
|
if (ident >= VM_REG_LAST) {
|
|
error = EINVAL;
|
|
goto err;
|
|
}
|
|
|
|
error = vm_get_register(sc->vm, vcpu, ident, val);
|
|
|
|
err:
|
|
return (error);
|
|
}
|
|
|
|
int
|
|
vmcb_setany(struct svm_softc *sc, int vcpu, int ident, uint64_t val)
|
|
{
|
|
int error = 0;
|
|
|
|
if (vcpu < 0 || vcpu >= VM_MAXCPU) {
|
|
error = EINVAL;
|
|
goto err;
|
|
}
|
|
|
|
if (ident >= VM_REG_LAST) {
|
|
error = EINVAL;
|
|
goto err;
|
|
}
|
|
|
|
error = vm_set_register(sc->vm, vcpu, ident, val);
|
|
|
|
err:
|
|
return (error);
|
|
}
|
|
|
|
int
|
|
vmcb_snapshot_desc(void *arg, int vcpu, int reg, struct vm_snapshot_meta *meta)
|
|
{
|
|
int ret;
|
|
struct seg_desc desc;
|
|
|
|
if (meta->op == VM_SNAPSHOT_SAVE) {
|
|
ret = vmcb_getdesc(arg, vcpu, reg, &desc);
|
|
if (ret != 0)
|
|
goto done;
|
|
|
|
SNAPSHOT_VAR_OR_LEAVE(desc.base, meta, ret, done);
|
|
SNAPSHOT_VAR_OR_LEAVE(desc.limit, meta, ret, done);
|
|
SNAPSHOT_VAR_OR_LEAVE(desc.access, meta, ret, done);
|
|
} else if (meta->op == VM_SNAPSHOT_RESTORE) {
|
|
SNAPSHOT_VAR_OR_LEAVE(desc.base, meta, ret, done);
|
|
SNAPSHOT_VAR_OR_LEAVE(desc.limit, meta, ret, done);
|
|
SNAPSHOT_VAR_OR_LEAVE(desc.access, meta, ret, done);
|
|
|
|
ret = vmcb_setdesc(arg, vcpu, reg, &desc);
|
|
if (ret != 0)
|
|
goto done;
|
|
} else {
|
|
ret = EINVAL;
|
|
goto done;
|
|
}
|
|
|
|
done:
|
|
return (ret);
|
|
}
|
|
|
|
int
|
|
vmcb_snapshot_any(struct svm_softc *sc, int vcpu, int ident,
|
|
struct vm_snapshot_meta *meta)
|
|
{
|
|
int ret;
|
|
uint64_t val;
|
|
|
|
if (meta->op == VM_SNAPSHOT_SAVE) {
|
|
ret = vmcb_getany(sc, vcpu, ident, &val);
|
|
if (ret != 0)
|
|
goto done;
|
|
|
|
SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done);
|
|
} else if (meta->op == VM_SNAPSHOT_RESTORE) {
|
|
SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done);
|
|
|
|
ret = vmcb_setany(sc, vcpu, ident, val);
|
|
if (ret != 0)
|
|
goto done;
|
|
} else {
|
|
ret = EINVAL;
|
|
goto done;
|
|
}
|
|
|
|
done:
|
|
return (ret);
|
|
}
|
|
#endif
|