2013-04-11 06:52:19 +00:00
|
|
|
/*-
|
2018-06-13 07:02:58 +00:00
|
|
|
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
|
|
|
|
*
|
2013-04-11 06:52:19 +00:00
|
|
|
* Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com)
|
|
|
|
* All rights reserved.
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
* are met:
|
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
2013-04-27 04:49:51 +00:00
|
|
|
* notice unmodified, this list of conditions, and the following
|
|
|
|
* disclaimer.
|
2013-04-11 06:52:19 +00:00
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
|
*
|
2013-04-27 04:49:51 +00:00
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
|
|
|
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
|
|
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
|
|
|
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
|
|
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
|
|
|
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
|
|
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
|
|
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
|
|
|
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
2013-04-11 06:52:19 +00:00
|
|
|
*/
|
|
|
|
|
|
|
|
#include <sys/cdefs.h>
|
|
|
|
__FBSDID("$FreeBSD$");
|
|
|
|
|
Initial support for bhyve save and restore.
Save and restore (also known as suspend and resume) permits a snapshot
to be taken of a guest's state that can later be resumed. In the
current implementation, bhyve(8) creates a UNIX domain socket that is
used by bhyvectl(8) to send a request to save a snapshot (and
optionally exit after the snapshot has been taken). A snapshot
currently consists of two files: the first holds a copy of guest RAM,
and the second file holds other guest state such as vCPU register
values and device model state.
To resume a guest, bhyve(8) must be started with a matching pair of
command line arguments to instantiate the same set of device models as
well as a pointer to the saved snapshot.
While the current implementation is useful for several uses cases, it
has a few limitations. The file format for saving the guest state is
tied to the ABI of internal bhyve structures and is not
self-describing (in that it does not communicate the set of device
models present in the system). In addition, the state saved for some
device models closely matches the internal data structures which might
prove a challenge for compatibility of snapshot files across a range
of bhyve versions. The file format also does not currently support
versioning of individual chunks of state. As a result, the current
file format is not a fixed binary format and future revisions to save
and restore will break binary compatiblity of snapshot files. The
goal is to move to a more flexible format that adds versioning,
etc. and at that point to commit to providing a reasonable level of
compatibility. As a result, the current implementation is not enabled
by default. It can be enabled via the WITH_BHYVE_SNAPSHOT=yes option
for userland builds, and the kernel option BHYVE_SHAPSHOT.
Submitted by: Mihai Tiganus, Flavius Anton, Darius Mihai
Submitted by: Elena Mihailescu, Mihai Carabas, Sergiu Weisz
Relnotes: yes
Sponsored by: University Politehnica of Bucharest
Sponsored by: Matthew Grooms (student scholarships)
Sponsored by: iXsystems
Differential Revision: https://reviews.freebsd.org/D19495
2020-05-05 00:02:04 +00:00
|
|
|
#include "opt_bhyve_snapshot.h"
|
|
|
|
|
2013-04-11 06:52:19 +00:00
|
|
|
#include <sys/param.h>
|
|
|
|
#include <sys/systm.h>
|
|
|
|
|
|
|
|
#include <machine/segments.h>
|
|
|
|
#include <machine/specialreg.h>
|
|
|
|
#include <machine/vmm.h>
|
Initial support for bhyve save and restore.
Save and restore (also known as suspend and resume) permits a snapshot
to be taken of a guest's state that can later be resumed. In the
current implementation, bhyve(8) creates a UNIX domain socket that is
used by bhyvectl(8) to send a request to save a snapshot (and
optionally exit after the snapshot has been taken). A snapshot
currently consists of two files: the first holds a copy of guest RAM,
and the second file holds other guest state such as vCPU register
values and device model state.
To resume a guest, bhyve(8) must be started with a matching pair of
command line arguments to instantiate the same set of device models as
well as a pointer to the saved snapshot.
While the current implementation is useful for several uses cases, it
has a few limitations. The file format for saving the guest state is
tied to the ABI of internal bhyve structures and is not
self-describing (in that it does not communicate the set of device
models present in the system). In addition, the state saved for some
device models closely matches the internal data structures which might
prove a challenge for compatibility of snapshot files across a range
of bhyve versions. The file format also does not currently support
versioning of individual chunks of state. As a result, the current
file format is not a fixed binary format and future revisions to save
and restore will break binary compatiblity of snapshot files. The
goal is to move to a more flexible format that adds versioning,
etc. and at that point to commit to providing a reasonable level of
compatibility. As a result, the current implementation is not enabled
by default. It can be enabled via the WITH_BHYVE_SNAPSHOT=yes option
for userland builds, and the kernel option BHYVE_SHAPSHOT.
Submitted by: Mihai Tiganus, Flavius Anton, Darius Mihai
Submitted by: Elena Mihailescu, Mihai Carabas, Sergiu Weisz
Relnotes: yes
Sponsored by: University Politehnica of Bucharest
Sponsored by: Matthew Grooms (student scholarships)
Sponsored by: iXsystems
Differential Revision: https://reviews.freebsd.org/D19495
2020-05-05 00:02:04 +00:00
|
|
|
#include <machine/vmm_snapshot.h>
|
2013-04-11 06:52:19 +00:00
|
|
|
|
2014-09-21 23:42:54 +00:00
|
|
|
#include "vmm_ktr.h"
|
|
|
|
|
2013-04-11 06:52:19 +00:00
|
|
|
#include "vmcb.h"
|
2013-04-27 04:49:51 +00:00
|
|
|
#include "svm.h"
|
2014-09-21 23:42:54 +00:00
|
|
|
#include "svm_softc.h"
|
2013-04-11 06:52:19 +00:00
|
|
|
|
2013-04-15 04:16:12 +00:00
|
|
|
/*
|
|
|
|
* The VMCB aka Virtual Machine Control Block is a 4KB aligned page
|
|
|
|
* in memory that describes the virtual machine.
|
|
|
|
*
|
|
|
|
* The VMCB contains:
|
|
|
|
* - instructions or events in the guest to intercept
|
|
|
|
* - control bits that modify execution environment of the guest
|
|
|
|
* - guest processor state (e.g. general purpose registers)
|
|
|
|
*/
|
|
|
|
|
2014-09-21 23:42:54 +00:00
|
|
|
/*
|
|
|
|
* Return VMCB segment area.
|
|
|
|
*/
|
|
|
|
static struct vmcb_segment *
|
|
|
|
vmcb_segptr(struct vmcb *vmcb, int type)
|
|
|
|
{
|
|
|
|
struct vmcb_state *state;
|
|
|
|
struct vmcb_segment *seg;
|
|
|
|
|
|
|
|
state = &vmcb->state;
|
|
|
|
|
|
|
|
switch (type) {
|
|
|
|
case VM_REG_GUEST_CS:
|
|
|
|
seg = &state->cs;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case VM_REG_GUEST_DS:
|
|
|
|
seg = &state->ds;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case VM_REG_GUEST_ES:
|
|
|
|
seg = &state->es;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case VM_REG_GUEST_FS:
|
|
|
|
seg = &state->fs;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case VM_REG_GUEST_GS:
|
|
|
|
seg = &state->gs;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case VM_REG_GUEST_SS:
|
|
|
|
seg = &state->ss;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case VM_REG_GUEST_GDTR:
|
|
|
|
seg = &state->gdt;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case VM_REG_GUEST_IDTR:
|
|
|
|
seg = &state->idt;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case VM_REG_GUEST_LDTR:
|
|
|
|
seg = &state->ldt;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case VM_REG_GUEST_TR:
|
|
|
|
seg = &state->tr;
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
seg = NULL;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return (seg);
|
|
|
|
}
|
|
|
|
|
2014-10-10 21:48:59 +00:00
|
|
|
static int
|
|
|
|
vmcb_access(struct svm_softc *softc, int vcpu, int write, int ident,
|
|
|
|
uint64_t *val)
|
|
|
|
{
|
|
|
|
struct vmcb *vmcb;
|
|
|
|
int off, bytes;
|
|
|
|
char *ptr;
|
|
|
|
|
|
|
|
vmcb = svm_get_vmcb(softc, vcpu);
|
|
|
|
off = VMCB_ACCESS_OFFSET(ident);
|
|
|
|
bytes = VMCB_ACCESS_BYTES(ident);
|
|
|
|
|
|
|
|
if ((off + bytes) >= sizeof (struct vmcb))
|
|
|
|
return (EINVAL);
|
|
|
|
|
|
|
|
ptr = (char *)vmcb;
|
|
|
|
|
|
|
|
if (!write)
|
|
|
|
*val = 0;
|
|
|
|
|
|
|
|
switch (bytes) {
|
|
|
|
case 8:
|
|
|
|
case 4:
|
|
|
|
case 2:
|
|
|
|
if (write)
|
|
|
|
memcpy(ptr + off, val, bytes);
|
|
|
|
else
|
|
|
|
memcpy(val, ptr + off, bytes);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
VCPU_CTR1(softc->vm, vcpu,
|
|
|
|
"Invalid size %d for VMCB access: %d", bytes);
|
|
|
|
return (EINVAL);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Invalidate all VMCB state cached by h/w. */
|
|
|
|
if (write)
|
|
|
|
svm_set_dirty(softc, vcpu, 0xffffffff);
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2013-04-11 06:52:19 +00:00
|
|
|
/*
|
|
|
|
* Read from segment selector, control and general purpose register of VMCB.
|
|
|
|
*/
|
|
|
|
int
|
2014-09-21 23:42:54 +00:00
|
|
|
vmcb_read(struct svm_softc *sc, int vcpu, int ident, uint64_t *retval)
|
2013-04-11 06:52:19 +00:00
|
|
|
{
|
2014-09-21 23:42:54 +00:00
|
|
|
struct vmcb *vmcb;
|
2013-04-11 06:52:19 +00:00
|
|
|
struct vmcb_state *state;
|
|
|
|
struct vmcb_segment *seg;
|
|
|
|
int err;
|
|
|
|
|
2014-09-21 23:42:54 +00:00
|
|
|
vmcb = svm_get_vmcb(sc, vcpu);
|
2013-04-11 06:52:19 +00:00
|
|
|
state = &vmcb->state;
|
|
|
|
err = 0;
|
|
|
|
|
2014-10-10 21:48:59 +00:00
|
|
|
if (VMCB_ACCESS_OK(ident))
|
|
|
|
return (vmcb_access(sc, vcpu, 0, ident, retval));
|
|
|
|
|
2013-04-11 06:52:19 +00:00
|
|
|
switch (ident) {
|
|
|
|
case VM_REG_GUEST_CR0:
|
|
|
|
*retval = state->cr0;
|
|
|
|
break;
|
|
|
|
|
2014-06-05 06:29:18 +00:00
|
|
|
case VM_REG_GUEST_CR2:
|
|
|
|
*retval = state->cr2;
|
|
|
|
break;
|
|
|
|
|
2013-04-11 06:52:19 +00:00
|
|
|
case VM_REG_GUEST_CR3:
|
|
|
|
*retval = state->cr3;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case VM_REG_GUEST_CR4:
|
|
|
|
*retval = state->cr4;
|
|
|
|
break;
|
|
|
|
|
Save and restore guest debug registers.
Currently most of the debug registers are not saved and restored
during VM transitions allowing guest and host debug register values to
leak into the opposite context. One result is that hardware
watchpoints do not work reliably within a guest under VT-x.
Due to differences in SVM and VT-x, slightly different approaches are
used.
For VT-x:
- Enable debug register save/restore for VM entry/exit in the VMCS for
DR7 and MSR_DEBUGCTL.
- Explicitly save DR0-3,6 of the guest.
- Explicitly save DR0-3,6-7, MSR_DEBUGCTL, and the trap flag from
%rflags for the host. Note that because DR6 is "software" managed
and not stored in the VMCS a kernel debugger which single steps
through VM entry could corrupt the guest DR6 (since a single step
trap taken after loading the guest DR6 could alter the DR6
register). To avoid this, explicitly disable single-stepping via
the trace flag before loading the guest DR6. A determined debugger
could still defeat this by setting a breakpoint after the guest DR6
was loaded and then single-stepping.
For SVM:
- Enable debug register caching in the VMCB for DR6/DR7.
- Explicitly save DR0-3 of the guest.
- Explicitly save DR0-3,6-7, and MSR_DEBUGCTL for the host. Since SVM
saves the guest DR6 in the VMCB, the race with single-stepping
described for VT-x does not exist.
For both platforms, expose all of the guest DRx values via --get-drX
and --set-drX flags to bhyvectl.
Discussed with: avg, grehan
Tested by: avg (SVM), myself (VT-x)
MFC after: 1 month
Differential Revision: https://reviews.freebsd.org/D13229
2018-01-17 23:11:25 +00:00
|
|
|
case VM_REG_GUEST_DR6:
|
|
|
|
*retval = state->dr6;
|
|
|
|
break;
|
|
|
|
|
2013-04-11 06:52:19 +00:00
|
|
|
case VM_REG_GUEST_DR7:
|
|
|
|
*retval = state->dr7;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case VM_REG_GUEST_EFER:
|
|
|
|
*retval = state->efer;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case VM_REG_GUEST_RAX:
|
|
|
|
*retval = state->rax;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case VM_REG_GUEST_RFLAGS:
|
|
|
|
*retval = state->rflags;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case VM_REG_GUEST_RIP:
|
|
|
|
*retval = state->rip;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case VM_REG_GUEST_RSP:
|
|
|
|
*retval = state->rsp;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case VM_REG_GUEST_CS:
|
|
|
|
case VM_REG_GUEST_DS:
|
|
|
|
case VM_REG_GUEST_ES:
|
|
|
|
case VM_REG_GUEST_FS:
|
|
|
|
case VM_REG_GUEST_GS:
|
|
|
|
case VM_REG_GUEST_SS:
|
|
|
|
case VM_REG_GUEST_LDTR:
|
|
|
|
case VM_REG_GUEST_TR:
|
2014-09-21 23:42:54 +00:00
|
|
|
seg = vmcb_segptr(vmcb, ident);
|
|
|
|
KASSERT(seg != NULL, ("%s: unable to get segment %d from VMCB",
|
|
|
|
__func__, ident));
|
2013-04-11 06:52:19 +00:00
|
|
|
*retval = seg->selector;
|
|
|
|
break;
|
|
|
|
|
2014-09-21 23:42:54 +00:00
|
|
|
case VM_REG_GUEST_GDTR:
|
|
|
|
case VM_REG_GUEST_IDTR:
|
|
|
|
/* GDTR and IDTR don't have segment selectors */
|
|
|
|
err = EINVAL;
|
|
|
|
break;
|
2013-04-11 06:52:19 +00:00
|
|
|
default:
|
|
|
|
err = EINVAL;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return (err);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Write to segment selector, control and general purpose register of VMCB.
|
|
|
|
*/
|
|
|
|
int
|
2014-09-21 23:42:54 +00:00
|
|
|
vmcb_write(struct svm_softc *sc, int vcpu, int ident, uint64_t val)
|
2013-04-11 06:52:19 +00:00
|
|
|
{
|
2014-09-21 23:42:54 +00:00
|
|
|
struct vmcb *vmcb;
|
2013-04-11 06:52:19 +00:00
|
|
|
struct vmcb_state *state;
|
|
|
|
struct vmcb_segment *seg;
|
2014-09-21 23:42:54 +00:00
|
|
|
int err, dirtyseg;
|
2013-04-11 06:52:19 +00:00
|
|
|
|
2014-09-21 23:42:54 +00:00
|
|
|
vmcb = svm_get_vmcb(sc, vcpu);
|
2013-04-11 06:52:19 +00:00
|
|
|
state = &vmcb->state;
|
2014-09-21 23:42:54 +00:00
|
|
|
dirtyseg = 0;
|
2013-04-11 06:52:19 +00:00
|
|
|
err = 0;
|
|
|
|
|
2014-10-10 21:48:59 +00:00
|
|
|
if (VMCB_ACCESS_OK(ident))
|
|
|
|
return (vmcb_access(sc, vcpu, 1, ident, &val));
|
|
|
|
|
2013-04-11 06:52:19 +00:00
|
|
|
switch (ident) {
|
|
|
|
case VM_REG_GUEST_CR0:
|
|
|
|
state->cr0 = val;
|
2014-09-21 23:42:54 +00:00
|
|
|
svm_set_dirty(sc, vcpu, VMCB_CACHE_CR);
|
2013-04-11 06:52:19 +00:00
|
|
|
break;
|
|
|
|
|
2014-06-05 06:29:18 +00:00
|
|
|
case VM_REG_GUEST_CR2:
|
|
|
|
state->cr2 = val;
|
2014-09-21 23:42:54 +00:00
|
|
|
svm_set_dirty(sc, vcpu, VMCB_CACHE_CR2);
|
2014-06-05 06:29:18 +00:00
|
|
|
break;
|
|
|
|
|
2013-04-11 06:52:19 +00:00
|
|
|
case VM_REG_GUEST_CR3:
|
|
|
|
state->cr3 = val;
|
2014-09-21 23:42:54 +00:00
|
|
|
svm_set_dirty(sc, vcpu, VMCB_CACHE_CR);
|
2013-04-11 06:52:19 +00:00
|
|
|
break;
|
|
|
|
|
|
|
|
case VM_REG_GUEST_CR4:
|
|
|
|
state->cr4 = val;
|
2014-09-21 23:42:54 +00:00
|
|
|
svm_set_dirty(sc, vcpu, VMCB_CACHE_CR);
|
2013-04-11 06:52:19 +00:00
|
|
|
break;
|
|
|
|
|
Save and restore guest debug registers.
Currently most of the debug registers are not saved and restored
during VM transitions allowing guest and host debug register values to
leak into the opposite context. One result is that hardware
watchpoints do not work reliably within a guest under VT-x.
Due to differences in SVM and VT-x, slightly different approaches are
used.
For VT-x:
- Enable debug register save/restore for VM entry/exit in the VMCS for
DR7 and MSR_DEBUGCTL.
- Explicitly save DR0-3,6 of the guest.
- Explicitly save DR0-3,6-7, MSR_DEBUGCTL, and the trap flag from
%rflags for the host. Note that because DR6 is "software" managed
and not stored in the VMCS a kernel debugger which single steps
through VM entry could corrupt the guest DR6 (since a single step
trap taken after loading the guest DR6 could alter the DR6
register). To avoid this, explicitly disable single-stepping via
the trace flag before loading the guest DR6. A determined debugger
could still defeat this by setting a breakpoint after the guest DR6
was loaded and then single-stepping.
For SVM:
- Enable debug register caching in the VMCB for DR6/DR7.
- Explicitly save DR0-3 of the guest.
- Explicitly save DR0-3,6-7, and MSR_DEBUGCTL for the host. Since SVM
saves the guest DR6 in the VMCB, the race with single-stepping
described for VT-x does not exist.
For both platforms, expose all of the guest DRx values via --get-drX
and --set-drX flags to bhyvectl.
Discussed with: avg, grehan
Tested by: avg (SVM), myself (VT-x)
MFC after: 1 month
Differential Revision: https://reviews.freebsd.org/D13229
2018-01-17 23:11:25 +00:00
|
|
|
case VM_REG_GUEST_DR6:
|
|
|
|
state->dr6 = val;
|
|
|
|
svm_set_dirty(sc, vcpu, VMCB_CACHE_DR);
|
|
|
|
break;
|
|
|
|
|
2013-04-11 06:52:19 +00:00
|
|
|
case VM_REG_GUEST_DR7:
|
|
|
|
state->dr7 = val;
|
Save and restore guest debug registers.
Currently most of the debug registers are not saved and restored
during VM transitions allowing guest and host debug register values to
leak into the opposite context. One result is that hardware
watchpoints do not work reliably within a guest under VT-x.
Due to differences in SVM and VT-x, slightly different approaches are
used.
For VT-x:
- Enable debug register save/restore for VM entry/exit in the VMCS for
DR7 and MSR_DEBUGCTL.
- Explicitly save DR0-3,6 of the guest.
- Explicitly save DR0-3,6-7, MSR_DEBUGCTL, and the trap flag from
%rflags for the host. Note that because DR6 is "software" managed
and not stored in the VMCS a kernel debugger which single steps
through VM entry could corrupt the guest DR6 (since a single step
trap taken after loading the guest DR6 could alter the DR6
register). To avoid this, explicitly disable single-stepping via
the trace flag before loading the guest DR6. A determined debugger
could still defeat this by setting a breakpoint after the guest DR6
was loaded and then single-stepping.
For SVM:
- Enable debug register caching in the VMCB for DR6/DR7.
- Explicitly save DR0-3 of the guest.
- Explicitly save DR0-3,6-7, and MSR_DEBUGCTL for the host. Since SVM
saves the guest DR6 in the VMCB, the race with single-stepping
described for VT-x does not exist.
For both platforms, expose all of the guest DRx values via --get-drX
and --set-drX flags to bhyvectl.
Discussed with: avg, grehan
Tested by: avg (SVM), myself (VT-x)
MFC after: 1 month
Differential Revision: https://reviews.freebsd.org/D13229
2018-01-17 23:11:25 +00:00
|
|
|
svm_set_dirty(sc, vcpu, VMCB_CACHE_DR);
|
2013-04-11 06:52:19 +00:00
|
|
|
break;
|
|
|
|
|
|
|
|
case VM_REG_GUEST_EFER:
|
|
|
|
/* EFER_SVM must always be set when the guest is executing */
|
|
|
|
state->efer = val | EFER_SVM;
|
2014-09-21 23:42:54 +00:00
|
|
|
svm_set_dirty(sc, vcpu, VMCB_CACHE_CR);
|
2013-04-11 06:52:19 +00:00
|
|
|
break;
|
|
|
|
|
|
|
|
case VM_REG_GUEST_RAX:
|
|
|
|
state->rax = val;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case VM_REG_GUEST_RFLAGS:
|
|
|
|
state->rflags = val;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case VM_REG_GUEST_RIP:
|
|
|
|
state->rip = val;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case VM_REG_GUEST_RSP:
|
|
|
|
state->rsp = val;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case VM_REG_GUEST_CS:
|
|
|
|
case VM_REG_GUEST_DS:
|
|
|
|
case VM_REG_GUEST_ES:
|
2014-09-21 23:42:54 +00:00
|
|
|
case VM_REG_GUEST_SS:
|
|
|
|
dirtyseg = 1; /* FALLTHROUGH */
|
2013-04-11 06:52:19 +00:00
|
|
|
case VM_REG_GUEST_FS:
|
|
|
|
case VM_REG_GUEST_GS:
|
|
|
|
case VM_REG_GUEST_LDTR:
|
|
|
|
case VM_REG_GUEST_TR:
|
2014-09-21 23:42:54 +00:00
|
|
|
seg = vmcb_segptr(vmcb, ident);
|
|
|
|
KASSERT(seg != NULL, ("%s: unable to get segment %d from VMCB",
|
|
|
|
__func__, ident));
|
2013-04-11 06:52:19 +00:00
|
|
|
seg->selector = val;
|
2014-09-21 23:42:54 +00:00
|
|
|
if (dirtyseg)
|
|
|
|
svm_set_dirty(sc, vcpu, VMCB_CACHE_SEG);
|
2013-04-11 06:52:19 +00:00
|
|
|
break;
|
|
|
|
|
2014-09-21 23:42:54 +00:00
|
|
|
case VM_REG_GUEST_GDTR:
|
|
|
|
case VM_REG_GUEST_IDTR:
|
|
|
|
/* GDTR and IDTR don't have segment selectors */
|
|
|
|
err = EINVAL;
|
|
|
|
break;
|
2013-04-11 06:52:19 +00:00
|
|
|
default:
|
|
|
|
err = EINVAL;
|
2014-09-21 23:42:54 +00:00
|
|
|
break;
|
2013-04-11 06:52:19 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return (err);
|
|
|
|
}
|
|
|
|
|
2014-09-21 23:42:54 +00:00
|
|
|
int
|
|
|
|
vmcb_seg(struct vmcb *vmcb, int ident, struct vmcb_segment *seg2)
|
2013-04-11 06:52:19 +00:00
|
|
|
{
|
|
|
|
struct vmcb_segment *seg;
|
|
|
|
|
2014-09-21 23:42:54 +00:00
|
|
|
seg = vmcb_segptr(vmcb, ident);
|
|
|
|
if (seg != NULL) {
|
|
|
|
bcopy(seg, seg2, sizeof(struct vmcb_segment));
|
|
|
|
return (0);
|
|
|
|
} else {
|
|
|
|
return (EINVAL);
|
|
|
|
}
|
|
|
|
}
|
2013-04-11 06:52:19 +00:00
|
|
|
|
2014-09-21 23:42:54 +00:00
|
|
|
int
|
|
|
|
vmcb_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
|
|
|
|
{
|
|
|
|
struct vmcb *vmcb;
|
|
|
|
struct svm_softc *sc;
|
|
|
|
struct vmcb_segment *seg;
|
|
|
|
uint16_t attrib;
|
|
|
|
|
|
|
|
sc = arg;
|
|
|
|
vmcb = svm_get_vmcb(sc, vcpu);
|
|
|
|
|
|
|
|
seg = vmcb_segptr(vmcb, reg);
|
|
|
|
KASSERT(seg != NULL, ("%s: invalid segment descriptor %d",
|
|
|
|
__func__, reg));
|
|
|
|
|
|
|
|
seg->base = desc->base;
|
|
|
|
seg->limit = desc->limit;
|
|
|
|
if (reg != VM_REG_GUEST_GDTR && reg != VM_REG_GUEST_IDTR) {
|
|
|
|
/*
|
|
|
|
* Map seg_desc access to VMCB attribute format.
|
|
|
|
*
|
|
|
|
* SVM uses the 'P' bit in the segment attributes to indicate a
|
|
|
|
* NULL segment so clear it if the segment is marked unusable.
|
|
|
|
*/
|
|
|
|
attrib = ((desc->access & 0xF000) >> 4) | (desc->access & 0xFF);
|
|
|
|
if (SEG_DESC_UNUSABLE(desc->access)) {
|
|
|
|
attrib &= ~0x80;
|
|
|
|
}
|
|
|
|
seg->attrib = attrib;
|
|
|
|
}
|
2013-04-11 06:52:19 +00:00
|
|
|
|
2014-09-21 23:42:54 +00:00
|
|
|
VCPU_CTR4(sc->vm, vcpu, "Setting desc %d: base (%#lx), limit (%#x), "
|
|
|
|
"attrib (%#x)", reg, seg->base, seg->limit, seg->attrib);
|
2013-04-11 06:52:19 +00:00
|
|
|
|
2014-09-21 23:42:54 +00:00
|
|
|
switch (reg) {
|
|
|
|
case VM_REG_GUEST_CS:
|
|
|
|
case VM_REG_GUEST_DS:
|
2013-04-11 06:52:19 +00:00
|
|
|
case VM_REG_GUEST_ES:
|
|
|
|
case VM_REG_GUEST_SS:
|
2014-09-21 23:42:54 +00:00
|
|
|
svm_set_dirty(sc, vcpu, VMCB_CACHE_SEG);
|
2014-10-28 07:19:02 +00:00
|
|
|
break;
|
2013-04-11 06:52:19 +00:00
|
|
|
case VM_REG_GUEST_GDTR:
|
|
|
|
case VM_REG_GUEST_IDTR:
|
2014-09-21 23:42:54 +00:00
|
|
|
svm_set_dirty(sc, vcpu, VMCB_CACHE_DT);
|
2013-04-11 06:52:19 +00:00
|
|
|
break;
|
2014-09-21 23:42:54 +00:00
|
|
|
default:
|
2013-04-11 06:52:19 +00:00
|
|
|
break;
|
2014-09-21 23:42:54 +00:00
|
|
|
}
|
2013-04-11 06:52:19 +00:00
|
|
|
|
2014-09-21 23:42:54 +00:00
|
|
|
return (0);
|
|
|
|
}
|
2013-04-11 06:52:19 +00:00
|
|
|
|
2014-09-21 23:42:54 +00:00
|
|
|
int
|
|
|
|
vmcb_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
|
|
|
|
{
|
|
|
|
struct vmcb *vmcb;
|
|
|
|
struct svm_softc *sc;
|
|
|
|
struct vmcb_segment *seg;
|
|
|
|
|
|
|
|
sc = arg;
|
|
|
|
vmcb = svm_get_vmcb(sc, vcpu);
|
|
|
|
seg = vmcb_segptr(vmcb, reg);
|
|
|
|
KASSERT(seg != NULL, ("%s: invalid segment descriptor %d",
|
|
|
|
__func__, reg));
|
|
|
|
|
|
|
|
desc->base = seg->base;
|
|
|
|
desc->limit = seg->limit;
|
|
|
|
desc->access = 0;
|
|
|
|
|
|
|
|
if (reg != VM_REG_GUEST_GDTR && reg != VM_REG_GUEST_IDTR) {
|
|
|
|
/* Map seg_desc access to VMCB attribute format */
|
|
|
|
desc->access = ((seg->attrib & 0xF00) << 4) |
|
|
|
|
(seg->attrib & 0xFF);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* VT-x uses bit 16 to indicate a segment that has been loaded
|
|
|
|
* with a NULL selector (aka unusable). The 'desc->access'
|
|
|
|
* field is interpreted in the VT-x format by the
|
|
|
|
* processor-independent code.
|
|
|
|
*
|
|
|
|
* SVM uses the 'P' bit to convey the same information so
|
|
|
|
* convert it into the VT-x format. For more details refer to
|
|
|
|
* section "Segment State in the VMCB" in APMv2.
|
|
|
|
*/
|
|
|
|
if (reg != VM_REG_GUEST_CS && reg != VM_REG_GUEST_TR) {
|
|
|
|
if ((desc->access & 0x80) == 0)
|
|
|
|
desc->access |= 0x10000; /* Unusable segment */
|
|
|
|
}
|
2013-04-11 06:52:19 +00:00
|
|
|
}
|
|
|
|
|
2014-09-21 23:42:54 +00:00
|
|
|
return (0);
|
2013-04-11 06:52:19 +00:00
|
|
|
}
|
Initial support for bhyve save and restore.
Save and restore (also known as suspend and resume) permits a snapshot
to be taken of a guest's state that can later be resumed. In the
current implementation, bhyve(8) creates a UNIX domain socket that is
used by bhyvectl(8) to send a request to save a snapshot (and
optionally exit after the snapshot has been taken). A snapshot
currently consists of two files: the first holds a copy of guest RAM,
and the second file holds other guest state such as vCPU register
values and device model state.
To resume a guest, bhyve(8) must be started with a matching pair of
command line arguments to instantiate the same set of device models as
well as a pointer to the saved snapshot.
While the current implementation is useful for several uses cases, it
has a few limitations. The file format for saving the guest state is
tied to the ABI of internal bhyve structures and is not
self-describing (in that it does not communicate the set of device
models present in the system). In addition, the state saved for some
device models closely matches the internal data structures which might
prove a challenge for compatibility of snapshot files across a range
of bhyve versions. The file format also does not currently support
versioning of individual chunks of state. As a result, the current
file format is not a fixed binary format and future revisions to save
and restore will break binary compatiblity of snapshot files. The
goal is to move to a more flexible format that adds versioning,
etc. and at that point to commit to providing a reasonable level of
compatibility. As a result, the current implementation is not enabled
by default. It can be enabled via the WITH_BHYVE_SNAPSHOT=yes option
for userland builds, and the kernel option BHYVE_SHAPSHOT.
Submitted by: Mihai Tiganus, Flavius Anton, Darius Mihai
Submitted by: Elena Mihailescu, Mihai Carabas, Sergiu Weisz
Relnotes: yes
Sponsored by: University Politehnica of Bucharest
Sponsored by: Matthew Grooms (student scholarships)
Sponsored by: iXsystems
Differential Revision: https://reviews.freebsd.org/D19495
2020-05-05 00:02:04 +00:00
|
|
|
|
|
|
|
#ifdef BHYVE_SNAPSHOT
|
|
|
|
int
|
|
|
|
vmcb_getany(struct svm_softc *sc, int vcpu, int ident, uint64_t *val)
|
|
|
|
{
|
|
|
|
int error = 0;
|
|
|
|
|
|
|
|
if (vcpu < 0 || vcpu >= VM_MAXCPU) {
|
|
|
|
error = EINVAL;
|
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ident >= VM_REG_LAST) {
|
|
|
|
error = EINVAL;
|
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
|
|
|
|
error = vm_get_register(sc->vm, vcpu, ident, val);
|
|
|
|
|
|
|
|
err:
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
vmcb_setany(struct svm_softc *sc, int vcpu, int ident, uint64_t val)
|
|
|
|
{
|
|
|
|
int error = 0;
|
|
|
|
|
|
|
|
if (vcpu < 0 || vcpu >= VM_MAXCPU) {
|
|
|
|
error = EINVAL;
|
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ident >= VM_REG_LAST) {
|
|
|
|
error = EINVAL;
|
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
|
|
|
|
error = vm_set_register(sc->vm, vcpu, ident, val);
|
|
|
|
|
|
|
|
err:
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
vmcb_snapshot_desc(void *arg, int vcpu, int reg, struct vm_snapshot_meta *meta)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct seg_desc desc;
|
|
|
|
|
|
|
|
if (meta->op == VM_SNAPSHOT_SAVE) {
|
|
|
|
ret = vmcb_getdesc(arg, vcpu, reg, &desc);
|
|
|
|
if (ret != 0)
|
|
|
|
goto done;
|
|
|
|
|
|
|
|
SNAPSHOT_VAR_OR_LEAVE(desc.base, meta, ret, done);
|
|
|
|
SNAPSHOT_VAR_OR_LEAVE(desc.limit, meta, ret, done);
|
|
|
|
SNAPSHOT_VAR_OR_LEAVE(desc.access, meta, ret, done);
|
|
|
|
} else if (meta->op == VM_SNAPSHOT_RESTORE) {
|
|
|
|
SNAPSHOT_VAR_OR_LEAVE(desc.base, meta, ret, done);
|
|
|
|
SNAPSHOT_VAR_OR_LEAVE(desc.limit, meta, ret, done);
|
|
|
|
SNAPSHOT_VAR_OR_LEAVE(desc.access, meta, ret, done);
|
|
|
|
|
|
|
|
ret = vmcb_setdesc(arg, vcpu, reg, &desc);
|
|
|
|
if (ret != 0)
|
|
|
|
goto done;
|
|
|
|
} else {
|
|
|
|
ret = EINVAL;
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
|
|
|
|
done:
|
|
|
|
return (ret);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
vmcb_snapshot_any(struct svm_softc *sc, int vcpu, int ident,
|
|
|
|
struct vm_snapshot_meta *meta)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
uint64_t val;
|
|
|
|
|
|
|
|
if (meta->op == VM_SNAPSHOT_SAVE) {
|
|
|
|
ret = vmcb_getany(sc, vcpu, ident, &val);
|
|
|
|
if (ret != 0)
|
|
|
|
goto done;
|
|
|
|
|
|
|
|
SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done);
|
|
|
|
} else if (meta->op == VM_SNAPSHOT_RESTORE) {
|
|
|
|
SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done);
|
|
|
|
|
|
|
|
ret = vmcb_setany(sc, vcpu, ident, val);
|
|
|
|
if (ret != 0)
|
|
|
|
goto done;
|
|
|
|
} else {
|
|
|
|
ret = EINVAL;
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
|
|
|
|
done:
|
|
|
|
return (ret);
|
|
|
|
}
|
|
|
|
#endif
|