freebsd-dev/usr.sbin/bhyve/bhyverun.c
John Baldwin cd377eb369 Initial debug server for bhyve.
This commit adds a new debug server to bhyve.  Unlike the existing -g
option which provides an efficient connection to a debug server
running in the guest OS, this debug server permits inspection and
control of the guest from within the hypervisor itself without
requiring any cooperation from the guest.  It is similar to the debug
server provided by qemu.

To avoid conflicting with the existing -g option, a new -G option has
been added that accepts a TCP port.  An IPv4 socket is bound to this
port and listens for connections from debuggers.  In addition, if the
port begins with the character 'w', the hypervisor will pause the
guest at the first instruction until a debugger attaches and
explicitly continues the guest.  Note that only a single debugger can
attach to a guest at a time.

Virtual CPUs are exposed to the remote debugger as threads.  General
purpose register values can be read for each virtual CPU.  Other
registers cannot currently be read, and no register values can be
changed by the debugger.

The remote debugger can read guest memory but not write to guest
memory.  To facilitate source-level debugging of the guest, memory
addresses from the debugger are treated as virtual addresses (rather
than physical addresses) and are resolved to a physical address using
the active virtual address translation of the current virtual CPU.
Memory reads should honor memory mapped I/O regions, though the debug
server does not attempt to honor any alignment or size constraints
when accessing MMIO.

The debug server provides limited support for controlling the guest.
The guest is suspended when a debugger is attached and resumes when a
debugger detaches.  A debugger can suspend a guest by sending a Ctrl-C
request (e.g. via Ctrl-C in GDB).  A debugger can also continue a
suspended guest while remaining attached.  Breakpoints are not yet
supported.  Single stepping is supported on Intel CPUs that support
MTRAP VM exits, but is not available on other systems.

While the current debug server has limited functionality, it should
at least be usable for basic debugging now.  It is also a useful
checkpoint to serve as a base for adding additional features.

Reviewed by:	grehan
Differential Revision:	https://reviews.freebsd.org/D15022
2018-05-01 15:17:46 +00:00

1129 lines
25 KiB
C

/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#ifndef WITHOUT_CAPSICUM
#include <sys/capsicum.h>
#endif
#include <sys/mman.h>
#include <sys/time.h>
#include <machine/atomic.h>
#include <machine/segments.h>
#ifndef WITHOUT_CAPSICUM
#include <capsicum_helpers.h>
#endif
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <err.h>
#include <errno.h>
#include <libgen.h>
#include <unistd.h>
#include <assert.h>
#include <errno.h>
#include <pthread.h>
#include <pthread_np.h>
#include <sysexits.h>
#include <stdbool.h>
#include <stdint.h>
#include <machine/vmm.h>
#ifndef WITHOUT_CAPSICUM
#include <machine/vmm_dev.h>
#endif
#include <vmmapi.h>
#include "bhyverun.h"
#include "acpi.h"
#include "atkbdc.h"
#include "inout.h"
#include "dbgport.h"
#include "fwctl.h"
#include "gdb.h"
#include "ioapic.h"
#include "mem.h"
#include "mevent.h"
#include "mptbl.h"
#include "pci_emul.h"
#include "pci_irq.h"
#include "pci_lpc.h"
#include "smbiostbl.h"
#include "xmsr.h"
#include "spinup_ap.h"
#include "rtc.h"
#define GUEST_NIO_PORT 0x488 /* guest upcalls via i/o port */
#define MB (1024UL * 1024)
#define GB (1024UL * MB)
typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu);
extern int vmexit_task_switch(struct vmctx *, struct vm_exit *, int *vcpu);
char *vmname;
int guest_ncpus;
uint16_t cores, maxcpus, sockets, threads;
char *guest_uuid_str;
static int guest_vmexit_on_hlt, guest_vmexit_on_pause;
static int virtio_msix = 1;
static int x2apic_mode = 0; /* default is xAPIC */
static int strictio;
static int strictmsr = 1;
static int acpi;
static char *progname;
static const int BSP = 0;
static cpuset_t cpumask;
static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip);
static struct vm_exit vmexit[VM_MAXCPU];
struct bhyvestats {
uint64_t vmexit_bogus;
uint64_t vmexit_reqidle;
uint64_t vmexit_hlt;
uint64_t vmexit_pause;
uint64_t vmexit_mtrap;
uint64_t vmexit_inst_emul;
uint64_t cpu_switch_rotate;
uint64_t cpu_switch_direct;
} stats;
struct mt_vmm_info {
pthread_t mt_thr;
struct vmctx *mt_ctx;
int mt_vcpu;
} mt_vmm_info[VM_MAXCPU];
static cpuset_t *vcpumap[VM_MAXCPU] = { NULL };
static void
usage(int code)
{
fprintf(stderr,
"Usage: %s [-abehuwxACHPSWY]\n"
" %*s [-c [[cpus=]numcpus][,sockets=n][,cores=n][,threads=n]]\n"
" %*s [-g <gdb port>] [-l <lpc>]\n"
" %*s [-m mem] [-p vcpu:hostcpu] [-s <pci>] [-U uuid] <vm>\n"
" -a: local apic is in xAPIC mode (deprecated)\n"
" -A: create ACPI tables\n"
" -c: number of cpus and/or topology specification"
" -C: include guest memory in core file\n"
" -e: exit on unhandled I/O access\n"
" -g: gdb port\n"
" -h: help\n"
" -H: vmexit from the guest on hlt\n"
" -l: LPC device configuration\n"
" -m: memory size in MB\n"
" -p: pin 'vcpu' to 'hostcpu'\n"
" -P: vmexit from the guest on pause\n"
" -s: <slot,driver,configinfo> PCI slot config\n"
" -S: guest memory cannot be swapped\n"
" -u: RTC keeps UTC time\n"
" -U: uuid\n"
" -w: ignore unimplemented MSRs\n"
" -W: force virtio to use single-vector MSI\n"
" -x: local apic is in x2APIC mode\n"
" -Y: disable MPtable generation\n",
progname, (int)strlen(progname), "", (int)strlen(progname), "",
(int)strlen(progname), "");
exit(code);
}
/*
* XXX This parser is known to have the following issues:
* 1. It accepts null key=value tokens ",,".
* 2. It accepts whitespace after = and before value.
* 3. Values out of range of INT are silently wrapped.
* 4. It doesn't check non-final values.
* 5. The apparently bogus limits of UINT16_MAX are for future expansion.
*
* The acceptance of a null specification ('-c ""') is by design to match the
* manual page syntax specification, this results in a topology of 1 vCPU.
*/
static int
topology_parse(const char *opt)
{
uint64_t ncpus;
int c, chk, n, s, t, tmp;
char *cp, *str;
bool ns, scts;
c = 1, n = 1, s = 1, t = 1;
ns = false, scts = false;
str = strdup(opt);
while ((cp = strsep(&str, ",")) != NULL) {
if (sscanf(cp, "%i%n", &tmp, &chk) == 1) {
n = tmp;
ns = true;
} else if (sscanf(cp, "cpus=%i%n", &tmp, &chk) == 1) {
n = tmp;
ns = true;
} else if (sscanf(cp, "sockets=%i%n", &tmp, &chk) == 1) {
s = tmp;
scts = true;
} else if (sscanf(cp, "cores=%i%n", &tmp, &chk) == 1) {
c = tmp;
scts = true;
} else if (sscanf(cp, "threads=%i%n", &tmp, &chk) == 1) {
t = tmp;
scts = true;
#ifdef notyet /* Do not expose this until vmm.ko implements it */
} else if (sscanf(cp, "maxcpus=%i%n", &tmp, &chk) == 1) {
m = tmp;
#endif
/* Skip the empty argument case from -c "" */
} else if (cp[0] == '\0')
continue;
else
return (-1);
/* Any trailing garbage causes an error */
if (cp[chk] != '\0')
return (-1);
}
/*
* Range check 1 <= n <= UINT16_MAX all values
*/
if (n < 1 || s < 1 || c < 1 || t < 1 ||
n > UINT16_MAX || s > UINT16_MAX || c > UINT16_MAX ||
t > UINT16_MAX)
return (-1);
/* If only the cpus was specified, use that as sockets */
if (!scts)
s = n;
/*
* Compute sockets * cores * threads avoiding overflow
* The range check above insures these are 16 bit values
* If n was specified check it against computed ncpus
*/
ncpus = (uint64_t)s * c * t;
if (ncpus > UINT16_MAX || (ns && n != ncpus))
return (-1);
guest_ncpus = ncpus;
sockets = s;
cores = c;
threads = t;
return(0);
}
static int
pincpu_parse(const char *opt)
{
int vcpu, pcpu;
if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) {
fprintf(stderr, "invalid format: %s\n", opt);
return (-1);
}
if (vcpu < 0 || vcpu >= VM_MAXCPU) {
fprintf(stderr, "vcpu '%d' outside valid range from 0 to %d\n",
vcpu, VM_MAXCPU - 1);
return (-1);
}
if (pcpu < 0 || pcpu >= CPU_SETSIZE) {
fprintf(stderr, "hostcpu '%d' outside valid range from "
"0 to %d\n", pcpu, CPU_SETSIZE - 1);
return (-1);
}
if (vcpumap[vcpu] == NULL) {
if ((vcpumap[vcpu] = malloc(sizeof(cpuset_t))) == NULL) {
perror("malloc");
return (-1);
}
CPU_ZERO(vcpumap[vcpu]);
}
CPU_SET(pcpu, vcpumap[vcpu]);
return (0);
}
void
vm_inject_fault(void *arg, int vcpu, int vector, int errcode_valid,
int errcode)
{
struct vmctx *ctx;
int error, restart_instruction;
ctx = arg;
restart_instruction = 1;
error = vm_inject_exception(ctx, vcpu, vector, errcode_valid, errcode,
restart_instruction);
assert(error == 0);
}
void *
paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len)
{
return (vm_map_gpa(ctx, gaddr, len));
}
int
fbsdrun_vmexit_on_pause(void)
{
return (guest_vmexit_on_pause);
}
int
fbsdrun_vmexit_on_hlt(void)
{
return (guest_vmexit_on_hlt);
}
int
fbsdrun_virtio_msix(void)
{
return (virtio_msix);
}
static void *
fbsdrun_start_thread(void *param)
{
char tname[MAXCOMLEN + 1];
struct mt_vmm_info *mtp;
int vcpu;
mtp = param;
vcpu = mtp->mt_vcpu;
snprintf(tname, sizeof(tname), "vcpu %d", vcpu);
pthread_set_name_np(mtp->mt_thr, tname);
gdb_cpu_add(vcpu);
vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip);
/* not reached */
exit(1);
return (NULL);
}
void
fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip)
{
int error;
assert(fromcpu == BSP);
/*
* The 'newcpu' must be activated in the context of 'fromcpu'. If
* vm_activate_cpu() is delayed until newcpu's pthread starts running
* then vmm.ko is out-of-sync with bhyve and this can create a race
* with vm_suspend().
*/
error = vm_activate_cpu(ctx, newcpu);
if (error != 0)
err(EX_OSERR, "could not activate CPU %d", newcpu);
CPU_SET_ATOMIC(newcpu, &cpumask);
/*
* Set up the vmexit struct to allow execution to start
* at the given RIP
*/
vmexit[newcpu].rip = rip;
vmexit[newcpu].inst_length = 0;
mt_vmm_info[newcpu].mt_ctx = ctx;
mt_vmm_info[newcpu].mt_vcpu = newcpu;
error = pthread_create(&mt_vmm_info[newcpu].mt_thr, NULL,
fbsdrun_start_thread, &mt_vmm_info[newcpu]);
assert(error == 0);
}
static int
fbsdrun_deletecpu(struct vmctx *ctx, int vcpu)
{
if (!CPU_ISSET(vcpu, &cpumask)) {
fprintf(stderr, "Attempting to delete unknown cpu %d\n", vcpu);
exit(1);
}
CPU_CLR_ATOMIC(vcpu, &cpumask);
return (CPU_EMPTY(&cpumask));
}
static int
vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu,
uint32_t eax)
{
#if BHYVE_DEBUG
/*
* put guest-driven debug here
*/
#endif
return (VMEXIT_CONTINUE);
}
static int
vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
{
int error;
int bytes, port, in, out;
int vcpu;
vcpu = *pvcpu;
port = vme->u.inout.port;
bytes = vme->u.inout.bytes;
in = vme->u.inout.in;
out = !in;
/* Extra-special case of host notifications */
if (out && port == GUEST_NIO_PORT) {
error = vmexit_handle_notify(ctx, vme, pvcpu, vme->u.inout.eax);
return (error);
}
error = emulate_inout(ctx, vcpu, vme, strictio);
if (error) {
fprintf(stderr, "Unhandled %s%c 0x%04x at 0x%lx\n",
in ? "in" : "out",
bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'),
port, vmexit->rip);
return (VMEXIT_ABORT);
} else {
return (VMEXIT_CONTINUE);
}
}
static int
vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
{
uint64_t val;
uint32_t eax, edx;
int error;
val = 0;
error = emulate_rdmsr(ctx, *pvcpu, vme->u.msr.code, &val);
if (error != 0) {
fprintf(stderr, "rdmsr to register %#x on vcpu %d\n",
vme->u.msr.code, *pvcpu);
if (strictmsr) {
vm_inject_gp(ctx, *pvcpu);
return (VMEXIT_CONTINUE);
}
}
eax = val;
error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RAX, eax);
assert(error == 0);
edx = val >> 32;
error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RDX, edx);
assert(error == 0);
return (VMEXIT_CONTINUE);
}
static int
vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
{
int error;
error = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code, vme->u.msr.wval);
if (error != 0) {
fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n",
vme->u.msr.code, vme->u.msr.wval, *pvcpu);
if (strictmsr) {
vm_inject_gp(ctx, *pvcpu);
return (VMEXIT_CONTINUE);
}
}
return (VMEXIT_CONTINUE);
}
static int
vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
{
(void)spinup_ap(ctx, *pvcpu,
vme->u.spinup_ap.vcpu, vme->u.spinup_ap.rip);
return (VMEXIT_CONTINUE);
}
#define DEBUG_EPT_MISCONFIG
#ifdef DEBUG_EPT_MISCONFIG
#define EXIT_REASON_EPT_MISCONFIG 49
#define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400
#define VMCS_IDENT(x) ((x) | 0x80000000)
static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4];
static int ept_misconfig_ptenum;
#endif
static int
vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
fprintf(stderr, "vm exit[%d]\n", *pvcpu);
fprintf(stderr, "\treason\t\tVMX\n");
fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip);
fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length);
fprintf(stderr, "\tstatus\t\t%d\n", vmexit->u.vmx.status);
fprintf(stderr, "\texit_reason\t%u\n", vmexit->u.vmx.exit_reason);
fprintf(stderr, "\tqualification\t0x%016lx\n",
vmexit->u.vmx.exit_qualification);
fprintf(stderr, "\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type);
fprintf(stderr, "\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error);
#ifdef DEBUG_EPT_MISCONFIG
if (vmexit->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) {
vm_get_register(ctx, *pvcpu,
VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS),
&ept_misconfig_gpa);
vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte,
&ept_misconfig_ptenum);
fprintf(stderr, "\tEPT misconfiguration:\n");
fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa);
fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n",
ept_misconfig_ptenum, ept_misconfig_pte[0],
ept_misconfig_pte[1], ept_misconfig_pte[2],
ept_misconfig_pte[3]);
}
#endif /* DEBUG_EPT_MISCONFIG */
return (VMEXIT_ABORT);
}
static int
vmexit_svm(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
fprintf(stderr, "vm exit[%d]\n", *pvcpu);
fprintf(stderr, "\treason\t\tSVM\n");
fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip);
fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length);
fprintf(stderr, "\texitcode\t%#lx\n", vmexit->u.svm.exitcode);
fprintf(stderr, "\texitinfo1\t%#lx\n", vmexit->u.svm.exitinfo1);
fprintf(stderr, "\texitinfo2\t%#lx\n", vmexit->u.svm.exitinfo2);
return (VMEXIT_ABORT);
}
static int
vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
assert(vmexit->inst_length == 0);
stats.vmexit_bogus++;
return (VMEXIT_CONTINUE);
}
static int
vmexit_reqidle(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
assert(vmexit->inst_length == 0);
stats.vmexit_reqidle++;
return (VMEXIT_CONTINUE);
}
static int
vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
stats.vmexit_hlt++;
/*
* Just continue execution with the next instruction. We use
* the HLT VM exit as a way to be friendly with the host
* scheduler.
*/
return (VMEXIT_CONTINUE);
}
static int
vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
stats.vmexit_pause++;
return (VMEXIT_CONTINUE);
}
static int
vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
assert(vmexit->inst_length == 0);
stats.vmexit_mtrap++;
gdb_cpu_mtrap(*pvcpu);
return (VMEXIT_CONTINUE);
}
static int
vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
int err, i;
struct vie *vie;
stats.vmexit_inst_emul++;
vie = &vmexit->u.inst_emul.vie;
err = emulate_mem(ctx, *pvcpu, vmexit->u.inst_emul.gpa,
vie, &vmexit->u.inst_emul.paging);
if (err) {
if (err == ESRCH) {
fprintf(stderr, "Unhandled memory access to 0x%lx\n",
vmexit->u.inst_emul.gpa);
}
fprintf(stderr, "Failed to emulate instruction [");
for (i = 0; i < vie->num_valid; i++) {
fprintf(stderr, "0x%02x%s", vie->inst[i],
i != (vie->num_valid - 1) ? " " : "");
}
fprintf(stderr, "] at 0x%lx\n", vmexit->rip);
return (VMEXIT_ABORT);
}
return (VMEXIT_CONTINUE);
}
static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER;
static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER;
static int
vmexit_suspend(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
enum vm_suspend_how how;
how = vmexit->u.suspended.how;
fbsdrun_deletecpu(ctx, *pvcpu);
if (*pvcpu != BSP) {
pthread_mutex_lock(&resetcpu_mtx);
pthread_cond_signal(&resetcpu_cond);
pthread_mutex_unlock(&resetcpu_mtx);
pthread_exit(NULL);
}
pthread_mutex_lock(&resetcpu_mtx);
while (!CPU_EMPTY(&cpumask)) {
pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx);
}
pthread_mutex_unlock(&resetcpu_mtx);
switch (how) {
case VM_SUSPEND_RESET:
exit(0);
case VM_SUSPEND_POWEROFF:
exit(1);
case VM_SUSPEND_HALT:
exit(2);
case VM_SUSPEND_TRIPLEFAULT:
exit(3);
default:
fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how);
exit(100);
}
return (0); /* NOTREACHED */
}
static int
vmexit_debug(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
gdb_cpu_suspend(*pvcpu);
return (VMEXIT_CONTINUE);
}
static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
[VM_EXITCODE_INOUT] = vmexit_inout,
[VM_EXITCODE_INOUT_STR] = vmexit_inout,
[VM_EXITCODE_VMX] = vmexit_vmx,
[VM_EXITCODE_SVM] = vmexit_svm,
[VM_EXITCODE_BOGUS] = vmexit_bogus,
[VM_EXITCODE_REQIDLE] = vmexit_reqidle,
[VM_EXITCODE_RDMSR] = vmexit_rdmsr,
[VM_EXITCODE_WRMSR] = vmexit_wrmsr,
[VM_EXITCODE_MTRAP] = vmexit_mtrap,
[VM_EXITCODE_INST_EMUL] = vmexit_inst_emul,
[VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap,
[VM_EXITCODE_SUSPENDED] = vmexit_suspend,
[VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch,
[VM_EXITCODE_DEBUG] = vmexit_debug,
};
static void
vm_loop(struct vmctx *ctx, int vcpu, uint64_t startrip)
{
int error, rc;
enum vm_exitcode exitcode;
cpuset_t active_cpus;
if (vcpumap[vcpu] != NULL) {
error = pthread_setaffinity_np(pthread_self(),
sizeof(cpuset_t), vcpumap[vcpu]);
assert(error == 0);
}
error = vm_active_cpus(ctx, &active_cpus);
assert(CPU_ISSET(vcpu, &active_cpus));
error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, startrip);
assert(error == 0);
while (1) {
error = vm_run(ctx, vcpu, &vmexit[vcpu]);
if (error != 0)
break;
exitcode = vmexit[vcpu].exitcode;
if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) {
fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n",
exitcode);
exit(1);
}
rc = (*handler[exitcode])(ctx, &vmexit[vcpu], &vcpu);
switch (rc) {
case VMEXIT_CONTINUE:
break;
case VMEXIT_ABORT:
abort();
default:
exit(1);
}
}
fprintf(stderr, "vm_run error %d, errno %d\n", error, errno);
}
static int
num_vcpus_allowed(struct vmctx *ctx)
{
int tmp, error;
error = vm_get_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, &tmp);
/*
* The guest is allowed to spinup more than one processor only if the
* UNRESTRICTED_GUEST capability is available.
*/
if (error == 0)
return (VM_MAXCPU);
else
return (1);
}
void
fbsdrun_set_capabilities(struct vmctx *ctx, int cpu)
{
int err, tmp;
if (fbsdrun_vmexit_on_hlt()) {
err = vm_get_capability(ctx, cpu, VM_CAP_HALT_EXIT, &tmp);
if (err < 0) {
fprintf(stderr, "VM exit on HLT not supported\n");
exit(1);
}
vm_set_capability(ctx, cpu, VM_CAP_HALT_EXIT, 1);
if (cpu == BSP)
handler[VM_EXITCODE_HLT] = vmexit_hlt;
}
if (fbsdrun_vmexit_on_pause()) {
/*
* pause exit support required for this mode
*/
err = vm_get_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, &tmp);
if (err < 0) {
fprintf(stderr,
"SMP mux requested, no pause support\n");
exit(1);
}
vm_set_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, 1);
if (cpu == BSP)
handler[VM_EXITCODE_PAUSE] = vmexit_pause;
}
if (x2apic_mode)
err = vm_set_x2apic_state(ctx, cpu, X2APIC_ENABLED);
else
err = vm_set_x2apic_state(ctx, cpu, X2APIC_DISABLED);
if (err) {
fprintf(stderr, "Unable to set x2apic state (%d)\n", err);
exit(1);
}
vm_set_capability(ctx, cpu, VM_CAP_ENABLE_INVPCID, 1);
}
static struct vmctx *
do_open(const char *vmname)
{
struct vmctx *ctx;
int error;
bool reinit, romboot;
#ifndef WITHOUT_CAPSICUM
cap_rights_t rights;
const cap_ioctl_t *cmds;
size_t ncmds;
#endif
reinit = romboot = false;
if (lpc_bootrom())
romboot = true;
error = vm_create(vmname);
if (error) {
if (errno == EEXIST) {
if (romboot) {
reinit = true;
} else {
/*
* The virtual machine has been setup by the
* userspace bootloader.
*/
}
} else {
perror("vm_create");
exit(1);
}
} else {
if (!romboot) {
/*
* If the virtual machine was just created then a
* bootrom must be configured to boot it.
*/
fprintf(stderr, "virtual machine cannot be booted\n");
exit(1);
}
}
ctx = vm_open(vmname);
if (ctx == NULL) {
perror("vm_open");
exit(1);
}
#ifndef WITHOUT_CAPSICUM
cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW);
if (cap_rights_limit(vm_get_device_fd(ctx), &rights) == -1 &&
errno != ENOSYS)
errx(EX_OSERR, "Unable to apply rights for sandbox");
vm_get_ioctls(&ncmds);
cmds = vm_get_ioctls(NULL);
if (cmds == NULL)
errx(EX_OSERR, "out of memory");
if (cap_ioctls_limit(vm_get_device_fd(ctx), cmds, ncmds) == -1 &&
errno != ENOSYS)
errx(EX_OSERR, "Unable to apply rights for sandbox");
free((cap_ioctl_t *)cmds);
#endif
if (reinit) {
error = vm_reinit(ctx);
if (error) {
perror("vm_reinit");
exit(1);
}
}
error = vm_set_topology(ctx, sockets, cores, threads, maxcpus);
if (error)
errx(EX_OSERR, "vm_set_topology");
return (ctx);
}
int
main(int argc, char *argv[])
{
int c, error, dbg_port, gdb_port, err, bvmcons;
int max_vcpus, mptgen, memflags;
int rtc_localtime;
bool gdb_stop;
struct vmctx *ctx;
uint64_t rip;
size_t memsize;
char *optstr;
bvmcons = 0;
progname = basename(argv[0]);
dbg_port = 0;
gdb_port = 0;
gdb_stop = false;
guest_ncpus = 1;
sockets = cores = threads = 1;
maxcpus = 0;
memsize = 256 * MB;
mptgen = 1;
rtc_localtime = 1;
memflags = 0;
optstr = "abehuwxACHIPSWYp:g:G:c:s:m:l:U:";
while ((c = getopt(argc, argv, optstr)) != -1) {
switch (c) {
case 'a':
x2apic_mode = 0;
break;
case 'A':
acpi = 1;
break;
case 'b':
bvmcons = 1;
break;
case 'p':
if (pincpu_parse(optarg) != 0) {
errx(EX_USAGE, "invalid vcpu pinning "
"configuration '%s'", optarg);
}
break;
case 'c':
if (topology_parse(optarg) != 0) {
errx(EX_USAGE, "invalid cpu topology "
"'%s'", optarg);
}
break;
case 'C':
memflags |= VM_MEM_F_INCORE;
break;
case 'g':
dbg_port = atoi(optarg);
break;
case 'G':
if (optarg[0] == 'w') {
gdb_stop = true;
optarg++;
}
gdb_port = atoi(optarg);
break;
case 'l':
if (lpc_device_parse(optarg) != 0) {
errx(EX_USAGE, "invalid lpc device "
"configuration '%s'", optarg);
}
break;
case 's':
if (pci_parse_slot(optarg) != 0)
exit(1);
else
break;
case 'S':
memflags |= VM_MEM_F_WIRED;
break;
case 'm':
error = vm_parse_memsize(optarg, &memsize);
if (error)
errx(EX_USAGE, "invalid memsize '%s'", optarg);
break;
case 'H':
guest_vmexit_on_hlt = 1;
break;
case 'I':
/*
* The "-I" option was used to add an ioapic to the
* virtual machine.
*
* An ioapic is now provided unconditionally for each
* virtual machine and this option is now deprecated.
*/
break;
case 'P':
guest_vmexit_on_pause = 1;
break;
case 'e':
strictio = 1;
break;
case 'u':
rtc_localtime = 0;
break;
case 'U':
guest_uuid_str = optarg;
break;
case 'w':
strictmsr = 0;
break;
case 'W':
virtio_msix = 0;
break;
case 'x':
x2apic_mode = 1;
break;
case 'Y':
mptgen = 0;
break;
case 'h':
usage(0);
default:
usage(1);
}
}
argc -= optind;
argv += optind;
if (argc != 1)
usage(1);
vmname = argv[0];
ctx = do_open(vmname);
max_vcpus = num_vcpus_allowed(ctx);
if (guest_ncpus > max_vcpus) {
fprintf(stderr, "%d vCPUs requested but only %d available\n",
guest_ncpus, max_vcpus);
exit(1);
}
fbsdrun_set_capabilities(ctx, BSP);
vm_set_memflags(ctx, memflags);
err = vm_setup_memory(ctx, memsize, VM_MMAP_ALL);
if (err) {
fprintf(stderr, "Unable to setup memory (%d)\n", errno);
exit(1);
}
error = init_msr();
if (error) {
fprintf(stderr, "init_msr error %d", error);
exit(1);
}
init_mem();
init_inout();
atkbdc_init(ctx);
pci_irq_init(ctx);
ioapic_init(ctx);
rtc_init(ctx, rtc_localtime);
sci_init(ctx);
/*
* Exit if a device emulation finds an error in its initilization
*/
if (init_pci(ctx) != 0)
exit(1);
if (dbg_port != 0)
init_dbgport(dbg_port);
if (gdb_port != 0)
init_gdb(ctx, gdb_port, gdb_stop);
if (bvmcons)
init_bvmcons();
if (lpc_bootrom()) {
if (vm_set_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, 1)) {
fprintf(stderr, "ROM boot failed: unrestricted guest "
"capability not available\n");
exit(1);
}
error = vcpu_reset(ctx, BSP);
assert(error == 0);
}
error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip);
assert(error == 0);
/*
* build the guest tables, MP etc.
*/
if (mptgen) {
error = mptable_build(ctx, guest_ncpus);
if (error)
exit(1);
}
error = smbios_build(ctx);
assert(error == 0);
if (acpi) {
error = acpi_build(ctx, guest_ncpus);
assert(error == 0);
}
if (lpc_bootrom())
fwctl_init();
#ifndef WITHOUT_CAPSICUM
caph_cache_catpages();
if (caph_limit_stdout() == -1 || caph_limit_stderr() == -1)
errx(EX_OSERR, "Unable to apply rights for sandbox");
if (cap_enter() == -1 && errno != ENOSYS)
errx(EX_OSERR, "cap_enter() failed");
#endif
/*
* Change the proc title to include the VM name.
*/
setproctitle("%s", vmname);
/*
* Add CPU 0
*/
fbsdrun_addcpu(ctx, BSP, BSP, rip);
/*
* Head off to the main event dispatch loop
*/
mevent_dispatch();
exit(1);
}