621b509048
Replace the existing ad-hoc configuration via various global variables with a small database of key-value pairs. The database supports heirarchical keys using a MIB-like syntax to name the path to a given key. Values are always stored as strings. The API used to manage configuation values does include wrappers to handling boolean values. Other values use non-string types require parsing by consumers. The configuration values are stored in a tree using nvlists. Leaf nodes hold string values. Configuration values are permitted to reference other configuration values using '%(name)'. This permits constructing template configurations. All existing command line arguments now set configuration values. For devices, the "-s" option parses its option argument to generate a list of key-value pairs for the given device. A new '-o' command line option permits setting an individual configuration variable. The key name is always given as a full path of dot-separated components. A new '-k' command line option parses a simple configuration file. This configuration file holds a flat list of 'key=value' lines where the 'key' is the full path of a configuration variable. Lines starting with a '#' are comments. In general, bhyve starts by parsing command line options in sequence and applying those settings to configuration values. Once this is complete, bhyve then begins initializing its state based on the configuration values. This means that subsequent configuration options or files may override or supplement previously given settings. A special 'config.dump' configuration value can be set to true to help debug configuration issues. When this value is set, bhyve will print out the configuration variables as a flat list of 'key=value' lines. Most command line argments map to a single configuration variable, e.g. '-w' sets the 'x86.strictmsr' value to false. A few command line arguments have less obvious effects: - Multiple '-p' options append their values (as a comma-seperated list) to "vcpu.N.cpuset" values (where N is a decimal vcpu number). - For '-s' options, a pci.<bus>.<slot>.<function> node is created. The first argument to '-s' (the device type) is used as the value of a "device" variable. Additional comma-separated arguments are then parsed into 'key=value' pairs and used to set additional variables under the device node. A PCI device emulation driver can provide its own hook to override the parsing of the additonal '-s' arguments after the device type. After the configuration phase as completed, the init_pci hook then walks the "pci.<bus>.<slot>.<func>" nodes. It uses the "device" value to find the device model to use. The device model's init routine is passed a reference to its nvlist node in the configuration tree which it can query for specific variables. The result is that a lot of the string parsing is removed from the device models and centralized. In addition, adding a new variable just requires teaching the model to look for the new variable. - For '-l' options, a similar model is used where the string is parsed into values that are later read during initialization. One key note here is that the serial ports use the commonly used lowercase names from existing documentation and examples (e.g. "lpc.com1") instead of the uppercase names previously used internally in bhyve. Reviewed by: grehan MFC after: 3 months Differential Revision: https://reviews.freebsd.org/D26035
1580 lines
37 KiB
C
1580 lines
37 KiB
C
/*-
|
|
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
|
|
*
|
|
* Copyright (c) 2011 NetApp, Inc.
|
|
* All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
* SUCH DAMAGE.
|
|
*
|
|
* $FreeBSD$
|
|
*/
|
|
|
|
#include <sys/cdefs.h>
|
|
__FBSDID("$FreeBSD$");
|
|
|
|
#include <sys/types.h>
|
|
#ifndef WITHOUT_CAPSICUM
|
|
#include <sys/capsicum.h>
|
|
#endif
|
|
#include <sys/mman.h>
|
|
#ifdef BHYVE_SNAPSHOT
|
|
#include <sys/socket.h>
|
|
#include <sys/stat.h>
|
|
#endif
|
|
#include <sys/time.h>
|
|
#ifdef BHYVE_SNAPSHOT
|
|
#include <sys/un.h>
|
|
#endif
|
|
|
|
#include <amd64/vmm/intel/vmcs.h>
|
|
|
|
#include <machine/atomic.h>
|
|
#include <machine/segments.h>
|
|
|
|
#ifndef WITHOUT_CAPSICUM
|
|
#include <capsicum_helpers.h>
|
|
#endif
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <err.h>
|
|
#include <errno.h>
|
|
#ifdef BHYVE_SNAPSHOT
|
|
#include <fcntl.h>
|
|
#endif
|
|
#include <libgen.h>
|
|
#include <unistd.h>
|
|
#include <assert.h>
|
|
#include <pthread.h>
|
|
#include <pthread_np.h>
|
|
#include <sysexits.h>
|
|
#include <stdbool.h>
|
|
#include <stdint.h>
|
|
#ifdef BHYVE_SNAPSHOT
|
|
#include <ucl.h>
|
|
#include <unistd.h>
|
|
|
|
#include <libxo/xo.h>
|
|
#endif
|
|
|
|
#include <machine/vmm.h>
|
|
#ifndef WITHOUT_CAPSICUM
|
|
#include <machine/vmm_dev.h>
|
|
#endif
|
|
#include <machine/vmm_instruction_emul.h>
|
|
#include <vmmapi.h>
|
|
|
|
#include "bhyverun.h"
|
|
#include "acpi.h"
|
|
#include "atkbdc.h"
|
|
#include "bootrom.h"
|
|
#include "config.h"
|
|
#include "inout.h"
|
|
#include "debug.h"
|
|
#include "fwctl.h"
|
|
#include "gdb.h"
|
|
#include "ioapic.h"
|
|
#include "kernemu_dev.h"
|
|
#include "mem.h"
|
|
#include "mevent.h"
|
|
#include "mptbl.h"
|
|
#include "pci_emul.h"
|
|
#include "pci_irq.h"
|
|
#include "pci_lpc.h"
|
|
#include "smbiostbl.h"
|
|
#ifdef BHYVE_SNAPSHOT
|
|
#include "snapshot.h"
|
|
#endif
|
|
#include "xmsr.h"
|
|
#include "spinup_ap.h"
|
|
#include "rtc.h"
|
|
#include "vmgenc.h"
|
|
|
|
#define GUEST_NIO_PORT 0x488 /* guest upcalls via i/o port */
|
|
|
|
#define MB (1024UL * 1024)
|
|
#define GB (1024UL * MB)
|
|
|
|
static const char * const vmx_exit_reason_desc[] = {
|
|
[EXIT_REASON_EXCEPTION] = "Exception or non-maskable interrupt (NMI)",
|
|
[EXIT_REASON_EXT_INTR] = "External interrupt",
|
|
[EXIT_REASON_TRIPLE_FAULT] = "Triple fault",
|
|
[EXIT_REASON_INIT] = "INIT signal",
|
|
[EXIT_REASON_SIPI] = "Start-up IPI (SIPI)",
|
|
[EXIT_REASON_IO_SMI] = "I/O system-management interrupt (SMI)",
|
|
[EXIT_REASON_SMI] = "Other SMI",
|
|
[EXIT_REASON_INTR_WINDOW] = "Interrupt window",
|
|
[EXIT_REASON_NMI_WINDOW] = "NMI window",
|
|
[EXIT_REASON_TASK_SWITCH] = "Task switch",
|
|
[EXIT_REASON_CPUID] = "CPUID",
|
|
[EXIT_REASON_GETSEC] = "GETSEC",
|
|
[EXIT_REASON_HLT] = "HLT",
|
|
[EXIT_REASON_INVD] = "INVD",
|
|
[EXIT_REASON_INVLPG] = "INVLPG",
|
|
[EXIT_REASON_RDPMC] = "RDPMC",
|
|
[EXIT_REASON_RDTSC] = "RDTSC",
|
|
[EXIT_REASON_RSM] = "RSM",
|
|
[EXIT_REASON_VMCALL] = "VMCALL",
|
|
[EXIT_REASON_VMCLEAR] = "VMCLEAR",
|
|
[EXIT_REASON_VMLAUNCH] = "VMLAUNCH",
|
|
[EXIT_REASON_VMPTRLD] = "VMPTRLD",
|
|
[EXIT_REASON_VMPTRST] = "VMPTRST",
|
|
[EXIT_REASON_VMREAD] = "VMREAD",
|
|
[EXIT_REASON_VMRESUME] = "VMRESUME",
|
|
[EXIT_REASON_VMWRITE] = "VMWRITE",
|
|
[EXIT_REASON_VMXOFF] = "VMXOFF",
|
|
[EXIT_REASON_VMXON] = "VMXON",
|
|
[EXIT_REASON_CR_ACCESS] = "Control-register accesses",
|
|
[EXIT_REASON_DR_ACCESS] = "MOV DR",
|
|
[EXIT_REASON_INOUT] = "I/O instruction",
|
|
[EXIT_REASON_RDMSR] = "RDMSR",
|
|
[EXIT_REASON_WRMSR] = "WRMSR",
|
|
[EXIT_REASON_INVAL_VMCS] =
|
|
"VM-entry failure due to invalid guest state",
|
|
[EXIT_REASON_INVAL_MSR] = "VM-entry failure due to MSR loading",
|
|
[EXIT_REASON_MWAIT] = "MWAIT",
|
|
[EXIT_REASON_MTF] = "Monitor trap flag",
|
|
[EXIT_REASON_MONITOR] = "MONITOR",
|
|
[EXIT_REASON_PAUSE] = "PAUSE",
|
|
[EXIT_REASON_MCE_DURING_ENTRY] =
|
|
"VM-entry failure due to machine-check event",
|
|
[EXIT_REASON_TPR] = "TPR below threshold",
|
|
[EXIT_REASON_APIC_ACCESS] = "APIC access",
|
|
[EXIT_REASON_VIRTUALIZED_EOI] = "Virtualized EOI",
|
|
[EXIT_REASON_GDTR_IDTR] = "Access to GDTR or IDTR",
|
|
[EXIT_REASON_LDTR_TR] = "Access to LDTR or TR",
|
|
[EXIT_REASON_EPT_FAULT] = "EPT violation",
|
|
[EXIT_REASON_EPT_MISCONFIG] = "EPT misconfiguration",
|
|
[EXIT_REASON_INVEPT] = "INVEPT",
|
|
[EXIT_REASON_RDTSCP] = "RDTSCP",
|
|
[EXIT_REASON_VMX_PREEMPT] = "VMX-preemption timer expired",
|
|
[EXIT_REASON_INVVPID] = "INVVPID",
|
|
[EXIT_REASON_WBINVD] = "WBINVD",
|
|
[EXIT_REASON_XSETBV] = "XSETBV",
|
|
[EXIT_REASON_APIC_WRITE] = "APIC write",
|
|
[EXIT_REASON_RDRAND] = "RDRAND",
|
|
[EXIT_REASON_INVPCID] = "INVPCID",
|
|
[EXIT_REASON_VMFUNC] = "VMFUNC",
|
|
[EXIT_REASON_ENCLS] = "ENCLS",
|
|
[EXIT_REASON_RDSEED] = "RDSEED",
|
|
[EXIT_REASON_PM_LOG_FULL] = "Page-modification log full",
|
|
[EXIT_REASON_XSAVES] = "XSAVES",
|
|
[EXIT_REASON_XRSTORS] = "XRSTORS"
|
|
};
|
|
|
|
typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu);
|
|
extern int vmexit_task_switch(struct vmctx *, struct vm_exit *, int *vcpu);
|
|
|
|
int guest_ncpus;
|
|
uint16_t cores, maxcpus, sockets, threads;
|
|
|
|
int raw_stdio = 0;
|
|
|
|
static char *progname;
|
|
static const int BSP = 0;
|
|
|
|
static cpuset_t cpumask;
|
|
|
|
static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip);
|
|
|
|
static struct vm_exit vmexit[VM_MAXCPU];
|
|
|
|
struct bhyvestats {
|
|
uint64_t vmexit_bogus;
|
|
uint64_t vmexit_reqidle;
|
|
uint64_t vmexit_hlt;
|
|
uint64_t vmexit_pause;
|
|
uint64_t vmexit_mtrap;
|
|
uint64_t vmexit_inst_emul;
|
|
uint64_t cpu_switch_rotate;
|
|
uint64_t cpu_switch_direct;
|
|
} stats;
|
|
|
|
struct mt_vmm_info {
|
|
pthread_t mt_thr;
|
|
struct vmctx *mt_ctx;
|
|
int mt_vcpu;
|
|
} mt_vmm_info[VM_MAXCPU];
|
|
|
|
static cpuset_t *vcpumap[VM_MAXCPU] = { NULL };
|
|
|
|
static void
|
|
usage(int code)
|
|
{
|
|
|
|
fprintf(stderr,
|
|
"Usage: %s [-aehuwxACDHPSWY]\n"
|
|
" %*s [-c [[cpus=]numcpus][,sockets=n][,cores=n][,threads=n]]\n"
|
|
" %*s [-k <file>] [-l <lpc>] [-m mem] [-o <var>=<value>]\n"
|
|
" %*s [-p vcpu:hostcpu] [-s <pci>] [-U uuid] [<vm>]\n"
|
|
" -a: local apic is in xAPIC mode (deprecated)\n"
|
|
" -A: create ACPI tables\n"
|
|
" -c: number of cpus and/or topology specification\n"
|
|
" -C: include guest memory in core file\n"
|
|
" -D: destroy on power-off\n"
|
|
" -e: exit on unhandled I/O access\n"
|
|
" -h: help\n"
|
|
" -H: vmexit from the guest on hlt\n"
|
|
" -k: key=value flat config file\n"
|
|
" -l: LPC device configuration\n"
|
|
" -m: memory size in MB\n"
|
|
" -o: set config 'var' to 'value'\n"
|
|
" -p: pin 'vcpu' to 'hostcpu'\n"
|
|
" -P: vmexit from the guest on pause\n"
|
|
#ifdef BHYVE_SNAPSHOT
|
|
" -r: path to checkpoint file\n"
|
|
#endif
|
|
" -s: <slot,driver,configinfo> PCI slot config\n"
|
|
" -S: guest memory cannot be swapped\n"
|
|
" -u: RTC keeps UTC time\n"
|
|
" -U: uuid\n"
|
|
" -w: ignore unimplemented MSRs\n"
|
|
" -W: force virtio to use single-vector MSI\n"
|
|
" -x: local apic is in x2APIC mode\n"
|
|
" -Y: disable MPtable generation\n",
|
|
progname, (int)strlen(progname), "", (int)strlen(progname), "",
|
|
(int)strlen(progname), "");
|
|
|
|
exit(code);
|
|
}
|
|
|
|
/*
|
|
* XXX This parser is known to have the following issues:
|
|
* 1. It accepts null key=value tokens ",," as setting "cpus" to an
|
|
* empty string.
|
|
*
|
|
* The acceptance of a null specification ('-c ""') is by design to match the
|
|
* manual page syntax specification, this results in a topology of 1 vCPU.
|
|
*/
|
|
static int
|
|
topology_parse(const char *opt)
|
|
{
|
|
char *cp, *str;
|
|
|
|
if (*opt == '\0') {
|
|
set_config_value("sockets", "1");
|
|
set_config_value("cores", "1");
|
|
set_config_value("threads", "1");
|
|
set_config_value("cpus", "1");
|
|
return (0);
|
|
}
|
|
|
|
str = strdup(opt);
|
|
if (str == NULL)
|
|
errx(4, "Failed to allocate memory");
|
|
|
|
while ((cp = strsep(&str, ",")) != NULL) {
|
|
if (strncmp(cp, "cpus=", strlen("cpus=")) == 0)
|
|
set_config_value("cpus", cp + strlen("cpus="));
|
|
else if (strncmp(cp, "sockets=", strlen("sockets=")) == 0)
|
|
set_config_value("sockets", cp + strlen("sockets="));
|
|
else if (strncmp(cp, "cores=", strlen("cores=")) == 0)
|
|
set_config_value("cores", cp + strlen("cores="));
|
|
else if (strncmp(cp, "threads=", strlen("threads=")) == 0)
|
|
set_config_value("threads", cp + strlen("threads="));
|
|
#ifdef notyet /* Do not expose this until vmm.ko implements it */
|
|
else if (strncmp(cp, "maxcpus=", strlen("maxcpus=")) == 0)
|
|
set_config_value("maxcpus", cp + strlen("maxcpus="));
|
|
#endif
|
|
else if (strchr(cp, '=') != NULL)
|
|
goto out;
|
|
else
|
|
set_config_value("cpus", cp);
|
|
}
|
|
free(str);
|
|
return (0);
|
|
|
|
out:
|
|
free(str);
|
|
return (-1);
|
|
}
|
|
|
|
static int
|
|
parse_int_value(const char *key, const char *value, int minval, int maxval)
|
|
{
|
|
char *cp;
|
|
long lval;
|
|
|
|
errno = 0;
|
|
lval = strtol(value, &cp, 0);
|
|
if (errno != 0 || *cp != '\0' || cp == value || lval < minval ||
|
|
lval > maxval)
|
|
errx(4, "Invalid value for %s: '%s'", key, value);
|
|
return (lval);
|
|
}
|
|
|
|
/*
|
|
* Set the sockets, cores, threads, and guest_cpus variables based on
|
|
* the configured topology.
|
|
*
|
|
* The limits of UINT16_MAX are due to the types passed to
|
|
* vm_set_topology(). vmm.ko may enforce tighter limits.
|
|
*/
|
|
static void
|
|
calc_topolopgy(void)
|
|
{
|
|
const char *value;
|
|
bool explicit_cpus;
|
|
uint64_t ncpus;
|
|
|
|
value = get_config_value("cpus");
|
|
if (value != NULL) {
|
|
guest_ncpus = parse_int_value("cpus", value, 1, UINT16_MAX);
|
|
explicit_cpus = true;
|
|
} else {
|
|
guest_ncpus = 1;
|
|
explicit_cpus = false;
|
|
}
|
|
value = get_config_value("cores");
|
|
if (value != NULL)
|
|
cores = parse_int_value("cores", value, 1, UINT16_MAX);
|
|
else
|
|
cores = 1;
|
|
value = get_config_value("threads");
|
|
if (value != NULL)
|
|
threads = parse_int_value("threads", value, 1, UINT16_MAX);
|
|
else
|
|
threads = 1;
|
|
value = get_config_value("sockets");
|
|
if (value != NULL)
|
|
sockets = parse_int_value("sockets", value, 1, UINT16_MAX);
|
|
else
|
|
sockets = guest_ncpus;
|
|
|
|
/*
|
|
* Compute sockets * cores * threads avoiding overflow. The
|
|
* range check above insures these are 16 bit values.
|
|
*/
|
|
ncpus = (uint64_t)sockets * cores * threads;
|
|
if (ncpus > UINT16_MAX)
|
|
errx(4, "Computed number of vCPUs too high: %ju",
|
|
(uintmax_t)ncpus);
|
|
|
|
if (explicit_cpus) {
|
|
if (guest_ncpus != ncpus)
|
|
errx(4, "Topology (%d sockets, %d cores, %d threads) "
|
|
"does not match %d vCPUs", sockets, cores, threads,
|
|
guest_ncpus);
|
|
} else
|
|
guest_ncpus = ncpus;
|
|
}
|
|
|
|
static int
|
|
pincpu_parse(const char *opt)
|
|
{
|
|
const char *value;
|
|
char *newval;
|
|
char key[16];
|
|
int vcpu, pcpu;
|
|
|
|
if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) {
|
|
fprintf(stderr, "invalid format: %s\n", opt);
|
|
return (-1);
|
|
}
|
|
|
|
if (vcpu < 0 || vcpu >= VM_MAXCPU) {
|
|
fprintf(stderr, "vcpu '%d' outside valid range from 0 to %d\n",
|
|
vcpu, VM_MAXCPU - 1);
|
|
return (-1);
|
|
}
|
|
|
|
if (pcpu < 0 || pcpu >= CPU_SETSIZE) {
|
|
fprintf(stderr, "hostcpu '%d' outside valid range from "
|
|
"0 to %d\n", pcpu, CPU_SETSIZE - 1);
|
|
return (-1);
|
|
}
|
|
|
|
snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu);
|
|
value = get_config_value(key);
|
|
|
|
if (asprintf(&newval, "%s%s%d", value != NULL ? value : "",
|
|
value != NULL ? "," : "", pcpu) == -1) {
|
|
perror("failed to build new cpuset string");
|
|
return (-1);
|
|
}
|
|
|
|
set_config_value(key, newval);
|
|
free(newval);
|
|
return (0);
|
|
}
|
|
|
|
static void
|
|
parse_cpuset(int vcpu, const char *list, cpuset_t *set)
|
|
{
|
|
char *cp, *token;
|
|
int pcpu, start;
|
|
|
|
CPU_ZERO(set);
|
|
start = -1;
|
|
token = __DECONST(char *, list);
|
|
for (;;) {
|
|
pcpu = strtoul(token, &cp, 0);
|
|
if (cp == token)
|
|
errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list);
|
|
if (pcpu < 0 || pcpu >= CPU_SETSIZE)
|
|
errx(4, "hostcpu '%d' outside valid range from 0 to %d",
|
|
pcpu, CPU_SETSIZE - 1);
|
|
switch (*cp) {
|
|
case ',':
|
|
case '\0':
|
|
if (start >= 0) {
|
|
if (start > pcpu)
|
|
errx(4, "Invalid hostcpu range %d-%d",
|
|
start, pcpu);
|
|
while (start < pcpu) {
|
|
CPU_SET(start, vcpumap[vcpu]);
|
|
start++;
|
|
}
|
|
start = -1;
|
|
}
|
|
CPU_SET(pcpu, vcpumap[vcpu]);
|
|
break;
|
|
case '-':
|
|
if (start >= 0)
|
|
errx(4, "invalid cpuset for vcpu %d: '%s'",
|
|
vcpu, list);
|
|
start = pcpu;
|
|
break;
|
|
default:
|
|
errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list);
|
|
}
|
|
if (*cp == '\0')
|
|
break;
|
|
token = cp + 1;
|
|
}
|
|
}
|
|
|
|
static void
|
|
build_vcpumaps(void)
|
|
{
|
|
char key[16];
|
|
const char *value;
|
|
int vcpu;
|
|
|
|
for (vcpu = 0; vcpu < guest_ncpus; vcpu++) {
|
|
snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu);
|
|
value = get_config_value(key);
|
|
if (value == NULL)
|
|
continue;
|
|
vcpumap[vcpu] = malloc(sizeof(cpuset_t));
|
|
if (vcpumap[vcpu] == NULL)
|
|
err(4, "Failed to allocate cpuset for vcpu %d", vcpu);
|
|
parse_cpuset(vcpu, value, vcpumap[vcpu]);
|
|
}
|
|
}
|
|
|
|
void
|
|
vm_inject_fault(void *arg, int vcpu, int vector, int errcode_valid,
|
|
int errcode)
|
|
{
|
|
struct vmctx *ctx;
|
|
int error, restart_instruction;
|
|
|
|
ctx = arg;
|
|
restart_instruction = 1;
|
|
|
|
error = vm_inject_exception(ctx, vcpu, vector, errcode_valid, errcode,
|
|
restart_instruction);
|
|
assert(error == 0);
|
|
}
|
|
|
|
void *
|
|
paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len)
|
|
{
|
|
|
|
return (vm_map_gpa(ctx, gaddr, len));
|
|
}
|
|
|
|
#ifdef BHYVE_SNAPSHOT
|
|
uintptr_t
|
|
paddr_host2guest(struct vmctx *ctx, void *addr)
|
|
{
|
|
return (vm_rev_map_gpa(ctx, addr));
|
|
}
|
|
#endif
|
|
|
|
int
|
|
fbsdrun_virtio_msix(void)
|
|
{
|
|
|
|
return (get_config_bool_default("virtio_msix", true));
|
|
}
|
|
|
|
static void *
|
|
fbsdrun_start_thread(void *param)
|
|
{
|
|
char tname[MAXCOMLEN + 1];
|
|
struct mt_vmm_info *mtp;
|
|
int vcpu;
|
|
|
|
mtp = param;
|
|
vcpu = mtp->mt_vcpu;
|
|
|
|
snprintf(tname, sizeof(tname), "vcpu %d", vcpu);
|
|
pthread_set_name_np(mtp->mt_thr, tname);
|
|
|
|
#ifdef BHYVE_SNAPSHOT
|
|
checkpoint_cpu_add(vcpu);
|
|
#endif
|
|
gdb_cpu_add(vcpu);
|
|
|
|
vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip);
|
|
|
|
/* not reached */
|
|
exit(1);
|
|
return (NULL);
|
|
}
|
|
|
|
void
|
|
fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip)
|
|
{
|
|
int error;
|
|
|
|
assert(fromcpu == BSP);
|
|
|
|
/*
|
|
* The 'newcpu' must be activated in the context of 'fromcpu'. If
|
|
* vm_activate_cpu() is delayed until newcpu's pthread starts running
|
|
* then vmm.ko is out-of-sync with bhyve and this can create a race
|
|
* with vm_suspend().
|
|
*/
|
|
error = vm_activate_cpu(ctx, newcpu);
|
|
if (error != 0)
|
|
err(EX_OSERR, "could not activate CPU %d", newcpu);
|
|
|
|
CPU_SET_ATOMIC(newcpu, &cpumask);
|
|
|
|
/*
|
|
* Set up the vmexit struct to allow execution to start
|
|
* at the given RIP
|
|
*/
|
|
vmexit[newcpu].rip = rip;
|
|
vmexit[newcpu].inst_length = 0;
|
|
|
|
mt_vmm_info[newcpu].mt_ctx = ctx;
|
|
mt_vmm_info[newcpu].mt_vcpu = newcpu;
|
|
|
|
error = pthread_create(&mt_vmm_info[newcpu].mt_thr, NULL,
|
|
fbsdrun_start_thread, &mt_vmm_info[newcpu]);
|
|
assert(error == 0);
|
|
}
|
|
|
|
static int
|
|
fbsdrun_deletecpu(struct vmctx *ctx, int vcpu)
|
|
{
|
|
|
|
if (!CPU_ISSET(vcpu, &cpumask)) {
|
|
fprintf(stderr, "Attempting to delete unknown cpu %d\n", vcpu);
|
|
exit(4);
|
|
}
|
|
|
|
CPU_CLR_ATOMIC(vcpu, &cpumask);
|
|
return (CPU_EMPTY(&cpumask));
|
|
}
|
|
|
|
static int
|
|
vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu,
|
|
uint32_t eax)
|
|
{
|
|
#if BHYVE_DEBUG
|
|
/*
|
|
* put guest-driven debug here
|
|
*/
|
|
#endif
|
|
return (VMEXIT_CONTINUE);
|
|
}
|
|
|
|
static int
|
|
vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
|
|
{
|
|
int error;
|
|
int bytes, port, in, out;
|
|
int vcpu;
|
|
|
|
vcpu = *pvcpu;
|
|
|
|
port = vme->u.inout.port;
|
|
bytes = vme->u.inout.bytes;
|
|
in = vme->u.inout.in;
|
|
out = !in;
|
|
|
|
/* Extra-special case of host notifications */
|
|
if (out && port == GUEST_NIO_PORT) {
|
|
error = vmexit_handle_notify(ctx, vme, pvcpu, vme->u.inout.eax);
|
|
return (error);
|
|
}
|
|
|
|
error = emulate_inout(ctx, vcpu, vme);
|
|
if (error) {
|
|
fprintf(stderr, "Unhandled %s%c 0x%04x at 0x%lx\n",
|
|
in ? "in" : "out",
|
|
bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'),
|
|
port, vmexit->rip);
|
|
return (VMEXIT_ABORT);
|
|
} else {
|
|
return (VMEXIT_CONTINUE);
|
|
}
|
|
}
|
|
|
|
static int
|
|
vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
|
|
{
|
|
uint64_t val;
|
|
uint32_t eax, edx;
|
|
int error;
|
|
|
|
val = 0;
|
|
error = emulate_rdmsr(ctx, *pvcpu, vme->u.msr.code, &val);
|
|
if (error != 0) {
|
|
fprintf(stderr, "rdmsr to register %#x on vcpu %d\n",
|
|
vme->u.msr.code, *pvcpu);
|
|
if (get_config_bool("x86.strictmsr")) {
|
|
vm_inject_gp(ctx, *pvcpu);
|
|
return (VMEXIT_CONTINUE);
|
|
}
|
|
}
|
|
|
|
eax = val;
|
|
error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RAX, eax);
|
|
assert(error == 0);
|
|
|
|
edx = val >> 32;
|
|
error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RDX, edx);
|
|
assert(error == 0);
|
|
|
|
return (VMEXIT_CONTINUE);
|
|
}
|
|
|
|
static int
|
|
vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
|
|
{
|
|
int error;
|
|
|
|
error = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code, vme->u.msr.wval);
|
|
if (error != 0) {
|
|
fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n",
|
|
vme->u.msr.code, vme->u.msr.wval, *pvcpu);
|
|
if (get_config_bool("x86.strictmsr")) {
|
|
vm_inject_gp(ctx, *pvcpu);
|
|
return (VMEXIT_CONTINUE);
|
|
}
|
|
}
|
|
return (VMEXIT_CONTINUE);
|
|
}
|
|
|
|
static int
|
|
vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
|
|
{
|
|
|
|
(void)spinup_ap(ctx, *pvcpu,
|
|
vme->u.spinup_ap.vcpu, vme->u.spinup_ap.rip);
|
|
|
|
return (VMEXIT_CONTINUE);
|
|
}
|
|
|
|
#define DEBUG_EPT_MISCONFIG
|
|
#ifdef DEBUG_EPT_MISCONFIG
|
|
#define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400
|
|
|
|
static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4];
|
|
static int ept_misconfig_ptenum;
|
|
#endif
|
|
|
|
static const char *
|
|
vmexit_vmx_desc(uint32_t exit_reason)
|
|
{
|
|
|
|
if (exit_reason >= nitems(vmx_exit_reason_desc) ||
|
|
vmx_exit_reason_desc[exit_reason] == NULL)
|
|
return ("Unknown");
|
|
return (vmx_exit_reason_desc[exit_reason]);
|
|
}
|
|
|
|
static int
|
|
vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
|
|
{
|
|
|
|
fprintf(stderr, "vm exit[%d]\n", *pvcpu);
|
|
fprintf(stderr, "\treason\t\tVMX\n");
|
|
fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip);
|
|
fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length);
|
|
fprintf(stderr, "\tstatus\t\t%d\n", vmexit->u.vmx.status);
|
|
fprintf(stderr, "\texit_reason\t%u (%s)\n", vmexit->u.vmx.exit_reason,
|
|
vmexit_vmx_desc(vmexit->u.vmx.exit_reason));
|
|
fprintf(stderr, "\tqualification\t0x%016lx\n",
|
|
vmexit->u.vmx.exit_qualification);
|
|
fprintf(stderr, "\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type);
|
|
fprintf(stderr, "\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error);
|
|
#ifdef DEBUG_EPT_MISCONFIG
|
|
if (vmexit->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) {
|
|
vm_get_register(ctx, *pvcpu,
|
|
VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS),
|
|
&ept_misconfig_gpa);
|
|
vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte,
|
|
&ept_misconfig_ptenum);
|
|
fprintf(stderr, "\tEPT misconfiguration:\n");
|
|
fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa);
|
|
fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n",
|
|
ept_misconfig_ptenum, ept_misconfig_pte[0],
|
|
ept_misconfig_pte[1], ept_misconfig_pte[2],
|
|
ept_misconfig_pte[3]);
|
|
}
|
|
#endif /* DEBUG_EPT_MISCONFIG */
|
|
return (VMEXIT_ABORT);
|
|
}
|
|
|
|
static int
|
|
vmexit_svm(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
|
|
{
|
|
|
|
fprintf(stderr, "vm exit[%d]\n", *pvcpu);
|
|
fprintf(stderr, "\treason\t\tSVM\n");
|
|
fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip);
|
|
fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length);
|
|
fprintf(stderr, "\texitcode\t%#lx\n", vmexit->u.svm.exitcode);
|
|
fprintf(stderr, "\texitinfo1\t%#lx\n", vmexit->u.svm.exitinfo1);
|
|
fprintf(stderr, "\texitinfo2\t%#lx\n", vmexit->u.svm.exitinfo2);
|
|
return (VMEXIT_ABORT);
|
|
}
|
|
|
|
static int
|
|
vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
|
|
{
|
|
|
|
assert(vmexit->inst_length == 0);
|
|
|
|
stats.vmexit_bogus++;
|
|
|
|
return (VMEXIT_CONTINUE);
|
|
}
|
|
|
|
static int
|
|
vmexit_reqidle(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
|
|
{
|
|
|
|
assert(vmexit->inst_length == 0);
|
|
|
|
stats.vmexit_reqidle++;
|
|
|
|
return (VMEXIT_CONTINUE);
|
|
}
|
|
|
|
static int
|
|
vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
|
|
{
|
|
|
|
stats.vmexit_hlt++;
|
|
|
|
/*
|
|
* Just continue execution with the next instruction. We use
|
|
* the HLT VM exit as a way to be friendly with the host
|
|
* scheduler.
|
|
*/
|
|
return (VMEXIT_CONTINUE);
|
|
}
|
|
|
|
static int
|
|
vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
|
|
{
|
|
|
|
stats.vmexit_pause++;
|
|
|
|
return (VMEXIT_CONTINUE);
|
|
}
|
|
|
|
static int
|
|
vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
|
|
{
|
|
|
|
assert(vmexit->inst_length == 0);
|
|
|
|
stats.vmexit_mtrap++;
|
|
|
|
#ifdef BHYVE_SNAPSHOT
|
|
checkpoint_cpu_suspend(*pvcpu);
|
|
#endif
|
|
gdb_cpu_mtrap(*pvcpu);
|
|
#ifdef BHYVE_SNAPSHOT
|
|
checkpoint_cpu_resume(*pvcpu);
|
|
#endif
|
|
|
|
return (VMEXIT_CONTINUE);
|
|
}
|
|
|
|
static int
|
|
vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
|
|
{
|
|
int err, i, cs_d;
|
|
struct vie *vie;
|
|
enum vm_cpu_mode mode;
|
|
|
|
stats.vmexit_inst_emul++;
|
|
|
|
vie = &vmexit->u.inst_emul.vie;
|
|
if (!vie->decoded) {
|
|
/*
|
|
* Attempt to decode in userspace as a fallback. This allows
|
|
* updating instruction decode in bhyve without rebooting the
|
|
* kernel (rapid prototyping), albeit with much slower
|
|
* emulation.
|
|
*/
|
|
vie_restart(vie);
|
|
mode = vmexit->u.inst_emul.paging.cpu_mode;
|
|
cs_d = vmexit->u.inst_emul.cs_d;
|
|
if (vmm_decode_instruction(mode, cs_d, vie) != 0)
|
|
goto fail;
|
|
if (vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RIP,
|
|
vmexit->rip + vie->num_processed) != 0)
|
|
goto fail;
|
|
}
|
|
|
|
err = emulate_mem(ctx, *pvcpu, vmexit->u.inst_emul.gpa,
|
|
vie, &vmexit->u.inst_emul.paging);
|
|
|
|
if (err) {
|
|
if (err == ESRCH) {
|
|
EPRINTLN("Unhandled memory access to 0x%lx\n",
|
|
vmexit->u.inst_emul.gpa);
|
|
}
|
|
goto fail;
|
|
}
|
|
|
|
return (VMEXIT_CONTINUE);
|
|
|
|
fail:
|
|
fprintf(stderr, "Failed to emulate instruction sequence [ ");
|
|
for (i = 0; i < vie->num_valid; i++)
|
|
fprintf(stderr, "%02x", vie->inst[i]);
|
|
FPRINTLN(stderr, " ] at 0x%lx", vmexit->rip);
|
|
return (VMEXIT_ABORT);
|
|
}
|
|
|
|
static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER;
|
|
static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER;
|
|
|
|
static int
|
|
vmexit_suspend(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
|
|
{
|
|
enum vm_suspend_how how;
|
|
|
|
how = vmexit->u.suspended.how;
|
|
|
|
fbsdrun_deletecpu(ctx, *pvcpu);
|
|
|
|
if (*pvcpu != BSP) {
|
|
pthread_mutex_lock(&resetcpu_mtx);
|
|
pthread_cond_signal(&resetcpu_cond);
|
|
pthread_mutex_unlock(&resetcpu_mtx);
|
|
pthread_exit(NULL);
|
|
}
|
|
|
|
pthread_mutex_lock(&resetcpu_mtx);
|
|
while (!CPU_EMPTY(&cpumask)) {
|
|
pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx);
|
|
}
|
|
pthread_mutex_unlock(&resetcpu_mtx);
|
|
|
|
switch (how) {
|
|
case VM_SUSPEND_RESET:
|
|
exit(0);
|
|
case VM_SUSPEND_POWEROFF:
|
|
if (get_config_bool_default("destroy_on_poweroff", false))
|
|
vm_destroy(ctx);
|
|
exit(1);
|
|
case VM_SUSPEND_HALT:
|
|
exit(2);
|
|
case VM_SUSPEND_TRIPLEFAULT:
|
|
exit(3);
|
|
default:
|
|
fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how);
|
|
exit(100);
|
|
}
|
|
return (0); /* NOTREACHED */
|
|
}
|
|
|
|
static int
|
|
vmexit_debug(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
|
|
{
|
|
|
|
#ifdef BHYVE_SNAPSHOT
|
|
checkpoint_cpu_suspend(*pvcpu);
|
|
#endif
|
|
gdb_cpu_suspend(*pvcpu);
|
|
#ifdef BHYVE_SNAPSHOT
|
|
checkpoint_cpu_resume(*pvcpu);
|
|
#endif
|
|
return (VMEXIT_CONTINUE);
|
|
}
|
|
|
|
static int
|
|
vmexit_breakpoint(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
|
|
{
|
|
|
|
gdb_cpu_breakpoint(*pvcpu, vmexit);
|
|
return (VMEXIT_CONTINUE);
|
|
}
|
|
|
|
static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
|
|
[VM_EXITCODE_INOUT] = vmexit_inout,
|
|
[VM_EXITCODE_INOUT_STR] = vmexit_inout,
|
|
[VM_EXITCODE_VMX] = vmexit_vmx,
|
|
[VM_EXITCODE_SVM] = vmexit_svm,
|
|
[VM_EXITCODE_BOGUS] = vmexit_bogus,
|
|
[VM_EXITCODE_REQIDLE] = vmexit_reqidle,
|
|
[VM_EXITCODE_RDMSR] = vmexit_rdmsr,
|
|
[VM_EXITCODE_WRMSR] = vmexit_wrmsr,
|
|
[VM_EXITCODE_MTRAP] = vmexit_mtrap,
|
|
[VM_EXITCODE_INST_EMUL] = vmexit_inst_emul,
|
|
[VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap,
|
|
[VM_EXITCODE_SUSPENDED] = vmexit_suspend,
|
|
[VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch,
|
|
[VM_EXITCODE_DEBUG] = vmexit_debug,
|
|
[VM_EXITCODE_BPT] = vmexit_breakpoint,
|
|
};
|
|
|
|
static void
|
|
vm_loop(struct vmctx *ctx, int vcpu, uint64_t startrip)
|
|
{
|
|
int error, rc;
|
|
enum vm_exitcode exitcode;
|
|
cpuset_t active_cpus;
|
|
|
|
if (vcpumap[vcpu] != NULL) {
|
|
error = pthread_setaffinity_np(pthread_self(),
|
|
sizeof(cpuset_t), vcpumap[vcpu]);
|
|
assert(error == 0);
|
|
}
|
|
|
|
error = vm_active_cpus(ctx, &active_cpus);
|
|
assert(CPU_ISSET(vcpu, &active_cpus));
|
|
|
|
error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, startrip);
|
|
assert(error == 0);
|
|
|
|
while (1) {
|
|
error = vm_run(ctx, vcpu, &vmexit[vcpu]);
|
|
if (error != 0)
|
|
break;
|
|
|
|
exitcode = vmexit[vcpu].exitcode;
|
|
if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) {
|
|
fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n",
|
|
exitcode);
|
|
exit(4);
|
|
}
|
|
|
|
rc = (*handler[exitcode])(ctx, &vmexit[vcpu], &vcpu);
|
|
|
|
switch (rc) {
|
|
case VMEXIT_CONTINUE:
|
|
break;
|
|
case VMEXIT_ABORT:
|
|
abort();
|
|
default:
|
|
exit(4);
|
|
}
|
|
}
|
|
fprintf(stderr, "vm_run error %d, errno %d\n", error, errno);
|
|
}
|
|
|
|
static int
|
|
num_vcpus_allowed(struct vmctx *ctx)
|
|
{
|
|
int tmp, error;
|
|
|
|
error = vm_get_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, &tmp);
|
|
|
|
/*
|
|
* The guest is allowed to spinup more than one processor only if the
|
|
* UNRESTRICTED_GUEST capability is available.
|
|
*/
|
|
if (error == 0)
|
|
return (VM_MAXCPU);
|
|
else
|
|
return (1);
|
|
}
|
|
|
|
void
|
|
fbsdrun_set_capabilities(struct vmctx *ctx, int cpu)
|
|
{
|
|
int err, tmp;
|
|
|
|
if (get_config_bool_default("x86.vmexit_on_hlt", false)) {
|
|
err = vm_get_capability(ctx, cpu, VM_CAP_HALT_EXIT, &tmp);
|
|
if (err < 0) {
|
|
fprintf(stderr, "VM exit on HLT not supported\n");
|
|
exit(4);
|
|
}
|
|
vm_set_capability(ctx, cpu, VM_CAP_HALT_EXIT, 1);
|
|
if (cpu == BSP)
|
|
handler[VM_EXITCODE_HLT] = vmexit_hlt;
|
|
}
|
|
|
|
if (get_config_bool_default("x86.vmexit_on_pause", false)) {
|
|
/*
|
|
* pause exit support required for this mode
|
|
*/
|
|
err = vm_get_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, &tmp);
|
|
if (err < 0) {
|
|
fprintf(stderr,
|
|
"SMP mux requested, no pause support\n");
|
|
exit(4);
|
|
}
|
|
vm_set_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, 1);
|
|
if (cpu == BSP)
|
|
handler[VM_EXITCODE_PAUSE] = vmexit_pause;
|
|
}
|
|
|
|
if (get_config_bool_default("x86.x2apic", false))
|
|
err = vm_set_x2apic_state(ctx, cpu, X2APIC_ENABLED);
|
|
else
|
|
err = vm_set_x2apic_state(ctx, cpu, X2APIC_DISABLED);
|
|
|
|
if (err) {
|
|
fprintf(stderr, "Unable to set x2apic state (%d)\n", err);
|
|
exit(4);
|
|
}
|
|
|
|
vm_set_capability(ctx, cpu, VM_CAP_ENABLE_INVPCID, 1);
|
|
}
|
|
|
|
static struct vmctx *
|
|
do_open(const char *vmname)
|
|
{
|
|
struct vmctx *ctx;
|
|
int error;
|
|
bool reinit, romboot;
|
|
#ifndef WITHOUT_CAPSICUM
|
|
cap_rights_t rights;
|
|
const cap_ioctl_t *cmds;
|
|
size_t ncmds;
|
|
#endif
|
|
|
|
reinit = romboot = false;
|
|
|
|
if (lpc_bootrom())
|
|
romboot = true;
|
|
|
|
error = vm_create(vmname);
|
|
if (error) {
|
|
if (errno == EEXIST) {
|
|
if (romboot) {
|
|
reinit = true;
|
|
} else {
|
|
/*
|
|
* The virtual machine has been setup by the
|
|
* userspace bootloader.
|
|
*/
|
|
}
|
|
} else {
|
|
perror("vm_create");
|
|
exit(4);
|
|
}
|
|
} else {
|
|
if (!romboot) {
|
|
/*
|
|
* If the virtual machine was just created then a
|
|
* bootrom must be configured to boot it.
|
|
*/
|
|
fprintf(stderr, "virtual machine cannot be booted\n");
|
|
exit(4);
|
|
}
|
|
}
|
|
|
|
ctx = vm_open(vmname);
|
|
if (ctx == NULL) {
|
|
perror("vm_open");
|
|
exit(4);
|
|
}
|
|
|
|
#ifndef WITHOUT_CAPSICUM
|
|
cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW);
|
|
if (caph_rights_limit(vm_get_device_fd(ctx), &rights) == -1)
|
|
errx(EX_OSERR, "Unable to apply rights for sandbox");
|
|
vm_get_ioctls(&ncmds);
|
|
cmds = vm_get_ioctls(NULL);
|
|
if (cmds == NULL)
|
|
errx(EX_OSERR, "out of memory");
|
|
if (caph_ioctls_limit(vm_get_device_fd(ctx), cmds, ncmds) == -1)
|
|
errx(EX_OSERR, "Unable to apply rights for sandbox");
|
|
free((cap_ioctl_t *)cmds);
|
|
#endif
|
|
|
|
if (reinit) {
|
|
error = vm_reinit(ctx);
|
|
if (error) {
|
|
perror("vm_reinit");
|
|
exit(4);
|
|
}
|
|
}
|
|
error = vm_set_topology(ctx, sockets, cores, threads, maxcpus);
|
|
if (error)
|
|
errx(EX_OSERR, "vm_set_topology");
|
|
return (ctx);
|
|
}
|
|
|
|
void
|
|
spinup_vcpu(struct vmctx *ctx, int vcpu)
|
|
{
|
|
int error;
|
|
uint64_t rip;
|
|
|
|
error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip);
|
|
assert(error == 0);
|
|
|
|
fbsdrun_set_capabilities(ctx, vcpu);
|
|
error = vm_set_capability(ctx, vcpu, VM_CAP_UNRESTRICTED_GUEST, 1);
|
|
assert(error == 0);
|
|
|
|
fbsdrun_addcpu(ctx, BSP, vcpu, rip);
|
|
}
|
|
|
|
static bool
|
|
parse_config_option(const char *option)
|
|
{
|
|
const char *value;
|
|
char *path;
|
|
|
|
value = strchr(option, '=');
|
|
if (value == NULL || value[1] == '\0')
|
|
return (false);
|
|
path = strndup(option, value - option);
|
|
if (path == NULL)
|
|
err(4, "Failed to allocate memory");
|
|
set_config_value(path, value + 1);
|
|
return (true);
|
|
}
|
|
|
|
static void
|
|
parse_simple_config_file(const char *path)
|
|
{
|
|
FILE *fp;
|
|
char *line, *cp;
|
|
size_t linecap;
|
|
unsigned int lineno;
|
|
|
|
fp = fopen(path, "r");
|
|
if (fp == NULL)
|
|
err(4, "Failed to open configuration file %s", path);
|
|
line = NULL;
|
|
linecap = 0;
|
|
lineno = 1;
|
|
for (lineno = 1; getline(&line, &linecap, fp) > 0; lineno++) {
|
|
if (*line == '#' || *line == '\n')
|
|
continue;
|
|
cp = strchr(line, '\n');
|
|
if (cp != NULL)
|
|
*cp = '\0';
|
|
if (!parse_config_option(line))
|
|
errx(4, "%s line %u: invalid config option '%s'", path,
|
|
lineno, line);
|
|
}
|
|
free(line);
|
|
fclose(fp);
|
|
}
|
|
|
|
static void
|
|
set_defaults(void)
|
|
{
|
|
|
|
set_config_bool("acpi_tables", false);
|
|
set_config_value("memory.size", "256M");
|
|
set_config_bool("x86.strictmsr", true);
|
|
}
|
|
|
|
int
|
|
main(int argc, char *argv[])
|
|
{
|
|
int c, error, err;
|
|
int max_vcpus, memflags;
|
|
struct vmctx *ctx;
|
|
uint64_t rip;
|
|
size_t memsize;
|
|
const char *value, *vmname;
|
|
char *optstr;
|
|
#ifdef BHYVE_SNAPSHOT
|
|
char *restore_file;
|
|
struct restore_state rstate;
|
|
int vcpu;
|
|
|
|
restore_file = NULL;
|
|
#endif
|
|
|
|
init_config();
|
|
set_defaults();
|
|
progname = basename(argv[0]);
|
|
|
|
#ifdef BHYVE_SNAPSHOT
|
|
optstr = "aehuwxACDHIPSWYk:o:p:G:c:s:m:l:U:r:";
|
|
#else
|
|
optstr = "aehuwxACDHIPSWYk:o:p:G:c:s:m:l:U:";
|
|
#endif
|
|
while ((c = getopt(argc, argv, optstr)) != -1) {
|
|
switch (c) {
|
|
case 'a':
|
|
set_config_bool("x86.x2apic", false);
|
|
break;
|
|
case 'A':
|
|
set_config_bool("acpi_tables", true);
|
|
break;
|
|
case 'D':
|
|
set_config_bool("destroy_on_poweroff", true);
|
|
break;
|
|
case 'p':
|
|
if (pincpu_parse(optarg) != 0) {
|
|
errx(EX_USAGE, "invalid vcpu pinning "
|
|
"configuration '%s'", optarg);
|
|
}
|
|
break;
|
|
case 'c':
|
|
if (topology_parse(optarg) != 0) {
|
|
errx(EX_USAGE, "invalid cpu topology "
|
|
"'%s'", optarg);
|
|
}
|
|
break;
|
|
case 'C':
|
|
set_config_bool("memory.guest_in_core", true);
|
|
break;
|
|
case 'G':
|
|
if (optarg[0] == 'w') {
|
|
set_config_bool("gdb.wait", true);
|
|
optarg++;
|
|
}
|
|
set_config_value("gdb.port", optarg);
|
|
break;
|
|
case 'k':
|
|
parse_simple_config_file(optarg);
|
|
break;
|
|
case 'l':
|
|
if (strncmp(optarg, "help", strlen(optarg)) == 0) {
|
|
lpc_print_supported_devices();
|
|
exit(0);
|
|
} else if (lpc_device_parse(optarg) != 0) {
|
|
errx(EX_USAGE, "invalid lpc device "
|
|
"configuration '%s'", optarg);
|
|
}
|
|
break;
|
|
#ifdef BHYVE_SNAPSHOT
|
|
case 'r':
|
|
restore_file = optarg;
|
|
break;
|
|
#endif
|
|
case 's':
|
|
if (strncmp(optarg, "help", strlen(optarg)) == 0) {
|
|
pci_print_supported_devices();
|
|
exit(0);
|
|
} else if (pci_parse_slot(optarg) != 0)
|
|
exit(4);
|
|
else
|
|
break;
|
|
case 'S':
|
|
set_config_bool("memory.wired", true);
|
|
break;
|
|
case 'm':
|
|
set_config_value("memory.size", optarg);
|
|
break;
|
|
case 'o':
|
|
if (!parse_config_option(optarg))
|
|
errx(EX_USAGE, "invalid configuration option '%s'", optarg);
|
|
break;
|
|
case 'H':
|
|
set_config_bool("x86.vmexit_on_hlt", true);
|
|
break;
|
|
case 'I':
|
|
/*
|
|
* The "-I" option was used to add an ioapic to the
|
|
* virtual machine.
|
|
*
|
|
* An ioapic is now provided unconditionally for each
|
|
* virtual machine and this option is now deprecated.
|
|
*/
|
|
break;
|
|
case 'P':
|
|
set_config_bool("x86.vmexit_on_pause", true);
|
|
break;
|
|
case 'e':
|
|
set_config_bool("x86.strictio", true);
|
|
break;
|
|
case 'u':
|
|
set_config_bool("rtc.use_localtime", false);
|
|
break;
|
|
case 'U':
|
|
set_config_value("uuid", optarg);
|
|
break;
|
|
case 'w':
|
|
set_config_bool("x86.strictmsr", false);
|
|
break;
|
|
case 'W':
|
|
set_config_bool("virtio_msix", false);
|
|
break;
|
|
case 'x':
|
|
set_config_bool("x86.x2apic", true);
|
|
break;
|
|
case 'Y':
|
|
set_config_bool("x86.mptable", false);
|
|
break;
|
|
case 'h':
|
|
usage(0);
|
|
default:
|
|
usage(1);
|
|
}
|
|
}
|
|
argc -= optind;
|
|
argv += optind;
|
|
|
|
if (argc > 1)
|
|
usage(1);
|
|
|
|
#ifdef BHYVE_SNAPSHOT
|
|
if (restore_file != NULL) {
|
|
error = load_restore_file(restore_file, &rstate);
|
|
if (error) {
|
|
fprintf(stderr, "Failed to read checkpoint info from "
|
|
"file: '%s'.\n", restore_file);
|
|
exit(1);
|
|
}
|
|
vmname = lookup_vmname(&rstate);
|
|
if (vmname != NULL)
|
|
set_config_value("name", vmname);
|
|
}
|
|
#endif
|
|
|
|
if (argc == 1)
|
|
set_config_value("name", argv[0]);
|
|
|
|
vmname = get_config_value("name");
|
|
if (vmname == NULL)
|
|
usage(1);
|
|
|
|
if (get_config_bool_default("config.dump", false)) {
|
|
dump_config();
|
|
exit(1);
|
|
}
|
|
|
|
calc_topolopgy();
|
|
build_vcpumaps();
|
|
|
|
value = get_config_value("memory.size");
|
|
error = vm_parse_memsize(value, &memsize);
|
|
if (error)
|
|
errx(EX_USAGE, "invalid memsize '%s'", value);
|
|
|
|
ctx = do_open(vmname);
|
|
|
|
#ifdef BHYVE_SNAPSHOT
|
|
if (restore_file != NULL) {
|
|
guest_ncpus = lookup_guest_ncpus(&rstate);
|
|
memflags = lookup_memflags(&rstate);
|
|
memsize = lookup_memsize(&rstate);
|
|
}
|
|
|
|
if (guest_ncpus < 1) {
|
|
fprintf(stderr, "Invalid guest vCPUs (%d)\n", guest_ncpus);
|
|
exit(1);
|
|
}
|
|
#endif
|
|
|
|
max_vcpus = num_vcpus_allowed(ctx);
|
|
if (guest_ncpus > max_vcpus) {
|
|
fprintf(stderr, "%d vCPUs requested but only %d available\n",
|
|
guest_ncpus, max_vcpus);
|
|
exit(4);
|
|
}
|
|
|
|
fbsdrun_set_capabilities(ctx, BSP);
|
|
|
|
memflags = 0;
|
|
if (get_config_bool_default("memory.wired", false))
|
|
memflags |= VM_MEM_F_WIRED;
|
|
if (get_config_bool_default("memory.guest_in_core", false))
|
|
memflags |= VM_MEM_F_INCORE;
|
|
vm_set_memflags(ctx, memflags);
|
|
err = vm_setup_memory(ctx, memsize, VM_MMAP_ALL);
|
|
if (err) {
|
|
fprintf(stderr, "Unable to setup memory (%d)\n", errno);
|
|
exit(4);
|
|
}
|
|
|
|
error = init_msr();
|
|
if (error) {
|
|
fprintf(stderr, "init_msr error %d", error);
|
|
exit(4);
|
|
}
|
|
|
|
init_mem();
|
|
init_inout();
|
|
kernemu_dev_init();
|
|
init_bootrom(ctx);
|
|
atkbdc_init(ctx);
|
|
pci_irq_init(ctx);
|
|
ioapic_init(ctx);
|
|
|
|
rtc_init(ctx);
|
|
sci_init(ctx);
|
|
|
|
/*
|
|
* Exit if a device emulation finds an error in its initilization
|
|
*/
|
|
if (init_pci(ctx) != 0) {
|
|
perror("device emulation initialization error");
|
|
exit(4);
|
|
}
|
|
|
|
/*
|
|
* Initialize after PCI, to allow a bootrom file to reserve the high
|
|
* region.
|
|
*/
|
|
if (get_config_bool("acpi_tables"))
|
|
vmgenc_init(ctx);
|
|
|
|
value = get_config_value("gdb.port");
|
|
if (value != NULL)
|
|
init_gdb(ctx, atoi(value), get_config_bool_default("gdb.wait",
|
|
false));
|
|
|
|
if (lpc_bootrom()) {
|
|
if (vm_set_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, 1)) {
|
|
fprintf(stderr, "ROM boot failed: unrestricted guest "
|
|
"capability not available\n");
|
|
exit(4);
|
|
}
|
|
error = vcpu_reset(ctx, BSP);
|
|
assert(error == 0);
|
|
}
|
|
|
|
#ifdef BHYVE_SNAPSHOT
|
|
if (restore_file != NULL) {
|
|
fprintf(stdout, "Pausing pci devs...\r\n");
|
|
if (vm_pause_user_devs(ctx) != 0) {
|
|
fprintf(stderr, "Failed to pause PCI device state.\n");
|
|
exit(1);
|
|
}
|
|
|
|
fprintf(stdout, "Restoring vm mem...\r\n");
|
|
if (restore_vm_mem(ctx, &rstate) != 0) {
|
|
fprintf(stderr, "Failed to restore VM memory.\n");
|
|
exit(1);
|
|
}
|
|
|
|
fprintf(stdout, "Restoring pci devs...\r\n");
|
|
if (vm_restore_user_devs(ctx, &rstate) != 0) {
|
|
fprintf(stderr, "Failed to restore PCI device state.\n");
|
|
exit(1);
|
|
}
|
|
|
|
fprintf(stdout, "Restoring kernel structs...\r\n");
|
|
if (vm_restore_kern_structs(ctx, &rstate) != 0) {
|
|
fprintf(stderr, "Failed to restore kernel structs.\n");
|
|
exit(1);
|
|
}
|
|
|
|
fprintf(stdout, "Resuming pci devs...\r\n");
|
|
if (vm_resume_user_devs(ctx) != 0) {
|
|
fprintf(stderr, "Failed to resume PCI device state.\n");
|
|
exit(1);
|
|
}
|
|
}
|
|
#endif
|
|
|
|
error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip);
|
|
assert(error == 0);
|
|
|
|
/*
|
|
* build the guest tables, MP etc.
|
|
*/
|
|
if (get_config_bool_default("x86.mptable", true)) {
|
|
error = mptable_build(ctx, guest_ncpus);
|
|
if (error) {
|
|
perror("error to build the guest tables");
|
|
exit(4);
|
|
}
|
|
}
|
|
|
|
error = smbios_build(ctx);
|
|
assert(error == 0);
|
|
|
|
if (get_config_bool("acpi_tables")) {
|
|
error = acpi_build(ctx, guest_ncpus);
|
|
assert(error == 0);
|
|
}
|
|
|
|
if (lpc_bootrom())
|
|
fwctl_init();
|
|
|
|
/*
|
|
* Change the proc title to include the VM name.
|
|
*/
|
|
setproctitle("%s", vmname);
|
|
|
|
#ifndef WITHOUT_CAPSICUM
|
|
caph_cache_catpages();
|
|
|
|
if (caph_limit_stdout() == -1 || caph_limit_stderr() == -1)
|
|
errx(EX_OSERR, "Unable to apply rights for sandbox");
|
|
|
|
if (caph_enter() == -1)
|
|
errx(EX_OSERR, "cap_enter() failed");
|
|
#endif
|
|
|
|
#ifdef BHYVE_SNAPSHOT
|
|
if (restore_file != NULL)
|
|
destroy_restore_state(&rstate);
|
|
|
|
/*
|
|
* checkpointing thread for communication with bhyvectl
|
|
*/
|
|
if (init_checkpoint_thread(ctx) < 0)
|
|
printf("Failed to start checkpoint thread!\r\n");
|
|
|
|
if (restore_file != NULL)
|
|
vm_restore_time(ctx);
|
|
#endif
|
|
|
|
/*
|
|
* Add CPU 0
|
|
*/
|
|
fbsdrun_addcpu(ctx, BSP, BSP, rip);
|
|
|
|
#ifdef BHYVE_SNAPSHOT
|
|
/*
|
|
* If we restore a VM, start all vCPUs now (including APs), otherwise,
|
|
* let the guest OS to spin them up later via vmexits.
|
|
*/
|
|
if (restore_file != NULL) {
|
|
for (vcpu = 0; vcpu < guest_ncpus; vcpu++) {
|
|
if (vcpu == BSP)
|
|
continue;
|
|
|
|
fprintf(stdout, "spinning up vcpu no %d...\r\n", vcpu);
|
|
spinup_vcpu(ctx, vcpu);
|
|
}
|
|
}
|
|
#endif
|
|
|
|
/*
|
|
* Head off to the main event dispatch loop
|
|
*/
|
|
mevent_dispatch();
|
|
|
|
exit(4);
|
|
}
|