neel 79b96fdbcb MFC r282209:
Emulate the 'bit test' instruction.

MFC r282259:
Re-implement RTC current time calculation to eliminate the possibility of
losing time.

MFC r282281:
Advertise the MTRR feature via CPUID and emulate the minimal set of MTRR MSRs.

MFC r282284:
When an instruction cannot be decoded just return to userspace so bhyve(8)
can dump the instruction bytes.

MFC r282287:
Don't require <sys/cpuset.h> to be always included before <machine/vmm.h>.

MFC r282296:
Emulate MSR_SYSCFG which is accessed by Linux on AMD cpus when MTRRs are
enabled.

MFC r282301:
Relax limits when transitioning a vector from the IRR to the ISR and also
when extinguishing it from the ISR in response to an EOI.

MFC r282335:
Advertise an additional memory BAR in the "dummy" device emulation.

MFC r282336:
Emulate machine check related MSRs to allow guest OSes like Windows to boot.

MFC r282351:
Don't advertise the Intel SMX capability to the guest.

MFC r282407:
Emulate the 'CMP r/m8, imm8' instruction.

MFC r282519:
Add macros for AMD-specific bits in MSR_EFER: LMSLE, FFXSR and TCE.

MFC r282520:
Emulate guest writes to EFER_MSR properly.

MFC r282558:
Deprecate the 3-way return values from vm_gla2gpa() and vm_copy_setup().

MFC r282571:
Check 'td_owepreempt' and yield the vcpu thread if it is set.

MFC r282595:
Allow byte reads of AHCI registers.

MFC r282784:
Handling indirect descriptors is a capability of the host and not one that
needs to be negotiated. Use the host capabilities field and not the negotiated
field when verifying that indirect descriptors are supported.

MFC r282788:
Allow configuration of the sector size advertised to the guest.

MFC r282865:
Set the subvendor field in config space to the vendor ID. This is required
by the Windows virtio drivers to correctly match a device.

MFC r282922:
Bump the size of the blockif scatter-gather list to 67.

MFC r283075:
Fix off-by-one in array index bounds check. bhyveload would allow you to
create 33 entries on an array that only has 32 slots

MFC r283168:
Temporarily revert r282922 which bumped the max descriptors.

MFC r283255:
Emulate the "CMP r/m, reg" instruction (opcode 39H).

MFC r283256:
Add an option "--get-vmcs-exit-inst-length" to display the instruction length
of the instruction that caused the VM-exit.

MFC r283264:
Change the header type of the emulated host-bridge from type 1 to type 0.

MFC r283293:
Don't rely on the 'VM-exit instruction length' field in the VMCS to always
have an accurate length on an EPT violation.

MFC r283299:
Remove bogus verification of instruction length after instruction decode.

MFC r283308:
Exceptions don't deliver an error code in real mode.

MFC r283657:
Fix non-deterministic delays when accessing a vcpu that was in "running" or
"sleeping" state.

MFC r283973:
Use tunable 'hw.vmm.svm.features' to disable specific SVM features even
though they might be available in hardware. Use tunable 'hw.vmm.svm.num_asids'
to limit the number of ASIDs used by the hypervisor.

MFC r284046:
Fix regression in 'verify_gla()' with the RIP-relative addressing mode.

MFC r284174:
Support guest writes to the TSC by enabling the "use TSC offsetting"
execution control.
2015-06-28 03:22:26 +00:00

747 lines
14 KiB
C

/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
/*-
* Copyright (c) 2011 Google, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/ioctl.h>
#include <sys/stat.h>
#include <sys/disk.h>
#include <sys/queue.h>
#include <machine/specialreg.h>
#include <machine/vmm.h>
#include <dirent.h>
#include <dlfcn.h>
#include <errno.h>
#include <err.h>
#include <fcntl.h>
#include <getopt.h>
#include <libgen.h>
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sysexits.h>
#include <termios.h>
#include <unistd.h>
#include <vmmapi.h>
#include "userboot.h"
#define MB (1024 * 1024UL)
#define GB (1024 * 1024 * 1024UL)
#define BSP 0
#define NDISKS 32
static char *host_base;
static struct termios term, oldterm;
static int disk_fd[NDISKS];
static int ndisks;
static int consin_fd, consout_fd;
static char *vmname, *progname;
static struct vmctx *ctx;
static uint64_t gdtbase, cr3, rsp;
static void cb_exit(void *arg, int v);
/*
* Console i/o callbacks
*/
static void
cb_putc(void *arg, int ch)
{
char c = ch;
(void) write(consout_fd, &c, 1);
}
static int
cb_getc(void *arg)
{
char c;
if (read(consin_fd, &c, 1) == 1)
return (c);
return (-1);
}
static int
cb_poll(void *arg)
{
int n;
if (ioctl(consin_fd, FIONREAD, &n) >= 0)
return (n > 0);
return (0);
}
/*
* Host filesystem i/o callbacks
*/
struct cb_file {
int cf_isdir;
size_t cf_size;
struct stat cf_stat;
union {
int fd;
DIR *dir;
} cf_u;
};
static int
cb_open(void *arg, const char *filename, void **hp)
{
struct stat st;
struct cb_file *cf;
char path[PATH_MAX];
if (!host_base)
return (ENOENT);
strlcpy(path, host_base, PATH_MAX);
if (path[strlen(path) - 1] == '/')
path[strlen(path) - 1] = 0;
strlcat(path, filename, PATH_MAX);
cf = malloc(sizeof(struct cb_file));
if (stat(path, &cf->cf_stat) < 0) {
free(cf);
return (errno);
}
cf->cf_size = st.st_size;
if (S_ISDIR(cf->cf_stat.st_mode)) {
cf->cf_isdir = 1;
cf->cf_u.dir = opendir(path);
if (!cf->cf_u.dir)
goto out;
*hp = cf;
return (0);
}
if (S_ISREG(cf->cf_stat.st_mode)) {
cf->cf_isdir = 0;
cf->cf_u.fd = open(path, O_RDONLY);
if (cf->cf_u.fd < 0)
goto out;
*hp = cf;
return (0);
}
out:
free(cf);
return (EINVAL);
}
static int
cb_close(void *arg, void *h)
{
struct cb_file *cf = h;
if (cf->cf_isdir)
closedir(cf->cf_u.dir);
else
close(cf->cf_u.fd);
free(cf);
return (0);
}
static int
cb_isdir(void *arg, void *h)
{
struct cb_file *cf = h;
return (cf->cf_isdir);
}
static int
cb_read(void *arg, void *h, void *buf, size_t size, size_t *resid)
{
struct cb_file *cf = h;
ssize_t sz;
if (cf->cf_isdir)
return (EINVAL);
sz = read(cf->cf_u.fd, buf, size);
if (sz < 0)
return (EINVAL);
*resid = size - sz;
return (0);
}
static int
cb_readdir(void *arg, void *h, uint32_t *fileno_return, uint8_t *type_return,
size_t *namelen_return, char *name)
{
struct cb_file *cf = h;
struct dirent *dp;
if (!cf->cf_isdir)
return (EINVAL);
dp = readdir(cf->cf_u.dir);
if (!dp)
return (ENOENT);
/*
* Note: d_namlen is in the range 0..255 and therefore less
* than PATH_MAX so we don't need to test before copying.
*/
*fileno_return = dp->d_fileno;
*type_return = dp->d_type;
*namelen_return = dp->d_namlen;
memcpy(name, dp->d_name, dp->d_namlen);
name[dp->d_namlen] = 0;
return (0);
}
static int
cb_seek(void *arg, void *h, uint64_t offset, int whence)
{
struct cb_file *cf = h;
if (cf->cf_isdir)
return (EINVAL);
if (lseek(cf->cf_u.fd, offset, whence) < 0)
return (errno);
return (0);
}
static int
cb_stat(void *arg, void *h, int *mode, int *uid, int *gid, uint64_t *size)
{
struct cb_file *cf = h;
*mode = cf->cf_stat.st_mode;
*uid = cf->cf_stat.st_uid;
*gid = cf->cf_stat.st_gid;
*size = cf->cf_stat.st_size;
return (0);
}
/*
* Disk image i/o callbacks
*/
static int
cb_diskread(void *arg, int unit, uint64_t from, void *to, size_t size,
size_t *resid)
{
ssize_t n;
if (unit < 0 || unit >= ndisks )
return (EIO);
n = pread(disk_fd[unit], to, size, from);
if (n < 0)
return (errno);
*resid = size - n;
return (0);
}
static int
cb_diskioctl(void *arg, int unit, u_long cmd, void *data)
{
struct stat sb;
if (unit < 0 || unit >= ndisks)
return (EBADF);
switch (cmd) {
case DIOCGSECTORSIZE:
*(u_int *)data = 512;
break;
case DIOCGMEDIASIZE:
if (fstat(disk_fd[unit], &sb) == 0)
*(off_t *)data = sb.st_size;
else
return (ENOTTY);
break;
default:
return (ENOTTY);
}
return (0);
}
/*
* Guest virtual machine i/o callbacks
*/
static int
cb_copyin(void *arg, const void *from, uint64_t to, size_t size)
{
char *ptr;
to &= 0x7fffffff;
ptr = vm_map_gpa(ctx, to, size);
if (ptr == NULL)
return (EFAULT);
memcpy(ptr, from, size);
return (0);
}
static int
cb_copyout(void *arg, uint64_t from, void *to, size_t size)
{
char *ptr;
from &= 0x7fffffff;
ptr = vm_map_gpa(ctx, from, size);
if (ptr == NULL)
return (EFAULT);
memcpy(to, ptr, size);
return (0);
}
static void
cb_setreg(void *arg, int r, uint64_t v)
{
int error;
enum vm_reg_name vmreg;
vmreg = VM_REG_LAST;
switch (r) {
case 4:
vmreg = VM_REG_GUEST_RSP;
rsp = v;
break;
default:
break;
}
if (vmreg == VM_REG_LAST) {
printf("test_setreg(%d): not implemented\n", r);
cb_exit(NULL, USERBOOT_EXIT_QUIT);
}
error = vm_set_register(ctx, BSP, vmreg, v);
if (error) {
perror("vm_set_register");
cb_exit(NULL, USERBOOT_EXIT_QUIT);
}
}
static void
cb_setmsr(void *arg, int r, uint64_t v)
{
int error;
enum vm_reg_name vmreg;
vmreg = VM_REG_LAST;
switch (r) {
case MSR_EFER:
vmreg = VM_REG_GUEST_EFER;
break;
default:
break;
}
if (vmreg == VM_REG_LAST) {
printf("test_setmsr(%d): not implemented\n", r);
cb_exit(NULL, USERBOOT_EXIT_QUIT);
}
error = vm_set_register(ctx, BSP, vmreg, v);
if (error) {
perror("vm_set_msr");
cb_exit(NULL, USERBOOT_EXIT_QUIT);
}
}
static void
cb_setcr(void *arg, int r, uint64_t v)
{
int error;
enum vm_reg_name vmreg;
vmreg = VM_REG_LAST;
switch (r) {
case 0:
vmreg = VM_REG_GUEST_CR0;
break;
case 3:
vmreg = VM_REG_GUEST_CR3;
cr3 = v;
break;
case 4:
vmreg = VM_REG_GUEST_CR4;
break;
default:
break;
}
if (vmreg == VM_REG_LAST) {
printf("test_setcr(%d): not implemented\n", r);
cb_exit(NULL, USERBOOT_EXIT_QUIT);
}
error = vm_set_register(ctx, BSP, vmreg, v);
if (error) {
perror("vm_set_cr");
cb_exit(NULL, USERBOOT_EXIT_QUIT);
}
}
static void
cb_setgdt(void *arg, uint64_t base, size_t size)
{
int error;
error = vm_set_desc(ctx, BSP, VM_REG_GUEST_GDTR, base, size - 1, 0);
if (error != 0) {
perror("vm_set_desc(gdt)");
cb_exit(NULL, USERBOOT_EXIT_QUIT);
}
gdtbase = base;
}
static void
cb_exec(void *arg, uint64_t rip)
{
int error;
if (cr3 == 0)
error = vm_setup_freebsd_registers_i386(ctx, BSP, rip, gdtbase,
rsp);
else
error = vm_setup_freebsd_registers(ctx, BSP, rip, cr3, gdtbase,
rsp);
if (error) {
perror("vm_setup_freebsd_registers");
cb_exit(NULL, USERBOOT_EXIT_QUIT);
}
cb_exit(NULL, 0);
}
/*
* Misc
*/
static void
cb_delay(void *arg, int usec)
{
usleep(usec);
}
static void
cb_exit(void *arg, int v)
{
tcsetattr(consout_fd, TCSAFLUSH, &oldterm);
exit(v);
}
static void
cb_getmem(void *arg, uint64_t *ret_lowmem, uint64_t *ret_highmem)
{
*ret_lowmem = vm_get_lowmem_size(ctx);
*ret_highmem = vm_get_highmem_size(ctx);
}
struct env {
const char *str; /* name=value */
SLIST_ENTRY(env) next;
};
static SLIST_HEAD(envhead, env) envhead;
static void
addenv(const char *str)
{
struct env *env;
env = malloc(sizeof(struct env));
env->str = str;
SLIST_INSERT_HEAD(&envhead, env, next);
}
static const char *
cb_getenv(void *arg, int num)
{
int i;
struct env *env;
i = 0;
SLIST_FOREACH(env, &envhead, next) {
if (i == num)
return (env->str);
i++;
}
return (NULL);
}
static struct loader_callbacks cb = {
.getc = cb_getc,
.putc = cb_putc,
.poll = cb_poll,
.open = cb_open,
.close = cb_close,
.isdir = cb_isdir,
.read = cb_read,
.readdir = cb_readdir,
.seek = cb_seek,
.stat = cb_stat,
.diskread = cb_diskread,
.diskioctl = cb_diskioctl,
.copyin = cb_copyin,
.copyout = cb_copyout,
.setreg = cb_setreg,
.setmsr = cb_setmsr,
.setcr = cb_setcr,
.setgdt = cb_setgdt,
.exec = cb_exec,
.delay = cb_delay,
.exit = cb_exit,
.getmem = cb_getmem,
.getenv = cb_getenv,
};
static int
altcons_open(char *path)
{
struct stat sb;
int err;
int fd;
/*
* Allow stdio to be passed in so that the same string
* can be used for the bhyveload console and bhyve com-port
* parameters
*/
if (!strcmp(path, "stdio"))
return (0);
err = stat(path, &sb);
if (err == 0) {
if (!S_ISCHR(sb.st_mode))
err = ENOTSUP;
else {
fd = open(path, O_RDWR | O_NONBLOCK);
if (fd < 0)
err = errno;
else
consin_fd = consout_fd = fd;
}
}
return (err);
}
static int
disk_open(char *path)
{
int err, fd;
if (ndisks >= NDISKS)
return (ERANGE);
err = 0;
fd = open(path, O_RDONLY);
if (fd > 0) {
disk_fd[ndisks] = fd;
ndisks++;
} else
err = errno;
return (err);
}
static void
usage(void)
{
fprintf(stderr,
"usage: %s [-c <console-device>] [-d <disk-path>] [-e <name=value>]\n"
" %*s [-h <host-path>] [-m mem-size] <vmname>\n",
progname,
(int)strlen(progname), "");
exit(1);
}
int
main(int argc, char** argv)
{
void *h;
void (*func)(struct loader_callbacks *, void *, int, int);
uint64_t mem_size;
int opt, error, need_reinit;
progname = basename(argv[0]);
mem_size = 256 * MB;
consin_fd = STDIN_FILENO;
consout_fd = STDOUT_FILENO;
while ((opt = getopt(argc, argv, "c:d:e:h:m:")) != -1) {
switch (opt) {
case 'c':
error = altcons_open(optarg);
if (error != 0)
errx(EX_USAGE, "Could not open '%s'", optarg);
break;
case 'd':
error = disk_open(optarg);
if (error != 0)
errx(EX_USAGE, "Could not open '%s'", optarg);
break;
case 'e':
addenv(optarg);
break;
case 'h':
host_base = optarg;
break;
case 'm':
error = vm_parse_memsize(optarg, &mem_size);
if (error != 0)
errx(EX_USAGE, "Invalid memsize '%s'", optarg);
break;
case '?':
usage();
}
}
argc -= optind;
argv += optind;
if (argc != 1)
usage();
vmname = argv[0];
need_reinit = 0;
error = vm_create(vmname);
if (error) {
if (errno != EEXIST) {
perror("vm_create");
exit(1);
}
need_reinit = 1;
}
ctx = vm_open(vmname);
if (ctx == NULL) {
perror("vm_open");
exit(1);
}
if (need_reinit) {
error = vm_reinit(ctx);
if (error) {
perror("vm_reinit");
exit(1);
}
}
error = vm_setup_memory(ctx, mem_size, VM_MMAP_ALL);
if (error) {
perror("vm_setup_memory");
exit(1);
}
tcgetattr(consout_fd, &term);
oldterm = term;
cfmakeraw(&term);
term.c_cflag |= CLOCAL;
tcsetattr(consout_fd, TCSAFLUSH, &term);
h = dlopen("/boot/userboot.so", RTLD_LOCAL);
if (!h) {
printf("%s\n", dlerror());
return (1);
}
func = dlsym(h, "loader_main");
if (!func) {
printf("%s\n", dlerror());
return (1);
}
addenv("smbios.bios.vendor=BHYVE");
addenv("boot_serial=1");
func(&cb, NULL, USERBOOT_VERSION_3, ndisks);
}