Merge projects/bhyve to head.

'bhyve' was developed by grehan@ and myself at NetApp (thanks!).

Special thanks to Peter Snyder, Joe Caradonna and Michael Dexter for their
support and encouragement.

Obtained from:	NetApp
This commit is contained in:
Neel Natu 2013-01-19 04:18:52 +00:00
commit c458fc1ed4
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=245652
105 changed files with 25476 additions and 0 deletions

View File

@ -115,6 +115,7 @@ SUBDIR= ${SUBDIR_ORDERED} \
${_libusbhid} \
${_libusb} \
${_libvgl} \
${_libvmmapi} \
libwrap \
liby \
libz \
@ -198,6 +199,10 @@ _libproc= libproc
_librtld_db= librtld_db
.endif
.if ${MACHINE_CPUARCH} == "amd64"
_libvmmapi= libvmmapi
.endif
.if ${MACHINE_CPUARCH} == "ia64"
_libefi= libefi
.endif

11
lib/libvmmapi/Makefile Normal file
View File

@ -0,0 +1,11 @@
# $FreeBSD$
LIB= vmmapi
SRCS= vmmapi.c vmmapi_freebsd.c
INCS= vmmapi.h
WARNS?= 2
CFLAGS+= -I${.CURDIR}
.include <bsd.lib.mk>

723
lib/libvmmapi/vmmapi.c Normal file
View File

@ -0,0 +1,723 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <sys/sysctl.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <machine/specialreg.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>
#include <machine/vmm.h>
#include <machine/vmm_dev.h>
#include "vmmapi.h"
struct vmctx {
int fd;
char *name;
};
#define CREATE(x) sysctlbyname("hw.vmm.create", NULL, NULL, (x), strlen((x)))
#define DESTROY(x) sysctlbyname("hw.vmm.destroy", NULL, NULL, (x), strlen((x)))
static int
vm_device_open(const char *name)
{
int fd, len;
char *vmfile;
len = strlen("/dev/vmm/") + strlen(name) + 1;
vmfile = malloc(len);
assert(vmfile != NULL);
snprintf(vmfile, len, "/dev/vmm/%s", name);
/* Open the device file */
fd = open(vmfile, O_RDWR, 0);
free(vmfile);
return (fd);
}
int
vm_create(const char *name)
{
return (CREATE((char *)name));
}
struct vmctx *
vm_open(const char *name)
{
struct vmctx *vm;
vm = malloc(sizeof(struct vmctx) + strlen(name) + 1);
assert(vm != NULL);
vm->fd = -1;
vm->name = (char *)(vm + 1);
strcpy(vm->name, name);
if ((vm->fd = vm_device_open(vm->name)) < 0)
goto err;
return (vm);
err:
vm_destroy(vm);
return (NULL);
}
void
vm_destroy(struct vmctx *vm)
{
assert(vm != NULL);
if (vm->fd >= 0)
close(vm->fd);
DESTROY(vm->name);
free(vm);
}
size_t
vmm_get_mem_total(void)
{
size_t mem_total = 0;
size_t oldlen = sizeof(mem_total);
int error;
error = sysctlbyname("hw.vmm.mem_total", &mem_total, &oldlen, NULL, 0);
if (error)
return -1;
return mem_total;
}
size_t
vmm_get_mem_free(void)
{
size_t mem_free = 0;
size_t oldlen = sizeof(mem_free);
int error;
error = sysctlbyname("hw.vmm.mem_free", &mem_free, &oldlen, NULL, 0);
if (error)
return -1;
return mem_free;
}
int
vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len)
{
int error;
struct vm_memory_segment seg;
bzero(&seg, sizeof(seg));
seg.gpa = gpa;
error = ioctl(ctx->fd, VM_GET_MEMORY_SEG, &seg);
*ret_len = seg.len;
return (error);
}
int
vm_setup_memory(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char **mapaddr)
{
int error;
struct vm_memory_segment seg;
/*
* Create and optionally map 'len' bytes of memory at guest
* physical address 'gpa'
*/
bzero(&seg, sizeof(seg));
seg.gpa = gpa;
seg.len = len;
error = ioctl(ctx->fd, VM_MAP_MEMORY, &seg);
if (error == 0 && mapaddr != NULL) {
*mapaddr = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_SHARED,
ctx->fd, gpa);
}
return (error);
}
char *
vm_map_memory(struct vmctx *ctx, vm_paddr_t gpa, size_t len)
{
/* Map 'len' bytes of memory at guest physical address 'gpa' */
return ((char *)mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_SHARED,
ctx->fd, gpa));
}
int
vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
uint64_t base, uint32_t limit, uint32_t access)
{
int error;
struct vm_seg_desc vmsegdesc;
bzero(&vmsegdesc, sizeof(vmsegdesc));
vmsegdesc.cpuid = vcpu;
vmsegdesc.regnum = reg;
vmsegdesc.desc.base = base;
vmsegdesc.desc.limit = limit;
vmsegdesc.desc.access = access;
error = ioctl(ctx->fd, VM_SET_SEGMENT_DESCRIPTOR, &vmsegdesc);
return (error);
}
int
vm_get_desc(struct vmctx *ctx, int vcpu, int reg,
uint64_t *base, uint32_t *limit, uint32_t *access)
{
int error;
struct vm_seg_desc vmsegdesc;
bzero(&vmsegdesc, sizeof(vmsegdesc));
vmsegdesc.cpuid = vcpu;
vmsegdesc.regnum = reg;
error = ioctl(ctx->fd, VM_GET_SEGMENT_DESCRIPTOR, &vmsegdesc);
if (error == 0) {
*base = vmsegdesc.desc.base;
*limit = vmsegdesc.desc.limit;
*access = vmsegdesc.desc.access;
}
return (error);
}
int
vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val)
{
int error;
struct vm_register vmreg;
bzero(&vmreg, sizeof(vmreg));
vmreg.cpuid = vcpu;
vmreg.regnum = reg;
vmreg.regval = val;
error = ioctl(ctx->fd, VM_SET_REGISTER, &vmreg);
return (error);
}
int
vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *ret_val)
{
int error;
struct vm_register vmreg;
bzero(&vmreg, sizeof(vmreg));
vmreg.cpuid = vcpu;
vmreg.regnum = reg;
error = ioctl(ctx->fd, VM_GET_REGISTER, &vmreg);
*ret_val = vmreg.regval;
return (error);
}
int
vm_get_pinning(struct vmctx *ctx, int vcpu, int *host_cpuid)
{
int error;
struct vm_pin vmpin;
bzero(&vmpin, sizeof(vmpin));
vmpin.vm_cpuid = vcpu;
error = ioctl(ctx->fd, VM_GET_PINNING, &vmpin);
*host_cpuid = vmpin.host_cpuid;
return (error);
}
int
vm_set_pinning(struct vmctx *ctx, int vcpu, int host_cpuid)
{
int error;
struct vm_pin vmpin;
bzero(&vmpin, sizeof(vmpin));
vmpin.vm_cpuid = vcpu;
vmpin.host_cpuid = host_cpuid;
error = ioctl(ctx->fd, VM_SET_PINNING, &vmpin);
return (error);
}
int
vm_run(struct vmctx *ctx, int vcpu, uint64_t rip, struct vm_exit *vmexit)
{
int error;
struct vm_run vmrun;
bzero(&vmrun, sizeof(vmrun));
vmrun.cpuid = vcpu;
vmrun.rip = rip;
error = ioctl(ctx->fd, VM_RUN, &vmrun);
bcopy(&vmrun.vm_exit, vmexit, sizeof(struct vm_exit));
return (error);
}
static int
vm_inject_event_real(struct vmctx *ctx, int vcpu, enum vm_event_type type,
int vector, int error_code, int error_code_valid)
{
struct vm_event ev;
bzero(&ev, sizeof(ev));
ev.cpuid = vcpu;
ev.type = type;
ev.vector = vector;
ev.error_code = error_code;
ev.error_code_valid = error_code_valid;
return (ioctl(ctx->fd, VM_INJECT_EVENT, &ev));
}
int
vm_inject_event(struct vmctx *ctx, int vcpu, enum vm_event_type type,
int vector)
{
return (vm_inject_event_real(ctx, vcpu, type, vector, 0, 0));
}
int
vm_inject_event2(struct vmctx *ctx, int vcpu, enum vm_event_type type,
int vector, int error_code)
{
return (vm_inject_event_real(ctx, vcpu, type, vector, error_code, 1));
}
int
vm_apicid2vcpu(struct vmctx *ctx, int apicid)
{
/*
* The apic id associated with the 'vcpu' has the same numerical value
* as the 'vcpu' itself.
*/
return (apicid);
}
int
vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector)
{
struct vm_lapic_irq vmirq;
bzero(&vmirq, sizeof(vmirq));
vmirq.cpuid = vcpu;
vmirq.vector = vector;
return (ioctl(ctx->fd, VM_LAPIC_IRQ, &vmirq));
}
int
vm_inject_nmi(struct vmctx *ctx, int vcpu)
{
struct vm_nmi vmnmi;
bzero(&vmnmi, sizeof(vmnmi));
vmnmi.cpuid = vcpu;
return (ioctl(ctx->fd, VM_INJECT_NMI, &vmnmi));
}
static struct {
const char *name;
int type;
} capstrmap[] = {
{ "hlt_exit", VM_CAP_HALT_EXIT },
{ "mtrap_exit", VM_CAP_MTRAP_EXIT },
{ "pause_exit", VM_CAP_PAUSE_EXIT },
{ "unrestricted_guest", VM_CAP_UNRESTRICTED_GUEST },
{ 0 }
};
int
vm_capability_name2type(const char *capname)
{
int i;
for (i = 0; capstrmap[i].name != NULL && capname != NULL; i++) {
if (strcmp(capstrmap[i].name, capname) == 0)
return (capstrmap[i].type);
}
return (-1);
}
const char *
vm_capability_type2name(int type)
{
int i;
for (i = 0; capstrmap[i].name != NULL; i++) {
if (capstrmap[i].type == type)
return (capstrmap[i].name);
}
return (NULL);
}
int
vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap,
int *retval)
{
int error;
struct vm_capability vmcap;
bzero(&vmcap, sizeof(vmcap));
vmcap.cpuid = vcpu;
vmcap.captype = cap;
error = ioctl(ctx->fd, VM_GET_CAPABILITY, &vmcap);
*retval = vmcap.capval;
return (error);
}
int
vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, int val)
{
struct vm_capability vmcap;
bzero(&vmcap, sizeof(vmcap));
vmcap.cpuid = vcpu;
vmcap.captype = cap;
vmcap.capval = val;
return (ioctl(ctx->fd, VM_SET_CAPABILITY, &vmcap));
}
int
vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func)
{
struct vm_pptdev pptdev;
bzero(&pptdev, sizeof(pptdev));
pptdev.bus = bus;
pptdev.slot = slot;
pptdev.func = func;
return (ioctl(ctx->fd, VM_BIND_PPTDEV, &pptdev));
}
int
vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func)
{
struct vm_pptdev pptdev;
bzero(&pptdev, sizeof(pptdev));
pptdev.bus = bus;
pptdev.slot = slot;
pptdev.func = func;
return (ioctl(ctx->fd, VM_UNBIND_PPTDEV, &pptdev));
}
int
vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func,
vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
{
struct vm_pptdev_mmio pptmmio;
bzero(&pptmmio, sizeof(pptmmio));
pptmmio.bus = bus;
pptmmio.slot = slot;
pptmmio.func = func;
pptmmio.gpa = gpa;
pptmmio.len = len;
pptmmio.hpa = hpa;
return (ioctl(ctx->fd, VM_MAP_PPTDEV_MMIO, &pptmmio));
}
int
vm_setup_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
int destcpu, int vector, int numvec)
{
struct vm_pptdev_msi pptmsi;
bzero(&pptmsi, sizeof(pptmsi));
pptmsi.vcpu = vcpu;
pptmsi.bus = bus;
pptmsi.slot = slot;
pptmsi.func = func;
pptmsi.destcpu = destcpu;
pptmsi.vector = vector;
pptmsi.numvec = numvec;
return (ioctl(ctx->fd, VM_PPTDEV_MSI, &pptmsi));
}
int
vm_setup_msix(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
int idx, uint32_t msg, uint32_t vector_control, uint64_t addr)
{
struct vm_pptdev_msix pptmsix;
bzero(&pptmsix, sizeof(pptmsix));
pptmsix.vcpu = vcpu;
pptmsix.bus = bus;
pptmsix.slot = slot;
pptmsix.func = func;
pptmsix.idx = idx;
pptmsix.msg = msg;
pptmsix.addr = addr;
pptmsix.vector_control = vector_control;
return ioctl(ctx->fd, VM_PPTDEV_MSIX, &pptmsix);
}
uint64_t *
vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv,
int *ret_entries)
{
int error;
static struct vm_stats vmstats;
vmstats.cpuid = vcpu;
error = ioctl(ctx->fd, VM_STATS, &vmstats);
if (error == 0) {
if (ret_entries)
*ret_entries = vmstats.num_entries;
if (ret_tv)
*ret_tv = vmstats.tv;
return (vmstats.statbuf);
} else
return (NULL);
}
const char *
vm_get_stat_desc(struct vmctx *ctx, int index)
{
static struct vm_stat_desc statdesc;
statdesc.index = index;
if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0)
return (statdesc.desc);
else
return (NULL);
}
int
vm_get_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state *state)
{
int error;
struct vm_x2apic x2apic;
bzero(&x2apic, sizeof(x2apic));
x2apic.cpuid = vcpu;
error = ioctl(ctx->fd, VM_GET_X2APIC_STATE, &x2apic);
*state = x2apic.state;
return (error);
}
int
vm_set_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state state)
{
int error;
struct vm_x2apic x2apic;
bzero(&x2apic, sizeof(x2apic));
x2apic.cpuid = vcpu;
x2apic.state = state;
error = ioctl(ctx->fd, VM_SET_X2APIC_STATE, &x2apic);
return (error);
}
/*
* From Intel Vol 3a:
* Table 9-1. IA-32 Processor States Following Power-up, Reset or INIT
*/
int
vcpu_reset(struct vmctx *vmctx, int vcpu)
{
int error;
uint64_t rflags, rip, cr0, cr4, zero, desc_base, rdx;
uint32_t desc_access, desc_limit;
uint16_t sel;
zero = 0;
rflags = 0x2;
error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RFLAGS, rflags);
if (error)
goto done;
rip = 0xfff0;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RIP, rip)) != 0)
goto done;
cr0 = CR0_NE;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR0, cr0)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR3, zero)) != 0)
goto done;
cr4 = 0;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR4, cr4)) != 0)
goto done;
/*
* CS: present, r/w, accessed, 16-bit, byte granularity, usable
*/
desc_base = 0xffff0000;
desc_limit = 0xffff;
desc_access = 0x0093;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_CS,
desc_base, desc_limit, desc_access);
if (error)
goto done;
sel = 0xf000;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CS, sel)) != 0)
goto done;
/*
* SS,DS,ES,FS,GS: present, r/w, accessed, 16-bit, byte granularity
*/
desc_base = 0;
desc_limit = 0xffff;
desc_access = 0x0093;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_SS,
desc_base, desc_limit, desc_access);
if (error)
goto done;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_DS,
desc_base, desc_limit, desc_access);
if (error)
goto done;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_ES,
desc_base, desc_limit, desc_access);
if (error)
goto done;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_FS,
desc_base, desc_limit, desc_access);
if (error)
goto done;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GS,
desc_base, desc_limit, desc_access);
if (error)
goto done;
sel = 0;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_SS, sel)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DS, sel)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_ES, sel)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_FS, sel)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_GS, sel)) != 0)
goto done;
/* General purpose registers */
rdx = 0xf00;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RAX, zero)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBX, zero)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RCX, zero)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDX, rdx)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSI, zero)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDI, zero)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBP, zero)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSP, zero)) != 0)
goto done;
/* GDTR, IDTR */
desc_base = 0;
desc_limit = 0xffff;
desc_access = 0;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GDTR,
desc_base, desc_limit, desc_access);
if (error != 0)
goto done;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_IDTR,
desc_base, desc_limit, desc_access);
if (error != 0)
goto done;
/* TR */
desc_base = 0;
desc_limit = 0xffff;
desc_access = 0x0000008b;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_TR, 0, 0, desc_access);
if (error)
goto done;
sel = 0;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_TR, sel)) != 0)
goto done;
/* LDTR */
desc_base = 0;
desc_limit = 0xffff;
desc_access = 0x00000082;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_LDTR, desc_base,
desc_limit, desc_access);
if (error)
goto done;
sel = 0;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_LDTR, 0)) != 0)
goto done;
/* XXX cr2, debug registers */
error = 0;
done:
return (error);
}

105
lib/libvmmapi/vmmapi.h Normal file
View File

@ -0,0 +1,105 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VMMAPI_H_
#define _VMMAPI_H_
struct vmctx;
enum x2apic_state;
int vm_create(const char *name);
struct vmctx *vm_open(const char *name);
void vm_destroy(struct vmctx *ctx);
size_t vmm_get_mem_total(void);
size_t vmm_get_mem_free(void);
int vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len);
/*
* Create a memory segment of 'len' bytes in the guest physical address space
* at offset 'gpa'.
*
* If 'mapaddr' is not NULL then this region is mmap'ed into the address
* space of the calling process. If there is an mmap error then *mapaddr
* will be set to MAP_FAILED.
*/
int vm_setup_memory(struct vmctx *ctx, vm_paddr_t gpa, size_t len,
char **mapaddr);
char * vm_map_memory(struct vmctx *ctx, vm_paddr_t gpa, size_t len);
int vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
uint64_t base, uint32_t limit, uint32_t access);
int vm_get_desc(struct vmctx *ctx, int vcpu, int reg,
uint64_t *base, uint32_t *limit, uint32_t *access);
int vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val);
int vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *retval);
int vm_get_pinning(struct vmctx *ctx, int vcpu, int *host_cpuid);
int vm_set_pinning(struct vmctx *ctx, int vcpu, int host_cpuid);
int vm_run(struct vmctx *ctx, int vcpu, uint64_t rip,
struct vm_exit *ret_vmexit);
int vm_apicid2vcpu(struct vmctx *ctx, int apicid);
int vm_inject_event(struct vmctx *ctx, int vcpu, enum vm_event_type type,
int vector);
int vm_inject_event2(struct vmctx *ctx, int vcpu, enum vm_event_type type,
int vector, int error_code);
int vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector);
int vm_inject_nmi(struct vmctx *ctx, int vcpu);
int vm_capability_name2type(const char *capname);
const char *vm_capability_type2name(int type);
int vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap,
int *retval);
int vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap,
int val);
int vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func);
int vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func);
int vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func,
vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
int vm_setup_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
int dest, int vector, int numvec);
int vm_setup_msix(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
int idx, uint32_t msg, uint32_t vector_control, uint64_t addr);
/*
* Return a pointer to the statistics buffer. Note that this is not MT-safe.
*/
uint64_t *vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv,
int *ret_entries);
const char *vm_get_stat_desc(struct vmctx *ctx, int index);
int vm_get_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state *s);
int vm_set_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state s);
/* Reset vcpu register state */
int vcpu_reset(struct vmctx *ctx, int vcpu);
/*
* FreeBSD specific APIs
*/
int vm_setup_freebsd_registers(struct vmctx *ctx, int vcpu,
uint64_t rip, uint64_t cr3, uint64_t gdtbase,
uint64_t rsp);
void vm_setup_freebsd_gdt(uint64_t *gdtr);
#endif /* _VMMAPI_H_ */

View File

@ -0,0 +1,183 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <machine/specialreg.h>
#include <machine/segments.h>
#include <machine/vmm.h>
#include "vmmapi.h"
#define DESC_UNUSABLE 0x00010000
#define GUEST_NULL_SEL 0
#define GUEST_CODE_SEL 1
#define GUEST_DATA_SEL 2
#define GUEST_GDTR_LIMIT (3 * 8 - 1)
void
vm_setup_freebsd_gdt(uint64_t *gdtr)
{
gdtr[GUEST_NULL_SEL] = 0;
gdtr[GUEST_CODE_SEL] = 0x0020980000000000;
gdtr[GUEST_DATA_SEL] = 0x0000900000000000;
}
/*
* Setup the 'vcpu' register set such that it will begin execution at
* 'rip' in long mode.
*/
int
vm_setup_freebsd_registers(struct vmctx *vmctx, int vcpu,
uint64_t rip, uint64_t cr3, uint64_t gdtbase,
uint64_t rsp)
{
int error;
uint64_t cr0, cr4, efer, rflags, desc_base;
uint32_t desc_access, desc_limit;
uint16_t gsel;
cr0 = CR0_PE | CR0_PG | CR0_NE;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR0, cr0)) != 0)
goto done;
cr4 = CR4_PAE;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR4, cr4)) != 0)
goto done;
efer = EFER_LME | EFER_LMA;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_EFER, efer)))
goto done;
rflags = 0x2;
error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RFLAGS, rflags);
if (error)
goto done;
desc_base = 0;
desc_limit = 0;
desc_access = 0x0000209B;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_CS,
desc_base, desc_limit, desc_access);
if (error)
goto done;
desc_access = 0x00000093;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_DS,
desc_base, desc_limit, desc_access);
if (error)
goto done;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_ES,
desc_base, desc_limit, desc_access);
if (error)
goto done;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_FS,
desc_base, desc_limit, desc_access);
if (error)
goto done;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GS,
desc_base, desc_limit, desc_access);
if (error)
goto done;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_SS,
desc_base, desc_limit, desc_access);
if (error)
goto done;
/*
* XXX TR is pointing to null selector even though we set the
* TSS segment to be usable with a base address and limit of 0.
*/
desc_access = 0x0000008b;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_TR, 0, 0, desc_access);
if (error)
goto done;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_LDTR, 0, 0,
DESC_UNUSABLE);
if (error)
goto done;
gsel = GSEL(GUEST_CODE_SEL, SEL_KPL);
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CS, gsel)) != 0)
goto done;
gsel = GSEL(GUEST_DATA_SEL, SEL_KPL);
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DS, gsel)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_ES, gsel)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_FS, gsel)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_GS, gsel)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_SS, gsel)) != 0)
goto done;
/* XXX TR is pointing to the null selector */
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_TR, 0)) != 0)
goto done;
/* LDTR is pointing to the null selector */
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_LDTR, 0)) != 0)
goto done;
/* entry point */
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RIP, rip)) != 0)
goto done;
/* page table base */
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR3, cr3)) != 0)
goto done;
desc_base = gdtbase;
desc_limit = GUEST_GDTR_LIMIT;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GDTR,
desc_base, desc_limit, 0);
if (error != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSP, rsp)) != 0)
goto done;
error = 0;
done:
return (error);
}

68
share/man/man4/bhyve.4 Normal file
View File

@ -0,0 +1,68 @@
.\"
.\" Copyright (c) 2012 NetApp Inc
.\" All rights reserved.
.\"
.\" Redistribution and use in source and binary forms, with or without
.\" modification, are permitted provided that the following conditions
.\" are met:
.\" 1. Redistributions of source code must retain the above copyright
.\" notice, this list of conditions and the following disclaimer.
.\" 2. Redistributions in binary form must reproduce the above copyright
.\" notice, this list of conditions and the following disclaimer in the
.\" documentation and/or other materials provided with the distribution.
.\"
.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
.\" SUCH DAMAGE.
.\"
.\" $FreeBSD$
.\"
.Dd January 5, 2013
.Dt BHYVE 4
.Os
.Sh NAME
.Nm bhyve
.Nd virtual machine monitor
.Sh SYNOPSIS
.Cd "/usr/sbin/bhyve"
.Cd "/usr/sbin/bhyveload"
.Cd "/usr/sbin/bhyvectl"
.Cd "/boot/kernel/vmm.ko"
.Sh DESCRIPTION
.Nm
is a virtual machine monitor that is hosted by FreeBSD. It is used to host
unmodified guest operating systems on top of FreeBSD.
.Pp
.Nm
relies heavily on hardware assist provided by the CPU and chipset to virtualize
processor and memory resources.
.Sh SEE ALSO
.Xr bhyve 8 ,
.Xr bhyveload 8 ,
.Xr bhyvectl 8 ,
.Xr vmm 4
.Sh HISTORY
.Nm
first appeared in
.Fx 10.0 ,
and was developed at NetApp Inc.
.Sh AUTHORS
.Nm
was developed by
.An -nosplit
.An "Peter Grehan" Aq grehan@FreeBSD.org
and
.An "Neel Natu" Aq neel@FreeBSD.org
at NetApp Inc.
.Sh BUGS
.Nm
is considered experimental in
.Fx .

View File

@ -162,6 +162,7 @@ LIBULOG?= ${DESTDIR}${LIBDIR}/libulog.a
LIBUTIL?= ${DESTDIR}${LIBDIR}/libutil.a
LIBUUTIL?= ${DESTDIR}${LIBDIR}/libuutil.a
LIBVGL?= ${DESTDIR}${LIBDIR}/libvgl.a
LIBVMMAPI?= ${DESTDIR}${LIBDIR}/libvmmapi.a
LIBWIND?= ${DESTDIR}${LIBDIR}/libwind.a
LIBWRAP?= ${DESTDIR}${LIBDIR}/libwrap.a
LIBXPG4?= ${DESTDIR}${LIBDIR}/libxpg4.a

293
sys/amd64/include/vmm.h Normal file
View File

@ -0,0 +1,293 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD: vmm.h 482 2011-05-09 21:22:43Z grehan $
*/
#ifndef _VMM_H_
#define _VMM_H_
#ifdef _KERNEL
#define VM_MAX_NAMELEN 32
struct vm;
struct vm_memory_segment;
struct seg_desc;
struct vm_exit;
struct vm_run;
struct vlapic;
enum x2apic_state;
typedef int (*vmm_init_func_t)(void);
typedef int (*vmm_cleanup_func_t)(void);
typedef void * (*vmi_init_func_t)(struct vm *vm); /* instance specific apis */
typedef int (*vmi_run_func_t)(void *vmi, int vcpu, register_t rip);
typedef void (*vmi_cleanup_func_t)(void *vmi);
typedef int (*vmi_mmap_set_func_t)(void *vmi, vm_paddr_t gpa,
vm_paddr_t hpa, size_t length,
vm_memattr_t attr, int prot,
boolean_t superpages_ok);
typedef vm_paddr_t (*vmi_mmap_get_func_t)(void *vmi, vm_paddr_t gpa);
typedef int (*vmi_get_register_t)(void *vmi, int vcpu, int num,
uint64_t *retval);
typedef int (*vmi_set_register_t)(void *vmi, int vcpu, int num,
uint64_t val);
typedef int (*vmi_get_desc_t)(void *vmi, int vcpu, int num,
struct seg_desc *desc);
typedef int (*vmi_set_desc_t)(void *vmi, int vcpu, int num,
struct seg_desc *desc);
typedef int (*vmi_inject_event_t)(void *vmi, int vcpu,
int type, int vector,
uint32_t code, int code_valid);
typedef int (*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval);
typedef int (*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val);
struct vmm_ops {
vmm_init_func_t init; /* module wide initialization */
vmm_cleanup_func_t cleanup;
vmi_init_func_t vminit; /* vm-specific initialization */
vmi_run_func_t vmrun;
vmi_cleanup_func_t vmcleanup;
vmi_mmap_set_func_t vmmmap_set;
vmi_mmap_get_func_t vmmmap_get;
vmi_get_register_t vmgetreg;
vmi_set_register_t vmsetreg;
vmi_get_desc_t vmgetdesc;
vmi_set_desc_t vmsetdesc;
vmi_inject_event_t vminject;
vmi_get_cap_t vmgetcap;
vmi_set_cap_t vmsetcap;
};
extern struct vmm_ops vmm_ops_intel;
extern struct vmm_ops vmm_ops_amd;
struct vm *vm_create(const char *name);
void vm_destroy(struct vm *vm);
const char *vm_name(struct vm *vm);
int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len);
int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len);
vm_paddr_t vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t size);
int vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
struct vm_memory_segment *seg);
int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval);
int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val);
int vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
struct seg_desc *ret_desc);
int vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
struct seg_desc *desc);
int vm_get_pinning(struct vm *vm, int vcpu, int *cpuid);
int vm_set_pinning(struct vm *vm, int vcpu, int cpuid);
int vm_run(struct vm *vm, struct vm_run *vmrun);
int vm_inject_event(struct vm *vm, int vcpu, int type,
int vector, uint32_t error_code, int error_code_valid);
int vm_inject_nmi(struct vm *vm, int vcpu);
int vm_nmi_pending(struct vm *vm, int vcpuid);
void vm_nmi_clear(struct vm *vm, int vcpuid);
uint64_t *vm_guest_msrs(struct vm *vm, int cpu);
struct vlapic *vm_lapic(struct vm *vm, int cpu);
int vm_get_capability(struct vm *vm, int vcpu, int type, int *val);
int vm_set_capability(struct vm *vm, int vcpu, int type, int val);
int vm_get_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state *state);
int vm_set_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state state);
void vm_activate_cpu(struct vm *vm, int vcpu);
cpuset_t vm_active_cpus(struct vm *vm);
struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid);
/*
* Return 1 if device indicated by bus/slot/func is supposed to be a
* pci passthrough device.
*
* Return 0 otherwise.
*/
int vmm_is_pptdev(int bus, int slot, int func);
void *vm_iommu_domain(struct vm *vm);
enum vcpu_state {
VCPU_IDLE,
VCPU_RUNNING,
VCPU_CANNOT_RUN,
};
int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state);
enum vcpu_state vcpu_get_state(struct vm *vm, int vcpu);
static int __inline
vcpu_is_running(struct vm *vm, int vcpu)
{
return (vcpu_get_state(vm, vcpu) == VCPU_RUNNING);
}
void *vcpu_stats(struct vm *vm, int vcpu);
void vm_interrupt_hostcpu(struct vm *vm, int vcpu);
#endif /* KERNEL */
#include <machine/vmm_instruction_emul.h>
#define VM_MAXCPU 8 /* maximum virtual cpus */
/*
* Identifiers for events that can be injected into the VM
*/
enum vm_event_type {
VM_EVENT_NONE,
VM_HW_INTR,
VM_NMI,
VM_HW_EXCEPTION,
VM_SW_INTR,
VM_PRIV_SW_EXCEPTION,
VM_SW_EXCEPTION,
VM_EVENT_MAX
};
/*
* Identifiers for architecturally defined registers.
*/
enum vm_reg_name {
VM_REG_GUEST_RAX,
VM_REG_GUEST_RBX,
VM_REG_GUEST_RCX,
VM_REG_GUEST_RDX,
VM_REG_GUEST_RSI,
VM_REG_GUEST_RDI,
VM_REG_GUEST_RBP,
VM_REG_GUEST_R8,
VM_REG_GUEST_R9,
VM_REG_GUEST_R10,
VM_REG_GUEST_R11,
VM_REG_GUEST_R12,
VM_REG_GUEST_R13,
VM_REG_GUEST_R14,
VM_REG_GUEST_R15,
VM_REG_GUEST_CR0,
VM_REG_GUEST_CR3,
VM_REG_GUEST_CR4,
VM_REG_GUEST_DR7,
VM_REG_GUEST_RSP,
VM_REG_GUEST_RIP,
VM_REG_GUEST_RFLAGS,
VM_REG_GUEST_ES,
VM_REG_GUEST_CS,
VM_REG_GUEST_SS,
VM_REG_GUEST_DS,
VM_REG_GUEST_FS,
VM_REG_GUEST_GS,
VM_REG_GUEST_LDTR,
VM_REG_GUEST_TR,
VM_REG_GUEST_IDTR,
VM_REG_GUEST_GDTR,
VM_REG_GUEST_EFER,
VM_REG_LAST
};
/*
* Identifiers for optional vmm capabilities
*/
enum vm_cap_type {
VM_CAP_HALT_EXIT,
VM_CAP_MTRAP_EXIT,
VM_CAP_PAUSE_EXIT,
VM_CAP_UNRESTRICTED_GUEST,
VM_CAP_MAX
};
enum x2apic_state {
X2APIC_ENABLED,
X2APIC_AVAILABLE,
X2APIC_DISABLED,
X2APIC_STATE_LAST
};
/*
* The 'access' field has the format specified in Table 21-2 of the Intel
* Architecture Manual vol 3b.
*
* XXX The contents of the 'access' field are architecturally defined except
* bit 16 - Segment Unusable.
*/
struct seg_desc {
uint64_t base;
uint32_t limit;
uint32_t access;
};
enum vm_exitcode {
VM_EXITCODE_INOUT,
VM_EXITCODE_VMX,
VM_EXITCODE_BOGUS,
VM_EXITCODE_RDMSR,
VM_EXITCODE_WRMSR,
VM_EXITCODE_HLT,
VM_EXITCODE_MTRAP,
VM_EXITCODE_PAUSE,
VM_EXITCODE_PAGING,
VM_EXITCODE_SPINUP_AP,
VM_EXITCODE_MAX
};
struct vm_exit {
enum vm_exitcode exitcode;
int inst_length; /* 0 means unknown */
uint64_t rip;
union {
struct {
uint16_t bytes:3; /* 1 or 2 or 4 */
uint16_t in:1; /* out is 0, in is 1 */
uint16_t string:1;
uint16_t rep:1;
uint16_t port;
uint32_t eax; /* valid for out */
} inout;
struct {
uint64_t gpa;
struct vie vie;
} paging;
/*
* VMX specific payload. Used when there is no "better"
* exitcode to represent the VM-exit.
*/
struct {
int error; /* vmx inst error */
uint32_t exit_reason;
uint64_t exit_qualification;
} vmx;
struct {
uint32_t code; /* ecx value */
uint64_t wval;
} msr;
struct {
int vcpu;
uint64_t rip;
} spinup_ap;
} u;
};
#endif /* _VMM_H_ */

215
sys/amd64/include/vmm_dev.h Normal file
View File

@ -0,0 +1,215 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD: vmm_dev.h 482 2011-05-09 21:22:43Z grehan $
*/
#ifndef _VMM_DEV_H_
#define _VMM_DEV_H_
#ifdef _KERNEL
void vmmdev_init(void);
int vmmdev_cleanup(void);
#endif
struct vm_memory_segment {
vm_paddr_t gpa; /* in */
size_t len; /* in */
};
struct vm_register {
int cpuid;
int regnum; /* enum vm_reg_name */
uint64_t regval;
};
struct vm_seg_desc { /* data or code segment */
int cpuid;
int regnum; /* enum vm_reg_name */
struct seg_desc desc;
};
struct vm_pin {
int vm_cpuid;
int host_cpuid; /* -1 to unpin */
};
struct vm_run {
int cpuid;
uint64_t rip; /* start running here */
struct vm_exit vm_exit;
};
struct vm_event {
int cpuid;
enum vm_event_type type;
int vector;
uint32_t error_code;
int error_code_valid;
};
struct vm_lapic_irq {
int cpuid;
int vector;
};
struct vm_capability {
int cpuid;
enum vm_cap_type captype;
int capval;
int allcpus;
};
struct vm_pptdev {
int bus;
int slot;
int func;
};
struct vm_pptdev_mmio {
int bus;
int slot;
int func;
vm_paddr_t gpa;
vm_paddr_t hpa;
size_t len;
};
struct vm_pptdev_msi {
int vcpu;
int bus;
int slot;
int func;
int numvec; /* 0 means disabled */
int vector;
int destcpu;
};
struct vm_pptdev_msix {
int vcpu;
int bus;
int slot;
int func;
int idx;
uint32_t msg;
uint32_t vector_control;
uint64_t addr;
};
struct vm_nmi {
int cpuid;
};
#define MAX_VM_STATS 64
struct vm_stats {
int cpuid; /* in */
int num_entries; /* out */
struct timeval tv;
uint64_t statbuf[MAX_VM_STATS];
};
struct vm_stat_desc {
int index; /* in */
char desc[128]; /* out */
};
struct vm_x2apic {
int cpuid;
enum x2apic_state state;
};
enum {
IOCNUM_RUN,
IOCNUM_SET_PINNING,
IOCNUM_GET_PINNING,
IOCNUM_MAP_MEMORY,
IOCNUM_GET_MEMORY_SEG,
IOCNUM_SET_REGISTER,
IOCNUM_GET_REGISTER,
IOCNUM_SET_SEGMENT_DESCRIPTOR,
IOCNUM_GET_SEGMENT_DESCRIPTOR,
IOCNUM_INJECT_EVENT,
IOCNUM_LAPIC_IRQ,
IOCNUM_SET_CAPABILITY,
IOCNUM_GET_CAPABILITY,
IOCNUM_BIND_PPTDEV,
IOCNUM_UNBIND_PPTDEV,
IOCNUM_MAP_PPTDEV_MMIO,
IOCNUM_PPTDEV_MSI,
IOCNUM_PPTDEV_MSIX,
IOCNUM_INJECT_NMI,
IOCNUM_VM_STATS,
IOCNUM_VM_STAT_DESC,
IOCNUM_SET_X2APIC_STATE,
IOCNUM_GET_X2APIC_STATE,
};
#define VM_RUN \
_IOWR('v', IOCNUM_RUN, struct vm_run)
#define VM_SET_PINNING \
_IOW('v', IOCNUM_SET_PINNING, struct vm_pin)
#define VM_GET_PINNING \
_IOWR('v', IOCNUM_GET_PINNING, struct vm_pin)
#define VM_MAP_MEMORY \
_IOWR('v', IOCNUM_MAP_MEMORY, struct vm_memory_segment)
#define VM_GET_MEMORY_SEG \
_IOWR('v', IOCNUM_GET_MEMORY_SEG, struct vm_memory_segment)
#define VM_SET_REGISTER \
_IOW('v', IOCNUM_SET_REGISTER, struct vm_register)
#define VM_GET_REGISTER \
_IOWR('v', IOCNUM_GET_REGISTER, struct vm_register)
#define VM_SET_SEGMENT_DESCRIPTOR \
_IOW('v', IOCNUM_SET_SEGMENT_DESCRIPTOR, struct vm_seg_desc)
#define VM_GET_SEGMENT_DESCRIPTOR \
_IOWR('v', IOCNUM_GET_SEGMENT_DESCRIPTOR, struct vm_seg_desc)
#define VM_INJECT_EVENT \
_IOW('v', IOCNUM_INJECT_EVENT, struct vm_event)
#define VM_LAPIC_IRQ \
_IOW('v', IOCNUM_LAPIC_IRQ, struct vm_lapic_irq)
#define VM_SET_CAPABILITY \
_IOW('v', IOCNUM_SET_CAPABILITY, struct vm_capability)
#define VM_GET_CAPABILITY \
_IOWR('v', IOCNUM_GET_CAPABILITY, struct vm_capability)
#define VM_BIND_PPTDEV \
_IOW('v', IOCNUM_BIND_PPTDEV, struct vm_pptdev)
#define VM_UNBIND_PPTDEV \
_IOW('v', IOCNUM_UNBIND_PPTDEV, struct vm_pptdev)
#define VM_MAP_PPTDEV_MMIO \
_IOW('v', IOCNUM_MAP_PPTDEV_MMIO, struct vm_pptdev_mmio)
#define VM_PPTDEV_MSI \
_IOW('v', IOCNUM_PPTDEV_MSI, struct vm_pptdev_msi)
#define VM_PPTDEV_MSIX \
_IOW('v', IOCNUM_PPTDEV_MSIX, struct vm_pptdev_msix)
#define VM_INJECT_NMI \
_IOW('v', IOCNUM_INJECT_NMI, struct vm_nmi)
#define VM_STATS \
_IOWR('v', IOCNUM_VM_STATS, struct vm_stats)
#define VM_STAT_DESC \
_IOWR('v', IOCNUM_VM_STAT_DESC, struct vm_stat_desc)
#define VM_SET_X2APIC_STATE \
_IOW('v', IOCNUM_SET_X2APIC_STATE, struct vm_x2apic)
#define VM_GET_X2APIC_STATE \
_IOWR('v', IOCNUM_GET_X2APIC_STATE, struct vm_x2apic)
#endif

View File

@ -0,0 +1,113 @@
/*-
* Copyright (c) 2012 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VMM_INSTRUCTION_EMUL_H_
#define _VMM_INSTRUCTION_EMUL_H_
/*
* The data structures 'vie' and 'vie_op' are meant to be opaque to the
* consumers of instruction decoding. The only reason why their contents
* need to be exposed is because they are part of the 'vm_exit' structure.
*/
struct vie_op {
uint8_t op_byte; /* actual opcode byte */
uint8_t op_type; /* type of operation (e.g. MOV) */
uint16_t op_flags;
};
#define VIE_INST_SIZE 15
struct vie {
uint8_t inst[VIE_INST_SIZE]; /* instruction bytes */
uint8_t num_valid; /* size of the instruction */
uint8_t num_processed;
uint8_t rex_w:1, /* REX prefix */
rex_r:1,
rex_x:1,
rex_b:1;
uint8_t mod:2, /* ModRM byte */
reg:4,
rm:4;
uint8_t ss:2, /* SIB byte */
index:4,
base:4;
uint8_t disp_bytes;
uint8_t imm_bytes;
uint8_t scale;
int base_register; /* VM_REG_GUEST_xyz */
int index_register; /* VM_REG_GUEST_xyz */
int64_t displacement; /* optional addr displacement */
int64_t immediate; /* optional immediate operand */
uint8_t decoded; /* set to 1 if successfully decoded */
struct vie_op op; /* opcode description */
};
/*
* Callback functions to read and write memory regions.
*/
typedef int (*mem_region_read_t)(void *vm, int cpuid, uint64_t gpa,
uint64_t *rval, int rsize, void *arg);
typedef int (*mem_region_write_t)(void *vm, int cpuid, uint64_t gpa,
uint64_t wval, int wsize, void *arg);
/*
* Emulate the decoded 'vie' instruction.
*
* The callbacks 'mrr' and 'mrw' emulate reads and writes to the memory region
* containing 'gpa'. 'mrarg' is an opaque argument that is passed into the
* callback functions.
*
* 'void *vm' should be 'struct vm *' when called from kernel context and
* 'struct vmctx *' when called from user context.
* s
*/
int vmm_emulate_instruction(void *vm, int cpuid, uint64_t gpa, struct vie *vie,
mem_region_read_t mrr, mem_region_write_t mrw,
void *mrarg);
#ifdef _KERNEL
/*
* APIs to fetch and decode the instruction from nested page fault handler.
*/
int vmm_fetch_instruction(struct vm *vm, int cpuid,
uint64_t rip, int inst_length, uint64_t cr3,
struct vie *vie);
int vmm_decode_instruction(struct vm *vm, int cpuid,
uint64_t gla, struct vie *vie);
#endif /* _KERNEL */
#endif /* _VMM_INSTRUCTION_EMUL_H_ */

265
sys/amd64/vmm/amd/amdv.c Normal file
View File

@ -0,0 +1,265 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/errno.h>
#include <sys/smp.h>
#include <machine/vmm.h>
#include "io/iommu.h"
static int
amdv_init(void)
{
printf("amdv_init: not implemented\n");
return (ENXIO);
}
static int
amdv_cleanup(void)
{
printf("amdv_cleanup: not implemented\n");
return (ENXIO);
}
static void *
amdv_vminit(struct vm *vm)
{
printf("amdv_vminit: not implemented\n");
return (NULL);
}
static int
amdv_vmrun(void *arg, int vcpu, register_t rip)
{
printf("amdv_vmrun: not implemented\n");
return (ENXIO);
}
static void
amdv_vmcleanup(void *arg)
{
printf("amdv_vmcleanup: not implemented\n");
return;
}
static int
amdv_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
vm_memattr_t attr, int prot, boolean_t spok)
{
printf("amdv_vmmmap_set: not implemented\n");
return (EINVAL);
}
static vm_paddr_t
amdv_vmmmap_get(void *arg, vm_paddr_t gpa)
{
printf("amdv_vmmmap_get: not implemented\n");
return (EINVAL);
}
static int
amdv_getreg(void *arg, int vcpu, int regnum, uint64_t *retval)
{
printf("amdv_getreg: not implemented\n");
return (EINVAL);
}
static int
amdv_setreg(void *arg, int vcpu, int regnum, uint64_t val)
{
printf("amdv_setreg: not implemented\n");
return (EINVAL);
}
static int
amdv_getdesc(void *vmi, int vcpu, int num, struct seg_desc *desc)
{
printf("amdv_get_desc: not implemented\n");
return (EINVAL);
}
static int
amdv_setdesc(void *vmi, int vcpu, int num, struct seg_desc *desc)
{
printf("amdv_get_desc: not implemented\n");
return (EINVAL);
}
static int
amdv_inject_event(void *vmi, int vcpu, int type, int vector,
uint32_t error_code, int error_code_valid)
{
printf("amdv_inject_event: not implemented\n");
return (EINVAL);
}
static int
amdv_getcap(void *arg, int vcpu, int type, int *retval)
{
printf("amdv_getcap: not implemented\n");
return (EINVAL);
}
static int
amdv_setcap(void *arg, int vcpu, int type, int val)
{
printf("amdv_setcap: not implemented\n");
return (EINVAL);
}
struct vmm_ops vmm_ops_amd = {
amdv_init,
amdv_cleanup,
amdv_vminit,
amdv_vmrun,
amdv_vmcleanup,
amdv_vmmmap_set,
amdv_vmmmap_get,
amdv_getreg,
amdv_setreg,
amdv_getdesc,
amdv_setdesc,
amdv_inject_event,
amdv_getcap,
amdv_setcap
};
static int
amd_iommu_init(void)
{
printf("amd_iommu_init: not implemented\n");
return (ENXIO);
}
static void
amd_iommu_cleanup(void)
{
printf("amd_iommu_cleanup: not implemented\n");
}
static void
amd_iommu_enable(void)
{
printf("amd_iommu_enable: not implemented\n");
}
static void
amd_iommu_disable(void)
{
printf("amd_iommu_disable: not implemented\n");
}
static void *
amd_iommu_create_domain(vm_paddr_t maxaddr)
{
printf("amd_iommu_create_domain: not implemented\n");
return (NULL);
}
static void
amd_iommu_destroy_domain(void *domain)
{
printf("amd_iommu_destroy_domain: not implemented\n");
}
static uint64_t
amd_iommu_create_mapping(void *domain, vm_paddr_t gpa, vm_paddr_t hpa,
uint64_t len)
{
printf("amd_iommu_create_mapping: not implemented\n");
return (0);
}
static uint64_t
amd_iommu_remove_mapping(void *domain, vm_paddr_t gpa, uint64_t len)
{
printf("amd_iommu_remove_mapping: not implemented\n");
return (0);
}
static void
amd_iommu_add_device(void *domain, int bus, int slot, int func)
{
printf("amd_iommu_add_device: not implemented\n");
}
static void
amd_iommu_remove_device(void *domain, int bus, int slot, int func)
{
printf("amd_iommu_remove_device: not implemented\n");
}
static void
amd_iommu_invalidate_tlb(void *domain)
{
printf("amd_iommu_invalidate_tlb: not implemented\n");
}
struct iommu_ops iommu_ops_amd = {
amd_iommu_init,
amd_iommu_cleanup,
amd_iommu_enable,
amd_iommu_disable,
amd_iommu_create_domain,
amd_iommu_destroy_domain,
amd_iommu_create_mapping,
amd_iommu_remove_mapping,
amd_iommu_add_device,
amd_iommu_remove_device,
amd_iommu_invalidate_tlb,
};

392
sys/amd64/vmm/intel/ept.c Normal file
View File

@ -0,0 +1,392 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <sys/errno.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/smp.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <machine/param.h>
#include <machine/cpufunc.h>
#include <machine/pmap.h>
#include <machine/vmparam.h>
#include <machine/vmm.h>
#include "vmx_cpufunc.h"
#include "vmx_msr.h"
#include "vmx.h"
#include "ept.h"
#define EPT_PWL4(cap) ((cap) & (1UL << 6))
#define EPT_MEMORY_TYPE_WB(cap) ((cap) & (1UL << 14))
#define EPT_PDE_SUPERPAGE(cap) ((cap) & (1UL << 16)) /* 2MB pages */
#define EPT_PDPTE_SUPERPAGE(cap) ((cap) & (1UL << 17)) /* 1GB pages */
#define INVVPID_SUPPORTED(cap) ((cap) & (1UL << 32))
#define INVEPT_SUPPORTED(cap) ((cap) & (1UL << 20))
#define INVVPID_ALL_TYPES_MASK 0xF0000000000UL
#define INVVPID_ALL_TYPES_SUPPORTED(cap) \
(((cap) & INVVPID_ALL_TYPES_MASK) == INVVPID_ALL_TYPES_MASK)
#define INVEPT_ALL_TYPES_MASK 0x6000000UL
#define INVEPT_ALL_TYPES_SUPPORTED(cap) \
(((cap) & INVEPT_ALL_TYPES_MASK) == INVEPT_ALL_TYPES_MASK)
#define EPT_PG_RD (1 << 0)
#define EPT_PG_WR (1 << 1)
#define EPT_PG_EX (1 << 2)
#define EPT_PG_MEMORY_TYPE(x) ((x) << 3)
#define EPT_PG_IGNORE_PAT (1 << 6)
#define EPT_PG_SUPERPAGE (1 << 7)
#define EPT_ADDR_MASK ((uint64_t)-1 << 12)
MALLOC_DECLARE(M_VMX);
static uint64_t page_sizes_mask;
int
ept_init(void)
{
int page_shift;
uint64_t cap;
cap = rdmsr(MSR_VMX_EPT_VPID_CAP);
/*
* Verify that:
* - page walk length is 4 steps
* - extended page tables can be laid out in write-back memory
* - invvpid instruction with all possible types is supported
* - invept instruction with all possible types is supported
*/
if (!EPT_PWL4(cap) ||
!EPT_MEMORY_TYPE_WB(cap) ||
!INVVPID_SUPPORTED(cap) ||
!INVVPID_ALL_TYPES_SUPPORTED(cap) ||
!INVEPT_SUPPORTED(cap) ||
!INVEPT_ALL_TYPES_SUPPORTED(cap))
return (EINVAL);
/* Set bits in 'page_sizes_mask' for each valid page size */
page_shift = PAGE_SHIFT;
page_sizes_mask = 1UL << page_shift; /* 4KB page */
page_shift += 9;
if (EPT_PDE_SUPERPAGE(cap))
page_sizes_mask |= 1UL << page_shift; /* 2MB superpage */
page_shift += 9;
if (EPT_PDPTE_SUPERPAGE(cap))
page_sizes_mask |= 1UL << page_shift; /* 1GB superpage */
return (0);
}
#if 0
static void
ept_dump(uint64_t *ptp, int nlevels)
{
int i, t, tabs;
uint64_t *ptpnext, ptpval;
if (--nlevels < 0)
return;
tabs = 3 - nlevels;
for (t = 0; t < tabs; t++)
printf("\t");
printf("PTP = %p\n", ptp);
for (i = 0; i < 512; i++) {
ptpval = ptp[i];
if (ptpval == 0)
continue;
for (t = 0; t < tabs; t++)
printf("\t");
printf("%3d 0x%016lx\n", i, ptpval);
if (nlevels != 0 && (ptpval & EPT_PG_SUPERPAGE) == 0) {
ptpnext = (uint64_t *)
PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK);
ept_dump(ptpnext, nlevels);
}
}
}
#endif
static size_t
ept_create_mapping(uint64_t *ptp, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
vm_memattr_t attr, vm_prot_t prot, boolean_t spok)
{
int spshift, ptpshift, ptpindex, nlevels;
/*
* Compute the size of the mapping that we can accomodate.
*
* This is based on three factors:
* - super page sizes supported by the processor
* - alignment of the region starting at 'gpa' and 'hpa'
* - length of the region 'len'
*/
spshift = PAGE_SHIFT;
if (spok)
spshift += (EPT_PWLEVELS - 1) * 9;
while (spshift >= PAGE_SHIFT) {
uint64_t spsize = 1UL << spshift;
if ((page_sizes_mask & spsize) != 0 &&
(gpa & (spsize - 1)) == 0 &&
(hpa & (spsize - 1)) == 0 &&
length >= spsize) {
break;
}
spshift -= 9;
}
if (spshift < PAGE_SHIFT) {
panic("Invalid spshift for gpa 0x%016lx, hpa 0x%016lx, "
"length 0x%016lx, page_sizes_mask 0x%016lx",
gpa, hpa, length, page_sizes_mask);
}
nlevels = EPT_PWLEVELS;
while (--nlevels >= 0) {
ptpshift = PAGE_SHIFT + nlevels * 9;
ptpindex = (gpa >> ptpshift) & 0x1FF;
/* We have reached the leaf mapping */
if (spshift >= ptpshift)
break;
/*
* We are working on a non-leaf page table page.
*
* Create the next level page table page if necessary and point
* to it from the current page table.
*/
if (ptp[ptpindex] == 0) {
void *nlp = malloc(PAGE_SIZE, M_VMX, M_WAITOK | M_ZERO);
ptp[ptpindex] = vtophys(nlp);
ptp[ptpindex] |= EPT_PG_RD | EPT_PG_WR | EPT_PG_EX;
}
/* Work our way down to the next level page table page */
ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & EPT_ADDR_MASK);
}
if ((gpa & ((1UL << ptpshift) - 1)) != 0) {
panic("ept_create_mapping: gpa 0x%016lx and ptpshift %d "
"mismatch\n", gpa, ptpshift);
}
if (prot != VM_PROT_NONE) {
/* Do the mapping */
ptp[ptpindex] = hpa;
/* Apply the access controls */
if (prot & VM_PROT_READ)
ptp[ptpindex] |= EPT_PG_RD;
if (prot & VM_PROT_WRITE)
ptp[ptpindex] |= EPT_PG_WR;
if (prot & VM_PROT_EXECUTE)
ptp[ptpindex] |= EPT_PG_EX;
/*
* XXX should we enforce this memory type by setting the
* ignore PAT bit to 1.
*/
ptp[ptpindex] |= EPT_PG_MEMORY_TYPE(attr);
if (nlevels > 0)
ptp[ptpindex] |= EPT_PG_SUPERPAGE;
} else {
/* Remove the mapping */
ptp[ptpindex] = 0;
}
return (1UL << ptpshift);
}
static vm_paddr_t
ept_lookup_mapping(uint64_t *ptp, vm_paddr_t gpa)
{
int nlevels, ptpshift, ptpindex;
uint64_t ptpval, hpabase, pgmask;
nlevels = EPT_PWLEVELS;
while (--nlevels >= 0) {
ptpshift = PAGE_SHIFT + nlevels * 9;
ptpindex = (gpa >> ptpshift) & 0x1FF;
ptpval = ptp[ptpindex];
/* Cannot make progress beyond this point */
if ((ptpval & (EPT_PG_RD | EPT_PG_WR | EPT_PG_EX)) == 0)
break;
if (nlevels == 0 || (ptpval & EPT_PG_SUPERPAGE)) {
pgmask = (1UL << ptpshift) - 1;
hpabase = ptpval & ~pgmask;
return (hpabase | (gpa & pgmask));
}
/* Work our way down to the next level page table page */
ptp = (uint64_t *)PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK);
}
return ((vm_paddr_t)-1);
}
static void
ept_free_pt_entry(pt_entry_t pte)
{
if (pte == 0)
return;
/* sanity check */
if ((pte & EPT_PG_SUPERPAGE) != 0)
panic("ept_free_pt_entry: pte cannot have superpage bit");
return;
}
static void
ept_free_pd_entry(pd_entry_t pde)
{
pt_entry_t *pt;
int i;
if (pde == 0)
return;
if ((pde & EPT_PG_SUPERPAGE) == 0) {
pt = (pt_entry_t *)PHYS_TO_DMAP(pde & EPT_ADDR_MASK);
for (i = 0; i < NPTEPG; i++)
ept_free_pt_entry(pt[i]);
free(pt, M_VMX); /* free the page table page */
}
}
static void
ept_free_pdp_entry(pdp_entry_t pdpe)
{
pd_entry_t *pd;
int i;
if (pdpe == 0)
return;
if ((pdpe & EPT_PG_SUPERPAGE) == 0) {
pd = (pd_entry_t *)PHYS_TO_DMAP(pdpe & EPT_ADDR_MASK);
for (i = 0; i < NPDEPG; i++)
ept_free_pd_entry(pd[i]);
free(pd, M_VMX); /* free the page directory page */
}
}
static void
ept_free_pml4_entry(pml4_entry_t pml4e)
{
pdp_entry_t *pdp;
int i;
if (pml4e == 0)
return;
if ((pml4e & EPT_PG_SUPERPAGE) == 0) {
pdp = (pdp_entry_t *)PHYS_TO_DMAP(pml4e & EPT_ADDR_MASK);
for (i = 0; i < NPDPEPG; i++)
ept_free_pdp_entry(pdp[i]);
free(pdp, M_VMX); /* free the page directory ptr page */
}
}
void
ept_vmcleanup(struct vmx *vmx)
{
int i;
for (i = 0; i < NPML4EPG; i++)
ept_free_pml4_entry(vmx->pml4ept[i]);
}
int
ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t len,
vm_memattr_t attr, int prot, boolean_t spok)
{
size_t n;
struct vmx *vmx = arg;
while (len > 0) {
n = ept_create_mapping(vmx->pml4ept, gpa, hpa, len, attr,
prot, spok);
len -= n;
gpa += n;
hpa += n;
}
return (0);
}
vm_paddr_t
ept_vmmmap_get(void *arg, vm_paddr_t gpa)
{
vm_paddr_t hpa;
struct vmx *vmx;
vmx = arg;
hpa = ept_lookup_mapping(vmx->pml4ept, gpa);
return (hpa);
}
static void
invept_single_context(void *arg)
{
struct invept_desc desc = *(struct invept_desc *)arg;
invept(INVEPT_TYPE_SINGLE_CONTEXT, desc);
}
void
ept_invalidate_mappings(u_long pml4ept)
{
struct invept_desc invept_desc = { 0 };
invept_desc.eptp = EPTP(pml4ept);
smp_rendezvous(NULL, invept_single_context, NULL, &invept_desc);
}

43
sys/amd64/vmm/intel/ept.h Normal file
View File

@ -0,0 +1,43 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _EPT_H_
#define _EPT_H_
struct vmx;
#define EPT_PWLEVELS 4 /* page walk levels */
#define EPTP(pml4) ((pml4) | (EPT_PWLEVELS - 1) << 3 | PAT_WRITE_BACK)
int ept_init(void);
int ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
vm_memattr_t attr, int prot, boolean_t allow_superpage_mappings);
vm_paddr_t ept_vmmmap_get(void *arg, vm_paddr_t gpa);
void ept_invalidate_mappings(u_long ept_pml4);
void ept_vmcleanup(struct vmx *vmx);
#endif

551
sys/amd64/vmm/intel/vmcs.c Normal file
View File

@ -0,0 +1,551 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include "opt_ddb.h"
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/pcpu.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <machine/segments.h>
#include <machine/pmap.h>
#include <machine/vmm.h>
#include "vmm_host.h"
#include "vmcs.h"
#include "vmx_cpufunc.h"
#include "ept.h"
#include "vmx.h"
#ifdef DDB
#include <ddb/ddb.h>
#endif
static uint64_t
vmcs_fix_regval(uint32_t encoding, uint64_t val)
{
switch (encoding) {
case VMCS_GUEST_CR0:
val = vmx_fix_cr0(val);
break;
case VMCS_GUEST_CR4:
val = vmx_fix_cr4(val);
break;
default:
break;
}
return (val);
}
static uint32_t
vmcs_field_encoding(int ident)
{
switch (ident) {
case VM_REG_GUEST_CR0:
return (VMCS_GUEST_CR0);
case VM_REG_GUEST_CR3:
return (VMCS_GUEST_CR3);
case VM_REG_GUEST_CR4:
return (VMCS_GUEST_CR4);
case VM_REG_GUEST_DR7:
return (VMCS_GUEST_DR7);
case VM_REG_GUEST_RSP:
return (VMCS_GUEST_RSP);
case VM_REG_GUEST_RIP:
return (VMCS_GUEST_RIP);
case VM_REG_GUEST_RFLAGS:
return (VMCS_GUEST_RFLAGS);
case VM_REG_GUEST_ES:
return (VMCS_GUEST_ES_SELECTOR);
case VM_REG_GUEST_CS:
return (VMCS_GUEST_CS_SELECTOR);
case VM_REG_GUEST_SS:
return (VMCS_GUEST_SS_SELECTOR);
case VM_REG_GUEST_DS:
return (VMCS_GUEST_DS_SELECTOR);
case VM_REG_GUEST_FS:
return (VMCS_GUEST_FS_SELECTOR);
case VM_REG_GUEST_GS:
return (VMCS_GUEST_GS_SELECTOR);
case VM_REG_GUEST_TR:
return (VMCS_GUEST_TR_SELECTOR);
case VM_REG_GUEST_LDTR:
return (VMCS_GUEST_LDTR_SELECTOR);
case VM_REG_GUEST_EFER:
return (VMCS_GUEST_IA32_EFER);
default:
return (-1);
}
}
static int
vmcs_seg_desc_encoding(int seg, uint32_t *base, uint32_t *lim, uint32_t *acc)
{
switch (seg) {
case VM_REG_GUEST_ES:
*base = VMCS_GUEST_ES_BASE;
*lim = VMCS_GUEST_ES_LIMIT;
*acc = VMCS_GUEST_ES_ACCESS_RIGHTS;
break;
case VM_REG_GUEST_CS:
*base = VMCS_GUEST_CS_BASE;
*lim = VMCS_GUEST_CS_LIMIT;
*acc = VMCS_GUEST_CS_ACCESS_RIGHTS;
break;
case VM_REG_GUEST_SS:
*base = VMCS_GUEST_SS_BASE;
*lim = VMCS_GUEST_SS_LIMIT;
*acc = VMCS_GUEST_SS_ACCESS_RIGHTS;
break;
case VM_REG_GUEST_DS:
*base = VMCS_GUEST_DS_BASE;
*lim = VMCS_GUEST_DS_LIMIT;
*acc = VMCS_GUEST_DS_ACCESS_RIGHTS;
break;
case VM_REG_GUEST_FS:
*base = VMCS_GUEST_FS_BASE;
*lim = VMCS_GUEST_FS_LIMIT;
*acc = VMCS_GUEST_FS_ACCESS_RIGHTS;
break;
case VM_REG_GUEST_GS:
*base = VMCS_GUEST_GS_BASE;
*lim = VMCS_GUEST_GS_LIMIT;
*acc = VMCS_GUEST_GS_ACCESS_RIGHTS;
break;
case VM_REG_GUEST_TR:
*base = VMCS_GUEST_TR_BASE;
*lim = VMCS_GUEST_TR_LIMIT;
*acc = VMCS_GUEST_TR_ACCESS_RIGHTS;
break;
case VM_REG_GUEST_LDTR:
*base = VMCS_GUEST_LDTR_BASE;
*lim = VMCS_GUEST_LDTR_LIMIT;
*acc = VMCS_GUEST_LDTR_ACCESS_RIGHTS;
break;
case VM_REG_GUEST_IDTR:
*base = VMCS_GUEST_IDTR_BASE;
*lim = VMCS_GUEST_IDTR_LIMIT;
*acc = VMCS_INVALID_ENCODING;
break;
case VM_REG_GUEST_GDTR:
*base = VMCS_GUEST_GDTR_BASE;
*lim = VMCS_GUEST_GDTR_LIMIT;
*acc = VMCS_INVALID_ENCODING;
break;
default:
return (EINVAL);
}
return (0);
}
int
vmcs_getreg(struct vmcs *vmcs, int ident, uint64_t *retval)
{
int error;
uint32_t encoding;
/*
* If we need to get at vmx-specific state in the VMCS we can bypass
* the translation of 'ident' to 'encoding' by simply setting the
* sign bit. As it so happens the upper 16 bits are reserved (i.e
* set to 0) in the encodings for the VMCS so we are free to use the
* sign bit.
*/
if (ident < 0)
encoding = ident & 0x7fffffff;
else
encoding = vmcs_field_encoding(ident);
if (encoding == (uint32_t)-1)
return (EINVAL);
VMPTRLD(vmcs);
error = vmread(encoding, retval);
VMCLEAR(vmcs);
return (error);
}
int
vmcs_setreg(struct vmcs *vmcs, int ident, uint64_t val)
{
int error;
uint32_t encoding;
if (ident < 0)
encoding = ident & 0x7fffffff;
else
encoding = vmcs_field_encoding(ident);
if (encoding == (uint32_t)-1)
return (EINVAL);
val = vmcs_fix_regval(encoding, val);
VMPTRLD(vmcs);
error = vmwrite(encoding, val);
VMCLEAR(vmcs);
return (error);
}
int
vmcs_setdesc(struct vmcs *vmcs, int seg, struct seg_desc *desc)
{
int error;
uint32_t base, limit, access;
error = vmcs_seg_desc_encoding(seg, &base, &limit, &access);
if (error != 0)
panic("vmcs_setdesc: invalid segment register %d", seg);
VMPTRLD(vmcs);
if ((error = vmwrite(base, desc->base)) != 0)
goto done;
if ((error = vmwrite(limit, desc->limit)) != 0)
goto done;
if (access != VMCS_INVALID_ENCODING) {
if ((error = vmwrite(access, desc->access)) != 0)
goto done;
}
done:
VMCLEAR(vmcs);
return (error);
}
int
vmcs_getdesc(struct vmcs *vmcs, int seg, struct seg_desc *desc)
{
int error;
uint32_t base, limit, access;
uint64_t u64;
error = vmcs_seg_desc_encoding(seg, &base, &limit, &access);
if (error != 0)
panic("vmcs_getdesc: invalid segment register %d", seg);
VMPTRLD(vmcs);
if ((error = vmread(base, &u64)) != 0)
goto done;
desc->base = u64;
if ((error = vmread(limit, &u64)) != 0)
goto done;
desc->limit = u64;
if (access != VMCS_INVALID_ENCODING) {
if ((error = vmread(access, &u64)) != 0)
goto done;
desc->access = u64;
}
done:
VMCLEAR(vmcs);
return (error);
}
int
vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count)
{
int error;
VMPTRLD(vmcs);
/*
* Guest MSRs are saved in the VM-exit MSR-store area.
* Guest MSRs are loaded from the VM-entry MSR-load area.
* Both areas point to the same location in memory.
*/
if ((error = vmwrite(VMCS_EXIT_MSR_STORE, g_area)) != 0)
goto done;
if ((error = vmwrite(VMCS_EXIT_MSR_STORE_COUNT, g_count)) != 0)
goto done;
if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD, g_area)) != 0)
goto done;
if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD_COUNT, g_count)) != 0)
goto done;
error = 0;
done:
VMCLEAR(vmcs);
return (error);
}
int
vmcs_set_defaults(struct vmcs *vmcs,
u_long host_rip, u_long host_rsp, u_long ept_pml4,
uint32_t pinbased_ctls, uint32_t procbased_ctls,
uint32_t procbased_ctls2, uint32_t exit_ctls,
uint32_t entry_ctls, u_long msr_bitmap, uint16_t vpid)
{
int error, codesel, datasel, tsssel;
u_long cr0, cr4, efer;
uint64_t eptp, pat, fsbase, idtrbase;
uint32_t exc_bitmap;
codesel = vmm_get_host_codesel();
datasel = vmm_get_host_datasel();
tsssel = vmm_get_host_tsssel();
/*
* Make sure we have a "current" VMCS to work with.
*/
VMPTRLD(vmcs);
/*
* Load the VMX controls
*/
if ((error = vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls)) != 0)
goto done;
if ((error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls)) != 0)
goto done;
if ((error = vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2)) != 0)
goto done;
if ((error = vmwrite(VMCS_EXIT_CTLS, exit_ctls)) != 0)
goto done;
if ((error = vmwrite(VMCS_ENTRY_CTLS, entry_ctls)) != 0)
goto done;
/* Guest state */
/* Initialize guest IA32_PAT MSR with the default value */
pat = PAT_VALUE(0, PAT_WRITE_BACK) |
PAT_VALUE(1, PAT_WRITE_THROUGH) |
PAT_VALUE(2, PAT_UNCACHED) |
PAT_VALUE(3, PAT_UNCACHEABLE) |
PAT_VALUE(4, PAT_WRITE_BACK) |
PAT_VALUE(5, PAT_WRITE_THROUGH) |
PAT_VALUE(6, PAT_UNCACHED) |
PAT_VALUE(7, PAT_UNCACHEABLE);
if ((error = vmwrite(VMCS_GUEST_IA32_PAT, pat)) != 0)
goto done;
/* Host state */
/* Initialize host IA32_PAT MSR */
pat = vmm_get_host_pat();
if ((error = vmwrite(VMCS_HOST_IA32_PAT, pat)) != 0)
goto done;
/* Load the IA32_EFER MSR */
efer = vmm_get_host_efer();
if ((error = vmwrite(VMCS_HOST_IA32_EFER, efer)) != 0)
goto done;
/* Load the control registers */
cr0 = vmm_get_host_cr0();
if ((error = vmwrite(VMCS_HOST_CR0, cr0)) != 0)
goto done;
cr4 = vmm_get_host_cr4() | CR4_VMXE;
if ((error = vmwrite(VMCS_HOST_CR4, cr4)) != 0)
goto done;
/* Load the segment selectors */
if ((error = vmwrite(VMCS_HOST_ES_SELECTOR, datasel)) != 0)
goto done;
if ((error = vmwrite(VMCS_HOST_CS_SELECTOR, codesel)) != 0)
goto done;
if ((error = vmwrite(VMCS_HOST_SS_SELECTOR, datasel)) != 0)
goto done;
if ((error = vmwrite(VMCS_HOST_DS_SELECTOR, datasel)) != 0)
goto done;
if ((error = vmwrite(VMCS_HOST_FS_SELECTOR, datasel)) != 0)
goto done;
if ((error = vmwrite(VMCS_HOST_GS_SELECTOR, datasel)) != 0)
goto done;
if ((error = vmwrite(VMCS_HOST_TR_SELECTOR, tsssel)) != 0)
goto done;
/*
* Load the Base-Address for %fs and idtr.
*
* Note that we exclude %gs, tss and gdtr here because their base
* address is pcpu specific.
*/
fsbase = vmm_get_host_fsbase();
if ((error = vmwrite(VMCS_HOST_FS_BASE, fsbase)) != 0)
goto done;
idtrbase = vmm_get_host_idtrbase();
if ((error = vmwrite(VMCS_HOST_IDTR_BASE, idtrbase)) != 0)
goto done;
/* instruction pointer */
if ((error = vmwrite(VMCS_HOST_RIP, host_rip)) != 0)
goto done;
/* stack pointer */
if ((error = vmwrite(VMCS_HOST_RSP, host_rsp)) != 0)
goto done;
/* eptp */
eptp = EPTP(ept_pml4);
if ((error = vmwrite(VMCS_EPTP, eptp)) != 0)
goto done;
/* vpid */
if ((error = vmwrite(VMCS_VPID, vpid)) != 0)
goto done;
/* msr bitmap */
if ((error = vmwrite(VMCS_MSR_BITMAP, msr_bitmap)) != 0)
goto done;
/* exception bitmap */
exc_bitmap = 1 << IDT_MC;
if ((error = vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap)) != 0)
goto done;
/* link pointer */
if ((error = vmwrite(VMCS_LINK_POINTER, ~0)) != 0)
goto done;
done:
VMCLEAR(vmcs);
return (error);
}
uint64_t
vmcs_read(uint32_t encoding)
{
int error;
uint64_t val;
error = vmread(encoding, &val);
if (error != 0)
panic("vmcs_read(%u) error %d", encoding, error);
return (val);
}
#ifdef DDB
extern int vmxon_enabled[];
DB_SHOW_COMMAND(vmcs, db_show_vmcs)
{
uint64_t cur_vmcs, val;
uint32_t exit;
if (!vmxon_enabled[curcpu]) {
db_printf("VMX not enabled\n");
return;
}
if (have_addr) {
db_printf("Only current VMCS supported\n");
return;
}
vmptrst(&cur_vmcs);
if (cur_vmcs == VMCS_INITIAL) {
db_printf("No current VM context\n");
return;
}
db_printf("VMCS: %jx\n", cur_vmcs);
db_printf("VPID: %lu\n", vmcs_read(VMCS_VPID));
db_printf("Activity: ");
val = vmcs_read(VMCS_GUEST_ACTIVITY);
switch (val) {
case 0:
db_printf("Active");
break;
case 1:
db_printf("HLT");
break;
case 2:
db_printf("Shutdown");
break;
case 3:
db_printf("Wait for SIPI");
break;
default:
db_printf("Unknown: %#lx", val);
}
db_printf("\n");
exit = vmcs_read(VMCS_EXIT_REASON);
if (exit & 0x80000000)
db_printf("Entry Failure Reason: %u\n", exit & 0xffff);
else
db_printf("Exit Reason: %u\n", exit & 0xffff);
db_printf("Qualification: %#lx\n", vmcs_exit_qualification());
db_printf("Guest Linear Address: %#lx\n",
vmcs_read(VMCS_GUEST_LINEAR_ADDRESS));
switch (exit & 0x8000ffff) {
case EXIT_REASON_EXCEPTION:
case EXIT_REASON_EXT_INTR:
val = vmcs_read(VMCS_EXIT_INTERRUPTION_INFO);
db_printf("Interrupt Type: ");
switch (val >> 8 & 0x7) {
case 0:
db_printf("external");
break;
case 2:
db_printf("NMI");
break;
case 3:
db_printf("HW exception");
break;
case 4:
db_printf("SW exception");
break;
default:
db_printf("?? %lu", val >> 8 & 0x7);
break;
}
db_printf(" Vector: %lu", val & 0xff);
if (val & 0x800)
db_printf(" Error Code: %lx",
vmcs_read(VMCS_EXIT_INTERRUPTION_ERROR));
db_printf("\n");
break;
case EXIT_REASON_EPT_FAULT:
case EXIT_REASON_EPT_MISCONFIG:
db_printf("Guest Physical Address: %#lx\n",
vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS));
break;
}
db_printf("VM-instruction error: %#lx\n", vmcs_instruction_error());
}
#endif

338
sys/amd64/vmm/intel/vmcs.h Normal file
View File

@ -0,0 +1,338 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VMCS_H_
#define _VMCS_H_
#ifdef _KERNEL
struct vmcs {
uint32_t identifier;
uint32_t abort_code;
char _impl_specific[PAGE_SIZE - sizeof(uint32_t) * 2];
};
CTASSERT(sizeof(struct vmcs) == PAGE_SIZE);
/* MSR save region is composed of an array of 'struct msr_entry' */
struct msr_entry {
uint32_t index;
uint32_t reserved;
uint64_t val;
};
int vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count);
int vmcs_set_defaults(struct vmcs *vmcs, u_long host_rip, u_long host_rsp,
u_long ept_pml4,
uint32_t pinbased_ctls, uint32_t procbased_ctls,
uint32_t procbased_ctls2, uint32_t exit_ctls,
uint32_t entry_ctls, u_long msr_bitmap,
uint16_t vpid);
int vmcs_getreg(struct vmcs *vmcs, int ident, uint64_t *retval);
int vmcs_setreg(struct vmcs *vmcs, int ident, uint64_t val);
int vmcs_getdesc(struct vmcs *vmcs, int ident,
struct seg_desc *desc);
int vmcs_setdesc(struct vmcs *vmcs, int ident,
struct seg_desc *desc);
uint64_t vmcs_read(uint32_t encoding);
#define vmexit_instruction_length() vmcs_read(VMCS_EXIT_INSTRUCTION_LENGTH)
#define vmcs_guest_rip() vmcs_read(VMCS_GUEST_RIP)
#define vmcs_instruction_error() vmcs_read(VMCS_INSTRUCTION_ERROR)
#define vmcs_exit_reason() (vmcs_read(VMCS_EXIT_REASON) & 0xffff)
#define vmcs_exit_qualification() vmcs_read(VMCS_EXIT_QUALIFICATION)
#define vmcs_guest_cr3() vmcs_read(VMCS_GUEST_CR3)
#define vmcs_gpa() vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS)
#define vmcs_gla() vmcs_read(VMCS_GUEST_LINEAR_ADDRESS)
#endif /* _KERNEL */
#define VMCS_INITIAL 0xffffffffffffffff
#define VMCS_IDENT(encoding) ((encoding) | 0x80000000)
/*
* VMCS field encodings from Appendix H, Intel Architecture Manual Vol3B.
*/
#define VMCS_INVALID_ENCODING 0xffffffff
/* 16-bit control fields */
#define VMCS_VPID 0x00000000
/* 16-bit guest-state fields */
#define VMCS_GUEST_ES_SELECTOR 0x00000800
#define VMCS_GUEST_CS_SELECTOR 0x00000802
#define VMCS_GUEST_SS_SELECTOR 0x00000804
#define VMCS_GUEST_DS_SELECTOR 0x00000806
#define VMCS_GUEST_FS_SELECTOR 0x00000808
#define VMCS_GUEST_GS_SELECTOR 0x0000080A
#define VMCS_GUEST_LDTR_SELECTOR 0x0000080C
#define VMCS_GUEST_TR_SELECTOR 0x0000080E
/* 16-bit host-state fields */
#define VMCS_HOST_ES_SELECTOR 0x00000C00
#define VMCS_HOST_CS_SELECTOR 0x00000C02
#define VMCS_HOST_SS_SELECTOR 0x00000C04
#define VMCS_HOST_DS_SELECTOR 0x00000C06
#define VMCS_HOST_FS_SELECTOR 0x00000C08
#define VMCS_HOST_GS_SELECTOR 0x00000C0A
#define VMCS_HOST_TR_SELECTOR 0x00000C0C
/* 64-bit control fields */
#define VMCS_IO_BITMAP_A 0x00002000
#define VMCS_IO_BITMAP_B 0x00002002
#define VMCS_MSR_BITMAP 0x00002004
#define VMCS_EXIT_MSR_STORE 0x00002006
#define VMCS_EXIT_MSR_LOAD 0x00002008
#define VMCS_ENTRY_MSR_LOAD 0x0000200A
#define VMCS_EXECUTIVE_VMCS 0x0000200C
#define VMCS_TSC_OFFSET 0x00002010
#define VMCS_VIRTUAL_APIC 0x00002012
#define VMCS_APIC_ACCESS 0x00002014
#define VMCS_EPTP 0x0000201A
/* 64-bit read-only fields */
#define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400
/* 64-bit guest-state fields */
#define VMCS_LINK_POINTER 0x00002800
#define VMCS_GUEST_IA32_DEBUGCTL 0x00002802
#define VMCS_GUEST_IA32_PAT 0x00002804
#define VMCS_GUEST_IA32_EFER 0x00002806
#define VMCS_GUEST_IA32_PERF_GLOBAL_CTRL 0x00002808
#define VMCS_GUEST_PDPTE0 0x0000280A
#define VMCS_GUEST_PDPTE1 0x0000280C
#define VMCS_GUEST_PDPTE2 0x0000280E
#define VMCS_GUEST_PDPTE3 0x00002810
/* 64-bit host-state fields */
#define VMCS_HOST_IA32_PAT 0x00002C00
#define VMCS_HOST_IA32_EFER 0x00002C02
#define VMCS_HOST_IA32_PERF_GLOBAL_CTRL 0x00002C04
/* 32-bit control fields */
#define VMCS_PIN_BASED_CTLS 0x00004000
#define VMCS_PRI_PROC_BASED_CTLS 0x00004002
#define VMCS_EXCEPTION_BITMAP 0x00004004
#define VMCS_PF_ERROR_MASK 0x00004006
#define VMCS_PF_ERROR_MATCH 0x00004008
#define VMCS_CR3_TARGET_COUNT 0x0000400A
#define VMCS_EXIT_CTLS 0x0000400C
#define VMCS_EXIT_MSR_STORE_COUNT 0x0000400E
#define VMCS_EXIT_MSR_LOAD_COUNT 0x00004010
#define VMCS_ENTRY_CTLS 0x00004012
#define VMCS_ENTRY_MSR_LOAD_COUNT 0x00004014
#define VMCS_ENTRY_INTR_INFO 0x00004016
#define VMCS_ENTRY_EXCEPTION_ERROR 0x00004018
#define VMCS_ENTRY_INST_LENGTH 0x0000401A
#define VMCS_TPR_THRESHOLD 0x0000401C
#define VMCS_SEC_PROC_BASED_CTLS 0x0000401E
#define VMCS_PLE_GAP 0x00004020
#define VMCS_PLE_WINDOW 0x00004022
/* 32-bit read-only data fields */
#define VMCS_INSTRUCTION_ERROR 0x00004400
#define VMCS_EXIT_REASON 0x00004402
#define VMCS_EXIT_INTERRUPTION_INFO 0x00004404
#define VMCS_EXIT_INTERRUPTION_ERROR 0x00004406
#define VMCS_IDT_VECTORING_INFO 0x00004408
#define VMCS_IDT_VECTORING_ERROR 0x0000440A
#define VMCS_EXIT_INSTRUCTION_LENGTH 0x0000440C
#define VMCS_EXIT_INSTRUCTION_INFO 0x0000440E
/* 32-bit guest-state fields */
#define VMCS_GUEST_ES_LIMIT 0x00004800
#define VMCS_GUEST_CS_LIMIT 0x00004802
#define VMCS_GUEST_SS_LIMIT 0x00004804
#define VMCS_GUEST_DS_LIMIT 0x00004806
#define VMCS_GUEST_FS_LIMIT 0x00004808
#define VMCS_GUEST_GS_LIMIT 0x0000480A
#define VMCS_GUEST_LDTR_LIMIT 0x0000480C
#define VMCS_GUEST_TR_LIMIT 0x0000480E
#define VMCS_GUEST_GDTR_LIMIT 0x00004810
#define VMCS_GUEST_IDTR_LIMIT 0x00004812
#define VMCS_GUEST_ES_ACCESS_RIGHTS 0x00004814
#define VMCS_GUEST_CS_ACCESS_RIGHTS 0x00004816
#define VMCS_GUEST_SS_ACCESS_RIGHTS 0x00004818
#define VMCS_GUEST_DS_ACCESS_RIGHTS 0x0000481A
#define VMCS_GUEST_FS_ACCESS_RIGHTS 0x0000481C
#define VMCS_GUEST_GS_ACCESS_RIGHTS 0x0000481E
#define VMCS_GUEST_LDTR_ACCESS_RIGHTS 0x00004820
#define VMCS_GUEST_TR_ACCESS_RIGHTS 0x00004822
#define VMCS_GUEST_INTERRUPTIBILITY 0x00004824
#define VMCS_GUEST_ACTIVITY 0x00004826
#define VMCS_GUEST_SMBASE 0x00004828
#define VMCS_GUEST_IA32_SYSENTER_CS 0x0000482A
#define VMCS_PREEMPTION_TIMER_VALUE 0x0000482E
/* 32-bit host state fields */
#define VMCS_HOST_IA32_SYSENTER_CS 0x00004C00
/* Natural Width control fields */
#define VMCS_CR0_MASK 0x00006000
#define VMCS_CR4_MASK 0x00006002
#define VMCS_CR0_SHADOW 0x00006004
#define VMCS_CR4_SHADOW 0x00006006
#define VMCS_CR3_TARGET0 0x00006008
#define VMCS_CR3_TARGET1 0x0000600A
#define VMCS_CR3_TARGET2 0x0000600C
#define VMCS_CR3_TARGET3 0x0000600E
/* Natural Width read-only fields */
#define VMCS_EXIT_QUALIFICATION 0x00006400
#define VMCS_IO_RCX 0x00006402
#define VMCS_IO_RSI 0x00006404
#define VMCS_IO_RDI 0x00006406
#define VMCS_IO_RIP 0x00006408
#define VMCS_GUEST_LINEAR_ADDRESS 0x0000640A
/* Natural Width guest-state fields */
#define VMCS_GUEST_CR0 0x00006800
#define VMCS_GUEST_CR3 0x00006802
#define VMCS_GUEST_CR4 0x00006804
#define VMCS_GUEST_ES_BASE 0x00006806
#define VMCS_GUEST_CS_BASE 0x00006808
#define VMCS_GUEST_SS_BASE 0x0000680A
#define VMCS_GUEST_DS_BASE 0x0000680C
#define VMCS_GUEST_FS_BASE 0x0000680E
#define VMCS_GUEST_GS_BASE 0x00006810
#define VMCS_GUEST_LDTR_BASE 0x00006812
#define VMCS_GUEST_TR_BASE 0x00006814
#define VMCS_GUEST_GDTR_BASE 0x00006816
#define VMCS_GUEST_IDTR_BASE 0x00006818
#define VMCS_GUEST_DR7 0x0000681A
#define VMCS_GUEST_RSP 0x0000681C
#define VMCS_GUEST_RIP 0x0000681E
#define VMCS_GUEST_RFLAGS 0x00006820
#define VMCS_GUEST_PENDING_DBG_EXCEPTIONS 0x00006822
#define VMCS_GUEST_IA32_SYSENTER_ESP 0x00006824
#define VMCS_GUEST_IA32_SYSENTER_EIP 0x00006826
/* Natural Width host-state fields */
#define VMCS_HOST_CR0 0x00006C00
#define VMCS_HOST_CR3 0x00006C02
#define VMCS_HOST_CR4 0x00006C04
#define VMCS_HOST_FS_BASE 0x00006C06
#define VMCS_HOST_GS_BASE 0x00006C08
#define VMCS_HOST_TR_BASE 0x00006C0A
#define VMCS_HOST_GDTR_BASE 0x00006C0C
#define VMCS_HOST_IDTR_BASE 0x00006C0E
#define VMCS_HOST_IA32_SYSENTER_ESP 0x00006C10
#define VMCS_HOST_IA32_SYSENTER_EIP 0x00006C12
#define VMCS_HOST_RSP 0x00006C14
#define VMCS_HOST_RIP 0x00006c16
/*
* VM instruction error numbers
*/
#define VMRESUME_WITH_NON_LAUNCHED_VMCS 5
/*
* VMCS exit reasons
*/
#define EXIT_REASON_EXCEPTION 0
#define EXIT_REASON_EXT_INTR 1
#define EXIT_REASON_TRIPLE_FAULT 2
#define EXIT_REASON_INIT 3
#define EXIT_REASON_SIPI 4
#define EXIT_REASON_IO_SMI 5
#define EXIT_REASON_SMI 6
#define EXIT_REASON_INTR_WINDOW 7
#define EXIT_REASON_NMI_WINDOW 8
#define EXIT_REASON_TASK_SWITCH 9
#define EXIT_REASON_CPUID 10
#define EXIT_REASON_GETSEC 11
#define EXIT_REASON_HLT 12
#define EXIT_REASON_INVD 13
#define EXIT_REASON_INVLPG 14
#define EXIT_REASON_RDPMC 15
#define EXIT_REASON_RDTSC 16
#define EXIT_REASON_RSM 17
#define EXIT_REASON_VMCALL 18
#define EXIT_REASON_VMCLEAR 19
#define EXIT_REASON_VMLAUNCH 20
#define EXIT_REASON_VMPTRLD 21
#define EXIT_REASON_VMPTRST 22
#define EXIT_REASON_VMREAD 23
#define EXIT_REASON_VMRESUME 24
#define EXIT_REASON_VMWRITE 25
#define EXIT_REASON_VMXOFF 26
#define EXIT_REASON_VMXON 27
#define EXIT_REASON_CR_ACCESS 28
#define EXIT_REASON_DR_ACCESS 29
#define EXIT_REASON_INOUT 30
#define EXIT_REASON_RDMSR 31
#define EXIT_REASON_WRMSR 32
#define EXIT_REASON_INVAL_VMCS 33
#define EXIT_REASON_INVAL_MSR 34
#define EXIT_REASON_MWAIT 36
#define EXIT_REASON_MTF 37
#define EXIT_REASON_MONITOR 39
#define EXIT_REASON_PAUSE 40
#define EXIT_REASON_MCE 41
#define EXIT_REASON_TPR 43
#define EXIT_REASON_APIC 44
#define EXIT_REASON_GDTR_IDTR 46
#define EXIT_REASON_LDTR_TR 47
#define EXIT_REASON_EPT_FAULT 48
#define EXIT_REASON_EPT_MISCONFIG 49
#define EXIT_REASON_INVEPT 50
#define EXIT_REASON_RDTSCP 51
#define EXIT_REASON_VMX_PREEMPT 52
#define EXIT_REASON_INVVPID 53
#define EXIT_REASON_WBINVD 54
#define EXIT_REASON_XSETBV 55
/*
* VMCS interrupt information fields
*/
#define VMCS_INTERRUPTION_INFO_VALID (1U << 31)
#define VMCS_INTERRUPTION_INFO_HW_INTR (0 << 8)
#define VMCS_INTERRUPTION_INFO_NMI (2 << 8)
/*
* VMCS Guest interruptibility field
*/
#define VMCS_INTERRUPTIBILITY_STI_BLOCKING (1 << 0)
#define VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING (1 << 1)
#define VMCS_INTERRUPTIBILITY_SMI_BLOCKING (1 << 2)
#define VMCS_INTERRUPTIBILITY_NMI_BLOCKING (1 << 3)
/*
* Exit qualification for EXIT_REASON_INVAL_VMCS
*/
#define EXIT_QUAL_NMI_WHILE_STI_BLOCKING 3
/*
* Exit qualification for EPT violation
*/
#define EPT_VIOLATION_DATA_READ (1UL << 0)
#define EPT_VIOLATION_DATA_WRITE (1UL << 1)
#define EPT_VIOLATION_INST_FETCH (1UL << 2)
#define EPT_VIOLATION_GLA_VALID (1UL << 7)
#define EPT_VIOLATION_XLAT_VALID (1UL << 8)
#endif

1845
sys/amd64/vmm/intel/vmx.c Normal file

File diff suppressed because it is too large Load Diff

120
sys/amd64/vmm/intel/vmx.h Normal file
View File

@ -0,0 +1,120 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VMX_H_
#define _VMX_H_
#include "vmcs.h"
#define GUEST_MSR_MAX_ENTRIES 64 /* arbitrary */
struct vmxctx {
register_t tmpstk[32]; /* vmx_return() stack */
register_t tmpstktop;
register_t guest_rdi; /* Guest state */
register_t guest_rsi;
register_t guest_rdx;
register_t guest_rcx;
register_t guest_r8;
register_t guest_r9;
register_t guest_rax;
register_t guest_rbx;
register_t guest_rbp;
register_t guest_r10;
register_t guest_r11;
register_t guest_r12;
register_t guest_r13;
register_t guest_r14;
register_t guest_r15;
register_t guest_cr2;
register_t host_r15; /* Host state */
register_t host_r14;
register_t host_r13;
register_t host_r12;
register_t host_rbp;
register_t host_rsp;
register_t host_rbx;
register_t host_rip;
/*
* XXX todo debug registers and fpu state
*/
int launched; /* vmcs launch state */
int launch_error;
};
struct vmxcap {
int set;
uint32_t proc_ctls;
};
struct vmxstate {
int lastcpu; /* host cpu that this 'vcpu' last ran on */
uint16_t vpid;
};
/* virtual machine softc */
struct vmx {
pml4_entry_t pml4ept[NPML4EPG];
struct vmcs vmcs[VM_MAXCPU]; /* one vmcs per virtual cpu */
char msr_bitmap[PAGE_SIZE];
struct msr_entry guest_msrs[VM_MAXCPU][GUEST_MSR_MAX_ENTRIES];
struct vmxctx ctx[VM_MAXCPU];
struct vmxcap cap[VM_MAXCPU];
struct vmxstate state[VM_MAXCPU];
struct vm *vm;
};
CTASSERT((offsetof(struct vmx, pml4ept) & PAGE_MASK) == 0);
CTASSERT((offsetof(struct vmx, vmcs) & PAGE_MASK) == 0);
CTASSERT((offsetof(struct vmx, msr_bitmap) & PAGE_MASK) == 0);
CTASSERT((offsetof(struct vmx, guest_msrs) & 15) == 0);
#define VMX_RETURN_DIRECT 0
#define VMX_RETURN_LONGJMP 1
#define VMX_RETURN_VMRESUME 2
#define VMX_RETURN_VMLAUNCH 3
#define VMX_RETURN_AST 4
/*
* vmx_setjmp() returns:
* - 0 when it returns directly
* - 1 when it returns from vmx_longjmp
* - 2 when it returns from vmx_resume (which would only be in the error case)
* - 3 when it returns from vmx_launch (which would only be in the error case)
* - 4 when it returns from vmx_resume or vmx_launch because of AST pending
*/
int vmx_setjmp(struct vmxctx *ctx);
void vmx_longjmp(void); /* returns via vmx_setjmp */
void vmx_launch(struct vmxctx *ctx) __dead2; /* may return via vmx_setjmp */
void vmx_resume(struct vmxctx *ctx) __dead2; /* may return via vmx_setjmp */
u_long vmx_fix_cr0(u_long cr0);
u_long vmx_fix_cr4(u_long cr4);
#endif

View File

@ -0,0 +1,92 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VMX_CONTROLS_H_
#define _VMX_CONTROLS_H_
/* Pin-Based VM-Execution Controls */
#define PINBASED_EXTINT_EXITING (1 << 0)
#define PINBASED_NMI_EXITING (1 << 3)
#define PINBASED_VIRTUAL_NMI (1 << 5)
#define PINBASED_PREMPTION_TIMER (1 << 6)
/* Primary Processor-Based VM-Execution Controls */
#define PROCBASED_INT_WINDOW_EXITING (1 << 2)
#define PROCBASED_TSC_OFFSET (1 << 3)
#define PROCBASED_HLT_EXITING (1 << 7)
#define PROCBASED_INVLPG_EXITING (1 << 9)
#define PROCBASED_MWAIT_EXITING (1 << 10)
#define PROCBASED_RDPMC_EXITING (1 << 11)
#define PROCBASED_RDTSC_EXITING (1 << 12)
#define PROCBASED_CR3_LOAD_EXITING (1 << 15)
#define PROCBASED_CR3_STORE_EXITING (1 << 16)
#define PROCBASED_CR8_LOAD_EXITING (1 << 19)
#define PROCBASED_CR8_STORE_EXITING (1 << 20)
#define PROCBASED_USE_TPR_SHADOW (1 << 21)
#define PROCBASED_NMI_WINDOW_EXITING (1 << 22)
#define PROCBASED_MOV_DR_EXITING (1 << 23)
#define PROCBASED_IO_EXITING (1 << 24)
#define PROCBASED_IO_BITMAPS (1 << 25)
#define PROCBASED_MTF (1 << 27)
#define PROCBASED_MSR_BITMAPS (1 << 28)
#define PROCBASED_MONITOR_EXITING (1 << 29)
#define PROCBASED_PAUSE_EXITING (1 << 30)
#define PROCBASED_SECONDARY_CONTROLS (1 << 31)
/* Secondary Processor-Based VM-Execution Controls */
#define PROCBASED2_VIRTUALIZE_APIC (1 << 0)
#define PROCBASED2_ENABLE_EPT (1 << 1)
#define PROCBASED2_DESC_TABLE_EXITING (1 << 2)
#define PROCBASED2_ENABLE_RDTSCP (1 << 3)
#define PROCBASED2_VIRTUALIZE_X2APIC (1 << 4)
#define PROCBASED2_ENABLE_VPID (1 << 5)
#define PROCBASED2_WBINVD_EXITING (1 << 6)
#define PROCBASED2_UNRESTRICTED_GUEST (1 << 7)
#define PROCBASED2_PAUSE_LOOP_EXITING (1 << 10)
/* VM Exit Controls */
#define VM_EXIT_SAVE_DEBUG_CONTROLS (1 << 2)
#define VM_EXIT_HOST_LMA (1 << 9)
#define VM_EXIT_LOAD_PERF_GLOBAL_CTRL (1 << 12)
#define VM_EXIT_ACKNOWLEDGE_INTERRUPT (1 << 15)
#define VM_EXIT_SAVE_PAT (1 << 18)
#define VM_EXIT_LOAD_PAT (1 << 19)
#define VM_EXIT_SAVE_EFER (1 << 20)
#define VM_EXIT_LOAD_EFER (1 << 21)
#define VM_EXIT_SAVE_PREEMPTION_TIMER (1 << 22)
/* VM Entry Controls */
#define VM_ENTRY_LOAD_DEBUG_CONTROLS (1 << 2)
#define VM_ENTRY_GUEST_LMA (1 << 9)
#define VM_ENTRY_INTO_SMM (1 << 10)
#define VM_ENTRY_DEACTIVATE_DUAL_MONITOR (1 << 11)
#define VM_ENTRY_LOAD_PERF_GLOBAL_CTRL (1 << 13)
#define VM_ENTRY_LOAD_PAT (1 << 14)
#define VM_ENTRY_LOAD_EFER (1 << 15)
#endif

View File

@ -0,0 +1,218 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VMX_CPUFUNC_H_
#define _VMX_CPUFUNC_H_
struct vmcs;
/*
* Section 5.2 "Conventions" from Intel Architecture Manual 2B.
*
* error
* VMsucceed 0
* VMFailInvalid 1
* VMFailValid 2 see also VMCS VM-Instruction Error Field
*/
#define VM_SUCCESS 0
#define VM_FAIL_INVALID 1
#define VM_FAIL_VALID 2
#define VMX_SET_ERROR_CODE \
" jnc 1f;" \
" mov $1, %[error];" /* CF: error = 1 */ \
" jmp 3f;" \
"1: jnz 2f;" \
" mov $2, %[error];" /* ZF: error = 2 */ \
" jmp 3f;" \
"2: mov $0, %[error];" \
"3:"
/* returns 0 on success and non-zero on failure */
static __inline int
vmxon(char *region)
{
int error;
uint64_t addr;
addr = vtophys(region);
__asm __volatile("vmxon %[addr];"
VMX_SET_ERROR_CODE
: [error] "=r" (error)
: [addr] "m" (*(uint64_t *)&addr)
: "memory");
return (error);
}
/* returns 0 on success and non-zero on failure */
static __inline int
vmclear(struct vmcs *vmcs)
{
int error;
uint64_t addr;
addr = vtophys(vmcs);
__asm __volatile("vmclear %[addr];"
VMX_SET_ERROR_CODE
: [error] "=r" (error)
: [addr] "m" (*(uint64_t *)&addr)
: "memory");
return (error);
}
static __inline void
vmxoff(void)
{
__asm __volatile("vmxoff");
}
static __inline void
vmptrst(uint64_t *addr)
{
__asm __volatile("vmptrst %[addr]" :: [addr]"m" (*addr) : "memory");
}
static __inline int
vmptrld(struct vmcs *vmcs)
{
int error;
uint64_t addr;
addr = vtophys(vmcs);
__asm __volatile("vmptrld %[addr];"
VMX_SET_ERROR_CODE
: [error] "=r" (error)
: [addr] "m" (*(uint64_t *)&addr)
: "memory");
return (error);
}
static __inline int
vmwrite(uint64_t reg, uint64_t val)
{
int error;
__asm __volatile("vmwrite %[val], %[reg];"
VMX_SET_ERROR_CODE
: [error] "=r" (error)
: [val] "r" (val), [reg] "r" (reg)
: "memory");
return (error);
}
static __inline int
vmread(uint64_t r, uint64_t *addr)
{
int error;
__asm __volatile("vmread %[r], %[addr];"
VMX_SET_ERROR_CODE
: [error] "=r" (error)
: [r] "r" (r), [addr] "m" (*addr)
: "memory");
return (error);
}
static void __inline
VMCLEAR(struct vmcs *vmcs)
{
int err;
err = vmclear(vmcs);
if (err != 0)
panic("%s: vmclear(%p) error %d", __func__, vmcs, err);
critical_exit();
}
static void __inline
VMPTRLD(struct vmcs *vmcs)
{
int err;
critical_enter();
err = vmptrld(vmcs);
if (err != 0)
panic("%s: vmptrld(%p) error %d", __func__, vmcs, err);
}
#define INVVPID_TYPE_ADDRESS 0UL
#define INVVPID_TYPE_SINGLE_CONTEXT 1UL
#define INVVPID_TYPE_ALL_CONTEXTS 2UL
struct invvpid_desc {
uint16_t vpid;
uint16_t _res1;
uint32_t _res2;
uint64_t linear_addr;
};
CTASSERT(sizeof(struct invvpid_desc) == 16);
static void __inline
invvpid(uint64_t type, struct invvpid_desc desc)
{
int error;
__asm __volatile("invvpid %[desc], %[type];"
VMX_SET_ERROR_CODE
: [error] "=r" (error)
: [desc] "m" (desc), [type] "r" (type)
: "memory");
if (error)
panic("invvpid error %d", error);
}
#define INVEPT_TYPE_SINGLE_CONTEXT 1UL
#define INVEPT_TYPE_ALL_CONTEXTS 2UL
struct invept_desc {
uint64_t eptp;
uint64_t _res;
};
CTASSERT(sizeof(struct invept_desc) == 16);
static void __inline
invept(uint64_t type, struct invept_desc desc)
{
int error;
__asm __volatile("invept %[desc], %[type];"
VMX_SET_ERROR_CODE
: [error] "=r" (error)
: [desc] "m" (desc), [type] "r" (type)
: "memory");
if (error)
panic("invept error %d", error);
}
#endif

View File

@ -0,0 +1,89 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/proc.h>
#include <sys/assym.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <machine/pmap.h>
#include <machine/vmm.h>
#include "vmx.h"
#include "vmx_cpufunc.h"
ASSYM(VMXCTX_TMPSTKTOP, offsetof(struct vmxctx, tmpstktop));
ASSYM(VMXCTX_GUEST_RDI, offsetof(struct vmxctx, guest_rdi));
ASSYM(VMXCTX_GUEST_RSI, offsetof(struct vmxctx, guest_rsi));
ASSYM(VMXCTX_GUEST_RDX, offsetof(struct vmxctx, guest_rdx));
ASSYM(VMXCTX_GUEST_RCX, offsetof(struct vmxctx, guest_rcx));
ASSYM(VMXCTX_GUEST_R8, offsetof(struct vmxctx, guest_r8));
ASSYM(VMXCTX_GUEST_R9, offsetof(struct vmxctx, guest_r9));
ASSYM(VMXCTX_GUEST_RAX, offsetof(struct vmxctx, guest_rax));
ASSYM(VMXCTX_GUEST_RBX, offsetof(struct vmxctx, guest_rbx));
ASSYM(VMXCTX_GUEST_RBP, offsetof(struct vmxctx, guest_rbp));
ASSYM(VMXCTX_GUEST_R10, offsetof(struct vmxctx, guest_r10));
ASSYM(VMXCTX_GUEST_R11, offsetof(struct vmxctx, guest_r11));
ASSYM(VMXCTX_GUEST_R12, offsetof(struct vmxctx, guest_r12));
ASSYM(VMXCTX_GUEST_R13, offsetof(struct vmxctx, guest_r13));
ASSYM(VMXCTX_GUEST_R14, offsetof(struct vmxctx, guest_r14));
ASSYM(VMXCTX_GUEST_R15, offsetof(struct vmxctx, guest_r15));
ASSYM(VMXCTX_GUEST_CR2, offsetof(struct vmxctx, guest_cr2));
ASSYM(VMXCTX_HOST_R15, offsetof(struct vmxctx, host_r15));
ASSYM(VMXCTX_HOST_R14, offsetof(struct vmxctx, host_r14));
ASSYM(VMXCTX_HOST_R13, offsetof(struct vmxctx, host_r13));
ASSYM(VMXCTX_HOST_R12, offsetof(struct vmxctx, host_r12));
ASSYM(VMXCTX_HOST_RBP, offsetof(struct vmxctx, host_rbp));
ASSYM(VMXCTX_HOST_RSP, offsetof(struct vmxctx, host_rsp));
ASSYM(VMXCTX_HOST_RBX, offsetof(struct vmxctx, host_rbx));
ASSYM(VMXCTX_HOST_RIP, offsetof(struct vmxctx, host_rip));
ASSYM(VMXCTX_LAUNCH_ERROR, offsetof(struct vmxctx, launch_error));
ASSYM(VM_SUCCESS, VM_SUCCESS);
ASSYM(VM_FAIL_INVALID, VM_FAIL_INVALID);
ASSYM(VM_FAIL_VALID, VM_FAIL_VALID);
ASSYM(VMX_RETURN_DIRECT, VMX_RETURN_DIRECT);
ASSYM(VMX_RETURN_LONGJMP, VMX_RETURN_LONGJMP);
ASSYM(VMX_RETURN_VMRESUME, VMX_RETURN_VMRESUME);
ASSYM(VMX_RETURN_VMLAUNCH, VMX_RETURN_VMLAUNCH);
ASSYM(VMX_RETURN_AST, VMX_RETURN_AST);
ASSYM(TDF_ASTPENDING, TDF_ASTPENDING);
ASSYM(TDF_NEEDRESCHED, TDF_NEEDRESCHED);
ASSYM(TD_FLAGS, offsetof(struct thread, td_flags));
ASSYM(PC_CURTHREAD, offsetof(struct pcpu, pc_curthread));

View File

@ -0,0 +1,172 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <machine/cpufunc.h>
#include "vmx_msr.h"
static boolean_t
vmx_ctl_allows_one_setting(uint64_t msr_val, int bitpos)
{
if (msr_val & (1UL << (bitpos + 32)))
return (TRUE);
else
return (FALSE);
}
static boolean_t
vmx_ctl_allows_zero_setting(uint64_t msr_val, int bitpos)
{
if ((msr_val & (1UL << bitpos)) == 0)
return (TRUE);
else
return (FALSE);
}
uint32_t
vmx_revision(void)
{
return (rdmsr(MSR_VMX_BASIC) & 0xffffffff);
}
/*
* Generate a bitmask to be used for the VMCS execution control fields.
*
* The caller specifies what bits should be set to one in 'ones_mask'
* and what bits should be set to zero in 'zeros_mask'. The don't-care
* bits are set to the default value. The default values are obtained
* based on "Algorithm 3" in Section 27.5.1 "Algorithms for Determining
* VMX Capabilities".
*
* Returns zero on success and non-zero on error.
*/
int
vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask,
uint32_t zeros_mask, uint32_t *retval)
{
int i;
uint64_t val, trueval;
boolean_t true_ctls_avail, one_allowed, zero_allowed;
/* We cannot ask the same bit to be set to both '1' and '0' */
if ((ones_mask ^ zeros_mask) != (ones_mask | zeros_mask))
return (EINVAL);
if (rdmsr(MSR_VMX_BASIC) & (1UL << 55))
true_ctls_avail = TRUE;
else
true_ctls_avail = FALSE;
val = rdmsr(ctl_reg);
if (true_ctls_avail)
trueval = rdmsr(true_ctl_reg); /* step c */
else
trueval = val; /* step a */
for (i = 0; i < 32; i++) {
one_allowed = vmx_ctl_allows_one_setting(trueval, i);
zero_allowed = vmx_ctl_allows_zero_setting(trueval, i);
KASSERT(one_allowed || zero_allowed,
("invalid zero/one setting for bit %d of ctl 0x%0x, "
"truectl 0x%0x\n", i, ctl_reg, true_ctl_reg));
if (zero_allowed && !one_allowed) { /* b(i),c(i) */
if (ones_mask & (1 << i))
return (EINVAL);
*retval &= ~(1 << i);
} else if (one_allowed && !zero_allowed) { /* b(i),c(i) */
if (zeros_mask & (1 << i))
return (EINVAL);
*retval |= 1 << i;
} else {
if (zeros_mask & (1 << i)) /* b(ii),c(ii) */
*retval &= ~(1 << i);
else if (ones_mask & (1 << i)) /* b(ii), c(ii) */
*retval |= 1 << i;
else if (!true_ctls_avail)
*retval &= ~(1 << i); /* b(iii) */
else if (vmx_ctl_allows_zero_setting(val, i))/* c(iii)*/
*retval &= ~(1 << i);
else if (vmx_ctl_allows_one_setting(val, i)) /* c(iv) */
*retval |= 1 << i;
else {
panic("vmx_set_ctlreg: unable to determine "
"correct value of ctl bit %d for msr "
"0x%0x and true msr 0x%0x", i, ctl_reg,
true_ctl_reg);
}
}
}
return (0);
}
void
msr_bitmap_initialize(char *bitmap)
{
memset(bitmap, 0xff, PAGE_SIZE);
}
int
msr_bitmap_change_access(char *bitmap, u_int msr, int access)
{
int byte, bit;
if (msr <= 0x00001FFF)
byte = msr / 8;
else if (msr >= 0xC0000000 && msr <= 0xC0001FFF)
byte = 1024 + (msr - 0xC0000000) / 8;
else
return (EINVAL);
bit = msr & 0x7;
if (access & MSR_BITMAP_ACCESS_READ)
bitmap[byte] &= ~(1 << bit);
else
bitmap[byte] |= 1 << bit;
byte += 2048;
if (access & MSR_BITMAP_ACCESS_WRITE)
bitmap[byte] &= ~(1 << bit);
else
bitmap[byte] |= 1 << bit;
return (0);
}

View File

@ -0,0 +1,78 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VMX_MSR_H_
#define _VMX_MSR_H_
#define MSR_VMX_BASIC 0x480
#define MSR_VMX_EPT_VPID_CAP 0x48C
#define MSR_VMX_PROCBASED_CTLS 0x482
#define MSR_VMX_TRUE_PROCBASED_CTLS 0x48E
#define MSR_VMX_PINBASED_CTLS 0x481
#define MSR_VMX_TRUE_PINBASED_CTLS 0x48D
#define MSR_VMX_PROCBASED_CTLS2 0x48B
#define MSR_VMX_EXIT_CTLS 0x483
#define MSR_VMX_TRUE_EXIT_CTLS 0x48f
#define MSR_VMX_ENTRY_CTLS 0x484
#define MSR_VMX_TRUE_ENTRY_CTLS 0x490
#define MSR_VMX_CR0_FIXED0 0x486
#define MSR_VMX_CR0_FIXED1 0x487
#define MSR_VMX_CR4_FIXED0 0x488
#define MSR_VMX_CR4_FIXED1 0x489
uint32_t vmx_revision(void);
int vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask,
uint32_t zeros_mask, uint32_t *retval);
/*
* According to Section 21.10.4 "Software Access to Related Structures",
* changes to data structures pointed to by the VMCS must be made only when
* there is no logical processor with a current VMCS that points to the
* data structure.
*
* This pretty much limits us to configuring the MSR bitmap before VMCS
* initialization for SMP VMs. Unless of course we do it the hard way - which
* would involve some form of synchronization between the vcpus to vmclear
* all VMCSs' that point to the bitmap.
*/
#define MSR_BITMAP_ACCESS_NONE 0x0
#define MSR_BITMAP_ACCESS_READ 0x1
#define MSR_BITMAP_ACCESS_WRITE 0x2
#define MSR_BITMAP_ACCESS_RW (MSR_BITMAP_ACCESS_READ|MSR_BITMAP_ACCESS_WRITE)
void msr_bitmap_initialize(char *bitmap);
int msr_bitmap_change_access(char *bitmap, u_int msr, int access);
#endif

View File

@ -0,0 +1,246 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <machine/asmacros.h>
#include "vmx_assym.s"
/*
* Disable interrupts before updating %rsp in VMX_CHECK_AST or
* VMX_GUEST_RESTORE.
*
* The location that %rsp points to is a 'vmxctx' and not a
* real stack so we don't want an interrupt handler to trash it
*/
#define VMX_DISABLE_INTERRUPTS cli
/*
* If the thread hosting the vcpu has an ast pending then take care of it
* by returning from vmx_setjmp() with a return value of VMX_RETURN_AST.
*
* Assumes that %rdi holds a pointer to the 'vmxctx' and that interrupts
* are disabled.
*/
#define VMX_CHECK_AST \
movq PCPU(CURTHREAD),%rax; \
testl $TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%rax); \
je 9f; \
movq $VMX_RETURN_AST,%rsi; \
movq %rdi,%rsp; \
addq $VMXCTX_TMPSTKTOP,%rsp; \
callq vmx_return; \
9:
/*
* Assumes that %rdi holds a pointer to the 'vmxctx'.
*
* On "return" all registers are updated to reflect guest state. The two
* exceptions are %rip and %rsp. These registers are atomically switched
* by hardware from the guest area of the vmcs.
*
* We modify %rsp to point to the 'vmxctx' so we can use it to restore
* host context in case of an error with 'vmlaunch' or 'vmresume'.
*/
#define VMX_GUEST_RESTORE \
movq %rdi,%rsp; \
movq VMXCTX_GUEST_CR2(%rdi),%rsi; \
movq %rsi,%cr2; \
movq VMXCTX_GUEST_RSI(%rdi),%rsi; \
movq VMXCTX_GUEST_RDX(%rdi),%rdx; \
movq VMXCTX_GUEST_RCX(%rdi),%rcx; \
movq VMXCTX_GUEST_R8(%rdi),%r8; \
movq VMXCTX_GUEST_R9(%rdi),%r9; \
movq VMXCTX_GUEST_RAX(%rdi),%rax; \
movq VMXCTX_GUEST_RBX(%rdi),%rbx; \
movq VMXCTX_GUEST_RBP(%rdi),%rbp; \
movq VMXCTX_GUEST_R10(%rdi),%r10; \
movq VMXCTX_GUEST_R11(%rdi),%r11; \
movq VMXCTX_GUEST_R12(%rdi),%r12; \
movq VMXCTX_GUEST_R13(%rdi),%r13; \
movq VMXCTX_GUEST_R14(%rdi),%r14; \
movq VMXCTX_GUEST_R15(%rdi),%r15; \
movq VMXCTX_GUEST_RDI(%rdi),%rdi; /* restore rdi the last */
#define VM_INSTRUCTION_ERROR(reg) \
jnc 1f; \
movl $VM_FAIL_INVALID,reg; /* CF is set */ \
jmp 3f; \
1: jnz 2f; \
movl $VM_FAIL_VALID,reg; /* ZF is set */ \
jmp 3f; \
2: movl $VM_SUCCESS,reg; \
3: movl reg,VMXCTX_LAUNCH_ERROR(%rsp)
.text
/*
* int vmx_setjmp(ctxp)
* %rdi = ctxp
*
* Return value is '0' when it returns directly from here.
* Return value is '1' when it returns after a vm exit through vmx_longjmp.
*/
ENTRY(vmx_setjmp)
movq (%rsp),%rax /* return address */
movq %r15,VMXCTX_HOST_R15(%rdi)
movq %r14,VMXCTX_HOST_R14(%rdi)
movq %r13,VMXCTX_HOST_R13(%rdi)
movq %r12,VMXCTX_HOST_R12(%rdi)
movq %rbp,VMXCTX_HOST_RBP(%rdi)
movq %rsp,VMXCTX_HOST_RSP(%rdi)
movq %rbx,VMXCTX_HOST_RBX(%rdi)
movq %rax,VMXCTX_HOST_RIP(%rdi)
/*
* XXX save host debug registers
*/
movl $VMX_RETURN_DIRECT,%eax
ret
END(vmx_setjmp)
/*
* void vmx_return(struct vmxctx *ctxp, int retval)
* %rdi = ctxp
* %rsi = retval
* Return to vmm context through vmx_setjmp() with a value of 'retval'.
*/
ENTRY(vmx_return)
/* Restore host context. */
movq VMXCTX_HOST_R15(%rdi),%r15
movq VMXCTX_HOST_R14(%rdi),%r14
movq VMXCTX_HOST_R13(%rdi),%r13
movq VMXCTX_HOST_R12(%rdi),%r12
movq VMXCTX_HOST_RBP(%rdi),%rbp
movq VMXCTX_HOST_RSP(%rdi),%rsp
movq VMXCTX_HOST_RBX(%rdi),%rbx
movq VMXCTX_HOST_RIP(%rdi),%rax
movq %rax,(%rsp) /* return address */
/*
* XXX restore host debug registers
*/
movl %esi,%eax
ret
END(vmx_return)
/*
* void vmx_longjmp(void)
* %rsp points to the struct vmxctx
*/
ENTRY(vmx_longjmp)
/*
* Save guest state that is not automatically saved in the vmcs.
*/
movq %rdi,VMXCTX_GUEST_RDI(%rsp)
movq %rsi,VMXCTX_GUEST_RSI(%rsp)
movq %rdx,VMXCTX_GUEST_RDX(%rsp)
movq %rcx,VMXCTX_GUEST_RCX(%rsp)
movq %r8,VMXCTX_GUEST_R8(%rsp)
movq %r9,VMXCTX_GUEST_R9(%rsp)
movq %rax,VMXCTX_GUEST_RAX(%rsp)
movq %rbx,VMXCTX_GUEST_RBX(%rsp)
movq %rbp,VMXCTX_GUEST_RBP(%rsp)
movq %r10,VMXCTX_GUEST_R10(%rsp)
movq %r11,VMXCTX_GUEST_R11(%rsp)
movq %r12,VMXCTX_GUEST_R12(%rsp)
movq %r13,VMXCTX_GUEST_R13(%rsp)
movq %r14,VMXCTX_GUEST_R14(%rsp)
movq %r15,VMXCTX_GUEST_R15(%rsp)
movq %cr2,%rdi
movq %rdi,VMXCTX_GUEST_CR2(%rsp)
movq %rsp,%rdi
movq $VMX_RETURN_LONGJMP,%rsi
addq $VMXCTX_TMPSTKTOP,%rsp
callq vmx_return
END(vmx_longjmp)
/*
* void vmx_resume(struct vmxctx *ctxp)
* %rdi = ctxp
*
* Although the return type is a 'void' this function may return indirectly
* through vmx_setjmp() with a return value of 2.
*/
ENTRY(vmx_resume)
VMX_DISABLE_INTERRUPTS
VMX_CHECK_AST
/*
* Restore guest state that is not automatically loaded from the vmcs.
*/
VMX_GUEST_RESTORE
vmresume
/*
* Capture the reason why vmresume failed.
*/
VM_INSTRUCTION_ERROR(%eax)
/* Return via vmx_setjmp with return value of VMX_RETURN_VMRESUME */
movq %rsp,%rdi
movq $VMX_RETURN_VMRESUME,%rsi
addq $VMXCTX_TMPSTKTOP,%rsp
callq vmx_return
END(vmx_resume)
/*
* void vmx_launch(struct vmxctx *ctxp)
* %rdi = ctxp
*
* Although the return type is a 'void' this function may return indirectly
* through vmx_setjmp() with a return value of 3.
*/
ENTRY(vmx_launch)
VMX_DISABLE_INTERRUPTS
VMX_CHECK_AST
/*
* Restore guest state that is not automatically loaded from the vmcs.
*/
VMX_GUEST_RESTORE
vmlaunch
/*
* Capture the reason why vmlaunch failed.
*/
VM_INSTRUCTION_ERROR(%eax)
/* Return via vmx_setjmp with return value of VMX_RETURN_VMLAUNCH */
movq %rsp,%rdi
movq $VMX_RETURN_VMLAUNCH,%rsi
addq $VMXCTX_TMPSTKTOP,%rsp
callq vmx_return
END(vmx_launch)

677
sys/amd64/vmm/intel/vtd.c Normal file
View File

@ -0,0 +1,677 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <dev/pci/pcireg.h>
#include <machine/pmap.h>
#include <machine/vmparam.h>
#include <machine/pci_cfgreg.h>
#include "io/iommu.h"
/*
* Documented in the "Intel Virtualization Technology for Directed I/O",
* Architecture Spec, September 2008.
*/
/* Section 10.4 "Register Descriptions" */
struct vtdmap {
volatile uint32_t version;
volatile uint32_t res0;
volatile uint64_t cap;
volatile uint64_t ext_cap;
volatile uint32_t gcr;
volatile uint32_t gsr;
volatile uint64_t rta;
volatile uint64_t ccr;
};
#define VTD_CAP_SAGAW(cap) (((cap) >> 8) & 0x1F)
#define VTD_CAP_ND(cap) ((cap) & 0x7)
#define VTD_CAP_CM(cap) (((cap) >> 7) & 0x1)
#define VTD_CAP_SPS(cap) (((cap) >> 34) & 0xF)
#define VTD_CAP_RWBF(cap) (((cap) >> 4) & 0x1)
#define VTD_ECAP_DI(ecap) (((ecap) >> 2) & 0x1)
#define VTD_ECAP_COHERENCY(ecap) ((ecap) & 0x1)
#define VTD_ECAP_IRO(ecap) (((ecap) >> 8) & 0x3FF)
#define VTD_GCR_WBF (1 << 27)
#define VTD_GCR_SRTP (1 << 30)
#define VTD_GCR_TE (1 << 31)
#define VTD_GSR_WBFS (1 << 27)
#define VTD_GSR_RTPS (1 << 30)
#define VTD_GSR_TES (1 << 31)
#define VTD_CCR_ICC (1UL << 63) /* invalidate context cache */
#define VTD_CCR_CIRG_GLOBAL (1UL << 61) /* global invalidation */
#define VTD_IIR_IVT (1UL << 63) /* invalidation IOTLB */
#define VTD_IIR_IIRG_GLOBAL (1ULL << 60) /* global IOTLB invalidation */
#define VTD_IIR_IIRG_DOMAIN (2ULL << 60) /* domain IOTLB invalidation */
#define VTD_IIR_IIRG_PAGE (3ULL << 60) /* page IOTLB invalidation */
#define VTD_IIR_DRAIN_READS (1ULL << 49) /* drain pending DMA reads */
#define VTD_IIR_DRAIN_WRITES (1ULL << 48) /* drain pending DMA writes */
#define VTD_IIR_DOMAIN_P 32
#define VTD_ROOT_PRESENT 0x1
#define VTD_CTX_PRESENT 0x1
#define VTD_CTX_TT_ALL (1UL << 2)
#define VTD_PTE_RD (1UL << 0)
#define VTD_PTE_WR (1UL << 1)
#define VTD_PTE_SUPERPAGE (1UL << 7)
#define VTD_PTE_ADDR_M (0x000FFFFFFFFFF000UL)
struct domain {
uint64_t *ptp; /* first level page table page */
int pt_levels; /* number of page table levels */
int addrwidth; /* 'AW' field in context entry */
int spsmask; /* supported super page sizes */
u_int id; /* domain id */
vm_paddr_t maxaddr; /* highest address to be mapped */
SLIST_ENTRY(domain) next;
};
static SLIST_HEAD(, domain) domhead;
#define DRHD_MAX_UNITS 8
static int drhd_num;
static struct vtdmap *vtdmaps[DRHD_MAX_UNITS];
static int max_domains;
typedef int (*drhd_ident_func_t)(void);
static uint64_t root_table[PAGE_SIZE / sizeof(uint64_t)] __aligned(4096);
static uint64_t ctx_tables[256][PAGE_SIZE / sizeof(uint64_t)] __aligned(4096);
static MALLOC_DEFINE(M_VTD, "vtd", "vtd");
/*
* Config space register definitions from the "Intel 5520 and 5500" datasheet.
*/
static int
tylersburg_vtd_ident(void)
{
int units, nlbus;
uint16_t did, vid;
uint32_t miscsts, vtbar;
const int bus = 0;
const int slot = 20;
const int func = 0;
units = 0;
vid = pci_cfgregread(bus, slot, func, PCIR_VENDOR, 2);
did = pci_cfgregread(bus, slot, func, PCIR_DEVICE, 2);
if (vid != 0x8086 || did != 0x342E)
goto done;
/*
* Check if this is a dual IOH configuration.
*/
miscsts = pci_cfgregread(bus, slot, func, 0x9C, 4);
if (miscsts & (1 << 25))
nlbus = pci_cfgregread(bus, slot, func, 0x160, 1);
else
nlbus = -1;
vtbar = pci_cfgregread(bus, slot, func, 0x180, 4);
if (vtbar & 0x1) {
vtdmaps[units++] = (struct vtdmap *)
PHYS_TO_DMAP(vtbar & 0xffffe000);
} else if (bootverbose)
printf("VT-d unit in legacy IOH is disabled!\n");
if (nlbus != -1) {
vtbar = pci_cfgregread(nlbus, slot, func, 0x180, 4);
if (vtbar & 0x1) {
vtdmaps[units++] = (struct vtdmap *)
PHYS_TO_DMAP(vtbar & 0xffffe000);
} else if (bootverbose)
printf("VT-d unit in non-legacy IOH is disabled!\n");
}
done:
return (units);
}
static drhd_ident_func_t drhd_ident_funcs[] = {
tylersburg_vtd_ident,
NULL
};
static int
vtd_max_domains(struct vtdmap *vtdmap)
{
int nd;
nd = VTD_CAP_ND(vtdmap->cap);
switch (nd) {
case 0:
return (16);
case 1:
return (64);
case 2:
return (256);
case 3:
return (1024);
case 4:
return (4 * 1024);
case 5:
return (16 * 1024);
case 6:
return (64 * 1024);
default:
panic("vtd_max_domains: invalid value of nd (0x%0x)", nd);
}
}
static u_int
domain_id(void)
{
u_int id;
struct domain *dom;
/* Skip domain id 0 - it is reserved when Caching Mode field is set */
for (id = 1; id < max_domains; id++) {
SLIST_FOREACH(dom, &domhead, next) {
if (dom->id == id)
break;
}
if (dom == NULL)
break; /* found it */
}
if (id >= max_domains)
panic("domain ids exhausted");
return (id);
}
static void
vtd_wbflush(struct vtdmap *vtdmap)
{
if (VTD_ECAP_COHERENCY(vtdmap->ext_cap) == 0)
pmap_invalidate_cache();
if (VTD_CAP_RWBF(vtdmap->cap)) {
vtdmap->gcr = VTD_GCR_WBF;
while ((vtdmap->gsr & VTD_GSR_WBFS) != 0)
;
}
}
static void
vtd_ctx_global_invalidate(struct vtdmap *vtdmap)
{
vtdmap->ccr = VTD_CCR_ICC | VTD_CCR_CIRG_GLOBAL;
while ((vtdmap->ccr & VTD_CCR_ICC) != 0)
;
}
static void
vtd_iotlb_global_invalidate(struct vtdmap *vtdmap)
{
int offset;
volatile uint64_t *iotlb_reg, val;
vtd_wbflush(vtdmap);
offset = VTD_ECAP_IRO(vtdmap->ext_cap) * 16;
iotlb_reg = (volatile uint64_t *)((caddr_t)vtdmap + offset + 8);
*iotlb_reg = VTD_IIR_IVT | VTD_IIR_IIRG_GLOBAL |
VTD_IIR_DRAIN_READS | VTD_IIR_DRAIN_WRITES;
while (1) {
val = *iotlb_reg;
if ((val & VTD_IIR_IVT) == 0)
break;
}
}
static void
vtd_translation_enable(struct vtdmap *vtdmap)
{
vtdmap->gcr = VTD_GCR_TE;
while ((vtdmap->gsr & VTD_GSR_TES) == 0)
;
}
static void
vtd_translation_disable(struct vtdmap *vtdmap)
{
vtdmap->gcr = 0;
while ((vtdmap->gsr & VTD_GSR_TES) != 0)
;
}
static int
vtd_init(void)
{
int i, units;
struct vtdmap *vtdmap;
vm_paddr_t ctx_paddr;
for (i = 0; drhd_ident_funcs[i] != NULL; i++) {
units = (*drhd_ident_funcs[i])();
if (units > 0)
break;
}
if (units <= 0)
return (ENXIO);
drhd_num = units;
vtdmap = vtdmaps[0];
if (VTD_CAP_CM(vtdmap->cap) != 0)
panic("vtd_init: invalid caching mode");
max_domains = vtd_max_domains(vtdmap);
/*
* Set up the root-table to point to the context-entry tables
*/
for (i = 0; i < 256; i++) {
ctx_paddr = vtophys(ctx_tables[i]);
if (ctx_paddr & PAGE_MASK)
panic("ctx table (0x%0lx) not page aligned", ctx_paddr);
root_table[i * 2] = ctx_paddr | VTD_ROOT_PRESENT;
}
return (0);
}
static void
vtd_cleanup(void)
{
}
static void
vtd_enable(void)
{
int i;
struct vtdmap *vtdmap;
for (i = 0; i < drhd_num; i++) {
vtdmap = vtdmaps[i];
vtd_wbflush(vtdmap);
/* Update the root table address */
vtdmap->rta = vtophys(root_table);
vtdmap->gcr = VTD_GCR_SRTP;
while ((vtdmap->gsr & VTD_GSR_RTPS) == 0)
;
vtd_ctx_global_invalidate(vtdmap);
vtd_iotlb_global_invalidate(vtdmap);
vtd_translation_enable(vtdmap);
}
}
static void
vtd_disable(void)
{
int i;
struct vtdmap *vtdmap;
for (i = 0; i < drhd_num; i++) {
vtdmap = vtdmaps[i];
vtd_translation_disable(vtdmap);
}
}
static void
vtd_add_device(void *arg, int bus, int slot, int func)
{
int idx;
uint64_t *ctxp;
struct domain *dom = arg;
vm_paddr_t pt_paddr;
struct vtdmap *vtdmap;
if (bus < 0 || bus > PCI_BUSMAX ||
slot < 0 || slot > PCI_SLOTMAX ||
func < 0 || func > PCI_FUNCMAX)
panic("vtd_add_device: invalid bsf %d/%d/%d", bus, slot, func);
vtdmap = vtdmaps[0];
ctxp = ctx_tables[bus];
pt_paddr = vtophys(dom->ptp);
idx = (slot << 3 | func) * 2;
if (ctxp[idx] & VTD_CTX_PRESENT) {
panic("vtd_add_device: device %d/%d/%d is already owned by "
"domain %d", bus, slot, func,
(uint16_t)(ctxp[idx + 1] >> 8));
}
/*
* Order is important. The 'present' bit is set only after all fields
* of the context pointer are initialized.
*/
ctxp[idx + 1] = dom->addrwidth | (dom->id << 8);
if (VTD_ECAP_DI(vtdmap->ext_cap))
ctxp[idx] = VTD_CTX_TT_ALL;
else
ctxp[idx] = 0;
ctxp[idx] |= pt_paddr | VTD_CTX_PRESENT;
/*
* 'Not Present' entries are not cached in either the Context Cache
* or in the IOTLB, so there is no need to invalidate either of them.
*/
}
static void
vtd_remove_device(void *arg, int bus, int slot, int func)
{
int i, idx;
uint64_t *ctxp;
struct vtdmap *vtdmap;
if (bus < 0 || bus > PCI_BUSMAX ||
slot < 0 || slot > PCI_SLOTMAX ||
func < 0 || func > PCI_FUNCMAX)
panic("vtd_add_device: invalid bsf %d/%d/%d", bus, slot, func);
ctxp = ctx_tables[bus];
idx = (slot << 3 | func) * 2;
/*
* Order is important. The 'present' bit is must be cleared first.
*/
ctxp[idx] = 0;
ctxp[idx + 1] = 0;
/*
* Invalidate the Context Cache and the IOTLB.
*
* XXX use device-selective invalidation for Context Cache
* XXX use domain-selective invalidation for IOTLB
*/
for (i = 0; i < drhd_num; i++) {
vtdmap = vtdmaps[i];
vtd_ctx_global_invalidate(vtdmap);
vtd_iotlb_global_invalidate(vtdmap);
}
}
#define CREATE_MAPPING 0
#define REMOVE_MAPPING 1
static uint64_t
vtd_update_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len,
int remove)
{
struct domain *dom;
int i, spshift, ptpshift, ptpindex, nlevels;
uint64_t spsize, *ptp;
dom = arg;
ptpindex = 0;
ptpshift = 0;
if (gpa & PAGE_MASK)
panic("vtd_create_mapping: unaligned gpa 0x%0lx", gpa);
if (hpa & PAGE_MASK)
panic("vtd_create_mapping: unaligned hpa 0x%0lx", hpa);
if (len & PAGE_MASK)
panic("vtd_create_mapping: unaligned len 0x%0lx", len);
/*
* Compute the size of the mapping that we can accomodate.
*
* This is based on three factors:
* - supported super page size
* - alignment of the region starting at 'gpa' and 'hpa'
* - length of the region 'len'
*/
spshift = 48;
for (i = 3; i >= 0; i--) {
spsize = 1UL << spshift;
if ((dom->spsmask & (1 << i)) != 0 &&
(gpa & (spsize - 1)) == 0 &&
(hpa & (spsize - 1)) == 0 &&
(len >= spsize)) {
break;
}
spshift -= 9;
}
ptp = dom->ptp;
nlevels = dom->pt_levels;
while (--nlevels >= 0) {
ptpshift = 12 + nlevels * 9;
ptpindex = (gpa >> ptpshift) & 0x1FF;
/* We have reached the leaf mapping */
if (spshift >= ptpshift) {
break;
}
/*
* We are working on a non-leaf page table page.
*
* Create a downstream page table page if necessary and point
* to it from the current page table.
*/
if (ptp[ptpindex] == 0) {
void *nlp = malloc(PAGE_SIZE, M_VTD, M_WAITOK | M_ZERO);
ptp[ptpindex] = vtophys(nlp)| VTD_PTE_RD | VTD_PTE_WR;
}
ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & VTD_PTE_ADDR_M);
}
if ((gpa & ((1UL << ptpshift) - 1)) != 0)
panic("gpa 0x%lx and ptpshift %d mismatch", gpa, ptpshift);
/*
* Update the 'gpa' -> 'hpa' mapping
*/
if (remove) {
ptp[ptpindex] = 0;
} else {
ptp[ptpindex] = hpa | VTD_PTE_RD | VTD_PTE_WR;
if (nlevels > 0)
ptp[ptpindex] |= VTD_PTE_SUPERPAGE;
}
return (1UL << ptpshift);
}
static uint64_t
vtd_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len)
{
return (vtd_update_mapping(arg, gpa, hpa, len, CREATE_MAPPING));
}
static uint64_t
vtd_remove_mapping(void *arg, vm_paddr_t gpa, uint64_t len)
{
return (vtd_update_mapping(arg, gpa, 0, len, REMOVE_MAPPING));
}
static void
vtd_invalidate_tlb(void *dom)
{
int i;
struct vtdmap *vtdmap;
/*
* Invalidate the IOTLB.
* XXX use domain-selective invalidation for IOTLB
*/
for (i = 0; i < drhd_num; i++) {
vtdmap = vtdmaps[i];
vtd_iotlb_global_invalidate(vtdmap);
}
}
static void *
vtd_create_domain(vm_paddr_t maxaddr)
{
struct domain *dom;
vm_paddr_t addr;
int tmp, i, gaw, agaw, sagaw, res, pt_levels, addrwidth;
struct vtdmap *vtdmap;
if (drhd_num <= 0)
panic("vtd_create_domain: no dma remapping hardware available");
vtdmap = vtdmaps[0];
/*
* Calculate AGAW.
* Section 3.4.2 "Adjusted Guest Address Width", Architecture Spec.
*/
addr = 0;
for (gaw = 0; addr < maxaddr; gaw++)
addr = 1ULL << gaw;
res = (gaw - 12) % 9;
if (res == 0)
agaw = gaw;
else
agaw = gaw + 9 - res;
if (agaw > 64)
agaw = 64;
/*
* Select the smallest Supported AGAW and the corresponding number
* of page table levels.
*/
pt_levels = 2;
sagaw = 30;
addrwidth = 0;
tmp = VTD_CAP_SAGAW(vtdmap->cap);
for (i = 0; i < 5; i++) {
if ((tmp & (1 << i)) != 0 && sagaw >= agaw)
break;
pt_levels++;
addrwidth++;
sagaw += 9;
if (sagaw > 64)
sagaw = 64;
}
if (i >= 5) {
panic("vtd_create_domain: SAGAW 0x%lx does not support AGAW %d",
VTD_CAP_SAGAW(vtdmap->cap), agaw);
}
dom = malloc(sizeof(struct domain), M_VTD, M_ZERO | M_WAITOK);
dom->pt_levels = pt_levels;
dom->addrwidth = addrwidth;
dom->spsmask = VTD_CAP_SPS(vtdmap->cap);
dom->id = domain_id();
dom->maxaddr = maxaddr;
dom->ptp = malloc(PAGE_SIZE, M_VTD, M_ZERO | M_WAITOK);
if ((uintptr_t)dom->ptp & PAGE_MASK)
panic("vtd_create_domain: ptp (%p) not page aligned", dom->ptp);
SLIST_INSERT_HEAD(&domhead, dom, next);
return (dom);
}
static void
vtd_free_ptp(uint64_t *ptp, int level)
{
int i;
uint64_t *nlp;
if (level > 1) {
for (i = 0; i < 512; i++) {
if ((ptp[i] & (VTD_PTE_RD | VTD_PTE_WR)) == 0)
continue;
if ((ptp[i] & VTD_PTE_SUPERPAGE) != 0)
continue;
nlp = (uint64_t *)PHYS_TO_DMAP(ptp[i] & VTD_PTE_ADDR_M);
vtd_free_ptp(nlp, level - 1);
}
}
bzero(ptp, PAGE_SIZE);
free(ptp, M_VTD);
}
static void
vtd_destroy_domain(void *arg)
{
struct domain *dom;
dom = arg;
SLIST_REMOVE(&domhead, dom, domain, next);
vtd_free_ptp(dom->ptp, dom->pt_levels);
free(dom, M_VTD);
}
struct iommu_ops iommu_ops_intel = {
vtd_init,
vtd_cleanup,
vtd_enable,
vtd_disable,
vtd_create_domain,
vtd_destroy_domain,
vtd_create_mapping,
vtd_remove_mapping,
vtd_add_device,
vtd_remove_device,
vtd_invalidate_tlb,
};

277
sys/amd64/vmm/io/iommu.c Normal file
View File

@ -0,0 +1,277 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/types.h>
#include <sys/systm.h>
#include <sys/bus.h>
#include <dev/pci/pcivar.h>
#include <dev/pci/pcireg.h>
#include <machine/md_var.h>
#include "vmm_util.h"
#include "vmm_mem.h"
#include "iommu.h"
static boolean_t iommu_avail;
static struct iommu_ops *ops;
static void *host_domain;
static __inline int
IOMMU_INIT(void)
{
if (ops != NULL)
return ((*ops->init)());
else
return (ENXIO);
}
static __inline void
IOMMU_CLEANUP(void)
{
if (ops != NULL && iommu_avail)
(*ops->cleanup)();
}
static __inline void *
IOMMU_CREATE_DOMAIN(vm_paddr_t maxaddr)
{
if (ops != NULL && iommu_avail)
return ((*ops->create_domain)(maxaddr));
else
return (NULL);
}
static __inline void
IOMMU_DESTROY_DOMAIN(void *dom)
{
if (ops != NULL && iommu_avail)
(*ops->destroy_domain)(dom);
}
static __inline uint64_t
IOMMU_CREATE_MAPPING(void *domain, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len)
{
if (ops != NULL && iommu_avail)
return ((*ops->create_mapping)(domain, gpa, hpa, len));
else
return (len); /* XXX */
}
static __inline uint64_t
IOMMU_REMOVE_MAPPING(void *domain, vm_paddr_t gpa, uint64_t len)
{
if (ops != NULL && iommu_avail)
return ((*ops->remove_mapping)(domain, gpa, len));
else
return (len); /* XXX */
}
static __inline void
IOMMU_ADD_DEVICE(void *domain, int bus, int slot, int func)
{
if (ops != NULL && iommu_avail)
(*ops->add_device)(domain, bus, slot, func);
}
static __inline void
IOMMU_REMOVE_DEVICE(void *domain, int bus, int slot, int func)
{
if (ops != NULL && iommu_avail)
(*ops->remove_device)(domain, bus, slot, func);
}
static __inline void
IOMMU_INVALIDATE_TLB(void *domain)
{
if (ops != NULL && iommu_avail)
(*ops->invalidate_tlb)(domain);
}
static __inline void
IOMMU_ENABLE(void)
{
if (ops != NULL && iommu_avail)
(*ops->enable)();
}
static __inline void
IOMMU_DISABLE(void)
{
if (ops != NULL && iommu_avail)
(*ops->disable)();
}
void
iommu_init(void)
{
int error, bus, slot, func;
vm_paddr_t maxaddr;
const char *name;
device_t dev;
if (vmm_is_intel())
ops = &iommu_ops_intel;
else if (vmm_is_amd())
ops = &iommu_ops_amd;
else
ops = NULL;
error = IOMMU_INIT();
if (error)
return;
iommu_avail = TRUE;
/*
* Create a domain for the devices owned by the host
*/
maxaddr = vmm_mem_maxaddr();
host_domain = IOMMU_CREATE_DOMAIN(maxaddr);
if (host_domain == NULL)
panic("iommu_init: unable to create a host domain");
/*
* Create 1:1 mappings from '0' to 'maxaddr' for devices assigned to
* the host
*/
iommu_create_mapping(host_domain, 0, 0, maxaddr);
for (bus = 0; bus <= PCI_BUSMAX; bus++) {
for (slot = 0; slot <= PCI_SLOTMAX; slot++) {
for (func = 0; func <= PCI_FUNCMAX; func++) {
dev = pci_find_dbsf(0, bus, slot, func);
if (dev == NULL)
continue;
/* skip passthrough devices */
name = device_get_name(dev);
if (name != NULL && strcmp(name, "ppt") == 0)
continue;
/* everything else belongs to the host domain */
iommu_add_device(host_domain, bus, slot, func);
}
}
}
IOMMU_ENABLE();
}
void
iommu_cleanup(void)
{
IOMMU_DISABLE();
IOMMU_DESTROY_DOMAIN(host_domain);
IOMMU_CLEANUP();
}
void *
iommu_create_domain(vm_paddr_t maxaddr)
{
return (IOMMU_CREATE_DOMAIN(maxaddr));
}
void
iommu_destroy_domain(void *dom)
{
IOMMU_DESTROY_DOMAIN(dom);
}
void
iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa, size_t len)
{
uint64_t mapped, remaining;
remaining = len;
while (remaining > 0) {
mapped = IOMMU_CREATE_MAPPING(dom, gpa, hpa, remaining);
gpa += mapped;
hpa += mapped;
remaining -= mapped;
}
}
void
iommu_remove_mapping(void *dom, vm_paddr_t gpa, size_t len)
{
uint64_t unmapped, remaining;
remaining = len;
while (remaining > 0) {
unmapped = IOMMU_REMOVE_MAPPING(dom, gpa, remaining);
gpa += unmapped;
remaining -= unmapped;
}
}
void *
iommu_host_domain(void)
{
return (host_domain);
}
void
iommu_add_device(void *dom, int bus, int slot, int func)
{
IOMMU_ADD_DEVICE(dom, bus, slot, func);
}
void
iommu_remove_device(void *dom, int bus, int slot, int func)
{
IOMMU_REMOVE_DEVICE(dom, bus, slot, func);
}
void
iommu_invalidate_tlb(void *domain)
{
IOMMU_INVALIDATE_TLB(domain);
}

75
sys/amd64/vmm/io/iommu.h Normal file
View File

@ -0,0 +1,75 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _IO_IOMMU_H_
#define _IO_IOMMU_H_
typedef int (*iommu_init_func_t)(void);
typedef void (*iommu_cleanup_func_t)(void);
typedef void (*iommu_enable_func_t)(void);
typedef void (*iommu_disable_func_t)(void);
typedef void *(*iommu_create_domain_t)(vm_paddr_t maxaddr);
typedef void (*iommu_destroy_domain_t)(void *domain);
typedef uint64_t (*iommu_create_mapping_t)(void *domain, vm_paddr_t gpa,
vm_paddr_t hpa, uint64_t len);
typedef uint64_t (*iommu_remove_mapping_t)(void *domain, vm_paddr_t gpa,
uint64_t len);
typedef void (*iommu_add_device_t)(void *domain, int bus, int slot, int func);
typedef void (*iommu_remove_device_t)(void *dom, int bus, int slot, int func);
typedef void (*iommu_invalidate_tlb_t)(void *dom);
struct iommu_ops {
iommu_init_func_t init; /* module wide */
iommu_cleanup_func_t cleanup;
iommu_enable_func_t enable;
iommu_disable_func_t disable;
iommu_create_domain_t create_domain; /* domain-specific */
iommu_destroy_domain_t destroy_domain;
iommu_create_mapping_t create_mapping;
iommu_remove_mapping_t remove_mapping;
iommu_add_device_t add_device;
iommu_remove_device_t remove_device;
iommu_invalidate_tlb_t invalidate_tlb;
};
extern struct iommu_ops iommu_ops_intel;
extern struct iommu_ops iommu_ops_amd;
void iommu_init(void);
void iommu_cleanup(void);
void *iommu_host_domain(void);
void *iommu_create_domain(vm_paddr_t maxaddr);
void iommu_destroy_domain(void *dom);
void iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa,
size_t len);
void iommu_remove_mapping(void *dom, vm_paddr_t gpa, size_t len);
void iommu_add_device(void *dom, int bus, int slot, int func);
void iommu_remove_device(void *dom, int bus, int slot, int func);
void iommu_invalidate_tlb(void *domain);
#endif

610
sys/amd64/vmm/io/ppt.c Normal file
View File

@ -0,0 +1,610 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/module.h>
#include <sys/bus.h>
#include <sys/pciio.h>
#include <sys/rman.h>
#include <sys/smp.h>
#include <dev/pci/pcivar.h>
#include <dev/pci/pcireg.h>
#include <machine/resource.h>
#include <machine/vmm.h>
#include <machine/vmm_dev.h>
#include "vmm_lapic.h"
#include "vmm_ktr.h"
#include "iommu.h"
#include "ppt.h"
/* XXX locking */
#define MAX_PPTDEVS (sizeof(pptdevs) / sizeof(pptdevs[0]))
#define MAX_MMIOSEGS (PCIR_MAX_BAR_0 + 1)
#define MAX_MSIMSGS 32
MALLOC_DEFINE(M_PPTMSIX, "pptmsix", "Passthru MSI-X resources");
struct pptintr_arg { /* pptintr(pptintr_arg) */
struct pptdev *pptdev;
int vec;
int vcpu;
};
static struct pptdev {
device_t dev;
struct vm *vm; /* owner of this device */
struct vm_memory_segment mmio[MAX_MMIOSEGS];
struct {
int num_msgs; /* guest state */
int startrid; /* host state */
struct resource *res[MAX_MSIMSGS];
void *cookie[MAX_MSIMSGS];
struct pptintr_arg arg[MAX_MSIMSGS];
} msi;
struct {
int num_msgs;
int startrid;
int msix_table_rid;
struct resource *msix_table_res;
struct resource **res;
void **cookie;
struct pptintr_arg *arg;
} msix;
} pptdevs[32];
static int num_pptdevs;
static int
ppt_probe(device_t dev)
{
int bus, slot, func;
struct pci_devinfo *dinfo;
dinfo = (struct pci_devinfo *)device_get_ivars(dev);
bus = pci_get_bus(dev);
slot = pci_get_slot(dev);
func = pci_get_function(dev);
/*
* To qualify as a pci passthrough device a device must:
* - be allowed by administrator to be used in this role
* - be an endpoint device
*/
if (vmm_is_pptdev(bus, slot, func) &&
(dinfo->cfg.hdrtype & PCIM_HDRTYPE) == PCIM_HDRTYPE_NORMAL)
return (0);
else
return (ENXIO);
}
static int
ppt_attach(device_t dev)
{
int n;
if (num_pptdevs >= MAX_PPTDEVS) {
printf("ppt_attach: maximum number of pci passthrough devices "
"exceeded\n");
return (ENXIO);
}
n = num_pptdevs++;
pptdevs[n].dev = dev;
if (bootverbose)
device_printf(dev, "attached\n");
return (0);
}
static int
ppt_detach(device_t dev)
{
/*
* XXX check whether there are any pci passthrough devices assigned
* to guests before we allow this driver to detach.
*/
return (0);
}
static device_method_t ppt_methods[] = {
/* Device interface */
DEVMETHOD(device_probe, ppt_probe),
DEVMETHOD(device_attach, ppt_attach),
DEVMETHOD(device_detach, ppt_detach),
{0, 0}
};
static devclass_t ppt_devclass;
DEFINE_CLASS_0(ppt, ppt_driver, ppt_methods, 0);
DRIVER_MODULE(ppt, pci, ppt_driver, ppt_devclass, NULL, NULL);
static struct pptdev *
ppt_find(int bus, int slot, int func)
{
device_t dev;
int i, b, s, f;
for (i = 0; i < num_pptdevs; i++) {
dev = pptdevs[i].dev;
b = pci_get_bus(dev);
s = pci_get_slot(dev);
f = pci_get_function(dev);
if (bus == b && slot == s && func == f)
return (&pptdevs[i]);
}
return (NULL);
}
static void
ppt_unmap_mmio(struct vm *vm, struct pptdev *ppt)
{
int i;
struct vm_memory_segment *seg;
for (i = 0; i < MAX_MMIOSEGS; i++) {
seg = &ppt->mmio[i];
if (seg->len == 0)
continue;
(void)vm_unmap_mmio(vm, seg->gpa, seg->len);
bzero(seg, sizeof(struct vm_memory_segment));
}
}
static void
ppt_teardown_msi(struct pptdev *ppt)
{
int i, rid;
void *cookie;
struct resource *res;
if (ppt->msi.num_msgs == 0)
return;
for (i = 0; i < ppt->msi.num_msgs; i++) {
rid = ppt->msi.startrid + i;
res = ppt->msi.res[i];
cookie = ppt->msi.cookie[i];
if (cookie != NULL)
bus_teardown_intr(ppt->dev, res, cookie);
if (res != NULL)
bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res);
ppt->msi.res[i] = NULL;
ppt->msi.cookie[i] = NULL;
}
if (ppt->msi.startrid == 1)
pci_release_msi(ppt->dev);
ppt->msi.num_msgs = 0;
}
static void
ppt_teardown_msix_intr(struct pptdev *ppt, int idx)
{
int rid;
struct resource *res;
void *cookie;
rid = ppt->msix.startrid + idx;
res = ppt->msix.res[idx];
cookie = ppt->msix.cookie[idx];
if (cookie != NULL)
bus_teardown_intr(ppt->dev, res, cookie);
if (res != NULL)
bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res);
ppt->msix.res[idx] = NULL;
ppt->msix.cookie[idx] = NULL;
}
static void
ppt_teardown_msix(struct pptdev *ppt)
{
int i;
if (ppt->msix.num_msgs == 0)
return;
for (i = 0; i < ppt->msix.num_msgs; i++)
ppt_teardown_msix_intr(ppt, i);
if (ppt->msix.msix_table_res) {
bus_release_resource(ppt->dev, SYS_RES_MEMORY,
ppt->msix.msix_table_rid,
ppt->msix.msix_table_res);
ppt->msix.msix_table_res = NULL;
ppt->msix.msix_table_rid = 0;
}
free(ppt->msix.res, M_PPTMSIX);
free(ppt->msix.cookie, M_PPTMSIX);
free(ppt->msix.arg, M_PPTMSIX);
pci_release_msi(ppt->dev);
ppt->msix.num_msgs = 0;
}
int
ppt_assign_device(struct vm *vm, int bus, int slot, int func)
{
struct pptdev *ppt;
ppt = ppt_find(bus, slot, func);
if (ppt != NULL) {
/*
* If this device is owned by a different VM then we
* cannot change its owner.
*/
if (ppt->vm != NULL && ppt->vm != vm)
return (EBUSY);
ppt->vm = vm;
iommu_add_device(vm_iommu_domain(vm), bus, slot, func);
return (0);
}
return (ENOENT);
}
int
ppt_unassign_device(struct vm *vm, int bus, int slot, int func)
{
struct pptdev *ppt;
ppt = ppt_find(bus, slot, func);
if (ppt != NULL) {
/*
* If this device is not owned by this 'vm' then bail out.
*/
if (ppt->vm != vm)
return (EBUSY);
ppt_unmap_mmio(vm, ppt);
ppt_teardown_msi(ppt);
ppt_teardown_msix(ppt);
iommu_remove_device(vm_iommu_domain(vm), bus, slot, func);
ppt->vm = NULL;
return (0);
}
return (ENOENT);
}
int
ppt_unassign_all(struct vm *vm)
{
int i, bus, slot, func;
device_t dev;
for (i = 0; i < num_pptdevs; i++) {
if (pptdevs[i].vm == vm) {
dev = pptdevs[i].dev;
bus = pci_get_bus(dev);
slot = pci_get_slot(dev);
func = pci_get_function(dev);
ppt_unassign_device(vm, bus, slot, func);
}
}
return (0);
}
int
ppt_map_mmio(struct vm *vm, int bus, int slot, int func,
vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
{
int i, error;
struct vm_memory_segment *seg;
struct pptdev *ppt;
ppt = ppt_find(bus, slot, func);
if (ppt != NULL) {
if (ppt->vm != vm)
return (EBUSY);
for (i = 0; i < MAX_MMIOSEGS; i++) {
seg = &ppt->mmio[i];
if (seg->len == 0) {
error = vm_map_mmio(vm, gpa, len, hpa);
if (error == 0) {
seg->gpa = gpa;
seg->len = len;
}
return (error);
}
}
return (ENOSPC);
}
return (ENOENT);
}
static int
pptintr(void *arg)
{
int vec;
struct pptdev *ppt;
struct pptintr_arg *pptarg;
pptarg = arg;
ppt = pptarg->pptdev;
vec = pptarg->vec;
if (ppt->vm != NULL)
(void) lapic_set_intr(ppt->vm, pptarg->vcpu, vec);
else {
/*
* XXX
* This is not expected to happen - panic?
*/
}
/*
* For legacy interrupts give other filters a chance in case
* the interrupt was not generated by the passthrough device.
*/
if (ppt->msi.startrid == 0)
return (FILTER_STRAY);
else
return (FILTER_HANDLED);
}
/*
* XXX
* When we try to free the MSI resource the kernel will bind the thread to
* the host cpu was originally handling the MSI. The function freeing the
* MSI vector (apic_free_vector()) will panic the kernel if the thread
* is already bound to a cpu.
*
* So, we temporarily unbind the vcpu thread before freeing the MSI resource.
*/
static void
PPT_TEARDOWN_MSI(struct vm *vm, int vcpu, struct pptdev *ppt)
{
int pincpu = -1;
vm_get_pinning(vm, vcpu, &pincpu);
if (pincpu >= 0)
vm_set_pinning(vm, vcpu, -1);
ppt_teardown_msi(ppt);
if (pincpu >= 0)
vm_set_pinning(vm, vcpu, pincpu);
}
int
ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
int destcpu, int vector, int numvec)
{
int i, rid, flags;
int msi_count, startrid, error, tmp;
struct pptdev *ppt;
if ((destcpu >= VM_MAXCPU || destcpu < 0) ||
(vector < 0 || vector > 255) ||
(numvec < 0 || numvec > MAX_MSIMSGS))
return (EINVAL);
ppt = ppt_find(bus, slot, func);
if (ppt == NULL)
return (ENOENT);
if (ppt->vm != vm) /* Make sure we own this device */
return (EBUSY);
/* Free any allocated resources */
PPT_TEARDOWN_MSI(vm, vcpu, ppt);
if (numvec == 0) /* nothing more to do */
return (0);
flags = RF_ACTIVE;
msi_count = pci_msi_count(ppt->dev);
if (msi_count == 0) {
startrid = 0; /* legacy interrupt */
msi_count = 1;
flags |= RF_SHAREABLE;
} else
startrid = 1; /* MSI */
/*
* The device must be capable of supporting the number of vectors
* the guest wants to allocate.
*/
if (numvec > msi_count)
return (EINVAL);
/*
* Make sure that we can allocate all the MSI vectors that are needed
* by the guest.
*/
if (startrid == 1) {
tmp = numvec;
error = pci_alloc_msi(ppt->dev, &tmp);
if (error)
return (error);
else if (tmp != numvec) {
pci_release_msi(ppt->dev);
return (ENOSPC);
} else {
/* success */
}
}
ppt->msi.startrid = startrid;
/*
* Allocate the irq resource and attach it to the interrupt handler.
*/
for (i = 0; i < numvec; i++) {
ppt->msi.num_msgs = i + 1;
ppt->msi.cookie[i] = NULL;
rid = startrid + i;
ppt->msi.res[i] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ,
&rid, flags);
if (ppt->msi.res[i] == NULL)
break;
ppt->msi.arg[i].pptdev = ppt;
ppt->msi.arg[i].vec = vector + i;
ppt->msi.arg[i].vcpu = destcpu;
error = bus_setup_intr(ppt->dev, ppt->msi.res[i],
INTR_TYPE_NET | INTR_MPSAFE,
pptintr, NULL, &ppt->msi.arg[i],
&ppt->msi.cookie[i]);
if (error != 0)
break;
}
if (i < numvec) {
PPT_TEARDOWN_MSI(vm, vcpu, ppt);
return (ENXIO);
}
return (0);
}
int
ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
int idx, uint32_t msg, uint32_t vector_control, uint64_t addr)
{
struct pptdev *ppt;
struct pci_devinfo *dinfo;
int numvec, alloced, rid, error;
size_t res_size, cookie_size, arg_size;
ppt = ppt_find(bus, slot, func);
if (ppt == NULL)
return (ENOENT);
if (ppt->vm != vm) /* Make sure we own this device */
return (EBUSY);
dinfo = device_get_ivars(ppt->dev);
if (!dinfo)
return (ENXIO);
/*
* First-time configuration:
* Allocate the MSI-X table
* Allocate the IRQ resources
* Set up some variables in ppt->msix
*/
if (ppt->msix.num_msgs == 0) {
numvec = pci_msix_count(ppt->dev);
if (numvec <= 0)
return (EINVAL);
ppt->msix.startrid = 1;
ppt->msix.num_msgs = numvec;
res_size = numvec * sizeof(ppt->msix.res[0]);
cookie_size = numvec * sizeof(ppt->msix.cookie[0]);
arg_size = numvec * sizeof(ppt->msix.arg[0]);
ppt->msix.res = malloc(res_size, M_PPTMSIX, M_WAITOK | M_ZERO);
ppt->msix.cookie = malloc(cookie_size, M_PPTMSIX,
M_WAITOK | M_ZERO);
ppt->msix.arg = malloc(arg_size, M_PPTMSIX, M_WAITOK | M_ZERO);
rid = dinfo->cfg.msix.msix_table_bar;
ppt->msix.msix_table_res = bus_alloc_resource_any(ppt->dev,
SYS_RES_MEMORY, &rid, RF_ACTIVE);
if (ppt->msix.msix_table_res == NULL) {
ppt_teardown_msix(ppt);
return (ENOSPC);
}
ppt->msix.msix_table_rid = rid;
alloced = numvec;
error = pci_alloc_msix(ppt->dev, &alloced);
if (error || alloced != numvec) {
ppt_teardown_msix(ppt);
return (error == 0 ? ENOSPC: error);
}
}
if ((vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
/* Tear down the IRQ if it's already set up */
ppt_teardown_msix_intr(ppt, idx);
/* Allocate the IRQ resource */
ppt->msix.cookie[idx] = NULL;
rid = ppt->msix.startrid + idx;
ppt->msix.res[idx] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ,
&rid, RF_ACTIVE);
if (ppt->msix.res[idx] == NULL)
return (ENXIO);
ppt->msix.arg[idx].pptdev = ppt;
ppt->msix.arg[idx].vec = msg;
ppt->msix.arg[idx].vcpu = (addr >> 12) & 0xFF;
/* Setup the MSI-X interrupt */
error = bus_setup_intr(ppt->dev, ppt->msix.res[idx],
INTR_TYPE_NET | INTR_MPSAFE,
pptintr, NULL, &ppt->msix.arg[idx],
&ppt->msix.cookie[idx]);
if (error != 0) {
bus_teardown_intr(ppt->dev, ppt->msix.res[idx], ppt->msix.cookie[idx]);
bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, ppt->msix.res[idx]);
ppt->msix.cookie[idx] = NULL;
ppt->msix.res[idx] = NULL;
return (ENXIO);
}
} else {
/* Masked, tear it down if it's already been set up */
ppt_teardown_msix_intr(ppt, idx);
}
return (0);
}

41
sys/amd64/vmm/io/ppt.h Normal file
View File

@ -0,0 +1,41 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _IO_PPT_H_
#define _IO_PPT_H_
int ppt_assign_device(struct vm *vm, int bus, int slot, int func);
int ppt_unassign_device(struct vm *vm, int bus, int slot, int func);
int ppt_unassign_all(struct vm *vm);
int ppt_map_mmio(struct vm *vm, int bus, int slot, int func,
vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
int ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
int destcpu, int vector, int numvec);
int ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
int idx, uint32_t msg, uint32_t vector_control, uint64_t addr);
#endif

270
sys/amd64/vmm/io/vdev.c Normal file
View File

@ -0,0 +1,270 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include "vdev.h"
struct vdev {
SLIST_ENTRY(vdev) entry;
struct vdev_ops *ops;
void *dev;
};
static SLIST_HEAD(, vdev) vdev_head;
static int vdev_count;
struct vdev_region {
SLIST_ENTRY(vdev_region) entry;
struct vdev_ops *ops;
void *dev;
struct io_region *io;
};
static SLIST_HEAD(, vdev_region) region_head;
static int region_count;
static MALLOC_DEFINE(M_VDEV, "vdev", "vdev");
#define VDEV_INIT (0)
#define VDEV_RESET (1)
#define VDEV_HALT (2)
// static const char* vdev_event_str[] = {"VDEV_INIT", "VDEV_RESET", "VDEV_HALT"};
static int
vdev_system_event(int event)
{
struct vdev *vd;
int rc;
// TODO: locking
SLIST_FOREACH(vd, &vdev_head, entry) {
// printf("%s : %s Device %s\n", __func__, vdev_event_str[event], vd->ops->name);
switch (event) {
case VDEV_INIT:
rc = vd->ops->init(vd->dev);
break;
case VDEV_RESET:
rc = vd->ops->reset(vd->dev);
break;
case VDEV_HALT:
rc = vd->ops->halt(vd->dev);
break;
default:
break;
}
if (rc) {
printf("vdev %s init failed rc=%d\n",
vd->ops->name, rc);
return rc;
}
}
return 0;
}
int
vdev_init(void)
{
return vdev_system_event(VDEV_INIT);
}
int
vdev_reset(void)
{
return vdev_system_event(VDEV_RESET);
}
int
vdev_halt(void)
{
return vdev_system_event(VDEV_HALT);
}
void
vdev_vm_init(void)
{
SLIST_INIT(&vdev_head);
vdev_count = 0;
SLIST_INIT(&region_head);
region_count = 0;
}
void
vdev_vm_cleanup(void)
{
struct vdev *vd;
// TODO: locking
while (!SLIST_EMPTY(&vdev_head)) {
vd = SLIST_FIRST(&vdev_head);
SLIST_REMOVE_HEAD(&vdev_head, entry);
free(vd, M_VDEV);
vdev_count--;
}
}
int
vdev_register(struct vdev_ops *ops, void *dev)
{
struct vdev *vd;
vd = malloc(sizeof(*vd), M_VDEV, M_WAITOK | M_ZERO);
vd->ops = ops;
vd->dev = dev;
// TODO: locking
SLIST_INSERT_HEAD(&vdev_head, vd, entry);
vdev_count++;
return 0;
}
void
vdev_unregister(void *dev)
{
struct vdev *vd, *found;
found = NULL;
// TODO: locking
SLIST_FOREACH(vd, &vdev_head, entry) {
if (vd->dev == dev) {
found = vd;
}
}
if (found) {
SLIST_REMOVE(&vdev_head, found, vdev, entry);
free(found, M_VDEV);
}
}
#define IN_RANGE(val, start, end) \
(((val) >= (start)) && ((val) < (end)))
static struct vdev_region*
vdev_find_region(struct io_region *io, void *dev)
{
struct vdev_region *region, *found;
uint64_t region_base;
uint64_t region_end;
found = NULL;
// TODO: locking
// FIXME: we should verify we are in the context the current
// vcpu here as well.
SLIST_FOREACH(region, &region_head, entry) {
region_base = region->io->base;
region_end = region_base + region->io->len;
if (IN_RANGE(io->base, region_base, region_end) &&
IN_RANGE(io->base+io->len, region_base, region_end+1) &&
(dev && dev == region->dev)) {
found = region;
break;
}
}
return found;
}
int
vdev_register_region(struct vdev_ops *ops, void *dev, struct io_region *io)
{
struct vdev_region *region;
region = vdev_find_region(io, dev);
if (region) {
return -EEXIST;
}
region = malloc(sizeof(*region), M_VDEV, M_WAITOK | M_ZERO);
region->io = io;
region->ops = ops;
region->dev = dev;
// TODO: locking
SLIST_INSERT_HEAD(&region_head, region, entry);
region_count++;
return 0;
}
void
vdev_unregister_region(void *dev, struct io_region *io)
{
struct vdev_region *region;
region = vdev_find_region(io, dev);
if (region) {
SLIST_REMOVE(&region_head, region, vdev_region, entry);
free(region, M_VDEV);
region_count--;
}
}
static int
vdev_memrw(uint64_t gpa, opsize_t size, uint64_t *data, int read)
{
struct vdev_region *region;
struct io_region io;
region_attr_t attr;
int rc;
io.base = gpa;
io.len = size;
region = vdev_find_region(&io, NULL);
if (!region)
return -EINVAL;
attr = (read) ? MMIO_READ : MMIO_WRITE;
if (!(region->io->attr & attr))
return -EPERM;
if (read)
rc = region->ops->memread(region->dev, gpa, size, data);
else
rc = region->ops->memwrite(region->dev, gpa, size, *data);
return rc;
}
int
vdev_memread(uint64_t gpa, opsize_t size, uint64_t *data)
{
return vdev_memrw(gpa, size, data, 1);
}
int
vdev_memwrite(uint64_t gpa, opsize_t size, uint64_t data)
{
return vdev_memrw(gpa, size, &data, 0);
}

84
sys/amd64/vmm/io/vdev.h Normal file
View File

@ -0,0 +1,84 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VDEV_H_
#define _VDEV_H_
typedef enum {
BYTE = 1,
WORD = 2,
DWORD = 4,
QWORD = 8,
} opsize_t;
typedef enum {
MMIO_READ = 1,
MMIO_WRITE = 2,
} region_attr_t;
struct io_region {
uint64_t base;
uint64_t len;
region_attr_t attr;
int vcpu;
};
typedef int (*vdev_init_t)(void* dev);
typedef int (*vdev_reset_t)(void* dev);
typedef int (*vdev_halt_t)(void* dev);
typedef int (*vdev_memread_t)(void* dev, uint64_t gpa, opsize_t size, uint64_t *data);
typedef int (*vdev_memwrite_t)(void* dev, uint64_t gpa, opsize_t size, uint64_t data);
struct vdev_ops {
const char *name;
vdev_init_t init;
vdev_reset_t reset;
vdev_halt_t halt;
vdev_memread_t memread;
vdev_memwrite_t memwrite;
};
void vdev_vm_init(void);
void vdev_vm_cleanup(void);
int vdev_register(struct vdev_ops *ops, void *dev);
void vdev_unregister(void *dev);
int vdev_register_region(struct vdev_ops *ops, void *dev, struct io_region *io);
void vdev_unregister_region(void *dev, struct io_region *io);
int vdev_init(void);
int vdev_reset(void);
int vdev_halt(void);
int vdev_memread(uint64_t gpa, opsize_t size, uint64_t *data);
int vdev_memwrite(uint64_t gpa, opsize_t size, uint64_t data);
#endif /* _VDEV_H_ */

901
sys/amd64/vmm/io/vlapic.c Normal file
View File

@ -0,0 +1,901 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/systm.h>
#include <sys/smp.h>
#include <machine/clock.h>
#include <x86/specialreg.h>
#include <x86/apicreg.h>
#include <machine/vmm.h>
#include "vmm_lapic.h"
#include "vmm_ktr.h"
#include "vdev.h"
#include "vlapic.h"
#define VLAPIC_CTR0(vlapic, format) \
VMM_CTR0((vlapic)->vm, (vlapic)->vcpuid, format)
#define VLAPIC_CTR1(vlapic, format, p1) \
VMM_CTR1((vlapic)->vm, (vlapic)->vcpuid, format, p1)
#define VLAPIC_CTR_IRR(vlapic, msg) \
do { \
uint32_t *irrptr = &(vlapic)->apic.irr0; \
irrptr[0] = irrptr[0]; /* silence compiler */ \
VLAPIC_CTR1((vlapic), msg " irr0 0x%08x", irrptr[0 << 2]); \
VLAPIC_CTR1((vlapic), msg " irr1 0x%08x", irrptr[1 << 2]); \
VLAPIC_CTR1((vlapic), msg " irr2 0x%08x", irrptr[2 << 2]); \
VLAPIC_CTR1((vlapic), msg " irr3 0x%08x", irrptr[3 << 2]); \
VLAPIC_CTR1((vlapic), msg " irr4 0x%08x", irrptr[4 << 2]); \
VLAPIC_CTR1((vlapic), msg " irr5 0x%08x", irrptr[5 << 2]); \
VLAPIC_CTR1((vlapic), msg " irr6 0x%08x", irrptr[6 << 2]); \
VLAPIC_CTR1((vlapic), msg " irr7 0x%08x", irrptr[7 << 2]); \
} while (0)
#define VLAPIC_CTR_ISR(vlapic, msg) \
do { \
uint32_t *isrptr = &(vlapic)->apic.isr0; \
isrptr[0] = isrptr[0]; /* silence compiler */ \
VLAPIC_CTR1((vlapic), msg " isr0 0x%08x", isrptr[0 << 2]); \
VLAPIC_CTR1((vlapic), msg " isr1 0x%08x", isrptr[1 << 2]); \
VLAPIC_CTR1((vlapic), msg " isr2 0x%08x", isrptr[2 << 2]); \
VLAPIC_CTR1((vlapic), msg " isr3 0x%08x", isrptr[3 << 2]); \
VLAPIC_CTR1((vlapic), msg " isr4 0x%08x", isrptr[4 << 2]); \
VLAPIC_CTR1((vlapic), msg " isr5 0x%08x", isrptr[5 << 2]); \
VLAPIC_CTR1((vlapic), msg " isr6 0x%08x", isrptr[6 << 2]); \
VLAPIC_CTR1((vlapic), msg " isr7 0x%08x", isrptr[7 << 2]); \
} while (0)
static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic");
#define PRIO(x) ((x) >> 4)
#define VLAPIC_VERSION (16)
#define VLAPIC_MAXLVT_ENTRIES (5)
#define x2apic(vlapic) (((vlapic)->msr_apicbase & APICBASE_X2APIC) ? 1 : 0)
enum boot_state {
BS_INIT,
BS_SIPI,
BS_RUNNING
};
struct vlapic {
struct vm *vm;
int vcpuid;
struct io_region *mmio;
struct vdev_ops *ops;
struct LAPIC apic;
int esr_update;
int divisor;
int ccr_ticks;
/*
* The 'isrvec_stk' is a stack of vectors injected by the local apic.
* A vector is popped from the stack when the processor does an EOI.
* The vector on the top of the stack is used to compute the
* Processor Priority in conjunction with the TPR.
*/
uint8_t isrvec_stk[ISRVEC_STK_SIZE];
int isrvec_stk_top;
uint64_t msr_apicbase;
enum boot_state boot_state;
};
#define VLAPIC_BUS_FREQ tsc_freq
static int
vlapic_timer_divisor(uint32_t dcr)
{
switch (dcr & 0xB) {
case APIC_TDCR_2:
return (2);
case APIC_TDCR_4:
return (4);
case APIC_TDCR_8:
return (8);
case APIC_TDCR_16:
return (16);
case APIC_TDCR_32:
return (32);
case APIC_TDCR_64:
return (64);
case APIC_TDCR_128:
return (128);
default:
panic("vlapic_timer_divisor: invalid dcr 0x%08x", dcr);
}
}
static void
vlapic_mask_lvts(uint32_t *lvts, int num_lvt)
{
int i;
for (i = 0; i < num_lvt; i++) {
*lvts |= APIC_LVT_M;
lvts += 4;
}
}
#if 0
static inline void
vlapic_dump_lvt(uint32_t offset, uint32_t *lvt)
{
printf("Offset %x: lvt %08x (V:%02x DS:%x M:%x)\n", offset,
*lvt, *lvt & APIC_LVTT_VECTOR, *lvt & APIC_LVTT_DS,
*lvt & APIC_LVTT_M);
}
#endif
static uint64_t
vlapic_get_ccr(struct vlapic *vlapic)
{
struct LAPIC *lapic = &vlapic->apic;
return lapic->ccr_timer;
}
static void
vlapic_update_errors(struct vlapic *vlapic)
{
struct LAPIC *lapic = &vlapic->apic;
lapic->esr = 0; // XXX
}
static void
vlapic_init_ipi(struct vlapic *vlapic)
{
struct LAPIC *lapic = &vlapic->apic;
lapic->version = VLAPIC_VERSION;
lapic->version |= (VLAPIC_MAXLVT_ENTRIES < MAXLVTSHIFT);
lapic->dfr = 0xffffffff;
lapic->svr = APIC_SVR_VECTOR;
vlapic_mask_lvts(&lapic->lvt_timer, VLAPIC_MAXLVT_ENTRIES+1);
}
static int
vlapic_op_reset(void* dev)
{
struct vlapic *vlapic = (struct vlapic*)dev;
struct LAPIC *lapic = &vlapic->apic;
memset(lapic, 0, sizeof(*lapic));
lapic->apr = vlapic->vcpuid;
vlapic_init_ipi(vlapic);
vlapic->divisor = vlapic_timer_divisor(lapic->dcr_timer);
if (vlapic->vcpuid == 0)
vlapic->boot_state = BS_RUNNING; /* BSP */
else
vlapic->boot_state = BS_INIT; /* AP */
return 0;
}
static int
vlapic_op_init(void* dev)
{
struct vlapic *vlapic = (struct vlapic*)dev;
vdev_register_region(vlapic->ops, vlapic, vlapic->mmio);
return vlapic_op_reset(dev);
}
static int
vlapic_op_halt(void* dev)
{
struct vlapic *vlapic = (struct vlapic*)dev;
vdev_unregister_region(vlapic, vlapic->mmio);
return 0;
}
void
vlapic_set_intr_ready(struct vlapic *vlapic, int vector)
{
struct LAPIC *lapic = &vlapic->apic;
uint32_t *irrptr;
int idx;
if (vector < 0 || vector >= 256)
panic("vlapic_set_intr_ready: invalid vector %d\n", vector);
idx = (vector / 32) * 4;
irrptr = &lapic->irr0;
atomic_set_int(&irrptr[idx], 1 << (vector % 32));
VLAPIC_CTR_IRR(vlapic, "vlapic_set_intr_ready");
}
static void
vlapic_start_timer(struct vlapic *vlapic, uint32_t elapsed)
{
uint32_t icr_timer;
icr_timer = vlapic->apic.icr_timer;
vlapic->ccr_ticks = ticks;
if (elapsed < icr_timer)
vlapic->apic.ccr_timer = icr_timer - elapsed;
else {
/*
* This can happen when the guest is trying to run its local
* apic timer higher that the setting of 'hz' in the host.
*
* We deal with this by running the guest local apic timer
* at the rate of the host's 'hz' setting.
*/
vlapic->apic.ccr_timer = 0;
}
}
static __inline uint32_t *
vlapic_get_lvt(struct vlapic *vlapic, uint32_t offset)
{
struct LAPIC *lapic = &vlapic->apic;
int i;
if (offset < APIC_OFFSET_TIMER_LVT || offset > APIC_OFFSET_ERROR_LVT) {
panic("vlapic_get_lvt: invalid LVT\n");
}
i = (offset - APIC_OFFSET_TIMER_LVT) >> 2;
return ((&lapic->lvt_timer) + i);;
}
#if 1
static void
dump_isrvec_stk(struct vlapic *vlapic)
{
int i;
uint32_t *isrptr;
isrptr = &vlapic->apic.isr0;
for (i = 0; i < 8; i++)
printf("ISR%d 0x%08x\n", i, isrptr[i * 4]);
for (i = 0; i <= vlapic->isrvec_stk_top; i++)
printf("isrvec_stk[%d] = %d\n", i, vlapic->isrvec_stk[i]);
}
#endif
/*
* Algorithm adopted from section "Interrupt, Task and Processor Priority"
* in Intel Architecture Manual Vol 3a.
*/
static void
vlapic_update_ppr(struct vlapic *vlapic)
{
int isrvec, tpr, ppr;
/*
* Note that the value on the stack at index 0 is always 0.
*
* This is a placeholder for the value of ISRV when none of the
* bits is set in the ISRx registers.
*/
isrvec = vlapic->isrvec_stk[vlapic->isrvec_stk_top];
tpr = vlapic->apic.tpr;
#if 1
{
int i, lastprio, curprio, vector, idx;
uint32_t *isrptr;
if (vlapic->isrvec_stk_top == 0 && isrvec != 0)
panic("isrvec_stk is corrupted: %d", isrvec);
/*
* Make sure that the priority of the nested interrupts is
* always increasing.
*/
lastprio = -1;
for (i = 1; i <= vlapic->isrvec_stk_top; i++) {
curprio = PRIO(vlapic->isrvec_stk[i]);
if (curprio <= lastprio) {
dump_isrvec_stk(vlapic);
panic("isrvec_stk does not satisfy invariant");
}
lastprio = curprio;
}
/*
* Make sure that each bit set in the ISRx registers has a
* corresponding entry on the isrvec stack.
*/
i = 1;
isrptr = &vlapic->apic.isr0;
for (vector = 0; vector < 256; vector++) {
idx = (vector / 32) * 4;
if (isrptr[idx] & (1 << (vector % 32))) {
if (i > vlapic->isrvec_stk_top ||
vlapic->isrvec_stk[i] != vector) {
dump_isrvec_stk(vlapic);
panic("ISR and isrvec_stk out of sync");
}
i++;
}
}
}
#endif
if (PRIO(tpr) >= PRIO(isrvec))
ppr = tpr;
else
ppr = isrvec & 0xf0;
vlapic->apic.ppr = ppr;
VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr);
}
static void
vlapic_process_eoi(struct vlapic *vlapic)
{
struct LAPIC *lapic = &vlapic->apic;
uint32_t *isrptr;
int i, idx, bitpos;
isrptr = &lapic->isr0;
/*
* The x86 architecture reserves the the first 32 vectors for use
* by the processor.
*/
for (i = 7; i > 0; i--) {
idx = i * 4;
bitpos = fls(isrptr[idx]);
if (bitpos != 0) {
if (vlapic->isrvec_stk_top <= 0) {
panic("invalid vlapic isrvec_stk_top %d",
vlapic->isrvec_stk_top);
}
isrptr[idx] &= ~(1 << (bitpos - 1));
VLAPIC_CTR_ISR(vlapic, "vlapic_process_eoi");
vlapic->isrvec_stk_top--;
vlapic_update_ppr(vlapic);
return;
}
}
}
static __inline int
vlapic_get_lvt_field(uint32_t *lvt, uint32_t mask)
{
return (*lvt & mask);
}
static __inline int
vlapic_periodic_timer(struct vlapic *vlapic)
{
uint32_t *lvt;
lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
return (vlapic_get_lvt_field(lvt, APIC_LVTT_TM_PERIODIC));
}
static void
vlapic_fire_timer(struct vlapic *vlapic)
{
int vector;
uint32_t *lvt;
lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
if (!vlapic_get_lvt_field(lvt, APIC_LVTT_M)) {
vector = vlapic_get_lvt_field(lvt,APIC_LVTT_VECTOR);
vlapic_set_intr_ready(vlapic, vector);
}
}
static int
lapic_process_icr(struct vlapic *vlapic, uint64_t icrval)
{
int i;
cpuset_t dmask;
uint32_t dest, vec, mode;
struct vlapic *vlapic2;
struct vm_exit *vmexit;
if (x2apic(vlapic))
dest = icrval >> 32;
else
dest = icrval >> (32 + 24);
vec = icrval & APIC_VECTOR_MASK;
mode = icrval & APIC_DELMODE_MASK;
if (mode == APIC_DELMODE_FIXED || mode == APIC_DELMODE_NMI) {
switch (icrval & APIC_DEST_MASK) {
case APIC_DEST_DESTFLD:
CPU_SETOF(dest, &dmask);
break;
case APIC_DEST_SELF:
CPU_SETOF(vlapic->vcpuid, &dmask);
break;
case APIC_DEST_ALLISELF:
dmask = vm_active_cpus(vlapic->vm);
break;
case APIC_DEST_ALLESELF:
dmask = vm_active_cpus(vlapic->vm);
CPU_CLR(vlapic->vcpuid, &dmask);
break;
}
while ((i = cpusetobj_ffs(&dmask)) != 0) {
i--;
CPU_CLR(i, &dmask);
if (mode == APIC_DELMODE_FIXED)
lapic_set_intr(vlapic->vm, i, vec);
else
vm_inject_nmi(vlapic->vm, i);
}
return (0); /* handled completely in the kernel */
}
if (mode == APIC_DELMODE_INIT) {
if ((icrval & APIC_LEVEL_MASK) == APIC_LEVEL_DEASSERT)
return (0);
if (vlapic->vcpuid == 0 && dest != 0 && dest < VM_MAXCPU) {
vlapic2 = vm_lapic(vlapic->vm, dest);
/* move from INIT to waiting-for-SIPI state */
if (vlapic2->boot_state == BS_INIT) {
vlapic2->boot_state = BS_SIPI;
}
return (0);
}
}
if (mode == APIC_DELMODE_STARTUP) {
if (vlapic->vcpuid == 0 && dest != 0 && dest < VM_MAXCPU) {
vlapic2 = vm_lapic(vlapic->vm, dest);
/*
* Ignore SIPIs in any state other than wait-for-SIPI
*/
if (vlapic2->boot_state != BS_SIPI)
return (0);
vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid);
vmexit->exitcode = VM_EXITCODE_SPINUP_AP;
vmexit->u.spinup_ap.vcpu = dest;
vmexit->u.spinup_ap.rip = vec << PAGE_SHIFT;
/*
* XXX this assumes that the startup IPI always succeeds
*/
vlapic2->boot_state = BS_RUNNING;
vm_activate_cpu(vlapic2->vm, dest);
return (0);
}
}
/*
* This will cause a return to userland.
*/
return (1);
}
int
vlapic_pending_intr(struct vlapic *vlapic)
{
struct LAPIC *lapic = &vlapic->apic;
int idx, i, bitpos, vector;
uint32_t *irrptr, val;
irrptr = &lapic->irr0;
/*
* The x86 architecture reserves the the first 32 vectors for use
* by the processor.
*/
for (i = 7; i > 0; i--) {
idx = i * 4;
val = atomic_load_acq_int(&irrptr[idx]);
bitpos = fls(val);
if (bitpos != 0) {
vector = i * 32 + (bitpos - 1);
if (PRIO(vector) > PRIO(lapic->ppr)) {
VLAPIC_CTR1(vlapic, "pending intr %d", vector);
return (vector);
} else
break;
}
}
VLAPIC_CTR0(vlapic, "no pending intr");
return (-1);
}
void
vlapic_intr_accepted(struct vlapic *vlapic, int vector)
{
struct LAPIC *lapic = &vlapic->apic;
uint32_t *irrptr, *isrptr;
int idx, stk_top;
/*
* clear the ready bit for vector being accepted in irr
* and set the vector as in service in isr.
*/
idx = (vector / 32) * 4;
irrptr = &lapic->irr0;
atomic_clear_int(&irrptr[idx], 1 << (vector % 32));
VLAPIC_CTR_IRR(vlapic, "vlapic_intr_accepted");
isrptr = &lapic->isr0;
isrptr[idx] |= 1 << (vector % 32);
VLAPIC_CTR_ISR(vlapic, "vlapic_intr_accepted");
/*
* Update the PPR
*/
vlapic->isrvec_stk_top++;
stk_top = vlapic->isrvec_stk_top;
if (stk_top >= ISRVEC_STK_SIZE)
panic("isrvec_stk_top overflow %d", stk_top);
vlapic->isrvec_stk[stk_top] = vector;
vlapic_update_ppr(vlapic);
}
int
vlapic_op_mem_read(void* dev, uint64_t gpa, opsize_t size, uint64_t *data)
{
struct vlapic *vlapic = (struct vlapic*)dev;
struct LAPIC *lapic = &vlapic->apic;
uint64_t offset = gpa & ~(PAGE_SIZE);
uint32_t *reg;
int i;
if (offset > sizeof(*lapic)) {
*data = 0;
return 0;
}
offset &= ~3;
switch(offset)
{
case APIC_OFFSET_ID:
if (x2apic(vlapic))
*data = vlapic->vcpuid;
else
*data = vlapic->vcpuid << 24;
break;
case APIC_OFFSET_VER:
*data = lapic->version;
break;
case APIC_OFFSET_TPR:
*data = lapic->tpr;
break;
case APIC_OFFSET_APR:
*data = lapic->apr;
break;
case APIC_OFFSET_PPR:
*data = lapic->ppr;
break;
case APIC_OFFSET_EOI:
*data = lapic->eoi;
break;
case APIC_OFFSET_LDR:
*data = lapic->ldr;
break;
case APIC_OFFSET_DFR:
*data = lapic->dfr;
break;
case APIC_OFFSET_SVR:
*data = lapic->svr;
break;
case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
i = (offset - APIC_OFFSET_ISR0) >> 2;
reg = &lapic->isr0;
*data = *(reg + i);
break;
case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
i = (offset - APIC_OFFSET_TMR0) >> 2;
reg = &lapic->tmr0;
*data = *(reg + i);
break;
case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
i = (offset - APIC_OFFSET_IRR0) >> 2;
reg = &lapic->irr0;
*data = atomic_load_acq_int(reg + i);
break;
case APIC_OFFSET_ESR:
*data = lapic->esr;
break;
case APIC_OFFSET_ICR_LOW:
*data = lapic->icr_lo;
break;
case APIC_OFFSET_ICR_HI:
*data = lapic->icr_hi;
break;
case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
reg = vlapic_get_lvt(vlapic, offset);
*data = *(reg);
break;
case APIC_OFFSET_ICR:
*data = lapic->icr_timer;
break;
case APIC_OFFSET_CCR:
*data = vlapic_get_ccr(vlapic);
break;
case APIC_OFFSET_DCR:
*data = lapic->dcr_timer;
break;
case APIC_OFFSET_RRR:
default:
*data = 0;
break;
}
return 0;
}
int
vlapic_op_mem_write(void* dev, uint64_t gpa, opsize_t size, uint64_t data)
{
struct vlapic *vlapic = (struct vlapic*)dev;
struct LAPIC *lapic = &vlapic->apic;
uint64_t offset = gpa & ~(PAGE_SIZE);
uint32_t *reg;
int retval;
if (offset > sizeof(*lapic)) {
return 0;
}
retval = 0;
offset &= ~3;
switch(offset)
{
case APIC_OFFSET_ID:
break;
case APIC_OFFSET_TPR:
lapic->tpr = data & 0xff;
vlapic_update_ppr(vlapic);
break;
case APIC_OFFSET_EOI:
vlapic_process_eoi(vlapic);
break;
case APIC_OFFSET_LDR:
break;
case APIC_OFFSET_DFR:
break;
case APIC_OFFSET_SVR:
lapic->svr = data;
break;
case APIC_OFFSET_ICR_LOW:
if (!x2apic(vlapic)) {
data &= 0xffffffff;
data |= (uint64_t)lapic->icr_hi << 32;
}
retval = lapic_process_icr(vlapic, data);
break;
case APIC_OFFSET_ICR_HI:
if (!x2apic(vlapic)) {
retval = 0;
lapic->icr_hi = data;
}
break;
case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
reg = vlapic_get_lvt(vlapic, offset);
if (!(lapic->svr & APIC_SVR_ENABLE)) {
data |= APIC_LVT_M;
}
*reg = data;
// vlapic_dump_lvt(offset, reg);
break;
case APIC_OFFSET_ICR:
lapic->icr_timer = data;
vlapic_start_timer(vlapic, 0);
break;
case APIC_OFFSET_DCR:
lapic->dcr_timer = data;
vlapic->divisor = vlapic_timer_divisor(data);
break;
case APIC_OFFSET_ESR:
vlapic_update_errors(vlapic);
break;
case APIC_OFFSET_VER:
case APIC_OFFSET_APR:
case APIC_OFFSET_PPR:
case APIC_OFFSET_RRR:
case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
case APIC_OFFSET_CCR:
default:
// Read only.
break;
}
return (retval);
}
int
vlapic_timer_tick(struct vlapic *vlapic)
{
int curticks, delta, periodic, fired;
uint32_t ccr;
uint32_t decrement, leftover;
restart:
curticks = ticks;
delta = curticks - vlapic->ccr_ticks;
/* Local APIC timer is disabled */
if (vlapic->apic.icr_timer == 0)
return (-1);
/* One-shot mode and timer has already counted down to zero */
periodic = vlapic_periodic_timer(vlapic);
if (!periodic && vlapic->apic.ccr_timer == 0)
return (-1);
/*
* The 'curticks' and 'ccr_ticks' are out of sync by more than
* 2^31 ticks. We deal with this by restarting the timer.
*/
if (delta < 0) {
vlapic_start_timer(vlapic, 0);
goto restart;
}
fired = 0;
decrement = (VLAPIC_BUS_FREQ / vlapic->divisor) / hz;
vlapic->ccr_ticks = curticks;
ccr = vlapic->apic.ccr_timer;
while (delta-- > 0) {
if (ccr > decrement) {
ccr -= decrement;
continue;
}
/* Trigger the local apic timer interrupt */
vlapic_fire_timer(vlapic);
if (periodic) {
leftover = decrement - ccr;
vlapic_start_timer(vlapic, leftover);
ccr = vlapic->apic.ccr_timer;
} else {
/*
* One-shot timer has counted down to zero.
*/
ccr = 0;
}
fired = 1;
break;
}
vlapic->apic.ccr_timer = ccr;
if (!fired)
return ((ccr / decrement) + 1);
else
return (0);
}
struct vdev_ops vlapic_dev_ops = {
.name = "vlapic",
.init = vlapic_op_init,
.reset = vlapic_op_reset,
.halt = vlapic_op_halt,
.memread = vlapic_op_mem_read,
.memwrite = vlapic_op_mem_write,
};
static struct io_region vlapic_mmio[VM_MAXCPU];
struct vlapic *
vlapic_init(struct vm *vm, int vcpuid)
{
struct vlapic *vlapic;
vlapic = malloc(sizeof(struct vlapic), M_VLAPIC, M_WAITOK | M_ZERO);
vlapic->vm = vm;
vlapic->vcpuid = vcpuid;
vlapic->msr_apicbase = DEFAULT_APIC_BASE | APICBASE_ENABLED;
if (vcpuid == 0)
vlapic->msr_apicbase |= APICBASE_BSP;
vlapic->ops = &vlapic_dev_ops;
vlapic->mmio = vlapic_mmio + vcpuid;
vlapic->mmio->base = DEFAULT_APIC_BASE;
vlapic->mmio->len = PAGE_SIZE;
vlapic->mmio->attr = MMIO_READ|MMIO_WRITE;
vlapic->mmio->vcpu = vcpuid;
vdev_register(&vlapic_dev_ops, vlapic);
vlapic_op_init(vlapic);
return (vlapic);
}
void
vlapic_cleanup(struct vlapic *vlapic)
{
vlapic_op_halt(vlapic);
vdev_unregister(vlapic);
free(vlapic, M_VLAPIC);
}
uint64_t
vlapic_get_apicbase(struct vlapic *vlapic)
{
return (vlapic->msr_apicbase);
}
void
vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val)
{
int err;
enum x2apic_state state;
err = vm_get_x2apic_state(vlapic->vm, vlapic->vcpuid, &state);
if (err)
panic("vlapic_set_apicbase: err %d fetching x2apic state", err);
if (state == X2APIC_DISABLED)
val &= ~APICBASE_X2APIC;
vlapic->msr_apicbase = val;
}
void
vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
{
struct vlapic *vlapic;
vlapic = vm_lapic(vm, vcpuid);
if (state == X2APIC_DISABLED)
vlapic->msr_apicbase &= ~APICBASE_X2APIC;
}

111
sys/amd64/vmm/io/vlapic.h Normal file
View File

@ -0,0 +1,111 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VLAPIC_H_
#define _VLAPIC_H_
#include "vdev.h"
struct vm;
/*
* Map of APIC Registers: Offset Description Access
*/
#define APIC_OFFSET_ID 0x20 // Local APIC ID R/W
#define APIC_OFFSET_VER 0x30 // Local APIC Version R
#define APIC_OFFSET_TPR 0x80 // Task Priority Register R/W
#define APIC_OFFSET_APR 0x90 // Arbitration Priority Register R
#define APIC_OFFSET_PPR 0xA0 // Processor Priority Register R
#define APIC_OFFSET_EOI 0xB0 // EOI Register W
#define APIC_OFFSET_RRR 0xC0 // Remote read R
#define APIC_OFFSET_LDR 0xD0 // Logical Destination R/W
#define APIC_OFFSET_DFR 0xE0 // Destination Format Register 0..27 R; 28..31 R/W
#define APIC_OFFSET_SVR 0xF0 // Spurious Interrupt Vector Reg. 0..3 R; 4..9 R/W
#define APIC_OFFSET_ISR0 0x100 // ISR 000-031 R
#define APIC_OFFSET_ISR1 0x110 // ISR 032-063 R
#define APIC_OFFSET_ISR2 0x120 // ISR 064-095 R
#define APIC_OFFSET_ISR3 0x130 // ISR 095-128 R
#define APIC_OFFSET_ISR4 0x140 // ISR 128-159 R
#define APIC_OFFSET_ISR5 0x150 // ISR 160-191 R
#define APIC_OFFSET_ISR6 0x160 // ISR 192-223 R
#define APIC_OFFSET_ISR7 0x170 // ISR 224-255 R
#define APIC_OFFSET_TMR0 0x180 // TMR 000-031 R
#define APIC_OFFSET_TMR1 0x190 // TMR 032-063 R
#define APIC_OFFSET_TMR2 0x1A0 // TMR 064-095 R
#define APIC_OFFSET_TMR3 0x1B0 // TMR 095-128 R
#define APIC_OFFSET_TMR4 0x1C0 // TMR 128-159 R
#define APIC_OFFSET_TMR5 0x1D0 // TMR 160-191 R
#define APIC_OFFSET_TMR6 0x1E0 // TMR 192-223 R
#define APIC_OFFSET_TMR7 0x1F0 // TMR 224-255 R
#define APIC_OFFSET_IRR0 0x200 // IRR 000-031 R
#define APIC_OFFSET_IRR1 0x210 // IRR 032-063 R
#define APIC_OFFSET_IRR2 0x220 // IRR 064-095 R
#define APIC_OFFSET_IRR3 0x230 // IRR 095-128 R
#define APIC_OFFSET_IRR4 0x240 // IRR 128-159 R
#define APIC_OFFSET_IRR5 0x250 // IRR 160-191 R
#define APIC_OFFSET_IRR6 0x260 // IRR 192-223 R
#define APIC_OFFSET_IRR7 0x270 // IRR 224-255 R
#define APIC_OFFSET_ESR 0x280 // Error Status Register R
#define APIC_OFFSET_ICR_LOW 0x300 // Interrupt Command Reg. (0-31) R/W
#define APIC_OFFSET_ICR_HI 0x310 // Interrupt Command Reg. (32-63) R/W
#define APIC_OFFSET_TIMER_LVT 0x320 // Local Vector Table (Timer) R/W
#define APIC_OFFSET_THERM_LVT 0x330 // Local Vector Table (Thermal) R/W (PIV+)
#define APIC_OFFSET_PERF_LVT 0x340 // Local Vector Table (Performance) R/W (P6+)
#define APIC_OFFSET_LINT0_LVT 0x350 // Local Vector Table (LINT0) R/W
#define APIC_OFFSET_LINT1_LVT 0x360 // Local Vector Table (LINT1) R/W
#define APIC_OFFSET_ERROR_LVT 0x370 // Local Vector Table (ERROR) R/W
#define APIC_OFFSET_ICR 0x380 // Initial Count Reg. for Timer R/W
#define APIC_OFFSET_CCR 0x390 // Current Count of Timer R
#define APIC_OFFSET_DCR 0x3E0 // Timer Divide Configuration Reg. R/W
/*
* 16 priority levels with at most one vector injected per level.
*/
#define ISRVEC_STK_SIZE (16 + 1)
enum x2apic_state;
struct vlapic *vlapic_init(struct vm *vm, int vcpuid);
void vlapic_cleanup(struct vlapic *vlapic);
int vlapic_op_mem_write(void* dev, uint64_t gpa,
opsize_t size, uint64_t data);
int vlapic_op_mem_read(void* dev, uint64_t gpa,
opsize_t size, uint64_t *data);
int vlapic_pending_intr(struct vlapic *vlapic);
void vlapic_intr_accepted(struct vlapic *vlapic, int vector);
void vlapic_set_intr_ready(struct vlapic *vlapic, int vector);
int vlapic_timer_tick(struct vlapic *vlapic);
uint64_t vlapic_get_apicbase(struct vlapic *vlapic);
void vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val);
void vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state s);
#endif /* _VLAPIC_H_ */

1022
sys/amd64/vmm/vmm.c Normal file

File diff suppressed because it is too large Load Diff

538
sys/amd64/vmm/vmm_dev.c Normal file
View File

@ -0,0 +1,538 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/queue.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/malloc.h>
#include <sys/conf.h>
#include <sys/sysctl.h>
#include <sys/libkern.h>
#include <sys/ioccom.h>
#include <sys/mman.h>
#include <sys/uio.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <machine/pmap.h>
#include <machine/vmparam.h>
#include <machine/vmm.h>
#include "vmm_lapic.h"
#include "vmm_stat.h"
#include "vmm_mem.h"
#include "io/ppt.h"
#include <machine/vmm_dev.h>
struct vmmdev_softc {
struct vm *vm; /* vm instance cookie */
struct cdev *cdev;
SLIST_ENTRY(vmmdev_softc) link;
};
static SLIST_HEAD(, vmmdev_softc) head;
static struct mtx vmmdev_mtx;
static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
SYSCTL_DECL(_hw_vmm);
static struct vmmdev_softc *
vmmdev_lookup(const char *name)
{
struct vmmdev_softc *sc;
#ifdef notyet /* XXX kernel is not compiled with invariants */
mtx_assert(&vmmdev_mtx, MA_OWNED);
#endif
SLIST_FOREACH(sc, &head, link) {
if (strcmp(name, vm_name(sc->vm)) == 0)
break;
}
return (sc);
}
static struct vmmdev_softc *
vmmdev_lookup2(struct cdev *cdev)
{
return (cdev->si_drv1);
}
static int
vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
{
int error, off, c;
vm_paddr_t hpa, gpa;
struct vmmdev_softc *sc;
static char zerobuf[PAGE_SIZE];
error = 0;
mtx_lock(&vmmdev_mtx);
sc = vmmdev_lookup2(cdev);
if (sc == NULL)
error = ENXIO;
while (uio->uio_resid > 0 && error == 0) {
gpa = uio->uio_offset;
off = gpa & PAGE_MASK;
c = min(uio->uio_resid, PAGE_SIZE - off);
/*
* The VM has a hole in its physical memory map. If we want to
* use 'dd' to inspect memory beyond the hole we need to
* provide bogus data for memory that lies in the hole.
*
* Since this device does not support lseek(2), dd(1) will
* read(2) blocks of data to simulate the lseek(2).
*/
hpa = vm_gpa2hpa(sc->vm, gpa, c);
if (hpa == (vm_paddr_t)-1) {
if (uio->uio_rw == UIO_READ)
error = uiomove(zerobuf, c, uio);
else
error = EFAULT;
} else
error = uiomove((void *)PHYS_TO_DMAP(hpa), c, uio);
}
mtx_unlock(&vmmdev_mtx);
return (error);
}
static int
vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
struct thread *td)
{
int error, vcpu, state_changed;
enum vcpu_state new_state;
struct vmmdev_softc *sc;
struct vm_memory_segment *seg;
struct vm_register *vmreg;
struct vm_seg_desc* vmsegdesc;
struct vm_pin *vmpin;
struct vm_run *vmrun;
struct vm_event *vmevent;
struct vm_lapic_irq *vmirq;
struct vm_capability *vmcap;
struct vm_pptdev *pptdev;
struct vm_pptdev_mmio *pptmmio;
struct vm_pptdev_msi *pptmsi;
struct vm_pptdev_msix *pptmsix;
struct vm_nmi *vmnmi;
struct vm_stats *vmstats;
struct vm_stat_desc *statdesc;
struct vm_x2apic *x2apic;
sc = vmmdev_lookup2(cdev);
if (sc == NULL)
return (ENXIO);
vcpu = -1;
state_changed = 0;
/*
* Some VMM ioctls can operate only on vcpus that are not running.
*/
switch (cmd) {
case VM_RUN:
case VM_SET_PINNING:
case VM_GET_REGISTER:
case VM_SET_REGISTER:
case VM_GET_SEGMENT_DESCRIPTOR:
case VM_SET_SEGMENT_DESCRIPTOR:
case VM_INJECT_EVENT:
case VM_GET_CAPABILITY:
case VM_SET_CAPABILITY:
case VM_PPTDEV_MSI:
case VM_PPTDEV_MSIX:
case VM_SET_X2APIC_STATE:
/*
* XXX fragile, handle with care
* Assumes that the first field of the ioctl data is the vcpu.
*/
vcpu = *(int *)data;
if (vcpu < 0 || vcpu >= VM_MAXCPU) {
error = EINVAL;
goto done;
}
if (cmd == VM_RUN)
new_state = VCPU_RUNNING;
else
new_state = VCPU_CANNOT_RUN;
error = vcpu_set_state(sc->vm, vcpu, new_state);
if (error)
goto done;
state_changed = 1;
break;
case VM_MAP_PPTDEV_MMIO:
case VM_BIND_PPTDEV:
case VM_UNBIND_PPTDEV:
case VM_MAP_MEMORY:
/*
* ioctls that operate on the entire virtual machine must
* prevent all vcpus from running.
*/
error = 0;
for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) {
error = vcpu_set_state(sc->vm, vcpu, VCPU_CANNOT_RUN);
if (error)
break;
}
if (error) {
while (--vcpu >= 0)
vcpu_set_state(sc->vm, vcpu, VCPU_IDLE);
goto done;
}
state_changed = 2;
break;
default:
break;
}
switch(cmd) {
case VM_RUN:
vmrun = (struct vm_run *)data;
error = vm_run(sc->vm, vmrun);
break;
case VM_STAT_DESC: {
const char *desc;
statdesc = (struct vm_stat_desc *)data;
desc = vmm_stat_desc(statdesc->index);
if (desc != NULL) {
error = 0;
strlcpy(statdesc->desc, desc, sizeof(statdesc->desc));
} else
error = EINVAL;
break;
}
case VM_STATS: {
CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_TYPES);
vmstats = (struct vm_stats *)data;
getmicrotime(&vmstats->tv);
error = vmm_stat_copy(sc->vm, vmstats->cpuid,
&vmstats->num_entries, vmstats->statbuf);
break;
}
case VM_PPTDEV_MSI:
pptmsi = (struct vm_pptdev_msi *)data;
error = ppt_setup_msi(sc->vm, pptmsi->vcpu,
pptmsi->bus, pptmsi->slot, pptmsi->func,
pptmsi->destcpu, pptmsi->vector,
pptmsi->numvec);
break;
case VM_PPTDEV_MSIX:
pptmsix = (struct vm_pptdev_msix *)data;
error = ppt_setup_msix(sc->vm, pptmsix->vcpu,
pptmsix->bus, pptmsix->slot,
pptmsix->func, pptmsix->idx,
pptmsix->msg, pptmsix->vector_control,
pptmsix->addr);
break;
case VM_MAP_PPTDEV_MMIO:
pptmmio = (struct vm_pptdev_mmio *)data;
error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
pptmmio->func, pptmmio->gpa, pptmmio->len,
pptmmio->hpa);
break;
case VM_BIND_PPTDEV:
pptdev = (struct vm_pptdev *)data;
error = ppt_assign_device(sc->vm, pptdev->bus, pptdev->slot,
pptdev->func);
break;
case VM_UNBIND_PPTDEV:
pptdev = (struct vm_pptdev *)data;
error = ppt_unassign_device(sc->vm, pptdev->bus, pptdev->slot,
pptdev->func);
break;
case VM_INJECT_EVENT:
vmevent = (struct vm_event *)data;
error = vm_inject_event(sc->vm, vmevent->cpuid, vmevent->type,
vmevent->vector,
vmevent->error_code,
vmevent->error_code_valid);
break;
case VM_INJECT_NMI:
vmnmi = (struct vm_nmi *)data;
error = vm_inject_nmi(sc->vm, vmnmi->cpuid);
break;
case VM_LAPIC_IRQ:
vmirq = (struct vm_lapic_irq *)data;
error = lapic_set_intr(sc->vm, vmirq->cpuid, vmirq->vector);
break;
case VM_SET_PINNING:
vmpin = (struct vm_pin *)data;
error = vm_set_pinning(sc->vm, vmpin->vm_cpuid,
vmpin->host_cpuid);
break;
case VM_GET_PINNING:
vmpin = (struct vm_pin *)data;
error = vm_get_pinning(sc->vm, vmpin->vm_cpuid,
&vmpin->host_cpuid);
break;
case VM_MAP_MEMORY:
seg = (struct vm_memory_segment *)data;
error = vm_malloc(sc->vm, seg->gpa, seg->len);
break;
case VM_GET_MEMORY_SEG:
seg = (struct vm_memory_segment *)data;
seg->len = 0;
(void)vm_gpabase2memseg(sc->vm, seg->gpa, seg);
error = 0;
break;
case VM_GET_REGISTER:
vmreg = (struct vm_register *)data;
error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum,
&vmreg->regval);
break;
case VM_SET_REGISTER:
vmreg = (struct vm_register *)data;
error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum,
vmreg->regval);
break;
case VM_SET_SEGMENT_DESCRIPTOR:
vmsegdesc = (struct vm_seg_desc *)data;
error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid,
vmsegdesc->regnum,
&vmsegdesc->desc);
break;
case VM_GET_SEGMENT_DESCRIPTOR:
vmsegdesc = (struct vm_seg_desc *)data;
error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid,
vmsegdesc->regnum,
&vmsegdesc->desc);
break;
case VM_GET_CAPABILITY:
vmcap = (struct vm_capability *)data;
error = vm_get_capability(sc->vm, vmcap->cpuid,
vmcap->captype,
&vmcap->capval);
break;
case VM_SET_CAPABILITY:
vmcap = (struct vm_capability *)data;
error = vm_set_capability(sc->vm, vmcap->cpuid,
vmcap->captype,
vmcap->capval);
break;
case VM_SET_X2APIC_STATE:
x2apic = (struct vm_x2apic *)data;
error = vm_set_x2apic_state(sc->vm,
x2apic->cpuid, x2apic->state);
break;
case VM_GET_X2APIC_STATE:
x2apic = (struct vm_x2apic *)data;
error = vm_get_x2apic_state(sc->vm,
x2apic->cpuid, &x2apic->state);
break;
default:
error = ENOTTY;
break;
}
if (state_changed == 1) {
vcpu_set_state(sc->vm, vcpu, VCPU_IDLE);
} else if (state_changed == 2) {
for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++)
vcpu_set_state(sc->vm, vcpu, VCPU_IDLE);
}
done:
return (error);
}
static int
vmmdev_mmap(struct cdev *cdev, vm_ooffset_t offset, vm_paddr_t *paddr,
int nprot, vm_memattr_t *memattr)
{
int error;
struct vmmdev_softc *sc;
error = -1;
mtx_lock(&vmmdev_mtx);
sc = vmmdev_lookup2(cdev);
if (sc != NULL && (nprot & PROT_EXEC) == 0) {
*paddr = vm_gpa2hpa(sc->vm, (vm_paddr_t)offset, PAGE_SIZE);
if (*paddr != (vm_paddr_t)-1)
error = 0;
}
mtx_unlock(&vmmdev_mtx);
return (error);
}
static void
vmmdev_destroy(struct vmmdev_softc *sc, boolean_t unlink)
{
/*
* XXX must stop virtual machine instances that may be still
* running and cleanup their state.
*/
if (sc->cdev)
destroy_dev(sc->cdev);
if (sc->vm)
vm_destroy(sc->vm);
if (unlink) {
mtx_lock(&vmmdev_mtx);
SLIST_REMOVE(&head, sc, vmmdev_softc, link);
mtx_unlock(&vmmdev_mtx);
}
free(sc, M_VMMDEV);
}
static int
sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
{
int error;
char buf[VM_MAX_NAMELEN];
struct vmmdev_softc *sc;
strlcpy(buf, "beavis", sizeof(buf));
error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
if (error != 0 || req->newptr == NULL)
return (error);
/*
* XXX TODO if any process has this device open then fail
*/
mtx_lock(&vmmdev_mtx);
sc = vmmdev_lookup(buf);
if (sc == NULL) {
mtx_unlock(&vmmdev_mtx);
return (EINVAL);
}
sc->cdev->si_drv1 = NULL;
mtx_unlock(&vmmdev_mtx);
vmmdev_destroy(sc, TRUE);
return (0);
}
SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW,
NULL, 0, sysctl_vmm_destroy, "A", NULL);
static struct cdevsw vmmdevsw = {
.d_name = "vmmdev",
.d_version = D_VERSION,
.d_ioctl = vmmdev_ioctl,
.d_mmap = vmmdev_mmap,
.d_read = vmmdev_rw,
.d_write = vmmdev_rw,
};
static int
sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
{
int error;
struct vm *vm;
struct vmmdev_softc *sc, *sc2;
char buf[VM_MAX_NAMELEN];
strlcpy(buf, "beavis", sizeof(buf));
error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
if (error != 0 || req->newptr == NULL)
return (error);
mtx_lock(&vmmdev_mtx);
sc = vmmdev_lookup(buf);
mtx_unlock(&vmmdev_mtx);
if (sc != NULL)
return (EEXIST);
vm = vm_create(buf);
if (vm == NULL)
return (EINVAL);
sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO);
sc->vm = vm;
/*
* Lookup the name again just in case somebody sneaked in when we
* dropped the lock.
*/
mtx_lock(&vmmdev_mtx);
sc2 = vmmdev_lookup(buf);
if (sc2 == NULL)
SLIST_INSERT_HEAD(&head, sc, link);
mtx_unlock(&vmmdev_mtx);
if (sc2 != NULL) {
vmmdev_destroy(sc, FALSE);
return (EEXIST);
}
sc->cdev = make_dev(&vmmdevsw, 0, UID_ROOT, GID_WHEEL, 0600,
"vmm/%s", buf);
sc->cdev->si_drv1 = sc;
return (0);
}
SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW,
NULL, 0, sysctl_vmm_create, "A", NULL);
void
vmmdev_init(void)
{
mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF);
}
int
vmmdev_cleanup(void)
{
int error;
if (SLIST_EMPTY(&head))
error = 0;
else
error = EBUSY;
return (error);
}

124
sys/amd64/vmm/vmm_host.c Normal file
View File

@ -0,0 +1,124 @@
/*-
* Copyright (c) 2012 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/pcpu.h>
#include <machine/cpufunc.h>
#include <machine/segments.h>
#include <machine/specialreg.h>
#include "vmm_host.h"
static uint64_t vmm_host_efer, vmm_host_pat, vmm_host_cr0, vmm_host_cr4;
void
vmm_host_state_init(void)
{
vmm_host_efer = rdmsr(MSR_EFER);
vmm_host_pat = rdmsr(MSR_PAT);
/*
* We always want CR0.TS to be set when the processor does a VM exit.
*
* With emulation turned on unconditionally after a VM exit, we are
* able to trap inadvertent use of the FPU until the guest FPU state
* has been safely squirreled away.
*/
vmm_host_cr0 = rcr0() | CR0_TS;
vmm_host_cr4 = rcr4();
}
uint64_t
vmm_get_host_pat(void)
{
return (vmm_host_pat);
}
uint64_t
vmm_get_host_efer(void)
{
return (vmm_host_efer);
}
uint64_t
vmm_get_host_cr0(void)
{
return (vmm_host_cr0);
}
uint64_t
vmm_get_host_cr4(void)
{
return (vmm_host_cr4);
}
uint64_t
vmm_get_host_datasel(void)
{
return (GSEL(GDATA_SEL, SEL_KPL));
}
uint64_t
vmm_get_host_codesel(void)
{
return (GSEL(GCODE_SEL, SEL_KPL));
}
uint64_t
vmm_get_host_tsssel(void)
{
return (GSEL(GPROC0_SEL, SEL_KPL));
}
uint64_t
vmm_get_host_fsbase(void)
{
return (0);
}
uint64_t
vmm_get_host_idtrbase(void)
{
return (r_idt.rd_base);
}

75
sys/amd64/vmm/vmm_host.h Normal file
View File

@ -0,0 +1,75 @@
/*-
* Copyright (c) 2012 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VMM_HOST_H_
#define _VMM_HOST_H_
#ifndef _KERNEL
#error "no user-servicable parts inside"
#endif
void vmm_host_state_init(void);
uint64_t vmm_get_host_pat(void);
uint64_t vmm_get_host_efer(void);
uint64_t vmm_get_host_cr0(void);
uint64_t vmm_get_host_cr4(void);
uint64_t vmm_get_host_datasel(void);
uint64_t vmm_get_host_codesel(void);
uint64_t vmm_get_host_tsssel(void);
uint64_t vmm_get_host_fsbase(void);
uint64_t vmm_get_host_idtrbase(void);
/*
* Inline access to host state that is used on every VM entry
*/
static __inline uint64_t
vmm_get_host_trbase(void)
{
return ((uint64_t)PCPU_GET(tssp));
}
static __inline uint64_t
vmm_get_host_gdtrbase(void)
{
return ((uint64_t)&gdt[NGDT * curcpu]);
}
struct pcpu;
extern struct pcpu __pcpu[];
static __inline uint64_t
vmm_get_host_gsbase(void)
{
return ((uint64_t)&__pcpu[curcpu]);
}
#endif

View File

@ -0,0 +1,810 @@
/*-
* Copyright (c) 2012 Sandvine, Inc.
* Copyright (c) 2012 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#ifdef _KERNEL
#include <sys/param.h>
#include <sys/pcpu.h>
#include <sys/systm.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <machine/pmap.h>
#include <machine/vmparam.h>
#include <machine/vmm.h>
#else /* !_KERNEL */
#include <sys/types.h>
#include <sys/errno.h>
#include <machine/vmm.h>
#include <vmmapi.h>
#endif /* _KERNEL */
/* struct vie_op.op_type */
enum {
VIE_OP_TYPE_NONE = 0,
VIE_OP_TYPE_MOV,
VIE_OP_TYPE_AND,
VIE_OP_TYPE_LAST
};
/* struct vie_op.op_flags */
#define VIE_OP_F_IMM (1 << 0) /* immediate operand present */
#define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */
static const struct vie_op one_byte_opcodes[256] = {
[0x89] = {
.op_byte = 0x89,
.op_type = VIE_OP_TYPE_MOV,
},
[0x8B] = {
.op_byte = 0x8B,
.op_type = VIE_OP_TYPE_MOV,
},
[0xC7] = {
.op_byte = 0xC7,
.op_type = VIE_OP_TYPE_MOV,
.op_flags = VIE_OP_F_IMM,
},
[0x23] = {
.op_byte = 0x23,
.op_type = VIE_OP_TYPE_AND,
},
[0x81] = {
/* XXX Group 1 extended opcode - not just AND */
.op_byte = 0x81,
.op_type = VIE_OP_TYPE_AND,
.op_flags = VIE_OP_F_IMM,
}
};
/* struct vie.mod */
#define VIE_MOD_INDIRECT 0
#define VIE_MOD_INDIRECT_DISP8 1
#define VIE_MOD_INDIRECT_DISP32 2
#define VIE_MOD_DIRECT 3
/* struct vie.rm */
#define VIE_RM_SIB 4
#define VIE_RM_DISP32 5
#define GB (1024 * 1024 * 1024)
static enum vm_reg_name gpr_map[16] = {
VM_REG_GUEST_RAX,
VM_REG_GUEST_RCX,
VM_REG_GUEST_RDX,
VM_REG_GUEST_RBX,
VM_REG_GUEST_RSP,
VM_REG_GUEST_RBP,
VM_REG_GUEST_RSI,
VM_REG_GUEST_RDI,
VM_REG_GUEST_R8,
VM_REG_GUEST_R9,
VM_REG_GUEST_R10,
VM_REG_GUEST_R11,
VM_REG_GUEST_R12,
VM_REG_GUEST_R13,
VM_REG_GUEST_R14,
VM_REG_GUEST_R15
};
static uint64_t size2mask[] = {
[1] = 0xff,
[2] = 0xffff,
[4] = 0xffffffff,
[8] = 0xffffffffffffffff,
};
static int
vie_valid_register(enum vm_reg_name reg)
{
#ifdef _KERNEL
/*
* XXX
* The operand register in which we store the result of the
* read must be a GPR that we can modify even if the vcpu
* is "running". All the GPRs qualify except for %rsp.
*
* This is a limitation of the vm_set_register() API
* and can be fixed if necessary.
*/
if (reg == VM_REG_GUEST_RSP)
return (0);
#endif
return (1);
}
static int
vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval)
{
int error;
if (!vie_valid_register(reg))
return (EINVAL);
error = vm_get_register(vm, vcpuid, reg, rval);
return (error);
}
static int
vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
uint64_t val, int size)
{
int error;
uint64_t origval;
if (!vie_valid_register(reg))
return (EINVAL);
switch (size) {
case 1:
case 2:
error = vie_read_register(vm, vcpuid, reg, &origval);
if (error)
return (error);
val &= size2mask[size];
val |= origval & ~size2mask[size];
break;
case 4:
val &= 0xffffffffUL;
break;
case 8:
break;
default:
return (EINVAL);
}
error = vm_set_register(vm, vcpuid, reg, val);
return (error);
}
/*
* The following simplifying assumptions are made during emulation:
*
* - guest is in 64-bit mode
* - default address size is 64-bits
* - default operand size is 32-bits
*
* - operand size override is not supported
*
* - address size override is not supported
*/
static int
emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
{
int error, size;
enum vm_reg_name reg;
uint64_t val;
size = 4;
error = EINVAL;
switch (vie->op.op_byte) {
case 0x89:
/*
* MOV from reg (ModRM:reg) to mem (ModRM:r/m)
* 89/r: mov r/m32, r32
* REX.W + 89/r mov r/m64, r64
*/
if (vie->rex_w)
size = 8;
reg = gpr_map[vie->reg];
error = vie_read_register(vm, vcpuid, reg, &val);
if (error == 0) {
val &= size2mask[size];
error = memwrite(vm, vcpuid, gpa, val, size, arg);
}
break;
case 0x8B:
/*
* MOV from mem (ModRM:r/m) to reg (ModRM:reg)
* 8B/r: mov r32, r/m32
* REX.W 8B/r: mov r64, r/m64
*/
if (vie->rex_w)
size = 8;
error = memread(vm, vcpuid, gpa, &val, size, arg);
if (error == 0) {
reg = gpr_map[vie->reg];
error = vie_update_register(vm, vcpuid, reg, val, size);
}
break;
case 0xC7:
/*
* MOV from imm32 to mem (ModRM:r/m)
* C7/0 mov r/m32, imm32
* REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits)
*/
val = vie->immediate; /* already sign-extended */
if (vie->rex_w)
size = 8;
if (size != 8)
val &= size2mask[size];
error = memwrite(vm, vcpuid, gpa, val, size, arg);
break;
default:
break;
}
return (error);
}
static int
emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
{
int error, size;
enum vm_reg_name reg;
uint64_t val1, val2;
size = 4;
error = EINVAL;
switch (vie->op.op_byte) {
case 0x23:
/*
* AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
* result in reg.
*
* 23/r and r32, r/m32
* REX.W + 23/r and r64, r/m64
*/
if (vie->rex_w)
size = 8;
/* get the first operand */
reg = gpr_map[vie->reg];
error = vie_read_register(vm, vcpuid, reg, &val1);
if (error)
break;
/* get the second operand */
error = memread(vm, vcpuid, gpa, &val2, size, arg);
if (error)
break;
/* perform the operation and write the result */
val1 &= val2;
error = vie_update_register(vm, vcpuid, reg, val1, size);
break;
case 0x81:
/*
* AND reg (ModRM:reg) with immediate and store the
* result in reg
*
* 81/ and r/m32, imm32
* REX.W + 81/ and r/m64, imm32 sign-extended to 64
*
* Currently, only the AND operation of the 0x81 opcode
* is implemented (ModRM:reg = b100).
*/
if ((vie->reg & 7) != 4)
break;
if (vie->rex_w)
size = 8;
/* get the first operand */
error = memread(vm, vcpuid, gpa, &val1, size, arg);
if (error)
break;
/*
* perform the operation with the pre-fetched immediate
* operand and write the result
*/
val1 &= vie->immediate;
error = memwrite(vm, vcpuid, gpa, val1, size, arg);
break;
default:
break;
}
return (error);
}
int
vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
mem_region_read_t memread, mem_region_write_t memwrite,
void *memarg)
{
int error;
if (!vie->decoded)
return (EINVAL);
switch (vie->op.op_type) {
case VIE_OP_TYPE_MOV:
error = emulate_mov(vm, vcpuid, gpa, vie,
memread, memwrite, memarg);
break;
case VIE_OP_TYPE_AND:
error = emulate_and(vm, vcpuid, gpa, vie,
memread, memwrite, memarg);
break;
default:
error = EINVAL;
break;
}
return (error);
}
#ifdef _KERNEL
static void
vie_init(struct vie *vie)
{
bzero(vie, sizeof(struct vie));
vie->base_register = VM_REG_LAST;
vie->index_register = VM_REG_LAST;
}
static int
gla2gpa(struct vm *vm, uint64_t gla, uint64_t ptpphys,
uint64_t *gpa, uint64_t *gpaend)
{
vm_paddr_t hpa;
int nlevels, ptpshift, ptpindex;
uint64_t *ptpbase, pte, pgsize;
/*
* XXX assumes 64-bit guest with 4 page walk levels
*/
nlevels = 4;
while (--nlevels >= 0) {
/* Zero out the lower 12 bits and the upper 12 bits */
ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12;
hpa = vm_gpa2hpa(vm, ptpphys, PAGE_SIZE);
if (hpa == -1)
goto error;
ptpbase = (uint64_t *)PHYS_TO_DMAP(hpa);
ptpshift = PAGE_SHIFT + nlevels * 9;
ptpindex = (gla >> ptpshift) & 0x1FF;
pgsize = 1UL << ptpshift;
pte = ptpbase[ptpindex];
if ((pte & PG_V) == 0)
goto error;
if (pte & PG_PS) {
if (pgsize > 1 * GB)
goto error;
else
break;
}
ptpphys = pte;
}
/* Zero out the lower 'ptpshift' bits and the upper 12 bits */
pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12;
*gpa = pte | (gla & (pgsize - 1));
*gpaend = pte + pgsize;
return (0);
error:
return (-1);
}
int
vmm_fetch_instruction(struct vm *vm, int cpuid, uint64_t rip, int inst_length,
uint64_t cr3, struct vie *vie)
{
int n, err;
uint64_t hpa, gpa, gpaend, off;
/*
* XXX cache previously fetched instructions using 'rip' as the tag
*/
if (inst_length > VIE_INST_SIZE)
panic("vmm_fetch_instruction: invalid length %d", inst_length);
vie_init(vie);
/* Copy the instruction into 'vie' */
while (vie->num_valid < inst_length) {
err = gla2gpa(vm, rip, cr3, &gpa, &gpaend);
if (err)
break;
off = gpa & PAGE_MASK;
n = min(inst_length - vie->num_valid, PAGE_SIZE - off);
hpa = vm_gpa2hpa(vm, gpa, n);
if (hpa == -1)
break;
bcopy((void *)PHYS_TO_DMAP(hpa), &vie->inst[vie->num_valid], n);
rip += n;
vie->num_valid += n;
}
if (vie->num_valid == inst_length)
return (0);
else
return (-1);
}
static int
vie_peek(struct vie *vie, uint8_t *x)
{
if (vie->num_processed < vie->num_valid) {
*x = vie->inst[vie->num_processed];
return (0);
} else
return (-1);
}
static void
vie_advance(struct vie *vie)
{
vie->num_processed++;
}
static int
decode_rex(struct vie *vie)
{
uint8_t x;
if (vie_peek(vie, &x))
return (-1);
if (x >= 0x40 && x <= 0x4F) {
vie->rex_w = x & 0x8 ? 1 : 0;
vie->rex_r = x & 0x4 ? 1 : 0;
vie->rex_x = x & 0x2 ? 1 : 0;
vie->rex_b = x & 0x1 ? 1 : 0;
vie_advance(vie);
}
return (0);
}
static int
decode_opcode(struct vie *vie)
{
uint8_t x;
if (vie_peek(vie, &x))
return (-1);
vie->op = one_byte_opcodes[x];
if (vie->op.op_type == VIE_OP_TYPE_NONE)
return (-1);
vie_advance(vie);
return (0);
}
/*
* XXX assuming 32-bit or 64-bit guest
*/
static int
decode_modrm(struct vie *vie)
{
uint8_t x;
if (vie_peek(vie, &x))
return (-1);
vie->mod = (x >> 6) & 0x3;
vie->rm = (x >> 0) & 0x7;
vie->reg = (x >> 3) & 0x7;
/*
* A direct addressing mode makes no sense in the context of an EPT
* fault. There has to be a memory access involved to cause the
* EPT fault.
*/
if (vie->mod == VIE_MOD_DIRECT)
return (-1);
if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) ||
(vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) {
/*
* Table 2-5: Special Cases of REX Encodings
*
* mod=0, r/m=5 is used in the compatibility mode to
* indicate a disp32 without a base register.
*
* mod!=3, r/m=4 is used in the compatibility mode to
* indicate that the SIB byte is present.
*
* The 'b' bit in the REX prefix is don't care in
* this case.
*/
} else {
vie->rm |= (vie->rex_b << 3);
}
vie->reg |= (vie->rex_r << 3);
/* SIB */
if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)
goto done;
vie->base_register = gpr_map[vie->rm];
switch (vie->mod) {
case VIE_MOD_INDIRECT_DISP8:
vie->disp_bytes = 1;
break;
case VIE_MOD_INDIRECT_DISP32:
vie->disp_bytes = 4;
break;
case VIE_MOD_INDIRECT:
if (vie->rm == VIE_RM_DISP32) {
vie->disp_bytes = 4;
vie->base_register = VM_REG_LAST; /* no base */
}
break;
}
/* Figure out immediate operand size (if any) */
if (vie->op.op_flags & VIE_OP_F_IMM)
vie->imm_bytes = 4;
else if (vie->op.op_flags & VIE_OP_F_IMM8)
vie->imm_bytes = 1;
done:
vie_advance(vie);
return (0);
}
static int
decode_sib(struct vie *vie)
{
uint8_t x;
/* Proceed only if SIB byte is present */
if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB)
return (0);
if (vie_peek(vie, &x))
return (-1);
/* De-construct the SIB byte */
vie->ss = (x >> 6) & 0x3;
vie->index = (x >> 3) & 0x7;
vie->base = (x >> 0) & 0x7;
/* Apply the REX prefix modifiers */
vie->index |= vie->rex_x << 3;
vie->base |= vie->rex_b << 3;
switch (vie->mod) {
case VIE_MOD_INDIRECT_DISP8:
vie->disp_bytes = 1;
break;
case VIE_MOD_INDIRECT_DISP32:
vie->disp_bytes = 4;
break;
}
if (vie->mod == VIE_MOD_INDIRECT &&
(vie->base == 5 || vie->base == 13)) {
/*
* Special case when base register is unused if mod = 0
* and base = %rbp or %r13.
*
* Documented in:
* Table 2-3: 32-bit Addressing Forms with the SIB Byte
* Table 2-5: Special Cases of REX Encodings
*/
vie->disp_bytes = 4;
} else {
vie->base_register = gpr_map[vie->base];
}
/*
* All encodings of 'index' are valid except for %rsp (4).
*
* Documented in:
* Table 2-3: 32-bit Addressing Forms with the SIB Byte
* Table 2-5: Special Cases of REX Encodings
*/
if (vie->index != 4)
vie->index_register = gpr_map[vie->index];
/* 'scale' makes sense only in the context of an index register */
if (vie->index_register < VM_REG_LAST)
vie->scale = 1 << vie->ss;
vie_advance(vie);
return (0);
}
static int
decode_displacement(struct vie *vie)
{
int n, i;
uint8_t x;
union {
char buf[4];
int8_t signed8;
int32_t signed32;
} u;
if ((n = vie->disp_bytes) == 0)
return (0);
if (n != 1 && n != 4)
panic("decode_displacement: invalid disp_bytes %d", n);
for (i = 0; i < n; i++) {
if (vie_peek(vie, &x))
return (-1);
u.buf[i] = x;
vie_advance(vie);
}
if (n == 1)
vie->displacement = u.signed8; /* sign-extended */
else
vie->displacement = u.signed32; /* sign-extended */
return (0);
}
static int
decode_immediate(struct vie *vie)
{
int i, n;
uint8_t x;
union {
char buf[4];
int8_t signed8;
int32_t signed32;
} u;
if ((n = vie->imm_bytes) == 0)
return (0);
if (n != 1 && n != 4)
panic("decode_immediate: invalid imm_bytes %d", n);
for (i = 0; i < n; i++) {
if (vie_peek(vie, &x))
return (-1);
u.buf[i] = x;
vie_advance(vie);
}
if (n == 1)
vie->immediate = u.signed8; /* sign-extended */
else
vie->immediate = u.signed32; /* sign-extended */
return (0);
}
#define VERIFY_GLA
/*
* Verify that the 'guest linear address' provided as collateral of the nested
* page table fault matches with our instruction decoding.
*/
#ifdef VERIFY_GLA
static int
verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
{
int error;
uint64_t base, idx;
base = 0;
if (vie->base_register != VM_REG_LAST) {
error = vm_get_register(vm, cpuid, vie->base_register, &base);
if (error) {
printf("verify_gla: error %d getting base reg %d\n",
error, vie->base_register);
return (-1);
}
}
idx = 0;
if (vie->index_register != VM_REG_LAST) {
error = vm_get_register(vm, cpuid, vie->index_register, &idx);
if (error) {
printf("verify_gla: error %d getting index reg %d\n",
error, vie->index_register);
return (-1);
}
}
if (base + vie->scale * idx + vie->displacement != gla) {
printf("verify_gla mismatch: "
"base(0x%0lx), scale(%d), index(0x%0lx), "
"disp(0x%0lx), gla(0x%0lx)\n",
base, vie->scale, idx, vie->displacement, gla);
return (-1);
}
return (0);
}
#endif /* VERIFY_GLA */
int
vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
{
if (decode_rex(vie))
return (-1);
if (decode_opcode(vie))
return (-1);
if (decode_modrm(vie))
return (-1);
if (decode_sib(vie))
return (-1);
if (decode_displacement(vie))
return (-1);
if (decode_immediate(vie))
return (-1);
#ifdef VERIFY_GLA
if (verify_gla(vm, cpuid, gla, vie))
return (-1);
#endif
vie->decoded = 1; /* success */
return (0);
}
#endif /* _KERNEL */

93
sys/amd64/vmm/vmm_ipi.c Normal file
View File

@ -0,0 +1,93 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/bus.h>
#include <machine/intr_machdep.h>
#include <machine/apicvar.h>
#include <machine/segments.h>
#include <machine/md_var.h>
#include <machine/vmm.h>
#include "vmm_ipi.h"
extern inthand_t IDTVEC(rsvd), IDTVEC(justreturn);
/*
* The default is to use the IPI_AST to interrupt a vcpu.
*/
int vmm_ipinum = IPI_AST;
CTASSERT(APIC_SPURIOUS_INT == 255);
void
vmm_ipi_init(void)
{
int idx;
uintptr_t func;
struct gate_descriptor *ip;
/*
* Search backwards from the highest IDT vector available for use
* as our IPI vector. We install the 'justreturn' handler at that
* vector and use it to interrupt the vcpus.
*
* We do this because the IPI_AST is heavyweight and saves all
* registers in the trapframe. This is overkill for our use case
* which is simply to EOI the interrupt and return.
*/
idx = APIC_SPURIOUS_INT;
while (--idx >= APIC_IPI_INTS) {
ip = &idt[idx];
func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
if (func == (uintptr_t)&IDTVEC(rsvd)) {
vmm_ipinum = idx;
setidt(vmm_ipinum, IDTVEC(justreturn), SDT_SYSIGT,
SEL_KPL, 0);
break;
}
}
if (vmm_ipinum != IPI_AST && bootverbose) {
printf("vmm_ipi_init: installing ipi handler to interrupt "
"vcpus at vector %d\n", vmm_ipinum);
}
}
void
vmm_ipi_cleanup(void)
{
if (vmm_ipinum != IPI_AST)
setidt(vmm_ipinum, IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
}

39
sys/amd64/vmm/vmm_ipi.h Normal file
View File

@ -0,0 +1,39 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VMM_IPI_H_
#define _VMM_IPI_H_
struct vm;
extern int vmm_ipinum;
void vmm_ipi_init(void);
void vmm_ipi_cleanup(void);
#endif

51
sys/amd64/vmm/vmm_ktr.h Normal file
View File

@ -0,0 +1,51 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VMM_KTR_H_
#define _VMM_KTR_H_
#include <sys/ktr.h>
#include <sys/pcpu.h>
#define KTR_VMM KTR_GEN
#define VMM_CTR0(vm, vcpuid, format) \
CTR3(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu)
#define VMM_CTR1(vm, vcpuid, format, p1) \
CTR4(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu, \
(p1))
#define VMM_CTR2(vm, vcpuid, format, p1, p2) \
CTR5(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu, \
(p1), (p2))
#define VMM_CTR3(vm, vcpuid, format, p1, p2, p3) \
CTR6(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu, \
(p1), (p2), (p3))
#endif

201
sys/amd64/vmm/vmm_lapic.c Normal file
View File

@ -0,0 +1,201 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/smp.h>
#include <x86/specialreg.h>
#include <x86/apicreg.h>
#include <machine/vmm.h>
#include "vmm_ipi.h"
#include "vmm_lapic.h"
#include "vlapic.h"
int
lapic_pending_intr(struct vm *vm, int cpu)
{
struct vlapic *vlapic;
vlapic = vm_lapic(vm, cpu);
return (vlapic_pending_intr(vlapic));
}
void
lapic_intr_accepted(struct vm *vm, int cpu, int vector)
{
struct vlapic *vlapic;
vlapic = vm_lapic(vm, cpu);
vlapic_intr_accepted(vlapic, vector);
}
int
lapic_set_intr(struct vm *vm, int cpu, int vector)
{
struct vlapic *vlapic;
if (cpu < 0 || cpu >= VM_MAXCPU)
return (EINVAL);
if (vector < 32 || vector > 255)
return (EINVAL);
vlapic = vm_lapic(vm, cpu);
vlapic_set_intr_ready(vlapic, vector);
vm_interrupt_hostcpu(vm, cpu);
return (0);
}
int
lapic_timer_tick(struct vm *vm, int cpu)
{
struct vlapic *vlapic;
vlapic = vm_lapic(vm, cpu);
return (vlapic_timer_tick(vlapic));
}
static boolean_t
x2apic_msr(u_int msr)
{
if (msr >= 0x800 && msr <= 0xBFF)
return (TRUE);
else
return (FALSE);
}
static u_int
x2apic_msr_to_regoff(u_int msr)
{
return ((msr - 0x800) << 4);
}
boolean_t
lapic_msr(u_int msr)
{
if (x2apic_msr(msr) || (msr == MSR_APICBASE))
return (TRUE);
else
return (FALSE);
}
int
lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval)
{
int error;
u_int offset;
struct vlapic *vlapic;
vlapic = vm_lapic(vm, cpu);
if (msr == MSR_APICBASE) {
*rval = vlapic_get_apicbase(vlapic);
error = 0;
} else {
offset = x2apic_msr_to_regoff(msr);
error = vlapic_op_mem_read(vlapic, offset, DWORD, rval);
}
return (error);
}
int
lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t val)
{
int error;
u_int offset;
struct vlapic *vlapic;
vlapic = vm_lapic(vm, cpu);
if (msr == MSR_APICBASE) {
vlapic_set_apicbase(vlapic, val);
error = 0;
} else {
offset = x2apic_msr_to_regoff(msr);
error = vlapic_op_mem_write(vlapic, offset, DWORD, val);
}
return (error);
}
int
lapic_mmio_write(void *vm, int cpu, uint64_t gpa, uint64_t wval, int size,
void *arg)
{
int error;
uint64_t off;
struct vlapic *vlapic;
off = gpa - DEFAULT_APIC_BASE;
/*
* Memory mapped local apic accesses must be 4 bytes wide and
* aligned on a 16-byte boundary.
*/
if (size != 4 || off & 0xf)
return (EINVAL);
vlapic = vm_lapic(vm, cpu);
error = vlapic_op_mem_write(vlapic, off, DWORD, wval);
return (error);
}
int
lapic_mmio_read(void *vm, int cpu, uint64_t gpa, uint64_t *rval, int size,
void *arg)
{
int error;
uint64_t off;
struct vlapic *vlapic;
off = gpa - DEFAULT_APIC_BASE;
/*
* Memory mapped local apic accesses must be 4 bytes wide and
* aligned on a 16-byte boundary.
*/
if (size != 4 || off & 0xf)
return (EINVAL);
vlapic = vm_lapic(vm, cpu);
error = vlapic_op_mem_read(vlapic, off, DWORD, rval);
return (error);
}

71
sys/amd64/vmm/vmm_lapic.h Normal file
View File

@ -0,0 +1,71 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VMM_LAPIC_H_
#define _VMM_LAPIC_H_
struct vm;
boolean_t lapic_msr(u_int num);
int lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval);
int lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t wval);
int lapic_mmio_read(void *vm, int cpu, uint64_t gpa,
uint64_t *rval, int size, void *arg);
int lapic_mmio_write(void *vm, int cpu, uint64_t gpa,
uint64_t wval, int size, void *arg);
int lapic_timer_tick(struct vm *vm, int cpu);
/*
* Returns a vector between 32 and 255 if an interrupt is pending in the
* IRR that can be delivered based on the current state of ISR and TPR.
*
* Note that the vector does not automatically transition to the ISR as a
* result of calling this function.
*
* Returns -1 if there is no eligible vector that can be delivered to the
* guest at this time.
*/
int lapic_pending_intr(struct vm *vm, int cpu);
/*
* Transition 'vector' from IRR to ISR. This function is called with the
* vector returned by 'lapic_pending_intr()' when the guest is able to
* accept this interrupt (i.e. RFLAGS.IF = 1 and no conditions exist that
* block interrupt delivery).
*/
void lapic_intr_accepted(struct vm *vm, int cpu, int vector);
/*
* Signals to the LAPIC that an interrupt at 'vector' needs to be generated
* to the 'cpu', the state is recorded in IRR.
*/
int lapic_set_intr(struct vm *vm, int cpu, int vector);
#endif

135
sys/amd64/vmm/vmm_mem.c Normal file
View File

@ -0,0 +1,135 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/linker.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/kernel.h>
#include <sys/sysctl.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <vm/vm_page.h>
#include <vm/vm_pageout.h>
#include <machine/md_var.h>
#include <machine/metadata.h>
#include <machine/pc/bios.h>
#include <machine/vmparam.h>
#include <machine/pmap.h>
#include "vmm_util.h"
#include "vmm_mem.h"
SYSCTL_DECL(_hw_vmm);
static u_long pages_allocated;
SYSCTL_ULONG(_hw_vmm, OID_AUTO, pages_allocated, CTLFLAG_RD,
&pages_allocated, 0, "4KB pages allocated");
static void
update_pages_allocated(int howmany)
{
pages_allocated += howmany; /* XXX locking? */
}
int
vmm_mem_init(void)
{
return (0);
}
vm_paddr_t
vmm_mem_alloc(size_t size)
{
int flags;
vm_page_t m;
vm_paddr_t pa;
if (size != PAGE_SIZE)
panic("vmm_mem_alloc: invalid allocation size %lu", size);
flags = VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
VM_ALLOC_ZERO;
while (1) {
/*
* XXX need policy to determine when to back off the allocation
*/
m = vm_page_alloc(NULL, 0, flags);
if (m == NULL)
VM_WAIT;
else
break;
}
pa = VM_PAGE_TO_PHYS(m);
if ((m->flags & PG_ZERO) == 0)
pagezero((void *)PHYS_TO_DMAP(pa));
m->valid = VM_PAGE_BITS_ALL;
update_pages_allocated(1);
return (pa);
}
void
vmm_mem_free(vm_paddr_t base, size_t length)
{
vm_page_t m;
if (base & PAGE_MASK) {
panic("vmm_mem_free: base 0x%0lx must be aligned on a "
"0x%0x boundary\n", base, PAGE_SIZE);
}
if (length != PAGE_SIZE)
panic("vmm_mem_free: invalid length %lu", length);
m = PHYS_TO_VM_PAGE(base);
m->wire_count--;
vm_page_free(m);
atomic_subtract_int(&cnt.v_wire_count, 1);
update_pages_allocated(-1);
}
vm_paddr_t
vmm_mem_maxaddr(void)
{
return (ptoa(Maxmem));
}

37
sys/amd64/vmm/vmm_mem.h Normal file
View File

@ -0,0 +1,37 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VMM_MEM_H_
#define _VMM_MEM_H_
int vmm_mem_init(void);
vm_paddr_t vmm_mem_alloc(size_t size);
void vmm_mem_free(vm_paddr_t start, size_t size);
vm_paddr_t vmm_mem_maxaddr(void);
#endif

254
sys/amd64/vmm/vmm_msr.c Normal file
View File

@ -0,0 +1,254 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/smp.h>
#include <machine/specialreg.h>
#include <machine/vmm.h>
#include "vmm_lapic.h"
#include "vmm_msr.h"
#define VMM_MSR_F_EMULATE 0x01
#define VMM_MSR_F_READONLY 0x02
#define VMM_MSR_F_INVALID 0x04 /* guest_msr_valid() can override this */
struct vmm_msr {
int num;
int flags;
uint64_t hostval;
};
static struct vmm_msr vmm_msr[] = {
{ MSR_LSTAR, 0 },
{ MSR_CSTAR, 0 },
{ MSR_STAR, 0 },
{ MSR_SF_MASK, 0 },
{ MSR_PAT, VMM_MSR_F_EMULATE | VMM_MSR_F_INVALID },
{ MSR_BIOS_SIGN,VMM_MSR_F_EMULATE },
{ MSR_MCG_CAP, VMM_MSR_F_EMULATE | VMM_MSR_F_READONLY },
};
#define vmm_msr_num (sizeof(vmm_msr) / sizeof(vmm_msr[0]))
CTASSERT(VMM_MSR_NUM >= vmm_msr_num);
#define readonly_msr(idx) \
((vmm_msr[(idx)].flags & VMM_MSR_F_READONLY) != 0)
#define emulated_msr(idx) \
((vmm_msr[(idx)].flags & VMM_MSR_F_EMULATE) != 0)
#define invalid_msr(idx) \
((vmm_msr[(idx)].flags & VMM_MSR_F_INVALID) != 0)
void
vmm_msr_init(void)
{
int i;
for (i = 0; i < vmm_msr_num; i++) {
if (emulated_msr(i))
continue;
/*
* XXX this assumes that the value of the host msr does not
* change after we have cached it.
*/
vmm_msr[i].hostval = rdmsr(vmm_msr[i].num);
}
}
void
guest_msrs_init(struct vm *vm, int cpu)
{
int i;
uint64_t *guest_msrs;
guest_msrs = vm_guest_msrs(vm, cpu);
for (i = 0; i < vmm_msr_num; i++) {
switch (vmm_msr[i].num) {
case MSR_LSTAR:
case MSR_CSTAR:
case MSR_STAR:
case MSR_SF_MASK:
case MSR_BIOS_SIGN:
case MSR_MCG_CAP:
guest_msrs[i] = 0;
break;
case MSR_PAT:
guest_msrs[i] = PAT_VALUE(0, PAT_WRITE_BACK) |
PAT_VALUE(1, PAT_WRITE_THROUGH) |
PAT_VALUE(2, PAT_UNCACHED) |
PAT_VALUE(3, PAT_UNCACHEABLE) |
PAT_VALUE(4, PAT_WRITE_BACK) |
PAT_VALUE(5, PAT_WRITE_THROUGH) |
PAT_VALUE(6, PAT_UNCACHED) |
PAT_VALUE(7, PAT_UNCACHEABLE);
break;
default:
panic("guest_msrs_init: missing initialization for msr "
"0x%0x", vmm_msr[i].num);
}
}
}
static int
msr_num_to_idx(u_int num)
{
int i;
for (i = 0; i < vmm_msr_num; i++)
if (vmm_msr[i].num == num)
return (i);
return (-1);
}
int
emulate_wrmsr(struct vm *vm, int cpu, u_int num, uint64_t val)
{
int idx;
uint64_t *guest_msrs;
if (lapic_msr(num))
return (lapic_wrmsr(vm, cpu, num, val));
idx = msr_num_to_idx(num);
if (idx < 0 || invalid_msr(idx))
return (EINVAL);
if (!readonly_msr(idx)) {
guest_msrs = vm_guest_msrs(vm, cpu);
/* Stash the value */
guest_msrs[idx] = val;
/* Update processor state for non-emulated MSRs */
if (!emulated_msr(idx))
wrmsr(vmm_msr[idx].num, val);
}
return (0);
}
int
emulate_rdmsr(struct vm *vm, int cpu, u_int num)
{
int error, idx;
uint32_t eax, edx;
uint64_t result, *guest_msrs;
if (lapic_msr(num)) {
error = lapic_rdmsr(vm, cpu, num, &result);
goto done;
}
idx = msr_num_to_idx(num);
if (idx < 0 || invalid_msr(idx)) {
error = EINVAL;
goto done;
}
guest_msrs = vm_guest_msrs(vm, cpu);
result = guest_msrs[idx];
/*
* If this is not an emulated msr register make sure that the processor
* state matches our cached state.
*/
if (!emulated_msr(idx) && (rdmsr(num) != result)) {
panic("emulate_rdmsr: msr 0x%0x has inconsistent cached "
"(0x%016lx) and actual (0x%016lx) values", num,
result, rdmsr(num));
}
error = 0;
done:
if (error == 0) {
eax = result;
edx = result >> 32;
error = vm_set_register(vm, cpu, VM_REG_GUEST_RAX, eax);
if (error)
panic("vm_set_register(rax) error %d", error);
error = vm_set_register(vm, cpu, VM_REG_GUEST_RDX, edx);
if (error)
panic("vm_set_register(rdx) error %d", error);
}
return (error);
}
void
restore_guest_msrs(struct vm *vm, int cpu)
{
int i;
uint64_t *guest_msrs;
guest_msrs = vm_guest_msrs(vm, cpu);
for (i = 0; i < vmm_msr_num; i++) {
if (emulated_msr(i))
continue;
else
wrmsr(vmm_msr[i].num, guest_msrs[i]);
}
}
void
restore_host_msrs(struct vm *vm, int cpu)
{
int i;
for (i = 0; i < vmm_msr_num; i++) {
if (emulated_msr(i))
continue;
else
wrmsr(vmm_msr[i].num, vmm_msr[i].hostval);
}
}
/*
* Must be called by the CPU-specific code before any guests are
* created
*/
void
guest_msr_valid(int msr)
{
int i;
for (i = 0; i < vmm_msr_num; i++) {
if (vmm_msr[i].num == msr && invalid_msr(i)) {
vmm_msr[i].flags &= ~VMM_MSR_F_INVALID;
}
}
}

43
sys/amd64/vmm/vmm_msr.h Normal file
View File

@ -0,0 +1,43 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VMM_MSR_H_
#define _VMM_MSR_H_
#define VMM_MSR_NUM 16
struct vm;
void vmm_msr_init(void);
int emulate_wrmsr(struct vm *vm, int vcpu, u_int msr, uint64_t val);
int emulate_rdmsr(struct vm *vm, int vcpu, u_int msr);
void guest_msrs_init(struct vm *vm, int cpu);
void guest_msr_valid(int msr);
void restore_host_msrs(struct vm *vm, int cpu);
void restore_guest_msrs(struct vm *vm, int cpu);
#endif

104
sys/amd64/vmm/vmm_stat.c Normal file
View File

@ -0,0 +1,104 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/smp.h>
#include <machine/vmm.h>
#include "vmm_stat.h"
static int vstnum;
static struct vmm_stat_type *vsttab[MAX_VMM_STAT_TYPES];
static MALLOC_DEFINE(M_VMM_STAT, "vmm stat", "vmm stat");
void
vmm_stat_init(void *arg)
{
struct vmm_stat_type *vst = arg;
/* We require all stats to identify themselves with a description */
if (vst->desc == NULL)
return;
if (vstnum >= MAX_VMM_STAT_TYPES) {
printf("Cannot accomodate vmm stat type \"%s\"!\n", vst->desc);
return;
}
vst->index = vstnum;
vsttab[vstnum++] = vst;
}
int
vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf)
{
int i;
uint64_t *stats;
if (vcpu < 0 || vcpu >= VM_MAXCPU)
return (EINVAL);
stats = vcpu_stats(vm, vcpu);
for (i = 0; i < vstnum; i++)
buf[i] = stats[i];
*num_stats = vstnum;
return (0);
}
void *
vmm_stat_alloc(void)
{
u_long size;
size = vstnum * sizeof(uint64_t);
return (malloc(size, M_VMM_STAT, M_ZERO | M_WAITOK));
}
void
vmm_stat_free(void *vp)
{
free(vp, M_VMM_STAT);
}
const char *
vmm_stat_desc(int index)
{
if (index >= 0 && index < vstnum)
return (vsttab[index]->desc);
else
return (NULL);
}

71
sys/amd64/vmm/vmm_stat.h Normal file
View File

@ -0,0 +1,71 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VMM_STAT_H_
#define _VMM_STAT_H_
struct vm;
#define MAX_VMM_STAT_TYPES 64 /* arbitrary */
struct vmm_stat_type {
const char *desc; /* description of statistic */
int index; /* position in the stats buffer */
};
void vmm_stat_init(void *arg);
#define VMM_STAT_DEFINE(type, desc) \
struct vmm_stat_type type[1] = { \
{ desc, -1 } \
}; \
SYSINIT(type##_stat, SI_SUB_KLD, SI_ORDER_ANY, vmm_stat_init, type)
void *vmm_stat_alloc(void);
void vmm_stat_free(void *vp);
/*
* 'buf' should be at least fit 'MAX_VMM_STAT_TYPES' entries
*/
int vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf);
const char *vmm_stat_desc(int index);
static void __inline
vmm_stat_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst, uint64_t x)
{
#ifdef VMM_KEEP_STATS
uint64_t *stats = vcpu_stats(vm, vcpu);
if (vst->index >= 0)
stats[vst->index] += x;
#endif
}
#endif

View File

@ -0,0 +1,42 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#define LOCORE
#include <machine/asmacros.h>
#define LA_EOI 0xB0
.text
SUPERALIGN_TEXT
IDTVEC(justreturn)
pushq %rax
movq lapic, %rax
movl $0, LA_EOI(%rax)
popq %rax
iretq

111
sys/amd64/vmm/vmm_util.c Normal file
View File

@ -0,0 +1,111 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/libkern.h>
#include <machine/md_var.h>
#include "vmm_util.h"
boolean_t
vmm_is_intel(void)
{
if (strcmp(cpu_vendor, "GenuineIntel") == 0)
return (TRUE);
else
return (FALSE);
}
boolean_t
vmm_is_amd(void)
{
if (strcmp(cpu_vendor, "AuthenticAMD") == 0)
return (TRUE);
else
return (FALSE);
}
boolean_t
vmm_supports_1G_pages(void)
{
unsigned int regs[4];
/*
* CPUID.80000001:EDX[bit 26] = 1 indicates support for 1GB pages
*
* Both Intel and AMD support this bit.
*/
if (cpu_exthigh >= 0x80000001) {
do_cpuid(0x80000001, regs);
if (regs[3] & (1 << 26))
return (TRUE);
}
return (FALSE);
}
#include <sys/proc.h>
#include <machine/frame.h>
#define DUMP_REG(x) printf(#x "\t\t0x%016lx\n", (long)(tf->tf_ ## x))
#define DUMP_SEG(x) printf(#x "\t\t0x%04x\n", (unsigned)(tf->tf_ ## x))
void
dump_trapframe(struct trapframe *tf)
{
DUMP_REG(rdi);
DUMP_REG(rsi);
DUMP_REG(rdx);
DUMP_REG(rcx);
DUMP_REG(r8);
DUMP_REG(r9);
DUMP_REG(rax);
DUMP_REG(rbx);
DUMP_REG(rbp);
DUMP_REG(r10);
DUMP_REG(r11);
DUMP_REG(r12);
DUMP_REG(r13);
DUMP_REG(r14);
DUMP_REG(r15);
DUMP_REG(trapno);
DUMP_REG(addr);
DUMP_REG(flags);
DUMP_REG(err);
DUMP_REG(rip);
DUMP_REG(rflags);
DUMP_REG(rsp);
DUMP_SEG(cs);
DUMP_SEG(ss);
DUMP_SEG(fs);
DUMP_SEG(gs);
DUMP_SEG(es);
DUMP_SEG(ds);
}

40
sys/amd64/vmm/vmm_util.h Normal file
View File

@ -0,0 +1,40 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VMM_UTIL_H_
#define _VMM_UTIL_H_
struct trapframe;
boolean_t vmm_is_intel(void);
boolean_t vmm_is_amd(void);
boolean_t vmm_supports_1G_pages(void);
void dump_trapframe(struct trapframe *tf);
#endif

202
sys/amd64/vmm/x86.c Normal file
View File

@ -0,0 +1,202 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/types.h>
#include <sys/systm.h>
#include <sys/cpuset.h>
#include <machine/cpufunc.h>
#include <machine/md_var.h>
#include <machine/specialreg.h>
#include <machine/vmm.h>
#include "x86.h"
#define CPUID_VM_HIGH 0x40000000
static const char bhyve_id[12] = "BHyVE BHyVE ";
int
x86_emulate_cpuid(struct vm *vm, int vcpu_id,
uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
{
int error;
unsigned int func, regs[4];
enum x2apic_state x2apic_state;
func = *eax;
/*
* Requests for invalid CPUID levels should map to the highest
* available level instead.
*/
if (cpu_exthigh != 0 && *eax >= 0x80000000) {
if (*eax > cpu_exthigh)
*eax = cpu_exthigh;
} else if (*eax >= 0x40000000) {
if (*eax > CPUID_VM_HIGH)
*eax = CPUID_VM_HIGH;
} else if (*eax > cpu_high) {
*eax = cpu_high;
}
/*
* In general the approach used for CPU topology is to
* advertise a flat topology where all CPUs are packages with
* no multi-core or SMT.
*/
switch (func) {
case CPUID_0000_0000:
case CPUID_0000_0002:
case CPUID_0000_0003:
case CPUID_0000_000A:
cpuid_count(*eax, *ecx, regs);
break;
case CPUID_8000_0000:
case CPUID_8000_0001:
case CPUID_8000_0002:
case CPUID_8000_0003:
case CPUID_8000_0004:
case CPUID_8000_0006:
case CPUID_8000_0007:
case CPUID_8000_0008:
cpuid_count(*eax, *ecx, regs);
break;
case CPUID_0000_0001:
do_cpuid(1, regs);
error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state);
if (error) {
panic("x86_emulate_cpuid: error %d "
"fetching x2apic state", error);
}
/*
* Override the APIC ID only in ebx
*/
regs[1] &= ~(CPUID_LOCAL_APIC_ID);
regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT);
/*
* Don't expose VMX, SpeedStep or TME capability.
* Advertise x2APIC capability and Hypervisor guest.
*/
regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2);
regs[2] |= CPUID2_HV;
if (x2apic_state != X2APIC_DISABLED)
regs[2] |= CPUID2_X2APIC;
/*
* Hide xsave/osxsave/avx until the FPU save/restore
* issues are resolved
*/
regs[2] &= ~(CPUID2_XSAVE | CPUID2_OSXSAVE |
CPUID2_AVX);
/*
* Hide monitor/mwait until we know how to deal with
* these instructions.
*/
regs[2] &= ~CPUID2_MON;
/*
* Hide thermal monitoring
*/
regs[3] &= ~(CPUID_ACPI | CPUID_TM);
/*
* Machine check handling is done in the host.
* Hide MTRR capability.
*/
regs[3] &= ~(CPUID_MCA | CPUID_MCE | CPUID_MTRR);
/*
* Disable multi-core.
*/
regs[1] &= ~CPUID_HTT_CORES;
regs[3] &= ~CPUID_HTT;
break;
case CPUID_0000_0004:
do_cpuid(4, regs);
/*
* Do not expose topology.
*/
regs[0] &= 0xffff8000;
regs[0] |= 0x04008000;
break;
case CPUID_0000_0006:
case CPUID_0000_0007:
/*
* Handle the access, but report 0 for
* all options
*/
regs[0] = 0;
regs[1] = 0;
regs[2] = 0;
regs[3] = 0;
break;
case CPUID_0000_000B:
/*
* Processor topology enumeration
*/
regs[0] = 0;
regs[1] = 0;
regs[2] = *ecx & 0xff;
regs[3] = vcpu_id;
break;
case 0x40000000:
regs[0] = CPUID_VM_HIGH;
bcopy(bhyve_id, &regs[1], 4);
bcopy(bhyve_id, &regs[2], 4);
bcopy(bhyve_id, &regs[3], 4);
break;
default:
/* XXX: Leaf 5? */
return (0);
}
*eax = regs[0];
*ebx = regs[1];
*ecx = regs[2];
*edx = regs[3];
return (1);
}

64
sys/amd64/vmm/x86.h Normal file
View File

@ -0,0 +1,64 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _X86_H_
#define _X86_H_
#define CPUID_0000_0000 (0x0)
#define CPUID_0000_0001 (0x1)
#define CPUID_0000_0002 (0x2)
#define CPUID_0000_0003 (0x3)
#define CPUID_0000_0004 (0x4)
#define CPUID_0000_0006 (0x6)
#define CPUID_0000_0007 (0x7)
#define CPUID_0000_000A (0xA)
#define CPUID_0000_000B (0xB)
#define CPUID_8000_0000 (0x80000000)
#define CPUID_8000_0001 (0x80000001)
#define CPUID_8000_0002 (0x80000002)
#define CPUID_8000_0003 (0x80000003)
#define CPUID_8000_0004 (0x80000004)
#define CPUID_8000_0006 (0x80000006)
#define CPUID_8000_0007 (0x80000007)
#define CPUID_8000_0008 (0x80000008)
/*
* CPUID instruction Fn0000_0001:
*/
#define CPUID_0000_0001_APICID_MASK (0xff<<24)
#define CPUID_0000_0001_APICID_SHIFT 24
/*
* CPUID instruction Fn0000_0001 ECX
*/
#define CPUID_0000_0001_FEAT0_VMX (1<<5)
int x86_emulate_cpuid(struct vm *vm, int vcpu_id, uint32_t *eax, uint32_t *ebx,
uint32_t *ecx, uint32_t *edx);
#endif

View File

@ -464,6 +464,11 @@ libkern/memset.c standard
compat/x86bios/x86bios.c optional x86bios | atkbd | dpms | vesa
contrib/x86emu/x86emu.c optional x86bios | atkbd | dpms | vesa
#
# bvm console
#
dev/bvm/bvm_console.c optional bvmconsole
dev/bvm/bvm_dbg.c optional bvmdebug
#
# x86 shared code between IA32, AMD64 and PC98 architectures
#
x86/acpica/OsdEnvironment.c optional acpi

View File

@ -0,0 +1,129 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/module.h>
#include <sys/bus.h>
#include <sys/linker.h>
#include <sys/libkern.h>
#include <dev/pci/pcivar.h>
static int
linker_file_iterator(linker_file_t lf, void *arg)
{
const char *file = arg;
if (strcmp(lf->filename, file) == 0)
return (1);
else
return (0);
}
static boolean_t
pptdev(int bus, int slot, int func)
{
int found, b, s, f, n;
char *val, *cp, *cp2;
/*
* setenv pptdevs "1/2/3 4/5/6 7/8/9 10/11/12"
*/
found = 0;
cp = val = getenv("pptdevs");
while (cp != NULL && *cp != '\0') {
if ((cp2 = strchr(cp, ' ')) != NULL)
*cp2 = '\0';
n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
if (n == 3 && bus == b && slot == s && func == f) {
found = 1;
break;
}
if (cp2 != NULL)
*cp2++ = ' ';
cp = cp2;
}
freeenv(val);
return (found);
}
static int
pci_blackhole_probe(device_t dev)
{
int bus, slot, func;
/*
* If 'vmm.ko' has also been loaded the don't try to claim
* any pci devices.
*/
if (linker_file_foreach(linker_file_iterator, "vmm.ko"))
return (ENXIO);
bus = pci_get_bus(dev);
slot = pci_get_slot(dev);
func = pci_get_function(dev);
if (pptdev(bus, slot, func))
return (0);
else
return (ENXIO);
}
static int
pci_blackhole_attach(device_t dev)
{
/*
* We never really want to claim the devices but just want to prevent
* other drivers from getting to them.
*/
return (ENXIO);
}
static device_method_t pci_blackhole_methods[] = {
/* Device interface */
DEVMETHOD(device_probe, pci_blackhole_probe),
DEVMETHOD(device_attach, pci_blackhole_attach),
{ 0, 0 }
};
static driver_t pci_blackhole_driver = {
"blackhole",
pci_blackhole_methods,
};
devclass_t blackhole_devclass;
DRIVER_MODULE(blackhole, pci, pci_blackhole_driver, blackhole_devclass, 0, 0);
MODULE_DEPEND(blackhole, pci, 1, 1, 1);

240
sys/dev/bvm/bvm_console.c Normal file
View File

@ -0,0 +1,240 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/systm.h>
#include <sys/types.h>
#include <sys/cons.h>
#include <sys/tty.h>
#include <sys/reboot.h>
#include <sys/bus.h>
#include <sys/kdb.h>
#include <ddb/ddb.h>
#ifndef BVMCONS_POLL_HZ
#define BVMCONS_POLL_HZ 4
#endif
#define BVMBURSTLEN 16 /* max number of bytes to write in one chunk */
static tsw_open_t bvm_tty_open;
static tsw_close_t bvm_tty_close;
static tsw_outwakeup_t bvm_tty_outwakeup;
static struct ttydevsw bvm_ttydevsw = {
.tsw_flags = TF_NOPREFIX,
.tsw_open = bvm_tty_open,
.tsw_close = bvm_tty_close,
.tsw_outwakeup = bvm_tty_outwakeup,
};
static int polltime;
static struct callout_handle bvm_timeouthandle
= CALLOUT_HANDLE_INITIALIZER(&bvm_timeouthandle);
#if defined(KDB)
static int alt_break_state;
#endif
#define BVM_CONS_PORT 0x220
static int bvm_cons_port = BVM_CONS_PORT;
#define BVM_CONS_SIG ('b' << 8 | 'v')
static void bvm_timeout(void *);
static cn_probe_t bvm_cnprobe;
static cn_init_t bvm_cninit;
static cn_term_t bvm_cnterm;
static cn_getc_t bvm_cngetc;
static cn_putc_t bvm_cnputc;
static cn_grab_t bvm_cngrab;
static cn_ungrab_t bvm_cnungrab;
CONSOLE_DRIVER(bvm);
static int
bvm_rcons(u_char *ch)
{
int c;
c = inl(bvm_cons_port);
if (c != -1) {
*ch = (u_char)c;
return (0);
} else
return (-1);
}
static void
bvm_wcons(u_char ch)
{
outl(bvm_cons_port, ch);
}
static void
cn_drvinit(void *unused)
{
struct tty *tp;
if (bvm_consdev.cn_pri != CN_DEAD &&
bvm_consdev.cn_name[0] != '\0') {
tp = tty_alloc(&bvm_ttydevsw, NULL);
tty_makedev(tp, NULL, "bvmcons");
}
}
static int
bvm_tty_open(struct tty *tp)
{
polltime = hz / BVMCONS_POLL_HZ;
if (polltime < 1)
polltime = 1;
bvm_timeouthandle = timeout(bvm_timeout, tp, polltime);
return (0);
}
static void
bvm_tty_close(struct tty *tp)
{
/* XXX Should be replaced with callout_stop(9) */
untimeout(bvm_timeout, tp, bvm_timeouthandle);
}
static void
bvm_tty_outwakeup(struct tty *tp)
{
int len, written;
u_char buf[BVMBURSTLEN];
for (;;) {
len = ttydisc_getc(tp, buf, sizeof(buf));
if (len == 0)
break;
written = 0;
while (written < len)
bvm_wcons(buf[written++]);
}
}
static void
bvm_timeout(void *v)
{
struct tty *tp;
int c;
tp = (struct tty *)v;
tty_lock(tp);
while ((c = bvm_cngetc(NULL)) != -1)
ttydisc_rint(tp, c, 0);
ttydisc_rint_done(tp);
tty_unlock(tp);
bvm_timeouthandle = timeout(bvm_timeout, tp, polltime);
}
static void
bvm_cnprobe(struct consdev *cp)
{
int disabled, port;
disabled = 0;
cp->cn_pri = CN_DEAD;
resource_int_value("bvmconsole", 0, "disabled", &disabled);
if (!disabled) {
if (resource_int_value("bvmconsole", 0, "port", &port) == 0)
bvm_cons_port = port;
if (inw(bvm_cons_port) == BVM_CONS_SIG)
cp->cn_pri = CN_REMOTE;
}
}
static void
bvm_cninit(struct consdev *cp)
{
int i;
const char *bootmsg = "Using bvm console.\n";
if (boothowto & RB_VERBOSE) {
for (i = 0; i < strlen(bootmsg); i++)
bvm_cnputc(cp, bootmsg[i]);
}
strcpy(cp->cn_name, "bvmcons");
}
static void
bvm_cnterm(struct consdev *cp)
{
}
static int
bvm_cngetc(struct consdev *cp)
{
unsigned char ch;
if (bvm_rcons(&ch) == 0) {
#if defined(KDB)
kdb_alt_break(ch, &alt_break_state);
#endif
return (ch);
}
return (-1);
}
static void
bvm_cnputc(struct consdev *cp, int c)
{
bvm_wcons(c);
}
static void
bvm_cngrab(struct consdev *cp)
{
}
static void
bvm_cnungrab(struct consdev *cp)
{
}
SYSINIT(cndev, SI_SUB_CONFIGURE, SI_ORDER_MIDDLE, cn_drvinit, NULL);

100
sys/dev/bvm/bvm_dbg.c Normal file
View File

@ -0,0 +1,100 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/bus.h>
#include <gdb/gdb.h>
#include <machine/cpufunc.h>
static gdb_probe_f bvm_dbg_probe;
static gdb_init_f bvm_dbg_init;
static gdb_term_f bvm_dbg_term;
static gdb_getc_f bvm_dbg_getc;
static gdb_putc_f bvm_dbg_putc;
GDB_DBGPORT(bvm, bvm_dbg_probe, bvm_dbg_init, bvm_dbg_term,
bvm_dbg_getc, bvm_dbg_putc);
#define BVM_DBG_PORT 0x224
static int bvm_dbg_port = BVM_DBG_PORT;
#define BVM_DBG_SIG ('B' << 8 | 'V')
static int
bvm_dbg_probe(void)
{
int disabled, port;
disabled = 0;
resource_int_value("bvmdbg", 0, "disabled", &disabled);
if (!disabled) {
if (resource_int_value("bvmdbg", 0, "port", &port) == 0)
bvm_dbg_port = port;
if (inw(bvm_dbg_port) == BVM_DBG_SIG) {
/*
* Return a higher priority than 0 to override other
* gdb dbgport providers that may be present (e.g. uart)
*/
return (1);
}
}
return (-1);
}
static void
bvm_dbg_init(void)
{
}
static void
bvm_dbg_term(void)
{
}
static void
bvm_dbg_putc(int c)
{
outl(bvm_dbg_port, c);
}
static int
bvm_dbg_getc(void)
{
return (inl(bvm_dbg_port));
}

View File

@ -48,6 +48,7 @@ SUBDIR= \
${_bxe} \
${_bios} \
${_bktr} \
${_blackhole} \
${_bm} \
bridgestp \
bwi \
@ -335,6 +336,7 @@ SUBDIR= \
vge \
${_viawd} \
vkbd \
${_vmm} \
${_vpo} \
vr \
vte \
@ -624,6 +626,7 @@ _amdtemp= amdtemp
_arcmsr= arcmsr
_asmc= asmc
_bktr= bktr
_blackhole= blackhole
_bxe= bxe
_cardbus= cardbus
_cbb= cbb
@ -720,6 +723,7 @@ _twa= twa
_vesa= vesa
_viawd= viawd
_virtio= virtio
_vmm= vmm
_vxge= vxge
_x86bios= x86bios
_wbwd= wbwd

View File

@ -0,0 +1,9 @@
# $FreeBSD$
.PATH: ${.CURDIR}/../../dev/blackhole
KMOD= blackhole
SRCS= blackhole.c
SRCS+= bus_if.h device_if.h pci_if.h
.include <bsd.kmod.mk>

62
sys/modules/vmm/Makefile Normal file
View File

@ -0,0 +1,62 @@
# $FreeBSD$
KMOD= vmm
SRCS= opt_ddb.h device_if.h bus_if.h pci_if.h
CFLAGS+= -DVMM_KEEP_STATS -DSMP
CFLAGS+= -I${.CURDIR}/../../amd64/vmm
CFLAGS+= -I${.CURDIR}/../../amd64/vmm/io
CFLAGS+= -I${.CURDIR}/../../amd64/vmm/intel
# generic vmm support
.PATH: ${.CURDIR}/../../amd64/vmm
SRCS+= vmm.c \
vmm_dev.c \
vmm_host.c \
vmm_instruction_emul.c \
vmm_ipi.c \
vmm_lapic.c \
vmm_mem.c \
vmm_msr.c \
vmm_stat.c \
vmm_util.c \
x86.c \
vmm_support.S
.PATH: ${.CURDIR}/../../amd64/vmm/io
SRCS+= iommu.c \
ppt.c \
vdev.c \
vlapic.c
# intel-specific files
.PATH: ${.CURDIR}/../../amd64/vmm/intel
SRCS+= ept.c \
vmcs.c \
vmx_msr.c \
vmx.c \
vtd.c
# amd-specific files
.PATH: ${.CURDIR}/../../amd64/vmm/amd
SRCS+= amdv.c
OBJS= vmx_support.o
CLEANFILES= vmx_assym.s vmx_genassym.o
vmx_assym.s: vmx_genassym.o
.if exists(@)
vmx_assym.s: @/kern/genassym.sh
.endif
sh @/kern/genassym.sh vmx_genassym.o > ${.TARGET}
vmx_support.o: vmx_support.S vmx_assym.s
${CC} -c -x assembler-with-cpp -DLOCORE ${CFLAGS} \
${.IMPSRC} -o ${.TARGET}
vmx_genassym.o: vmx_genassym.c @ machine x86
${CC} -c ${CFLAGS:N-fno-common} ${.IMPSRC}
.include <bsd.kmod.mk>

View File

@ -10,6 +10,9 @@ SUBDIR+= acpi
SUBDIR+= apm
.endif
SUBDIR+= asf
SUBDIR+= bhyve
SUBDIR+= bhyvectl
SUBDIR+= bhyveload
SUBDIR+= boot0cfg
.if ${MK_TOOLCHAIN} != "no"
SUBDIR+= btxld

27
usr.sbin/bhyve/Makefile Normal file
View File

@ -0,0 +1,27 @@
#
# $FreeBSD$
#
PROG= bhyve
DEBUG_FLAGS= -g -O0
SRCS= acpi.c atpic.c bhyverun.c consport.c dbgport.c elcr.c inout.c
SRCS+= ioapic.c mem.c mevent.c mptbl.c
SRCS+= pci_emul.c pci_hostbridge.c pci_passthru.c pci_virtio_block.c
SRCS+= pci_virtio_net.c pci_uart.c pit_8254.c pmtmr.c post.c rtc.c uart.c
SRCS+= xmsr.c spinup_ap.c
.PATH: ${.CURDIR}/../../sys/amd64/vmm
SRCS+= vmm_instruction_emul.c
NO_MAN=
DPADD= ${LIBVMMAPI} ${LIBMD} ${LIBPTHREAD}
LDADD= -lvmmapi -lmd -lpthread
WARNS?= 2
CFLAGS+= -I${.CURDIR}/../../sys
.include <bsd.prog.mk>

844
usr.sbin/bhyve/acpi.c Normal file
View File

@ -0,0 +1,844 @@
/*-
* Copyright (c) 2012 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
/*
* bhyve ACPI table generator.
*
* Create the minimal set of ACPI tables required to boot FreeBSD (and
* hopefully other o/s's) by writing out ASL template files for each of
* the tables and the compiling them to AML with the Intel iasl compiler.
* The AML files are then read into guest memory.
*
* The tables are placed in the guest's ROM area just below 1MB physical,
* above the MPTable.
*
* Layout
* ------
* RSDP -> 0xf0400 (36 bytes fixed)
* RSDT -> 0xf0440 (36 bytes + 4*N table addrs, 2 used)
* XSDT -> 0xf0480 (36 bytes + 8*N table addrs, 2 used)
* MADT -> 0xf0500 (depends on #CPUs)
* FADT -> 0xf0600 (268 bytes)
* FACS -> 0xf0780 (64 bytes)
* DSDT -> 0xf0800 (variable - can go up to 0x100000)
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/errno.h>
#include <sys/stat.h>
#include <paths.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "bhyverun.h"
#include "acpi.h"
/*
* Define the base address of the ACPI tables, and the offsets to
* the individual tables
*/
#define BHYVE_ACPI_BASE 0xf0400
#define RSDT_OFFSET 0x040
#define XSDT_OFFSET 0x080
#define MADT_OFFSET 0x100
#define FADT_OFFSET 0x200
#define FACS_OFFSET 0x380
#define DSDT_OFFSET 0x400
#define BHYVE_ASL_TEMPLATE "bhyve.XXXXXXX"
#define BHYVE_ASL_SUFFIX ".aml"
#define BHYVE_ASL_COMPILER "/usr/sbin/iasl"
#define BHYVE_PM_TIMER_ADDR 0x408
static int basl_keep_temps;
static int basl_verbose_iasl;
static int basl_ncpu;
static uint32_t basl_acpi_base = BHYVE_ACPI_BASE;
/*
* Contains the full pathname of the template to be passed
* to mkstemp/mktemps(3)
*/
static char basl_template[MAXPATHLEN];
static char basl_stemplate[MAXPATHLEN];
struct basl_fio {
int fd;
FILE *fp;
char f_name[MAXPATHLEN];
};
#define EFPRINTF(...) \
err = fprintf(__VA_ARGS__); if (err < 0) goto err_exit;
#define EFFLUSH(x) \
err = fflush(x); if (err != 0) goto err_exit;
static int
basl_fwrite_rsdp(FILE *fp)
{
int err;
err = 0;
EFPRINTF(fp, "/*\n");
EFPRINTF(fp, " * bhyve RSDP template\n");
EFPRINTF(fp, " */\n");
EFPRINTF(fp, "[0008]\t\tSignature : \"RSD PTR \"\n");
EFPRINTF(fp, "[0001]\t\tChecksum : 43\n");
EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
EFPRINTF(fp, "[0001]\t\tRevision : 02\n");
EFPRINTF(fp, "[0004]\t\tRSDT Address : %08X\n",
basl_acpi_base + RSDT_OFFSET);
EFPRINTF(fp, "[0004]\t\tLength : 00000024\n");
EFPRINTF(fp, "[0008]\t\tXSDT Address : 00000000%08X\n",
basl_acpi_base + XSDT_OFFSET);
EFPRINTF(fp, "[0001]\t\tExtended Checksum : 00\n");
EFPRINTF(fp, "[0003]\t\tReserved : 000000\n");
EFFLUSH(fp);
return (0);
err_exit:
return (errno);
}
static int
basl_fwrite_rsdt(FILE *fp)
{
int err;
err = 0;
EFPRINTF(fp, "/*\n");
EFPRINTF(fp, " * bhyve RSDT template\n");
EFPRINTF(fp, " */\n");
EFPRINTF(fp, "[0004]\t\tSignature : \"RSDT\"\n");
EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n");
EFPRINTF(fp, "[0001]\t\tRevision : 01\n");
EFPRINTF(fp, "[0001]\t\tChecksum : 00\n");
EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVRSDT \"\n");
EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n");
/* iasl will fill in the compiler ID/revision fields */
EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n");
EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n");
EFPRINTF(fp, "\n");
/* Add in pointers to the MADT and FADT */
EFPRINTF(fp, "[0004]\t\tACPI Table Address 0 : %08X\n",
basl_acpi_base + MADT_OFFSET);
EFPRINTF(fp, "[0004]\t\tACPI Table Address 1 : %08X\n",
basl_acpi_base + FADT_OFFSET);
EFFLUSH(fp);
return (0);
err_exit:
return (errno);
}
static int
basl_fwrite_xsdt(FILE *fp)
{
int err;
err = 0;
EFPRINTF(fp, "/*\n");
EFPRINTF(fp, " * bhyve XSDT template\n");
EFPRINTF(fp, " */\n");
EFPRINTF(fp, "[0004]\t\tSignature : \"XSDT\"\n");
EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n");
EFPRINTF(fp, "[0001]\t\tRevision : 01\n");
EFPRINTF(fp, "[0001]\t\tChecksum : 00\n");
EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVXSDT \"\n");
EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n");
/* iasl will fill in the compiler ID/revision fields */
EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n");
EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n");
EFPRINTF(fp, "\n");
/* Add in pointers to the MADT and FADT */
EFPRINTF(fp, "[0004]\t\tACPI Table Address 0 : 00000000%08X\n",
basl_acpi_base + MADT_OFFSET);
EFPRINTF(fp, "[0004]\t\tACPI Table Address 1 : 00000000%08X\n",
basl_acpi_base + FADT_OFFSET);
EFFLUSH(fp);
return (0);
err_exit:
return (errno);
}
static int
basl_fwrite_madt(FILE *fp)
{
int err;
int i;
err = 0;
EFPRINTF(fp, "/*\n");
EFPRINTF(fp, " * bhyve MADT template\n");
EFPRINTF(fp, " */\n");
EFPRINTF(fp, "[0004]\t\tSignature : \"APIC\"\n");
EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n");
EFPRINTF(fp, "[0001]\t\tRevision : 01\n");
EFPRINTF(fp, "[0001]\t\tChecksum : 00\n");
EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVMADT \"\n");
EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n");
/* iasl will fill in the compiler ID/revision fields */
EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n");
EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n");
EFPRINTF(fp, "\n");
EFPRINTF(fp, "[0004]\t\tLocal Apic Address : FEE00000\n");
EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000001\n");
EFPRINTF(fp, "\t\t\tPC-AT Compatibility : 1\n");
EFPRINTF(fp, "\n");
/* Add a Processor Local APIC entry for each CPU */
for (i = 0; i < basl_ncpu; i++) {
EFPRINTF(fp, "[0001]\t\tSubtable Type : 00\n");
EFPRINTF(fp, "[0001]\t\tLength : 08\n");
EFPRINTF(fp, "[0001]\t\tProcessor ID : %02d\n", i);
EFPRINTF(fp, "[0001]\t\tLocal Apic ID : %02d\n", i);
EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000001\n");
EFPRINTF(fp, "\t\t\tProcessor Enabled : 1\n");
EFPRINTF(fp, "\n");
}
/* Always a single IOAPIC entry, with ID ncpu+1 */
EFPRINTF(fp, "[0001]\t\tSubtable Type : 01\n");
EFPRINTF(fp, "[0001]\t\tLength : 0C\n");
EFPRINTF(fp, "[0001]\t\tI/O Apic ID : %02d\n", basl_ncpu);
EFPRINTF(fp, "[0001]\t\tReserved : 00\n");
EFPRINTF(fp, "[0004]\t\tAddress : fec00000\n");
EFPRINTF(fp, "[0004]\t\tInterrupt : 00000000\n");
EFPRINTF(fp, "\n");
/* Override the 8259 chained vector. XXX maybe not needed */
EFPRINTF(fp, "[0001]\t\tSubtable Type : 02\n");
EFPRINTF(fp, "[0001]\t\tLength : 0A\n");
EFPRINTF(fp, "[0001]\t\tBus : 00\n");
EFPRINTF(fp, "[0001]\t\tSource : 09\n");
EFPRINTF(fp, "[0004]\t\tInterrupt : 00000009\n");
EFPRINTF(fp, "[0002]\t\tFlags (decoded below) : 0000\n");
EFPRINTF(fp, "\t\t\tPolarity : 0\n");
EFPRINTF(fp, "\t\t\tTrigger Mode : 0\n");
EFPRINTF(fp, "\n");
EFFLUSH(fp);
return (0);
err_exit:
return (errno);
}
static int
basl_fwrite_fadt(FILE *fp)
{
int err;
err = 0;
EFPRINTF(fp, "/*\n");
EFPRINTF(fp, " * bhyve FADT template\n");
EFPRINTF(fp, " */\n");
EFPRINTF(fp, "[0004]\t\tSignature : \"FACP\"\n");
EFPRINTF(fp, "[0004]\t\tTable Length : 0000010C\n");
EFPRINTF(fp, "[0001]\t\tRevision : 05\n");
EFPRINTF(fp, "[0001]\t\tChecksum : 00\n");
EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVFACP \"\n");
EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n");
/* iasl will fill in the compiler ID/revision fields */
EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n");
EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n");
EFPRINTF(fp, "\n");
EFPRINTF(fp, "[0004]\t\tFACS Address : %08X\n",
basl_acpi_base + FACS_OFFSET);
EFPRINTF(fp, "[0004]\t\tDSDT Address : %08X\n",
basl_acpi_base + DSDT_OFFSET);
EFPRINTF(fp, "[0001]\t\tModel : 00\n");
EFPRINTF(fp, "[0001]\t\tPM Profile : 00 [Unspecified]\n");
EFPRINTF(fp, "[0002]\t\tSCI Interrupt : 0009\n");
EFPRINTF(fp, "[0004]\t\tSMI Command Port : 00000000\n");
EFPRINTF(fp, "[0001]\t\tACPI Enable Value : 00\n");
EFPRINTF(fp, "[0001]\t\tACPI Disable Value : 00\n");
EFPRINTF(fp, "[0001]\t\tS4BIOS Command : 00\n");
EFPRINTF(fp, "[0001]\t\tP-State Control : 00\n");
EFPRINTF(fp, "[0004]\t\tPM1A Event Block Address : 00000000\n");
EFPRINTF(fp, "[0004]\t\tPM1B Event Block Address : 00000000\n");
EFPRINTF(fp, "[0004]\t\tPM1A Control Block Address : 00000000\n");
EFPRINTF(fp, "[0004]\t\tPM1B Control Block Address : 00000000\n");
EFPRINTF(fp, "[0004]\t\tPM2 Control Block Address : 00000000\n");
EFPRINTF(fp, "[0004]\t\tPM Timer Block Address : %08X\n",
BHYVE_PM_TIMER_ADDR);
EFPRINTF(fp, "[0004]\t\tGPE0 Block Address : 00000000\n");
EFPRINTF(fp, "[0004]\t\tGPE1 Block Address : 00000000\n");
EFPRINTF(fp, "[0001]\t\tPM1 Event Block Length : 04\n");
EFPRINTF(fp, "[0001]\t\tPM1 Control Block Length : 02\n");
EFPRINTF(fp, "[0001]\t\tPM2 Control Block Length : 00\n");
EFPRINTF(fp, "[0001]\t\tPM Timer Block Length : 04\n");
EFPRINTF(fp, "[0001]\t\tGPE0 Block Length : 00\n");
EFPRINTF(fp, "[0001]\t\tGPE1 Block Length : 00\n");
EFPRINTF(fp, "[0001]\t\tGPE1 Base Offset : 00\n");
EFPRINTF(fp, "[0001]\t\t_CST Support : 00\n");
EFPRINTF(fp, "[0002]\t\tC2 Latency : 0000\n");
EFPRINTF(fp, "[0002]\t\tC3 Latency : 0000\n");
EFPRINTF(fp, "[0002]\t\tCPU Cache Size : 0000\n");
EFPRINTF(fp, "[0002]\t\tCache Flush Stride : 0000\n");
EFPRINTF(fp, "[0001]\t\tDuty Cycle Offset : 00\n");
EFPRINTF(fp, "[0001]\t\tDuty Cycle Width : 00\n");
EFPRINTF(fp, "[0001]\t\tRTC Day Alarm Index : 00\n");
EFPRINTF(fp, "[0001]\t\tRTC Month Alarm Index : 00\n");
EFPRINTF(fp, "[0001]\t\tRTC Century Index : 00\n");
EFPRINTF(fp, "[0002]\t\tBoot Flags (decoded below) : 0000\n");
EFPRINTF(fp, "\t\t\tLegacy Devices Supported (V2) : 0\n");
EFPRINTF(fp, "\t\t\t8042 Present on ports 60/64 (V2) : 0\n");
EFPRINTF(fp, "\t\t\tVGA Not Present (V4) : 1\n");
EFPRINTF(fp, "\t\t\tMSI Not Supported (V4) : 0\n");
EFPRINTF(fp, "\t\t\tPCIe ASPM Not Supported (V4) : 1\n");
EFPRINTF(fp, "\t\t\tCMOS RTC Not Present (V5) : 0\n");
EFPRINTF(fp, "[0001]\t\tReserved : 00\n");
EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000000\n");
EFPRINTF(fp, "\t\t\tWBINVD instruction is operational (V1) : 1\n");
EFPRINTF(fp, "\t\t\tWBINVD flushes all caches (V1) : 0\n");
EFPRINTF(fp, "\t\t\tAll CPUs support C1 (V1) : 0\n");
EFPRINTF(fp, "\t\t\tC2 works on MP system (V1) : 0\n");
EFPRINTF(fp, "\t\t\tControl Method Power Button (V1) : 1\n");
EFPRINTF(fp, "\t\t\tControl Method Sleep Button (V1) : 1\n");
EFPRINTF(fp, "\t\t\tRTC wake not in fixed reg space (V1) : 0\n");
EFPRINTF(fp, "\t\t\tRTC can wake system from S4 (V1) : 0\n");
EFPRINTF(fp, "\t\t\t32-bit PM Timer (V1) : 1\n");
EFPRINTF(fp, "\t\t\tDocking Supported (V1) : 0\n");
EFPRINTF(fp, "\t\t\tReset Register Supported (V2) : 0\n");
EFPRINTF(fp, "\t\t\tSealed Case (V3) : 0\n");
EFPRINTF(fp, "\t\t\tHeadless - No Video (V3) : 1\n");
EFPRINTF(fp, "\t\t\tUse native instr after SLP_TYPx (V3) : 0\n");
EFPRINTF(fp, "\t\t\tPCIEXP_WAK Bits Supported (V4) : 0\n");
EFPRINTF(fp, "\t\t\tUse Platform Timer (V4) : 0\n");
EFPRINTF(fp, "\t\t\tRTC_STS valid on S4 wake (V4) : 0\n");
EFPRINTF(fp, "\t\t\tRemote Power-on capable (V4) : 0\n");
EFPRINTF(fp, "\t\t\tUse APIC Cluster Model (V4) : 0\n");
EFPRINTF(fp, "\t\t\tUse APIC Physical Destination Mode (V4) : 1\n");
EFPRINTF(fp, "\t\t\tHardware Reduced (V5) : 0\n");
EFPRINTF(fp, "\t\t\tLow Power S0 Idle (V5) : 0\n");
EFPRINTF(fp, "\n");
EFPRINTF(fp,
"[0012]\t\tReset Register : [Generic Address Structure]\n");
EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
EFPRINTF(fp, "[0001]\t\tBit Width : 08\n");
EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n");
EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000001\n");
EFPRINTF(fp, "\n");
EFPRINTF(fp, "[0001]\t\tValue to cause reset : 00\n");
EFPRINTF(fp, "[0003]\t\tReserved : 000000\n");
EFPRINTF(fp, "[0008]\t\tFACS Address : 00000000%08X\n",
basl_acpi_base + FACS_OFFSET);
EFPRINTF(fp, "[0008]\t\tDSDT Address : 00000000%08X\n",
basl_acpi_base + DSDT_OFFSET);
EFPRINTF(fp,
"[0012]\t\tPM1A Event Block : [Generic Address Structure]\n");
EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
EFPRINTF(fp, "[0001]\t\tBit Width : 20\n");
EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 02 [Word Access:16]\n");
EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000001\n");
EFPRINTF(fp, "\n");
EFPRINTF(fp,
"[0012]\t\tPM1B Event Block : [Generic Address Structure]\n");
EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
EFPRINTF(fp, "[0001]\t\tBit Width : 00\n");
EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
EFPRINTF(fp,
"[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n");
EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
EFPRINTF(fp, "\n");
EFPRINTF(fp,
"[0012]\t\tPM1A Control Block : [Generic Address Structure]\n");
EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
EFPRINTF(fp, "[0001]\t\tBit Width : 10\n");
EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 02 [Word Access:16]\n");
EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000001\n");
EFPRINTF(fp, "\n");
EFPRINTF(fp,
"[0012]\t\tPM1B Control Block : [Generic Address Structure]\n");
EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
EFPRINTF(fp, "[0001]\t\tBit Width : 00\n");
EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
EFPRINTF(fp,
"[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n");
EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
EFPRINTF(fp, "\n");
EFPRINTF(fp,
"[0012]\t\tPM2 Control Block : [Generic Address Structure]\n");
EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
EFPRINTF(fp, "[0001]\t\tBit Width : 08\n");
EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
EFPRINTF(fp,
"[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n");
EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
EFPRINTF(fp, "\n");
/* Valid for bhyve */
EFPRINTF(fp,
"[0012]\t\tPM Timer Block : [Generic Address Structure]\n");
EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
EFPRINTF(fp, "[0001]\t\tBit Width : 32\n");
EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
EFPRINTF(fp,
"[0001]\t\tEncoded Access Width : 03 [DWord Access:32]\n");
EFPRINTF(fp, "[0008]\t\tAddress : 00000000%08X\n",
BHYVE_PM_TIMER_ADDR);
EFPRINTF(fp, "\n");
EFPRINTF(fp, "[0012]\t\tGPE0 Block : [Generic Address Structure]\n");
EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
EFPRINTF(fp, "[0001]\t\tBit Width : 80\n");
EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n");
EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
EFPRINTF(fp, "\n");
EFPRINTF(fp, "[0012]\t\tGPE1 Block : [Generic Address Structure]\n");
EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
EFPRINTF(fp, "[0001]\t\tBit Width : 00\n");
EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
EFPRINTF(fp,
"[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n");
EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
EFPRINTF(fp, "\n");
EFPRINTF(fp,
"[0012]\t\tSleep Control Register : [Generic Address Structure]\n");
EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
EFPRINTF(fp, "[0001]\t\tBit Width : 08\n");
EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n");
EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
EFPRINTF(fp, "\n");
EFPRINTF(fp,
"[0012]\t\tSleep Status Register : [Generic Address Structure]\n");
EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
EFPRINTF(fp, "[0001]\t\tBit Width : 08\n");
EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n");
EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
EFFLUSH(fp);
return (0);
err_exit:
return (errno);
}
static int
basl_fwrite_facs(FILE *fp)
{
int err;
err = 0;
EFPRINTF(fp, "/*\n");
EFPRINTF(fp, " * bhyve FACS template\n");
EFPRINTF(fp, " */\n");
EFPRINTF(fp, "[0004]\t\tSignature : \"FACS\"\n");
EFPRINTF(fp, "[0004]\t\tLength : 00000040\n");
EFPRINTF(fp, "[0004]\t\tHardware Signature : 00000000\n");
EFPRINTF(fp, "[0004]\t\t32 Firmware Waking Vector : 00000000\n");
EFPRINTF(fp, "[0004]\t\tGlobal Lock : 00000000\n");
EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000000\n");
EFPRINTF(fp, "\t\t\tS4BIOS Support Present : 0\n");
EFPRINTF(fp, "\t\t\t64-bit Wake Supported (V2) : 0\n");
EFPRINTF(fp,
"[0008]\t\t64 Firmware Waking Vector : 0000000000000000\n");
EFPRINTF(fp, "[0001]\t\tVersion : 02\n");
EFPRINTF(fp, "[0003]\t\tReserved : 000000\n");
EFPRINTF(fp, "[0004]\t\tOspmFlags (decoded below) : 00000000\n");
EFPRINTF(fp, "\t\t\t64-bit Wake Env Required (V2) : 0\n");
EFFLUSH(fp);
return (0);
err_exit:
return (errno);
}
static int
basl_fwrite_dsdt(FILE *fp)
{
int err;
err = 0;
EFPRINTF(fp, "/*\n");
EFPRINTF(fp, " * bhyve DSDT template\n");
EFPRINTF(fp, " */\n");
EFPRINTF(fp, "DefinitionBlock (\"bhyve_dsdt.aml\", \"DSDT\", 2,"
"\"BHYVE \", \"BVDSDT \", 0x00000001)\n");
EFPRINTF(fp, "{\n");
EFPRINTF(fp, " Scope (_SB)\n");
EFPRINTF(fp, " {\n");
EFPRINTF(fp, " Device (PCI0)\n");
EFPRINTF(fp, " {\n");
EFPRINTF(fp, " Name (_HID, EisaId (\"PNP0A03\"))\n");
EFPRINTF(fp, " Name (_ADR, Zero)\n");
EFPRINTF(fp, " Name (_UID, One)\n");
EFPRINTF(fp, " Name (_CRS, ResourceTemplate ()\n");
EFPRINTF(fp, " {\n");
EFPRINTF(fp, " WordBusNumber (ResourceProducer, MinFixed,"
"MaxFixed, PosDecode,\n");
EFPRINTF(fp, " 0x0000, // Granularity\n");
EFPRINTF(fp, " 0x0000, // Range Minimum\n");
EFPRINTF(fp, " 0x00FF, // Range Maximum\n");
EFPRINTF(fp, " 0x0000, // Transl Offset\n");
EFPRINTF(fp, " 0x0100, // Length\n");
EFPRINTF(fp, " ,, )\n");
EFPRINTF(fp, " IO (Decode16,\n");
EFPRINTF(fp, " 0x0CF8, // Range Minimum\n");
EFPRINTF(fp, " 0x0CF8, // Range Maximum\n");
EFPRINTF(fp, " 0x01, // Alignment\n");
EFPRINTF(fp, " 0x08, // Length\n");
EFPRINTF(fp, " )\n");
EFPRINTF(fp, " WordIO (ResourceProducer, MinFixed, MaxFixed,"
"PosDecode, EntireRange,\n");
EFPRINTF(fp, " 0x0000, // Granularity\n");
EFPRINTF(fp, " 0x0000, // Range Minimum\n");
EFPRINTF(fp, " 0x0CF7, // Range Maximum\n");
EFPRINTF(fp, " 0x0000, // Transl Offset\n");
EFPRINTF(fp, " 0x0CF8, // Length\n");
EFPRINTF(fp, " ,, , TypeStatic)\n");
EFPRINTF(fp, " WordIO (ResourceProducer, MinFixed, MaxFixed,"
"PosDecode, EntireRange,\n");
EFPRINTF(fp, " 0x0000, // Granularity\n");
EFPRINTF(fp, " 0x0D00, // Range Minimum\n");
EFPRINTF(fp, " 0xFFFF, // Range Maximum\n");
EFPRINTF(fp, " 0x0000, // Transl Offset\n");
EFPRINTF(fp, " 0xF300, // Length\n");
EFPRINTF(fp, " ,, , TypeStatic)\n");
EFPRINTF(fp, " })\n");
EFPRINTF(fp, " }\n");
EFPRINTF(fp, " }\n");
EFPRINTF(fp, "\n");
EFPRINTF(fp, " Scope (_SB.PCI0)\n");
EFPRINTF(fp, " {\n");
EFPRINTF(fp, " Device (ISA)\n");
EFPRINTF(fp, " {\n");
EFPRINTF(fp, " Name (_ADR, 0x00010000)\n");
EFPRINTF(fp, " OperationRegion (P40C, PCI_Config, 0x60, 0x04)\n");
EFPRINTF(fp, " }\n");
EFPRINTF(fp, " }\n");
EFPRINTF(fp, "\n");
EFPRINTF(fp, " Scope (_SB.PCI0.ISA)\n");
EFPRINTF(fp, " {\n");
EFPRINTF(fp, " Device (RTC)\n");
EFPRINTF(fp, " {\n");
EFPRINTF(fp, " Name (_HID, EisaId (\"PNP0B00\"))\n");
EFPRINTF(fp, " Name (_CRS, ResourceTemplate ()\n");
EFPRINTF(fp, " {\n");
EFPRINTF(fp, " IO (Decode16,\n");
EFPRINTF(fp, " 0x0070, // Range Minimum\n");
EFPRINTF(fp, " 0x0070, // Range Maximum\n");
EFPRINTF(fp, " 0x10, // Alignment\n");
EFPRINTF(fp, " 0x02, // Length\n");
EFPRINTF(fp, " )\n");
EFPRINTF(fp, " IRQNoFlags ()\n");
EFPRINTF(fp, " {8}\n");
EFPRINTF(fp, " IO (Decode16,\n");
EFPRINTF(fp, " 0x0072, // Range Minimum\n");
EFPRINTF(fp, " 0x0072, // Range Maximum\n");
EFPRINTF(fp, " 0x02, // Alignment\n");
EFPRINTF(fp, " 0x06, // Length\n");
EFPRINTF(fp, " )\n");
EFPRINTF(fp, " })\n");
EFPRINTF(fp, " }\n");
EFPRINTF(fp, " }\n");
EFPRINTF(fp, "}\n");
EFFLUSH(fp);
return (0);
err_exit:
return (errno);
}
static int
basl_open(struct basl_fio *bf, int suffix)
{
int err;
err = 0;
if (suffix) {
strncpy(bf->f_name, basl_stemplate, MAXPATHLEN);
bf->fd = mkstemps(bf->f_name, strlen(BHYVE_ASL_SUFFIX));
} else {
strncpy(bf->f_name, basl_template, MAXPATHLEN);
bf->fd = mkstemp(bf->f_name);
}
if (bf->fd > 0) {
bf->fp = fdopen(bf->fd, "w+");
if (bf->fp == NULL) {
unlink(bf->f_name);
close(bf->fd);
}
} else {
err = 1;
}
return (err);
}
static void
basl_close(struct basl_fio *bf)
{
if (!basl_keep_temps)
unlink(bf->f_name);
fclose(bf->fp);
}
static int
basl_start(struct basl_fio *in, struct basl_fio *out)
{
int err;
err = basl_open(in, 0);
if (!err) {
err = basl_open(out, 1);
if (err) {
basl_close(in);
}
}
return (err);
}
static void
basl_end(struct basl_fio *in, struct basl_fio *out)
{
basl_close(in);
basl_close(out);
}
static int
basl_load(int fd, uint64_t off)
{
struct stat sb;
int err;
err = 0;
if (fstat(fd, &sb) < 0 ||
read(fd, paddr_guest2host(basl_acpi_base + off), sb.st_size) < 0)
err = errno;
return (err);
}
static int
basl_compile(int (*fwrite_section)(FILE *fp), uint64_t offset)
{
struct basl_fio io[2];
static char iaslbuf[3*MAXPATHLEN + 10];
char *fmt;
int err;
err = basl_start(&io[0], &io[1]);
if (!err) {
err = (*fwrite_section)(io[0].fp);
if (!err) {
/*
* iasl sends the results of the compilation to
* stdout. Shut this down by using the shell to
* redirect stdout to /dev/null, unless the user
* has requested verbose output for debugging
* purposes
*/
fmt = basl_verbose_iasl ?
"%s -p %s %s" :
"/bin/sh -c \"%s -p %s %s\" 1> /dev/null";
snprintf(iaslbuf, sizeof(iaslbuf),
fmt,
BHYVE_ASL_COMPILER,
io[1].f_name, io[0].f_name);
err = system(iaslbuf);
if (!err) {
/*
* Copy the aml output file into guest
* memory at the specified location
*/
err = basl_load(io[1].fd, offset);
}
}
basl_end(&io[0], &io[1]);
}
return (err);
}
static int
basl_make_templates(void)
{
const char *tmpdir;
int err;
int len;
err = 0;
/*
*
*/
if ((tmpdir = getenv("BHYVE_TMPDIR")) == NULL || *tmpdir == '\0' ||
(tmpdir = getenv("TMPDIR")) == NULL || *tmpdir == '\0') {
tmpdir = _PATH_TMP;
}
len = strlen(tmpdir);
if ((len + sizeof(BHYVE_ASL_TEMPLATE) + 1) < MAXPATHLEN) {
strcpy(basl_template, tmpdir);
while (len > 0 && basl_template[len - 1] == '/')
len--;
basl_template[len] = '/';
strcpy(&basl_template[len + 1], BHYVE_ASL_TEMPLATE);
} else
err = E2BIG;
if (!err) {
/*
* len has been intialized (and maybe adjusted) above
*/
if ((len + sizeof(BHYVE_ASL_TEMPLATE) + 1 +
sizeof(BHYVE_ASL_SUFFIX)) < MAXPATHLEN) {
strcpy(basl_stemplate, tmpdir);
basl_stemplate[len] = '/';
strcpy(&basl_stemplate[len + 1], BHYVE_ASL_TEMPLATE);
len = strlen(basl_stemplate);
strcpy(&basl_stemplate[len], BHYVE_ASL_SUFFIX);
} else
err = E2BIG;
}
return (err);
}
static struct {
int (*wsect)(FILE *fp);
uint64_t offset;
} basl_ftables[] =
{
{ basl_fwrite_rsdp, 0},
{ basl_fwrite_rsdt, RSDT_OFFSET },
{ basl_fwrite_xsdt, XSDT_OFFSET },
{ basl_fwrite_madt, MADT_OFFSET },
{ basl_fwrite_fadt, FADT_OFFSET },
{ basl_fwrite_facs, FACS_OFFSET },
{ basl_fwrite_dsdt, DSDT_OFFSET },
{ NULL }
};
int
acpi_build(struct vmctx *ctx, int ncpu, int ioapic)
{
int err;
int i;
err = 0;
basl_ncpu = ncpu;
if (!ioapic) {
fprintf(stderr, "ACPI tables require an ioapic\n");
return (EINVAL);
}
/*
* For debug, allow the user to have iasl compiler output sent
* to stdout rather than /dev/null
*/
if (getenv("BHYVE_ACPI_VERBOSE_IASL"))
basl_verbose_iasl = 1;
/*
* Allow the user to keep the generated ASL files for debugging
* instead of deleting them following use
*/
if (getenv("BHYVE_ACPI_KEEPTMPS"))
basl_keep_temps = 1;
i = 0;
err = basl_make_templates();
/*
* Run through all the ASL files, compiling them and
* copying them into guest memory
*/
while (!err && basl_ftables[i].wsect != NULL) {
err = basl_compile(basl_ftables[i].wsect,
basl_ftables[i].offset);
i++;
}
return (err);
}

34
usr.sbin/bhyve/acpi.h Normal file
View File

@ -0,0 +1,34 @@
/*-
* Copyright (c) 2012 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _ACPI_H_
#define _ACPI_H_
int acpi_build(struct vmctx *ctx, int ncpu, int ioapic);
#endif /* _ACPI_H_ */

68
usr.sbin/bhyve/atpic.c Normal file
View File

@ -0,0 +1,68 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include "inout.h"
/*
* FreeBSD only writes to the 8259 interrupt controllers to put them in a
* shutdown state.
*
* So, we just ignore the writes.
*/
#define IO_ICU1 0x20
#define IO_ICU2 0xA0
#define ICU_IMR_OFFSET 1
static int
atpic_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
if (bytes != 1)
return (-1);
if (in)
return (-1);
/* Pretend all writes to the 8259 are alright */
return (0);
}
INOUT_PORT(atpic, IO_ICU1, IOPORT_F_INOUT, atpic_handler);
INOUT_PORT(atpic, IO_ICU1 + ICU_IMR_OFFSET, IOPORT_F_INOUT, atpic_handler);
INOUT_PORT(atpic, IO_ICU2, IOPORT_F_INOUT, atpic_handler);
INOUT_PORT(atpic, IO_ICU2 + ICU_IMR_OFFSET, IOPORT_F_INOUT, atpic_handler);

788
usr.sbin/bhyve/bhyverun.c Normal file
View File

@ -0,0 +1,788 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/time.h>
#include <machine/segments.h>
#include <stdio.h>
#include <stdlib.h>
#include <libgen.h>
#include <unistd.h>
#include <assert.h>
#include <errno.h>
#include <signal.h>
#include <pthread.h>
#include <pthread_np.h>
#include <machine/vmm.h>
#include <vmmapi.h>
#include "bhyverun.h"
#include "acpi.h"
#include "inout.h"
#include "dbgport.h"
#include "mem.h"
#include "mevent.h"
#include "mptbl.h"
#include "pci_emul.h"
#include "xmsr.h"
#include "ioapic.h"
#include "spinup_ap.h"
#define DEFAULT_GUEST_HZ 100
#define DEFAULT_GUEST_TSLICE 200
#define GUEST_NIO_PORT 0x488 /* guest upcalls via i/o port */
#define VMEXIT_SWITCH 0 /* force vcpu switch in mux mode */
#define VMEXIT_CONTINUE 1 /* continue from next instruction */
#define VMEXIT_RESTART 2 /* restart current instruction */
#define VMEXIT_ABORT 3 /* abort the vm run loop */
#define VMEXIT_RESET 4 /* guest machine has reset */
#define MB (1024UL * 1024)
#define GB (1024UL * MB)
typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu);
int guest_tslice = DEFAULT_GUEST_TSLICE;
int guest_hz = DEFAULT_GUEST_HZ;
char *vmname;
u_long lomem_sz;
u_long himem_sz;
int guest_ncpus;
static int pincpu = -1;
static int guest_vcpu_mux;
static int guest_vmexit_on_hlt, guest_vmexit_on_pause, disable_x2apic;
static int foundcpus;
static int strictio;
static int acpi;
static char *lomem_addr;
static char *himem_addr;
static char *progname;
static const int BSP = 0;
static int cpumask;
static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip);
struct vm_exit vmexit[VM_MAXCPU];
struct fbsdstats {
uint64_t vmexit_bogus;
uint64_t vmexit_bogus_switch;
uint64_t vmexit_hlt;
uint64_t vmexit_pause;
uint64_t vmexit_mtrap;
uint64_t vmexit_paging;
uint64_t cpu_switch_rotate;
uint64_t cpu_switch_direct;
int io_reset;
} stats;
struct mt_vmm_info {
pthread_t mt_thr;
struct vmctx *mt_ctx;
int mt_vcpu;
} mt_vmm_info[VM_MAXCPU];
static void
usage(int code)
{
fprintf(stderr,
"Usage: %s [-aehABHIP][-g <gdb port>][-z <hz>][-s <pci>]"
"[-S <pci>][-p pincpu][-n <pci>][-m lowmem][-M highmem] <vm>\n"
" -a: local apic is in XAPIC mode (default is X2APIC)\n"
" -A: create an ACPI table\n"
" -g: gdb port (default is %d and 0 means don't open)\n"
" -c: # cpus (default 1)\n"
" -p: pin vcpu 'n' to host cpu 'pincpu + n'\n"
" -B: inject breakpoint exception on vm entry\n"
" -H: vmexit from the guest on hlt\n"
" -I: present an ioapic to the guest\n"
" -P: vmexit from the guest on pause\n"
" -e: exit on unhandled i/o access\n"
" -h: help\n"
" -z: guest hz (default is %d)\n"
" -s: <slot,driver,configinfo> PCI slot config\n"
" -S: <slot,driver,configinfo> legacy PCI slot config\n"
" -m: lowmem in MB\n"
" -M: highmem in MB\n"
" -x: mux vcpus to 1 hcpu\n"
" -t: mux vcpu timeslice hz (default %d)\n",
progname, DEFAULT_GDB_PORT, DEFAULT_GUEST_HZ,
DEFAULT_GUEST_TSLICE);
exit(code);
}
void *
paddr_guest2host(uintptr_t gaddr)
{
if (lomem_sz == 0)
return (NULL);
if (gaddr < lomem_sz) {
return ((void *)(lomem_addr + gaddr));
} else if (gaddr >= 4*GB && gaddr < (4*GB + himem_sz)) {
return ((void *)(himem_addr + gaddr - 4*GB));
} else
return (NULL);
}
int
fbsdrun_disable_x2apic(void)
{
return (disable_x2apic);
}
int
fbsdrun_vmexit_on_pause(void)
{
return (guest_vmexit_on_pause);
}
int
fbsdrun_vmexit_on_hlt(void)
{
return (guest_vmexit_on_hlt);
}
int
fbsdrun_muxed(void)
{
return (guest_vcpu_mux);
}
static void *
fbsdrun_start_thread(void *param)
{
char tname[MAXCOMLEN + 1];
struct mt_vmm_info *mtp;
int vcpu;
mtp = param;
vcpu = mtp->mt_vcpu;
snprintf(tname, sizeof(tname), "%s vcpu %d", vmname, vcpu);
pthread_set_name_np(mtp->mt_thr, tname);
vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip);
/* not reached */
exit(1);
return (NULL);
}
void
fbsdrun_addcpu(struct vmctx *ctx, int vcpu, uint64_t rip)
{
int error;
if (cpumask & (1 << vcpu)) {
fprintf(stderr, "addcpu: attempting to add existing cpu %d\n",
vcpu);
exit(1);
}
cpumask |= 1 << vcpu;
foundcpus++;
/*
* Set up the vmexit struct to allow execution to start
* at the given RIP
*/
vmexit[vcpu].rip = rip;
vmexit[vcpu].inst_length = 0;
if (vcpu == BSP || !guest_vcpu_mux){
mt_vmm_info[vcpu].mt_ctx = ctx;
mt_vmm_info[vcpu].mt_vcpu = vcpu;
error = pthread_create(&mt_vmm_info[vcpu].mt_thr, NULL,
fbsdrun_start_thread, &mt_vmm_info[vcpu]);
assert(error == 0);
}
}
static int
fbsdrun_get_next_cpu(int curcpu)
{
/*
* Get the next available CPU. Assumes they arrive
* in ascending order with no gaps.
*/
return ((curcpu + 1) % foundcpus);
}
static int
vmexit_catch_reset(void)
{
stats.io_reset++;
return (VMEXIT_RESET);
}
static int
vmexit_catch_inout(void)
{
return (VMEXIT_ABORT);
}
static int
vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu,
uint32_t eax)
{
#if PG_DEBUG /* put all types of debug here */
if (eax == 0) {
pause_noswitch = 1;
} else if (eax == 1) {
pause_noswitch = 0;
} else {
pause_noswitch = 0;
if (eax == 5) {
vm_set_capability(ctx, *pvcpu, VM_CAP_MTRAP_EXIT, 1);
}
}
#endif
return (VMEXIT_CONTINUE);
}
static int
vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
{
int error;
int bytes, port, in, out;
uint32_t eax;
int vcpu;
vcpu = *pvcpu;
port = vme->u.inout.port;
bytes = vme->u.inout.bytes;
eax = vme->u.inout.eax;
in = vme->u.inout.in;
out = !in;
/* We don't deal with these */
if (vme->u.inout.string || vme->u.inout.rep)
return (VMEXIT_ABORT);
/* Special case of guest reset */
if (out && port == 0x64 && (uint8_t)eax == 0xFE)
return (vmexit_catch_reset());
/* Extra-special case of host notifications */
if (out && port == GUEST_NIO_PORT)
return (vmexit_handle_notify(ctx, vme, pvcpu, eax));
error = emulate_inout(ctx, vcpu, in, port, bytes, &eax, strictio);
if (error == 0 && in)
error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, eax);
if (error == 0)
return (VMEXIT_CONTINUE);
else {
fprintf(stderr, "Unhandled %s%c 0x%04x\n",
in ? "in" : "out",
bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port);
return (vmexit_catch_inout());
}
}
static int
vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
{
fprintf(stderr, "vm exit rdmsr 0x%x, cpu %d\n", vme->u.msr.code,
*pvcpu);
return (VMEXIT_ABORT);
}
static int
vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
{
int newcpu;
int retval = VMEXIT_CONTINUE;
newcpu = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code,vme->u.msr.wval);
if (guest_vcpu_mux && *pvcpu != newcpu) {
retval = VMEXIT_SWITCH;
*pvcpu = newcpu;
}
return (retval);
}
static int
vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
{
int newcpu;
int retval = VMEXIT_CONTINUE;
newcpu = spinup_ap(ctx, *pvcpu,
vme->u.spinup_ap.vcpu, vme->u.spinup_ap.rip);
if (guest_vcpu_mux && *pvcpu != newcpu) {
retval = VMEXIT_SWITCH;
*pvcpu = newcpu;
}
return (retval);
}
static int
vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
fprintf(stderr, "vm exit[%d]\n", *pvcpu);
fprintf(stderr, "\treason\t\tVMX\n");
fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip);
fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length);
fprintf(stderr, "\terror\t\t%d\n", vmexit->u.vmx.error);
fprintf(stderr, "\texit_reason\t%u\n", vmexit->u.vmx.exit_reason);
fprintf(stderr, "\tqualification\t0x%016lx\n",
vmexit->u.vmx.exit_qualification);
return (VMEXIT_ABORT);
}
static int bogus_noswitch = 1;
static int
vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
stats.vmexit_bogus++;
if (!guest_vcpu_mux || guest_ncpus == 1 || bogus_noswitch) {
return (VMEXIT_RESTART);
} else {
stats.vmexit_bogus_switch++;
vmexit->inst_length = 0;
*pvcpu = -1;
return (VMEXIT_SWITCH);
}
}
static int
vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
stats.vmexit_hlt++;
if (fbsdrun_muxed()) {
*pvcpu = -1;
return (VMEXIT_SWITCH);
} else {
/*
* Just continue execution with the next instruction. We use
* the HLT VM exit as a way to be friendly with the host
* scheduler.
*/
return (VMEXIT_CONTINUE);
}
}
static int pause_noswitch;
static int
vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
stats.vmexit_pause++;
if (fbsdrun_muxed() && !pause_noswitch) {
*pvcpu = -1;
return (VMEXIT_SWITCH);
} else {
return (VMEXIT_CONTINUE);
}
}
static int
vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
stats.vmexit_mtrap++;
return (VMEXIT_RESTART);
}
static int
vmexit_paging(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
int err;
stats.vmexit_paging++;
err = emulate_mem(ctx, *pvcpu, vmexit->u.paging.gpa,
&vmexit->u.paging.vie);
if (err) {
if (err == EINVAL) {
fprintf(stderr,
"Failed to emulate instruction at 0x%lx\n",
vmexit->rip);
} else if (err == ESRCH) {
fprintf(stderr, "Unhandled memory access to 0x%lx\n",
vmexit->u.paging.gpa);
}
return (VMEXIT_ABORT);
}
return (VMEXIT_CONTINUE);
}
static void
sigalrm(int sig)
{
return;
}
static void
setup_timeslice(void)
{
struct sigaction sa;
struct itimerval itv;
int error;
/*
* Setup a realtime timer to generate a SIGALRM at a
* frequency of 'guest_tslice' ticks per second.
*/
sigemptyset(&sa.sa_mask);
sa.sa_flags = 0;
sa.sa_handler = sigalrm;
error = sigaction(SIGALRM, &sa, NULL);
assert(error == 0);
itv.it_interval.tv_sec = 0;
itv.it_interval.tv_usec = 1000000 / guest_tslice;
itv.it_value.tv_sec = 0;
itv.it_value.tv_usec = 1000000 / guest_tslice;
error = setitimer(ITIMER_REAL, &itv, NULL);
assert(error == 0);
}
static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
[VM_EXITCODE_INOUT] = vmexit_inout,
[VM_EXITCODE_VMX] = vmexit_vmx,
[VM_EXITCODE_BOGUS] = vmexit_bogus,
[VM_EXITCODE_RDMSR] = vmexit_rdmsr,
[VM_EXITCODE_WRMSR] = vmexit_wrmsr,
[VM_EXITCODE_MTRAP] = vmexit_mtrap,
[VM_EXITCODE_PAGING] = vmexit_paging,
[VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap,
};
static void
vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip)
{
int error, rc, prevcpu;
if (guest_vcpu_mux)
setup_timeslice();
if (pincpu >= 0) {
error = vm_set_pinning(ctx, vcpu, pincpu + vcpu);
assert(error == 0);
}
while (1) {
error = vm_run(ctx, vcpu, rip, &vmexit[vcpu]);
if (error != 0) {
/*
* It is possible that 'vmmctl' or some other process
* has transitioned the vcpu to CANNOT_RUN state right
* before we tried to transition it to RUNNING.
*
* This is expected to be temporary so just retry.
*/
if (errno == EBUSY)
continue;
else
break;
}
prevcpu = vcpu;
rc = (*handler[vmexit[vcpu].exitcode])(ctx, &vmexit[vcpu],
&vcpu);
switch (rc) {
case VMEXIT_SWITCH:
assert(guest_vcpu_mux);
if (vcpu == -1) {
stats.cpu_switch_rotate++;
vcpu = fbsdrun_get_next_cpu(prevcpu);
} else {
stats.cpu_switch_direct++;
}
/* fall through */
case VMEXIT_CONTINUE:
rip = vmexit[vcpu].rip + vmexit[vcpu].inst_length;
break;
case VMEXIT_RESTART:
rip = vmexit[vcpu].rip;
break;
case VMEXIT_RESET:
exit(0);
default:
exit(1);
}
}
fprintf(stderr, "vm_run error %d, errno %d\n", error, errno);
}
static int
num_vcpus_allowed(struct vmctx *ctx)
{
int tmp, error;
error = vm_get_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, &tmp);
/*
* The guest is allowed to spinup more than one processor only if the
* UNRESTRICTED_GUEST capability is available.
*/
if (error == 0)
return (VM_MAXCPU);
else
return (1);
}
int
main(int argc, char *argv[])
{
int c, error, gdb_port, inject_bkpt, tmp, err, ioapic, bvmcons;
int max_vcpus;
struct vmctx *ctx;
uint64_t rip;
bvmcons = 0;
inject_bkpt = 0;
progname = basename(argv[0]);
gdb_port = DEFAULT_GDB_PORT;
guest_ncpus = 1;
ioapic = 0;
while ((c = getopt(argc, argv, "abehABHIPxp:g:c:z:s:S:n:m:M:")) != -1) {
switch (c) {
case 'a':
disable_x2apic = 1;
break;
case 'A':
acpi = 1;
break;
case 'b':
bvmcons = 1;
break;
case 'B':
inject_bkpt = 1;
break;
case 'x':
guest_vcpu_mux = 1;
break;
case 'p':
pincpu = atoi(optarg);
break;
case 'c':
guest_ncpus = atoi(optarg);
break;
case 'g':
gdb_port = atoi(optarg);
break;
case 'z':
guest_hz = atoi(optarg);
break;
case 't':
guest_tslice = atoi(optarg);
break;
case 's':
pci_parse_slot(optarg, 0);
break;
case 'S':
pci_parse_slot(optarg, 1);
break;
case 'm':
lomem_sz = strtoul(optarg, NULL, 0) * MB;
break;
case 'M':
himem_sz = strtoul(optarg, NULL, 0) * MB;
break;
case 'H':
guest_vmexit_on_hlt = 1;
break;
case 'I':
ioapic = 1;
break;
case 'P':
guest_vmexit_on_pause = 1;
break;
case 'e':
strictio = 1;
break;
case 'h':
usage(0);
default:
usage(1);
}
}
argc -= optind;
argv += optind;
if (argc != 1)
usage(1);
/* No need to mux if guest is uni-processor */
if (guest_ncpus <= 1)
guest_vcpu_mux = 0;
/* vmexit on hlt if guest is muxed */
if (guest_vcpu_mux) {
guest_vmexit_on_hlt = 1;
guest_vmexit_on_pause = 1;
}
vmname = argv[0];
ctx = vm_open(vmname);
if (ctx == NULL) {
perror("vm_open");
exit(1);
}
max_vcpus = num_vcpus_allowed(ctx);
if (guest_ncpus > max_vcpus) {
fprintf(stderr, "%d vCPUs requested but only %d available\n",
guest_ncpus, max_vcpus);
exit(1);
}
if (fbsdrun_vmexit_on_hlt()) {
err = vm_get_capability(ctx, BSP, VM_CAP_HALT_EXIT, &tmp);
if (err < 0) {
fprintf(stderr, "VM exit on HLT not supported\n");
exit(1);
}
vm_set_capability(ctx, BSP, VM_CAP_HALT_EXIT, 1);
handler[VM_EXITCODE_HLT] = vmexit_hlt;
}
if (fbsdrun_vmexit_on_pause()) {
/*
* pause exit support required for this mode
*/
err = vm_get_capability(ctx, BSP, VM_CAP_PAUSE_EXIT, &tmp);
if (err < 0) {
fprintf(stderr,
"SMP mux requested, no pause support\n");
exit(1);
}
vm_set_capability(ctx, BSP, VM_CAP_PAUSE_EXIT, 1);
handler[VM_EXITCODE_PAUSE] = vmexit_pause;
}
if (fbsdrun_disable_x2apic())
err = vm_set_x2apic_state(ctx, BSP, X2APIC_DISABLED);
else
err = vm_set_x2apic_state(ctx, BSP, X2APIC_ENABLED);
if (err) {
fprintf(stderr, "Unable to set x2apic state (%d)\n", err);
exit(1);
}
if (lomem_sz != 0) {
lomem_addr = vm_map_memory(ctx, 0, lomem_sz);
if (lomem_addr == (char *) MAP_FAILED) {
lomem_sz = 0;
} else if (himem_sz != 0) {
himem_addr = vm_map_memory(ctx, 4*GB, himem_sz);
if (himem_addr == (char *) MAP_FAILED) {
lomem_sz = 0;
himem_sz = 0;
}
}
}
init_inout();
init_pci(ctx);
if (ioapic)
ioapic_init(0);
if (gdb_port != 0)
init_dbgport(gdb_port);
if (bvmcons)
init_bvmcons();
error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip);
assert(error == 0);
if (inject_bkpt) {
error = vm_inject_event(ctx, BSP, VM_HW_EXCEPTION, IDT_BP);
assert(error == 0);
}
/*
* build the guest tables, MP etc.
*/
mptable_build(ctx, guest_ncpus, ioapic);
if (acpi) {
error = acpi_build(ctx, guest_ncpus, ioapic);
assert(error == 0);
}
/*
* Add CPU 0
*/
fbsdrun_addcpu(ctx, BSP, rip);
/*
* Head off to the main event dispatch loop
*/
mevent_dispatch();
exit(1);
}

53
usr.sbin/bhyve/bhyverun.h Normal file
View File

@ -0,0 +1,53 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _FBSDRUN_H_
#define _FBSDRUN_H_
#ifndef CTASSERT /* Allow lint to override */
#define CTASSERT(x) _CTASSERT(x, __LINE__)
#define _CTASSERT(x, y) __CTASSERT(x, y)
#define __CTASSERT(x, y) typedef char __assert ## y[(x) ? 1 : -1]
#endif
struct vmctx;
extern int guest_hz;
extern int guest_tslice;
extern int guest_ncpus;
extern char *vmname;
extern u_long lomem_sz, himem_sz;
void *paddr_guest2host(uintptr_t);
void fbsdrun_addcpu(struct vmctx *ctx, int cpu, uint64_t rip);
int fbsdrun_muxed(void);
int fbsdrun_vmexit_on_hlt(void);
int fbsdrun_vmexit_on_pause(void);
int fbsdrun_disable_x2apic(void);
#endif

140
usr.sbin/bhyve/consport.c Normal file
View File

@ -0,0 +1,140 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <sys/select.h>
#include <stdio.h>
#include <stdlib.h>
#include <termios.h>
#include <unistd.h>
#include <stdbool.h>
#include "inout.h"
#define BVM_CONSOLE_PORT 0x220
#define BVM_CONS_SIG ('b' << 8 | 'v')
static struct termios tio_orig, tio_new;
static void
ttyclose(void)
{
tcsetattr(STDIN_FILENO, TCSANOW, &tio_orig);
}
static void
ttyopen(void)
{
tcgetattr(STDIN_FILENO, &tio_orig);
cfmakeraw(&tio_new);
tcsetattr(STDIN_FILENO, TCSANOW, &tio_new);
atexit(ttyclose);
}
static bool
tty_char_available(void)
{
fd_set rfds;
struct timeval tv;
FD_ZERO(&rfds);
FD_SET(STDIN_FILENO, &rfds);
tv.tv_sec = 0;
tv.tv_usec = 0;
if (select(STDIN_FILENO + 1, &rfds, NULL, NULL, &tv) > 0) {
return (true);
} else {
return (false);
}
}
static int
ttyread(void)
{
char rb;
if (tty_char_available()) {
read(STDIN_FILENO, &rb, 1);
return (rb & 0xff);
} else {
return (-1);
}
}
static void
ttywrite(unsigned char wb)
{
(void) write(STDOUT_FILENO, &wb, 1);
}
static int
console_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
static int opened;
if (bytes == 2 && in) {
*eax = BVM_CONS_SIG;
return (0);
}
if (bytes != 4)
return (-1);
if (!opened) {
ttyopen();
opened = 1;
}
if (in)
*eax = ttyread();
else
ttywrite(*eax);
return (0);
}
static struct inout_port consport = {
"bvmcons",
BVM_CONSOLE_PORT,
IOPORT_F_INOUT,
console_handler
};
void
init_bvmcons(void)
{
register_inout(&consport);
}

138
usr.sbin/bhyve/dbgport.c Normal file
View File

@ -0,0 +1,138 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <sys/uio.h>
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
#include "inout.h"
#include "dbgport.h"
#define BVM_DBG_PORT 0x224
#define BVM_DBG_SIG ('B' << 8 | 'V')
static int listen_fd, conn_fd;
static struct sockaddr_in sin;
static int
dbg_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
char ch;
int nwritten, nread, printonce;
if (bytes == 2 && in) {
*eax = BVM_DBG_SIG;
return (0);
}
if (bytes != 4)
return (-1);
again:
printonce = 0;
while (conn_fd < 0) {
if (!printonce) {
printf("Waiting for connection from gdb\r\n");
printonce = 1;
}
conn_fd = accept(listen_fd, NULL, NULL);
if (conn_fd >= 0)
fcntl(conn_fd, F_SETFL, O_NONBLOCK);
else if (errno != EINTR)
perror("accept");
}
if (in) {
nread = read(conn_fd, &ch, 1);
if (nread == -1 && errno == EAGAIN)
*eax = -1;
else if (nread == 1)
*eax = ch;
else {
close(conn_fd);
conn_fd = -1;
goto again;
}
} else {
ch = *eax;
nwritten = write(conn_fd, &ch, 1);
if (nwritten != 1) {
close(conn_fd);
conn_fd = -1;
goto again;
}
}
return (0);
}
static struct inout_port dbgport = {
"bvmdbg",
BVM_DBG_PORT,
IOPORT_F_INOUT,
dbg_handler
};
void
init_dbgport(int sport)
{
conn_fd = -1;
if ((listen_fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
perror("socket");
exit(1);
}
sin.sin_len = sizeof(sin);
sin.sin_family = AF_INET;
sin.sin_addr.s_addr = htonl(INADDR_ANY);
sin.sin_port = htons(sport);
if (bind(listen_fd, (struct sockaddr *)&sin, sizeof(sin)) < 0) {
perror("bind");
exit(1);
}
if (listen(listen_fd, 1) < 0) {
perror("listen");
exit(1);
}
register_inout(&dbgport);
}

36
usr.sbin/bhyve/dbgport.h Normal file
View File

@ -0,0 +1,36 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _DBGPORT_H_
#define _DBGPORT_H_
#define DEFAULT_GDB_PORT 6466
void init_dbgport(int port);
#endif

65
usr.sbin/bhyve/elcr.c Normal file
View File

@ -0,0 +1,65 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include "inout.h"
/*
* EISA interrupt Level Control Register.
*
* This is a 16-bit register with one bit for each of the IRQ0 through IRQ15.
* A level triggered irq is indicated by setting the corresponding bit to '1'.
*/
#define ELCR_PORT 0x4d0
static uint8_t elcr[2] = { 0x00, 0x00 };
static int
elcr_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
int idx;
if (bytes != 1)
return (-1);
idx = port - ELCR_PORT;
if (in)
*eax = elcr[idx];
else
elcr[idx] = *eax;
return (0);
}
INOUT_PORT(elcr, ELCR_PORT + 0, IOPORT_F_INOUT, elcr_handler);
INOUT_PORT(elcr, ELCR_PORT + 1, IOPORT_F_INOUT, elcr_handler);

151
usr.sbin/bhyve/inout.c Normal file
View File

@ -0,0 +1,151 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/linker_set.h>
#include <stdio.h>
#include <assert.h>
#include "inout.h"
SET_DECLARE(inout_port_set, struct inout_port);
#define MAX_IOPORTS (1 << 16)
static struct {
const char *name;
int flags;
inout_func_t handler;
void *arg;
} inout_handlers[MAX_IOPORTS];
static int
default_inout(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
if (in) {
switch (bytes) {
case 4:
*eax = 0xffffffff;
break;
case 2:
*eax = 0xffff;
break;
case 1:
*eax = 0xff;
break;
}
}
return (0);
}
int
emulate_inout(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, int strict)
{
int flags;
uint32_t mask;
inout_func_t handler;
void *arg;
assert(port < MAX_IOPORTS);
handler = inout_handlers[port].handler;
if (strict && handler == default_inout)
return (-1);
if (!in) {
switch (bytes) {
case 1:
mask = 0xff;
break;
case 2:
mask = 0xffff;
break;
default:
mask = 0xffffffff;
break;
}
*eax = *eax & mask;
}
flags = inout_handlers[port].flags;
arg = inout_handlers[port].arg;
if ((in && (flags & IOPORT_F_IN)) || (!in && (flags & IOPORT_F_OUT)))
return ((*handler)(ctx, vcpu, in, port, bytes, eax, arg));
else
return (-1);
}
void
init_inout(void)
{
struct inout_port **iopp, *iop;
int i;
/*
* Set up the default handler for all ports
*/
for (i = 0; i < MAX_IOPORTS; i++) {
inout_handlers[i].name = "default";
inout_handlers[i].flags = IOPORT_F_IN | IOPORT_F_OUT;
inout_handlers[i].handler = default_inout;
inout_handlers[i].arg = NULL;
}
/*
* Overwrite with specified handlers
*/
SET_FOREACH(iopp, inout_port_set) {
iop = *iopp;
assert(iop->port < MAX_IOPORTS);
inout_handlers[iop->port].name = iop->name;
inout_handlers[iop->port].flags = iop->flags;
inout_handlers[iop->port].handler = iop->handler;
inout_handlers[iop->port].arg = NULL;
}
}
int
register_inout(struct inout_port *iop)
{
assert(iop->port < MAX_IOPORTS);
inout_handlers[iop->port].name = iop->name;
inout_handlers[iop->port].flags = iop->flags;
inout_handlers[iop->port].handler = iop->handler;
inout_handlers[iop->port].arg = iop->arg;
return (0);
}

67
usr.sbin/bhyve/inout.h Normal file
View File

@ -0,0 +1,67 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _INOUT_H_
#define _INOUT_H_
#include <sys/linker_set.h>
struct vmctx;
typedef int (*inout_func_t)(struct vmctx *ctx, int vcpu, int in, int port,
int bytes, uint32_t *eax, void *arg);
struct inout_port {
const char *name;
int port;
int flags;
inout_func_t handler;
void *arg;
};
#define IOPORT_F_IN 0x1
#define IOPORT_F_OUT 0x2
#define IOPORT_F_INOUT 0x3
#define INOUT_PORT(name, port, flags, handler) \
static struct inout_port __CONCAT(__inout_port, __LINE__) = { \
#name, \
(port), \
(flags), \
(handler), \
0 \
}; \
DATA_SET(inout_port_set, __CONCAT(__inout_port, __LINE__))
void init_inout(void);
int emulate_inout(struct vmctx *, int vcpu, int in, int port, int bytes,
uint32_t *eax, int strict);
int register_inout(struct inout_port *iop);
void init_bvmcons(void);
#endif /* _INOUT_H_ */

324
usr.sbin/bhyve/ioapic.c Normal file
View File

@ -0,0 +1,324 @@
/*-
* Copyright (c) 2012 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <x86/apicreg.h>
#include <machine/vmm.h>
#include <string.h>
#include <assert.h>
#include <stdbool.h>
#include <vmmapi.h>
#include "inout.h"
#include "mem.h"
#include "bhyverun.h"
#include <stdio.h>
#define IOAPIC_PADDR 0xFEC00000
#define IOREGSEL 0x00
#define IOWIN 0x10
#define REDIR_ENTRIES 16
#define INTR_ASSERTED(ioapic, pin) ((ioapic)->pinstate[(pin)] == true)
struct ioapic {
int inited;
uint32_t id;
uint64_t redtbl[REDIR_ENTRIES];
bool pinstate[REDIR_ENTRIES];
uintptr_t paddr; /* gpa where the ioapic is mapped */
uint32_t ioregsel;
struct memory_region *region;
};
static struct ioapic ioapics[1]; /* only a single ioapic for now */
static int ioapic_region_read(struct ioapic *ioapic, uintptr_t paddr,
int size, uint64_t *data);
static int ioapic_region_write(struct ioapic *ioapic, uintptr_t paddr,
int size, uint64_t data);
static int ioapic_region_handler(struct vmctx *vm, int vcpu, int dir,
uintptr_t paddr, int size, uint64_t *val,
void *arg1, long arg2);
static void
ioapic_set_pinstate(struct vmctx *ctx, int pin, bool newstate)
{
int vector, apicid, vcpu;
uint32_t low, high;
struct ioapic *ioapic;
ioapic = &ioapics[0]; /* assume a single ioapic */
if (pin < 0 || pin >= REDIR_ENTRIES)
return;
/* Nothing to do if interrupt pin has not changed state */
if (ioapic->pinstate[pin] == newstate)
return;
ioapic->pinstate[pin] = newstate; /* record it */
/* Nothing to do if interrupt pin is deasserted */
if (!INTR_ASSERTED(ioapic, pin))
return;
/*
* XXX
* We only deal with:
* - edge triggered interrupts
* - physical destination mode
* - fixed delivery mode
*/
low = ioapic->redtbl[pin];
high = ioapic->redtbl[pin] >> 32;
if ((low & IOART_INTMASK) == IOART_INTMCLR &&
(low & IOART_TRGRMOD) == IOART_TRGREDG &&
(low & IOART_DESTMOD) == IOART_DESTPHY &&
(low & IOART_DELMOD) == IOART_DELFIXED) {
vector = low & IOART_INTVEC;
apicid = high >> APIC_ID_SHIFT;
if (apicid != 0xff) {
/* unicast */
vcpu = vm_apicid2vcpu(ctx, apicid);
vm_lapic_irq(ctx, vcpu, vector);
} else {
/* broadcast */
vcpu = 0;
while (vcpu < guest_ncpus) {
vm_lapic_irq(ctx, vcpu, vector);
vcpu++;
}
}
}
}
void
ioapic_deassert_pin(struct vmctx *ctx, int pin)
{
ioapic_set_pinstate(ctx, pin, false);
}
void
ioapic_assert_pin(struct vmctx *ctx, int pin)
{
ioapic_set_pinstate(ctx, pin, true);
}
void
ioapic_init(int which)
{
struct mem_range memp;
struct ioapic *ioapic;
int error;
int i;
assert(which == 0);
ioapic = &ioapics[which];
assert(ioapic->inited == 0);
bzero(ioapic, sizeof(struct ioapic));
/* Initialize all redirection entries to mask all interrupts */
for (i = 0; i < REDIR_ENTRIES; i++)
ioapic->redtbl[i] = 0x0001000000010000UL;
ioapic->paddr = IOAPIC_PADDR;
/* Register emulated memory region */
memp.name = "ioapic";
memp.flags = MEM_F_RW;
memp.handler = ioapic_region_handler;
memp.arg1 = ioapic;
memp.arg2 = which;
memp.base = ioapic->paddr;
memp.size = sizeof(struct IOAPIC);
error = register_mem(&memp);
assert (error == 0);
ioapic->inited = 1;
}
static uint32_t
ioapic_read(struct ioapic *ioapic, uint32_t addr)
{
int regnum, pin, rshift;
assert(ioapic->inited);
regnum = addr & 0xff;
switch (regnum) {
case IOAPIC_ID:
return (ioapic->id);
break;
case IOAPIC_VER:
return ((REDIR_ENTRIES << MAXREDIRSHIFT) | 0x11);
break;
case IOAPIC_ARB:
return (ioapic->id);
break;
default:
break;
}
/* redirection table entries */
if (regnum >= IOAPIC_REDTBL &&
regnum < IOAPIC_REDTBL + REDIR_ENTRIES * 2) {
pin = (regnum - IOAPIC_REDTBL) / 2;
if ((regnum - IOAPIC_REDTBL) % 2)
rshift = 32;
else
rshift = 0;
return (ioapic->redtbl[pin] >> rshift);
}
return (0);
}
static void
ioapic_write(struct ioapic *ioapic, uint32_t addr, uint32_t data)
{
int regnum, pin, lshift;
assert(ioapic->inited);
regnum = addr & 0xff;
switch (regnum) {
case IOAPIC_ID:
ioapic->id = data & APIC_ID_MASK;
break;
case IOAPIC_VER:
case IOAPIC_ARB:
/* readonly */
break;
default:
break;
}
/* redirection table entries */
if (regnum >= IOAPIC_REDTBL &&
regnum < IOAPIC_REDTBL + REDIR_ENTRIES * 2) {
pin = (regnum - IOAPIC_REDTBL) / 2;
if ((regnum - IOAPIC_REDTBL) % 2)
lshift = 32;
else
lshift = 0;
ioapic->redtbl[pin] &= ~((uint64_t)0xffffffff << lshift);
ioapic->redtbl[pin] |= ((uint64_t)data << lshift);
}
}
static int
ioapic_region_read(struct ioapic *ioapic, uintptr_t paddr, int size,
uint64_t *data)
{
int offset;
offset = paddr - ioapic->paddr;
/*
* The IOAPIC specification allows 32-bit wide accesses to the
* IOREGSEL (offset 0) and IOWIN (offset 16) registers.
*/
if (size != 4 || (offset != IOREGSEL && offset != IOWIN)) {
#if 1
printf("invalid access to ioapic%d: size %d, offset %d\n",
(int)(ioapic - ioapics), size, offset);
#endif
*data = 0;
return (0);
}
if (offset == IOREGSEL)
*data = ioapic->ioregsel;
else
*data = ioapic_read(ioapic, ioapic->ioregsel);
return (0);
}
static int
ioapic_region_write(struct ioapic *ioapic, uintptr_t paddr, int size,
uint64_t data)
{
int offset;
offset = paddr - ioapic->paddr;
/*
* The ioapic specification allows 32-bit wide accesses to the
* IOREGSEL (offset 0) and IOWIN (offset 16) registers.
*/
if (size != 4 || (offset != IOREGSEL && offset != IOWIN)) {
#if 1
printf("invalid access to ioapic%d: size %d, offset %d\n",
(int)(ioapic - ioapics), size, offset);
#endif
return (0);
}
if (offset == IOREGSEL)
ioapic->ioregsel = data;
else
ioapic_write(ioapic, ioapic->ioregsel, data);
return (0);
}
static int
ioapic_region_handler(struct vmctx *vm, int vcpu, int dir, uintptr_t paddr,
int size, uint64_t *val, void *arg1, long arg2)
{
struct ioapic *ioapic;
int which;
ioapic = arg1;
which = arg2;
assert(ioapic == &ioapics[which]);
if (dir == MEM_F_READ)
ioapic_region_read(ioapic, paddr, size, val);
else
ioapic_region_write(ioapic, paddr, size, *val);
return (0);
}

38
usr.sbin/bhyve/ioapic.h Normal file
View File

@ -0,0 +1,38 @@
/*-
* Copyright (c) 2012 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _IOAPIC_H_
#define _IOAPIC_H_
struct vmctx;
void ioapic_init(int num);
void ioapic_deassert_pin(struct vmctx *ctx, int pin);
void ioapic_assert_pin(struct vmctx *ctx, int pin);
#endif

218
usr.sbin/bhyve/mem.c Normal file
View File

@ -0,0 +1,218 @@
/*-
* Copyright (c) 2012 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
/*
* Memory ranges are represented with an RB tree. On insertion, the range
* is checked for overlaps. On lookup, the key has the same base and limit
* so it can be searched within the range.
*
* It is assumed that all setup of ranges takes place in single-threaded
* mode before vCPUs have been started. As such, no locks are used on the
* RB tree. If this is no longer the case, then a r/w lock could be used,
* with readers on the lookup and a writer if the tree needs to be changed
* (and per vCPU caches flushed)
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <sys/tree.h>
#include <sys/errno.h>
#include <machine/vmm.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include "mem.h"
struct mmio_rb_range {
RB_ENTRY(mmio_rb_range) mr_link; /* RB tree links */
struct mem_range mr_param;
uint64_t mr_base;
uint64_t mr_end;
};
struct mmio_rb_tree;
RB_PROTOTYPE(mmio_rb_tree, mmio_rb_range, mr_link, mmio_rb_range_compare);
RB_HEAD(mmio_rb_tree, mmio_rb_range) mmio_rbroot;
/*
* Per-vCPU cache. Since most accesses from a vCPU will be to
* consecutive addresses in a range, it makes sense to cache the
* result of a lookup.
*/
static struct mmio_rb_range *mmio_hint[VM_MAXCPU];
static int
mmio_rb_range_compare(struct mmio_rb_range *a, struct mmio_rb_range *b)
{
if (a->mr_end < b->mr_base)
return (-1);
else if (a->mr_base > b->mr_end)
return (1);
return (0);
}
static int
mmio_rb_lookup(uint64_t addr, struct mmio_rb_range **entry)
{
struct mmio_rb_range find, *res;
find.mr_base = find.mr_end = addr;
res = RB_FIND(mmio_rb_tree, &mmio_rbroot, &find);
if (res != NULL) {
*entry = res;
return (0);
}
return (ENOENT);
}
static int
mmio_rb_add(struct mmio_rb_range *new)
{
struct mmio_rb_range *overlap;
overlap = RB_INSERT(mmio_rb_tree, &mmio_rbroot, new);
if (overlap != NULL) {
#ifdef RB_DEBUG
printf("overlap detected: new %lx:%lx, tree %lx:%lx\n",
new->mr_base, new->mr_end,
overlap->mr_base, overlap->mr_end);
#endif
return (EEXIST);
}
return (0);
}
#if 0
static void
mmio_rb_dump(void)
{
struct mmio_rb_range *np;
RB_FOREACH(np, mmio_rb_tree, &mmio_rbroot) {
printf(" %lx:%lx, %s\n", np->mr_base, np->mr_end,
np->mr_param.name);
}
}
#endif
RB_GENERATE(mmio_rb_tree, mmio_rb_range, mr_link, mmio_rb_range_compare);
static int
mem_read(void *ctx, int vcpu, uint64_t gpa, uint64_t *rval, int size, void *arg)
{
int error;
struct mem_range *mr = arg;
error = (*mr->handler)(ctx, vcpu, MEM_F_READ, gpa, size,
rval, mr->arg1, mr->arg2);
return (error);
}
static int
mem_write(void *ctx, int vcpu, uint64_t gpa, uint64_t wval, int size, void *arg)
{
int error;
struct mem_range *mr = arg;
error = (*mr->handler)(ctx, vcpu, MEM_F_WRITE, gpa, size,
&wval, mr->arg1, mr->arg2);
return (error);
}
int
emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie)
{
struct mmio_rb_range *entry;
int err;
/*
* First check the per-vCPU cache
*/
if (mmio_hint[vcpu] &&
paddr >= mmio_hint[vcpu]->mr_base &&
paddr <= mmio_hint[vcpu]->mr_end) {
entry = mmio_hint[vcpu];
} else
entry = NULL;
if (entry == NULL) {
if (mmio_rb_lookup(paddr, &entry))
return (ESRCH);
/* Update the per-vCPU cache */
mmio_hint[vcpu] = entry;
}
assert(entry != NULL && entry == mmio_hint[vcpu]);
err = vmm_emulate_instruction(ctx, vcpu, paddr, vie,
mem_read, mem_write, &entry->mr_param);
return (err);
}
int
register_mem(struct mem_range *memp)
{
struct mmio_rb_range *mrp;
int err;
err = 0;
mrp = malloc(sizeof(struct mmio_rb_range));
if (mrp != NULL) {
mrp->mr_param = *memp;
mrp->mr_base = memp->base;
mrp->mr_end = memp->base + memp->size - 1;
err = mmio_rb_add(mrp);
if (err)
free(mrp);
} else
err = ENOMEM;
return (err);
}
void
init_mem(void)
{
RB_INIT(&mmio_rbroot);
}

57
usr.sbin/bhyve/mem.h Normal file
View File

@ -0,0 +1,57 @@
/*-
* Copyright (c) 2012 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _MEM_H_
#define _MEM_H_
#include <sys/linker_set.h>
struct vmctx;
typedef int (*mem_func_t)(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
int size, uint64_t *val, void *arg1, long arg2);
struct mem_range {
const char *name;
int flags;
mem_func_t handler;
void *arg1;
long arg2;
uint64_t base;
uint64_t size;
};
#define MEM_F_READ 0x1
#define MEM_F_WRITE 0x2
#define MEM_F_RW 0x3
void init_mem(void);
int emulate_mem(struct vmctx *, int vcpu, uint64_t paddr, struct vie *vie);
int register_mem(struct mem_range *memp);
#endif /* _MEM_H_ */

432
usr.sbin/bhyve/mevent.c Normal file
View File

@ -0,0 +1,432 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
/*
* Micro event library for FreeBSD, designed for a single i/o thread
* using kqueue, and having events be persistent by default.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <assert.h>
#include <errno.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/event.h>
#include <sys/time.h>
#include <pthread.h>
#include <pthread_np.h>
#include "mevent.h"
#define MEVENT_MAX 64
#define MEV_ENABLE 1
#define MEV_DISABLE 2
#define MEV_DEL_PENDING 3
extern char *vmname;
static pthread_t mevent_tid;
static int mevent_pipefd[2];
static pthread_mutex_t mevent_lmutex = PTHREAD_MUTEX_INITIALIZER;
struct mevent {
void (*me_func)(int, enum ev_type, void *);
int me_fd;
enum ev_type me_type;
void *me_param;
int me_cq;
int me_state;
int me_closefd;
LIST_ENTRY(mevent) me_list;
};
static LIST_HEAD(listhead, mevent) global_head, change_head;
static void
mevent_qlock(void)
{
pthread_mutex_lock(&mevent_lmutex);
}
static void
mevent_qunlock(void)
{
pthread_mutex_unlock(&mevent_lmutex);
}
static void
mevent_pipe_read(int fd, enum ev_type type, void *param)
{
char buf[MEVENT_MAX];
int status;
/*
* Drain the pipe read side. The fd is non-blocking so this is
* safe to do.
*/
do {
status = read(fd, buf, sizeof(buf));
} while (status == MEVENT_MAX);
}
static void
mevent_notify(void)
{
char c;
/*
* If calling from outside the i/o thread, write a byte on the
* pipe to force the i/o thread to exit the blocking kevent call.
*/
if (mevent_pipefd[1] != 0 && pthread_self() != mevent_tid) {
write(mevent_pipefd[1], &c, 1);
}
}
static int
mevent_kq_filter(struct mevent *mevp)
{
int retval;
retval = 0;
if (mevp->me_type == EVF_READ)
retval = EVFILT_READ;
if (mevp->me_type == EVF_WRITE)
retval = EVFILT_WRITE;
return (retval);
}
static int
mevent_kq_flags(struct mevent *mevp)
{
int ret;
switch (mevp->me_state) {
case MEV_ENABLE:
ret = EV_ADD;
break;
case MEV_DISABLE:
ret = EV_DISABLE;
break;
case MEV_DEL_PENDING:
ret = EV_DELETE;
break;
}
return (ret);
}
static int
mevent_kq_fflags(struct mevent *mevp)
{
/* XXX nothing yet, perhaps EV_EOF for reads ? */
return (0);
}
static int
mevent_build(int mfd, struct kevent *kev)
{
struct mevent *mevp, *tmpp;
int i;
i = 0;
mevent_qlock();
LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) {
if (mevp->me_closefd) {
/*
* A close of the file descriptor will remove the
* event
*/
close(mevp->me_fd);
} else {
kev[i].ident = mevp->me_fd;
kev[i].filter = mevent_kq_filter(mevp);
kev[i].flags = mevent_kq_flags(mevp);
kev[i].fflags = mevent_kq_fflags(mevp);
kev[i].data = 0;
kev[i].udata = mevp;
i++;
}
mevp->me_cq = 0;
LIST_REMOVE(mevp, me_list);
if (mevp->me_state == MEV_DEL_PENDING) {
free(mevp);
} else {
LIST_INSERT_HEAD(&global_head, mevp, me_list);
}
assert(i < MEVENT_MAX);
}
mevent_qunlock();
return (i);
}
static void
mevent_handle(struct kevent *kev, int numev)
{
struct mevent *mevp;
int i;
for (i = 0; i < numev; i++) {
mevp = kev[i].udata;
/* XXX check for EV_ERROR ? */
(*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param);
}
}
struct mevent *
mevent_add(int fd, enum ev_type type,
void (*func)(int, enum ev_type, void *), void *param)
{
struct mevent *lp, *mevp;
if (fd < 0 || func == NULL) {
return (NULL);
}
mevp = NULL;
mevent_qlock();
/*
* Verify that the fd/type tuple is not present in any list
*/
LIST_FOREACH(lp, &global_head, me_list) {
if (lp->me_fd == fd && lp->me_type == type) {
goto exit;
}
}
LIST_FOREACH(lp, &change_head, me_list) {
if (lp->me_fd == fd && lp->me_type == type) {
goto exit;
}
}
/*
* Allocate an entry, populate it, and add it to the change list.
*/
mevp = malloc(sizeof(struct mevent));
if (mevp == NULL) {
goto exit;
}
memset(mevp, 0, sizeof(struct mevent));
mevp->me_fd = fd;
mevp->me_type = type;
mevp->me_func = func;
mevp->me_param = param;
LIST_INSERT_HEAD(&change_head, mevp, me_list);
mevp->me_cq = 1;
mevp->me_state = MEV_ENABLE;
mevent_notify();
exit:
mevent_qunlock();
return (mevp);
}
static int
mevent_update(struct mevent *evp, int newstate)
{
/*
* It's not possible to enable/disable a deleted event
*/
if (evp->me_state == MEV_DEL_PENDING)
return (EINVAL);
/*
* No update needed if state isn't changing
*/
if (evp->me_state == newstate)
return (0);
mevent_qlock();
evp->me_state = newstate;
/*
* Place the entry onto the changed list if not already there.
*/
if (evp->me_cq == 0) {
evp->me_cq = 1;
LIST_REMOVE(evp, me_list);
LIST_INSERT_HEAD(&change_head, evp, me_list);
mevent_notify();
}
mevent_qunlock();
return (0);
}
int
mevent_enable(struct mevent *evp)
{
return (mevent_update(evp, MEV_ENABLE));
}
int
mevent_disable(struct mevent *evp)
{
return (mevent_update(evp, MEV_DISABLE));
}
static int
mevent_delete_event(struct mevent *evp, int closefd)
{
mevent_qlock();
/*
* Place the entry onto the changed list if not already there, and
* mark as to be deleted.
*/
if (evp->me_cq == 0) {
evp->me_cq = 1;
LIST_REMOVE(evp, me_list);
LIST_INSERT_HEAD(&change_head, evp, me_list);
mevent_notify();
}
evp->me_state = MEV_DEL_PENDING;
if (closefd)
evp->me_closefd = 1;
mevent_qunlock();
return (0);
}
int
mevent_delete(struct mevent *evp)
{
return (mevent_delete_event(evp, 0));
}
int
mevent_delete_close(struct mevent *evp)
{
return (mevent_delete_event(evp, 1));
}
static void
mevent_set_name(void)
{
char tname[MAXCOMLEN + 1];
snprintf(tname, sizeof(tname), "%s mevent", vmname);
pthread_set_name_np(mevent_tid, tname);
}
void
mevent_dispatch(void)
{
struct kevent changelist[MEVENT_MAX];
struct kevent eventlist[MEVENT_MAX];
struct mevent *pipev;
int mfd;
int numev;
int ret;
mevent_tid = pthread_self();
mevent_set_name();
mfd = kqueue();
assert(mfd > 0);
/*
* Open the pipe that will be used for other threads to force
* the blocking kqueue call to exit by writing to it. Set the
* descriptor to non-blocking.
*/
ret = pipe(mevent_pipefd);
if (ret < 0) {
perror("pipe");
exit(0);
}
/*
* Add internal event handler for the pipe write fd
*/
pipev = mevent_add(mevent_pipefd[0], EVF_READ, mevent_pipe_read, NULL);
assert(pipev != NULL);
for (;;) {
/*
* Build changelist if required.
* XXX the changelist can be put into the blocking call
* to eliminate the extra syscall. Currently better for
* debug.
*/
numev = mevent_build(mfd, changelist);
if (numev) {
ret = kevent(mfd, changelist, numev, NULL, 0, NULL);
if (ret == -1) {
perror("Error return from kevent change");
}
}
/*
* Block awaiting events
*/
ret = kevent(mfd, NULL, 0, eventlist, MEVENT_MAX, NULL);
if (ret == -1) {
perror("Error return from kevent monitor");
}
/*
* Handle reported events
*/
mevent_handle(eventlist, ret);
}
}

49
usr.sbin/bhyve/mevent.h Normal file
View File

@ -0,0 +1,49 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _MEVENT_H_
#define _MEVENT_H_
enum ev_type {
EVF_READ,
EVF_WRITE
};
struct mevent;
struct mevent *mevent_add(int fd, enum ev_type type,
void (*func)(int, enum ev_type, void *),
void *param);
int mevent_enable(struct mevent *evp);
int mevent_disable(struct mevent *evp);
int mevent_delete(struct mevent *evp);
int mevent_delete_close(struct mevent *evp);
void mevent_dispatch(void);
#endif /* _MEVENT_H_ */

View File

@ -0,0 +1,180 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
/*
* Test program for the micro event library. Set up a simple TCP echo
* service.
*
* cc mevent_test.c mevent.c -lpthread
*/
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include "mevent.h"
#define TEST_PORT 4321
static pthread_mutex_t accept_mutex = PTHREAD_MUTEX_INITIALIZER;
static pthread_cond_t accept_condvar = PTHREAD_COND_INITIALIZER;
#define MEVENT_ECHO
#ifdef MEVENT_ECHO
struct esync {
pthread_mutex_t e_mt;
pthread_cond_t e_cond;
};
static void
echoer_callback(int fd, enum ev_type type, void *param)
{
struct esync *sync = param;
pthread_mutex_lock(&sync->e_mt);
pthread_cond_signal(&sync->e_cond);
pthread_mutex_unlock(&sync->e_mt);
}
static void *
echoer(void *param)
{
struct esync sync;
struct mevent *mev;
char buf[128];
int fd = (int)(uintptr_t) param;
int len;
pthread_mutex_init(&sync.e_mt, NULL);
pthread_cond_init(&sync.e_cond, NULL);
pthread_mutex_lock(&sync.e_mt);
mev = mevent_add(fd, EVF_READ, echoer_callback, &sync);
if (mev == NULL) {
printf("Could not allocate echoer event\n");
exit(1);
}
while (!pthread_cond_wait(&sync.e_cond, &sync.e_mt)) {
len = read(fd, buf, sizeof(buf));
if (len > 0) {
write(fd, buf, len);
write(0, buf, len);
} else {
break;
}
}
mevent_delete_close(mev);
pthread_mutex_unlock(&sync.e_mt);
pthread_mutex_destroy(&sync.e_mt);
pthread_cond_destroy(&sync.e_cond);
}
#else
static void *
echoer(void *param)
{
char buf[128];
int fd = (int)(uintptr_t) param;
int len;
while ((len = read(fd, buf, sizeof(buf))) > 0) {
write(1, buf, len);
}
}
#endif /* MEVENT_ECHO */
static void
acceptor_callback(int fd, enum ev_type type, void *param)
{
pthread_mutex_lock(&accept_mutex);
pthread_cond_signal(&accept_condvar);
pthread_mutex_unlock(&accept_mutex);
}
static void *
acceptor(void *param)
{
struct sockaddr_in sin;
pthread_t tid;
int news;
int s;
if ((s = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
perror("socket");
exit(1);
}
sin.sin_len = sizeof(sin);
sin.sin_family = AF_INET;
sin.sin_addr.s_addr = htonl(INADDR_ANY);
sin.sin_port = htons(TEST_PORT);
if (bind(s, (struct sockaddr *)&sin, sizeof(sin)) < 0) {
perror("bind");
exit(1);
}
if (listen(s, 1) < 0) {
perror("listen");
exit(1);
}
(void) mevent_add(s, EVF_READ, acceptor_callback, NULL);
pthread_mutex_lock(&accept_mutex);
while (!pthread_cond_wait(&accept_condvar, &accept_mutex)) {
news = accept(s, NULL, NULL);
if (news < 0) {
perror("accept error");
} else {
printf("incoming connection, spawning thread\n");
pthread_create(&tid, NULL, echoer,
(void *)(uintptr_t)news);
}
}
}
main()
{
pthread_t tid;
pthread_create(&tid, NULL, acceptor, NULL);
mevent_dispatch();
}

398
usr.sbin/bhyve/mptbl.c Normal file
View File

@ -0,0 +1,398 @@
/*-
* Copyright (c) 2012 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <sys/errno.h>
#include <x86/mptable.h>
#include <stdio.h>
#include <string.h>
#include "bhyverun.h"
#include "mptbl.h"
#define MPTABLE_BASE 0xF0000
#define LAPIC_PADDR 0xFEE00000
#define LAPIC_VERSION 16
#define IOAPIC_PADDR 0xFEC00000
#define IOAPIC_VERSION 0x11
#define MP_SPECREV 4
#define MPFP_SIG "_MP_"
/* Configuration header defines */
#define MPCH_SIG "PCMP"
#define MPCH_OEMID "BHyVe "
#define MPCH_OEMID_LEN 8
#define MPCH_PRODID "Hypervisor "
#define MPCH_PRODID_LEN 12
/* Processor entry defines */
#define MPEP_SIG_FAMILY 6 /* XXX bhyve should supply this */
#define MPEP_SIG_MODEL 26
#define MPEP_SIG_STEPPING 5
#define MPEP_SIG \
((MPEP_SIG_FAMILY << 8) | \
(MPEP_SIG_MODEL << 4) | \
(MPEP_SIG_STEPPING))
#define MPEP_FEATURES (0xBFEBFBFF) /* XXX Intel i7 */
/* Define processor entry struct since <x86/mptable.h> gets it wrong */
typedef struct BPROCENTRY {
u_char type;
u_char apic_id;
u_char apic_version;
u_char cpu_flags;
uint32_t cpu_signature;
uint32_t feature_flags;
uint32_t reserved1;
uint32_t reserved2;
} *bproc_entry_ptr;
CTASSERT(sizeof(struct BPROCENTRY) == 20);
/* Bus entry defines */
#define MPE_NUM_BUSES 2
#define MPE_BUSNAME_LEN 6
#define MPE_BUSNAME_ISA "ISA "
#define MPE_BUSNAME_PCI "PCI "
static void *oem_tbl_start;
static int oem_tbl_size;
static uint8_t
mpt_compute_checksum(void *base, size_t len)
{
uint8_t *bytes;
uint8_t sum;
for(bytes = base, sum = 0; len > 0; len--) {
sum += *bytes++;
}
return (256 - sum);
}
static void
mpt_build_mpfp(mpfps_t mpfp, vm_paddr_t gpa)
{
memset(mpfp, 0, sizeof(*mpfp));
memcpy(mpfp->signature, MPFP_SIG, 4);
mpfp->pap = gpa + sizeof(*mpfp);
mpfp->length = 1;
mpfp->spec_rev = MP_SPECREV;
mpfp->checksum = mpt_compute_checksum(mpfp, sizeof(*mpfp));
}
static void
mpt_build_mpch(mpcth_t mpch)
{
memset(mpch, 0, sizeof(*mpch));
memcpy(mpch->signature, MPCH_SIG, 4);
mpch->spec_rev = MP_SPECREV;
memcpy(mpch->oem_id, MPCH_OEMID, MPCH_OEMID_LEN);
memcpy(mpch->product_id, MPCH_PRODID, MPCH_PRODID_LEN);
mpch->apic_address = LAPIC_PADDR;
}
static void
mpt_build_proc_entries(bproc_entry_ptr mpep, int ncpu)
{
int i;
for (i = 0; i < ncpu; i++) {
memset(mpep, 0, sizeof(*mpep));
mpep->type = MPCT_ENTRY_PROCESSOR;
mpep->apic_id = i; // XXX
mpep->apic_version = LAPIC_VERSION;
mpep->cpu_flags = PROCENTRY_FLAG_EN;
if (i == 0)
mpep->cpu_flags |= PROCENTRY_FLAG_BP;
mpep->cpu_signature = MPEP_SIG;
mpep->feature_flags = MPEP_FEATURES;
mpep++;
}
}
static void
mpt_build_bus_entries(bus_entry_ptr mpeb)
{
memset(mpeb, 0, sizeof(*mpeb));
mpeb->type = MPCT_ENTRY_BUS;
mpeb->bus_id = ISA;
memcpy(mpeb->bus_type, MPE_BUSNAME_ISA, MPE_BUSNAME_LEN);
mpeb++;
memset(mpeb, 0, sizeof(*mpeb));
mpeb->type = MPCT_ENTRY_BUS;
mpeb->bus_id = PCI;
memcpy(mpeb->bus_type, MPE_BUSNAME_PCI, MPE_BUSNAME_LEN);
}
static void
mpt_build_ioapic_entries(io_apic_entry_ptr mpei, int id)
{
memset(mpei, 0, sizeof(*mpei));
mpei->type = MPCT_ENTRY_IOAPIC;
mpei->apic_id = id;
mpei->apic_version = IOAPIC_VERSION;
mpei->apic_flags = IOAPICENTRY_FLAG_EN;
mpei->apic_address = IOAPIC_PADDR;
}
#ifdef notyet
static void
mpt_build_ioint_entries(struct mpe_ioint *mpeii, int num_pins, int id)
{
int pin;
/*
* The following config is taken from kernel mptable.c
* mptable_parse_default_config_ints(...), for now
* just use the default config, tweek later if needed.
*/
/* Run through all 16 pins. */
for (pin = 0; pin < num_pins; pin++) {
memset(mpeii, 0, sizeof(*mpeii));
mpeii->entry_type = MP_ENTRY_IOINT;
mpeii->src_bus_id = MPE_BUSID_ISA;
mpeii->dst_apic_id = id;
/*
* All default configs route IRQs from bus 0 to the first 16
* pins of the first I/O APIC with an APIC ID of 2.
*/
mpeii->dst_apic_intin = pin;
switch (pin) {
case 0:
/* Pin 0 is an ExtINT pin. */
mpeii->intr_type = MPEII_INTR_EXTINT;
break;
case 2:
/* IRQ 0 is routed to pin 2. */
mpeii->intr_type = MPEII_INTR_INT;
mpeii->src_bus_irq = 0;
break;
case 5:
case 10:
case 11:
/*
* PCI Irqs set to level triggered.
*/
mpeii->intr_flags = MPEII_FLAGS_TRIGMODE_LEVEL;
mpeii->src_bus_id = MPE_BUSID_PCI;
default:
/* All other pins are identity mapped. */
mpeii->intr_type = MPEII_INTR_INT;
mpeii->src_bus_irq = pin;
break;
}
mpeii++;
}
}
#define COPYSTR(dest, src, bytes) \
memcpy(dest, src, bytes); \
str[bytes] = 0;
static void
mptable_dump(struct mp_floating_pointer *mpfp, struct mp_config_hdr *mpch)
{
static char str[16];
int i;
char *cur;
union mpe {
struct mpe_proc *proc;
struct mpe_bus *bus;
struct mpe_ioapic *ioapic;
struct mpe_ioint *ioint;
struct mpe_lint *lnit;
char *p;
};
union mpe mpe;
printf(" MP Floating Pointer :\n");
COPYSTR(str, mpfp->signature, 4);
printf("\tsignature:\t%s\n", str);
printf("\tmpch paddr:\t%x\n", mpfp->mptable_paddr);
printf("\tlength:\t%x\n", mpfp->length);
printf("\tspecrec:\t%x\n", mpfp->specrev);
printf("\tchecksum:\t%x\n", mpfp->checksum);
printf("\tfeature1:\t%x\n", mpfp->feature1);
printf("\tfeature2:\t%x\n", mpfp->feature2);
printf("\tfeature3:\t%x\n", mpfp->feature3);
printf("\tfeature4:\t%x\n", mpfp->feature4);
printf(" MP Configuration Header :\n");
COPYSTR(str, mpch->signature, 4);
printf(" signature: %s\n", str);
printf(" length: %x\n", mpch->length);
printf(" specrec: %x\n", mpch->specrev);
printf(" checksum: %x\n", mpch->checksum);
COPYSTR(str, mpch->oemid, MPCH_OEMID_LEN);
printf(" oemid: %s\n", str);
COPYSTR(str, mpch->prodid, MPCH_PRODID_LEN);
printf(" prodid: %s\n", str);
printf(" oem_ptr: %x\n", mpch->oem_ptr);
printf(" oem_sz: %x\n", mpch->oem_sz);
printf(" nr_entries: %x\n", mpch->nr_entries);
printf(" apic paddr: %x\n", mpch->lapic_paddr);
printf(" ext_length: %x\n", mpch->ext_length);
printf(" ext_checksum: %x\n", mpch->ext_checksum);
cur = (char *)mpch + sizeof(*mpch);
for (i = 0; i < mpch->nr_entries; i++) {
mpe.p = cur;
switch(*mpe.p) {
case MP_ENTRY_PROC:
printf(" MP Processor Entry :\n");
printf(" lapic_id: %x\n", mpe.proc->lapic_id);
printf(" lapic_version: %x\n", mpe.proc->lapic_version);
printf(" proc_flags: %x\n", mpe.proc->proc_flags);
printf(" proc_signature: %x\n", mpe.proc->proc_signature);
printf(" feature_flags: %x\n", mpe.proc->feature_flags);
cur += sizeof(struct mpe_proc);
break;
case MP_ENTRY_BUS:
printf(" MP Bus Entry :\n");
printf(" busid: %x\n", mpe.bus->busid);
COPYSTR(str, mpe.bus->busname, MPE_BUSNAME_LEN);
printf(" busname: %s\n", str);
cur += sizeof(struct mpe_bus);
break;
case MP_ENTRY_IOAPIC:
printf(" MP IOAPIC Entry :\n");
printf(" ioapi_id: %x\n", mpe.ioapic->ioapic_id);
printf(" ioapi_version: %x\n", mpe.ioapic->ioapic_version);
printf(" ioapi_flags: %x\n", mpe.ioapic->ioapic_flags);
printf(" ioapi_paddr: %x\n", mpe.ioapic->ioapic_paddr);
cur += sizeof(struct mpe_ioapic);
break;
case MP_ENTRY_IOINT:
printf(" MP IO Interrupt Entry :\n");
printf(" intr_type: %x\n", mpe.ioint->intr_type);
printf(" intr_flags: %x\n", mpe.ioint->intr_flags);
printf(" src_bus_id: %x\n", mpe.ioint->src_bus_id);
printf(" src_bus_irq: %x\n", mpe.ioint->src_bus_irq);
printf(" dst_apic_id: %x\n", mpe.ioint->dst_apic_id);
printf(" dst_apic_intin: %x\n", mpe.ioint->dst_apic_intin);
cur += sizeof(struct mpe_ioint);
break;
case MP_ENTRY_LINT:
printf(" MP Local Interrupt Entry :\n");
cur += sizeof(struct mpe_lint);
break;
}
}
}
#endif
void
mptable_add_oemtbl(void *tbl, int tblsz)
{
oem_tbl_start = tbl;
oem_tbl_size = tblsz;
}
int
mptable_build(struct vmctx *ctx, int ncpu, int ioapic)
{
mpcth_t mpch;
bus_entry_ptr mpeb;
io_apic_entry_ptr mpei;
bproc_entry_ptr mpep;
mpfps_t mpfp;
char *curraddr;
char *startaddr;
if (paddr_guest2host(0) == NULL) {
printf("mptable requires mapped mem\n");
return (ENOMEM);
}
startaddr = curraddr = paddr_guest2host(MPTABLE_BASE);
mpfp = (mpfps_t)curraddr;
mpt_build_mpfp(mpfp, MPTABLE_BASE);
curraddr += sizeof(*mpfp);
mpch = (mpcth_t)curraddr;
mpt_build_mpch(mpch);
curraddr += sizeof(*mpch);
mpep = (bproc_entry_ptr)curraddr;
mpt_build_proc_entries(mpep, ncpu);
curraddr += sizeof(*mpep) * ncpu;
mpch->entry_count += ncpu;
mpeb = (bus_entry_ptr) curraddr;
mpt_build_bus_entries(mpeb);
curraddr += sizeof(*mpeb) * MPE_NUM_BUSES;
mpch->entry_count += MPE_NUM_BUSES;
if (ioapic) {
mpei = (io_apic_entry_ptr)curraddr;
mpt_build_ioapic_entries(mpei, ncpu + 1);
curraddr += sizeof(*mpei);
mpch->entry_count++;
}
#ifdef notyet
mpt_build_ioint_entries((struct mpe_ioint*)curraddr, MPEII_MAX_IRQ,
ncpu + 1);
curraddr += sizeof(struct mpe_ioint) * MPEII_MAX_IRQ;
mpch->entry_count += MPEII_MAX_IRQ;
#endif
if (oem_tbl_start) {
mpch->oem_table_pointer = curraddr - startaddr + MPTABLE_BASE;
mpch->oem_table_size = oem_tbl_size;
memcpy(curraddr, oem_tbl_start, oem_tbl_size);
}
mpch->base_table_length = curraddr - (char *)mpch;
mpch->checksum = mpt_compute_checksum(mpch, sizeof(*mpch));
return (0);
}

35
usr.sbin/bhyve/mptbl.h Normal file
View File

@ -0,0 +1,35 @@
/*-
* Copyright (c) 2012 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _MPTBL_H_
#define _MPTBL_H_
int mptable_build(struct vmctx *ctx, int ncpu, int ioapic);
void mptable_add_oemtbl(void *tbl, int tblsz);
#endif /* _MPTBL_H_ */

1117
usr.sbin/bhyve/pci_emul.c Normal file

File diff suppressed because it is too large Load Diff

216
usr.sbin/bhyve/pci_emul.h Normal file
View File

@ -0,0 +1,216 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _PCI_EMUL_H_
#define _PCI_EMUL_H_
#include <sys/types.h>
#include <sys/queue.h>
#include <sys/kernel.h>
#include <dev/pci/pcireg.h>
#include <assert.h>
#define PCI_BARMAX PCIR_MAX_BAR_0 /* BAR registers in a Type 0 header */
#define PCIY_RESERVED 0x00
struct vmctx;
struct pci_devinst;
struct memory_region;
struct pci_devemu {
char *pe_emu; /* Name of device emulation */
/* instance creation */
int (*pe_init)(struct vmctx *, struct pci_devinst *,
char *opts);
/* config space read/write callbacks */
int (*pe_cfgwrite)(struct vmctx *ctx, int vcpu,
struct pci_devinst *pi, int offset,
int bytes, uint32_t val);
int (*pe_cfgread)(struct vmctx *ctx, int vcpu,
struct pci_devinst *pi, int offset,
int bytes, uint32_t *retval);
/* BAR read/write callbacks */
void (*pe_barwrite)(struct vmctx *ctx, int vcpu,
struct pci_devinst *pi, int baridx,
uint64_t offset, int size, uint64_t value);
uint64_t (*pe_barread)(struct vmctx *ctx, int vcpu,
struct pci_devinst *pi, int baridx,
uint64_t offset, int size);
};
#define PCI_EMUL_SET(x) DATA_SET(pci_devemu_set, x);
enum pcibar_type {
PCIBAR_NONE,
PCIBAR_IO,
PCIBAR_MEM32,
PCIBAR_MEM64,
PCIBAR_MEMHI64
};
struct pcibar {
enum pcibar_type type; /* io or memory */
uint64_t size;
uint64_t addr;
};
#define PI_NAMESZ 40
struct msix_table_entry {
uint64_t addr;
uint32_t msg_data;
uint32_t vector_control;
} __packed;
/*
* In case the structure is modified to hold extra information, use a define
* for the size that should be emulated.
*/
#define MSIX_TABLE_ENTRY_SIZE 16
#define MAX_MSIX_TABLE_SIZE 2048
struct pci_devinst {
struct pci_devemu *pi_d;
struct vmctx *pi_vmctx;
uint8_t pi_bus, pi_slot, pi_func;
uint8_t pi_lintr_pin;
char pi_name[PI_NAMESZ];
uint16_t pi_iobase;
int pi_bar_getsize;
struct {
int enabled;
int cpu;
int vector;
int msgnum;
} pi_msi;
struct {
int enabled;
int table_bar;
int pba_bar;
size_t table_offset;
size_t table_size;
int table_count;
size_t pba_offset;
struct msix_table_entry table[MAX_MSIX_TABLE_SIZE];
} pi_msix;
void *pi_arg; /* devemu-private data */
u_char pi_cfgdata[PCI_REGMAX + 1];
struct pcibar pi_bar[PCI_BARMAX + 1];
};
struct msicap {
uint8_t capid;
uint8_t nextptr;
uint16_t msgctrl;
uint32_t addrlo;
uint32_t addrhi;
uint16_t msgdata;
} __packed;
struct msixcap {
uint8_t capid;
uint8_t nextptr;
uint16_t msgctrl;
uint32_t table_offset;
uint32_t pba_offset;
} __packed;
void init_pci(struct vmctx *ctx);
void msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
int bytes, uint32_t val);
void msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
int bytes, uint32_t val);
void pci_callback(void);
int pci_emul_alloc_bar(struct pci_devinst *pdi, int idx,
enum pcibar_type type, uint64_t size);
int pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx,
uint64_t hostbase, enum pcibar_type type, uint64_t size);
int pci_emul_add_msicap(struct pci_devinst *pi, int msgnum);
int pci_is_legacy(struct pci_devinst *pi);
void pci_generate_msi(struct pci_devinst *pi, int msgnum);
void pci_generate_msix(struct pci_devinst *pi, int msgnum);
void pci_lintr_assert(struct pci_devinst *pi);
void pci_lintr_deassert(struct pci_devinst *pi);
int pci_lintr_request(struct pci_devinst *pi, int ivec);
int pci_msi_enabled(struct pci_devinst *pi);
int pci_msix_enabled(struct pci_devinst *pi);
int pci_msi_msgnum(struct pci_devinst *pi);
void pci_parse_slot(char *opt, int legacy);
void pci_populate_msicap(struct msicap *cap, int msgs, int nextptr);
static __inline void
pci_set_cfgdata8(struct pci_devinst *pi, int offset, uint8_t val)
{
assert(offset <= PCI_REGMAX);
*(uint8_t *)(pi->pi_cfgdata + offset) = val;
}
static __inline void
pci_set_cfgdata16(struct pci_devinst *pi, int offset, uint16_t val)
{
assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0);
*(uint16_t *)(pi->pi_cfgdata + offset) = val;
}
static __inline void
pci_set_cfgdata32(struct pci_devinst *pi, int offset, uint32_t val)
{
assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0);
*(uint32_t *)(pi->pi_cfgdata + offset) = val;
}
static __inline uint8_t
pci_get_cfgdata8(struct pci_devinst *pi, int offset)
{
assert(offset <= PCI_REGMAX);
return (*(uint8_t *)(pi->pi_cfgdata + offset));
}
static __inline uint16_t
pci_get_cfgdata16(struct pci_devinst *pi, int offset)
{
assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0);
return (*(uint16_t *)(pi->pi_cfgdata + offset));
}
static __inline uint32_t
pci_get_cfgdata32(struct pci_devinst *pi, int offset)
{
assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0);
return (*(uint32_t *)(pi->pi_cfgdata + offset));
}
#endif /* _PCI_EMUL_H_ */

View File

@ -0,0 +1,52 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "pci_emul.h"
static int
pci_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
{
/* config space */
pci_set_cfgdata16(pi, PCIR_VENDOR, 0x1275); /* NetApp */
pci_set_cfgdata16(pi, PCIR_DEVICE, 0x1275); /* NetApp */
pci_set_cfgdata8(pi, PCIR_HDRTYPE, PCIM_HDRTYPE_BRIDGE);
pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE);
pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_HOST);
return (0);
}
struct pci_devemu pci_de_hostbridge = {
.pe_emu = "hostbridge",
.pe_init = pci_hostbridge_init,
};
PCI_EMUL_SET(pci_de_hostbridge);

View File

@ -0,0 +1,724 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/types.h>
#include <sys/pciio.h>
#include <sys/ioctl.h>
#include <dev/io/iodev.h>
#include <machine/iodev.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <fcntl.h>
#include <unistd.h>
#include <machine/vmm.h>
#include <vmmapi.h>
#include "pci_emul.h"
#include "mem.h"
#ifndef _PATH_DEVPCI
#define _PATH_DEVPCI "/dev/pci"
#endif
#ifndef _PATH_DEVIO
#define _PATH_DEVIO "/dev/io"
#endif
#define LEGACY_SUPPORT 1
#define MSIX_TABLE_BIR_MASK 7
#define MSIX_TABLE_OFFSET_MASK (~MSIX_TABLE_BIR_MASK);
#define MSIX_TABLE_COUNT(x) (((x) & 0x7FF) + 1)
#define MSIX_CAPLEN 12
static int pcifd = -1;
static int iofd = -1;
struct passthru_softc {
struct pci_devinst *psc_pi;
struct pcibar psc_bar[PCI_BARMAX + 1];
struct {
int capoff;
int msgctrl;
int emulated;
} psc_msi;
struct {
int capoff;
} psc_msix;
struct pcisel psc_sel;
};
static int
msi_caplen(int msgctrl)
{
int len;
len = 10; /* minimum length of msi capability */
if (msgctrl & PCIM_MSICTRL_64BIT)
len += 4;
#if 0
/*
* Ignore the 'mask' and 'pending' bits in the MSI capability.
* We'll let the guest manipulate them directly.
*/
if (msgctrl & PCIM_MSICTRL_VECTOR)
len += 10;
#endif
return (len);
}
static uint32_t
read_config(const struct pcisel *sel, long reg, int width)
{
struct pci_io pi;
bzero(&pi, sizeof(pi));
pi.pi_sel = *sel;
pi.pi_reg = reg;
pi.pi_width = width;
if (ioctl(pcifd, PCIOCREAD, &pi) < 0)
return (0); /* XXX */
else
return (pi.pi_data);
}
static void
write_config(const struct pcisel *sel, long reg, int width, uint32_t data)
{
struct pci_io pi;
bzero(&pi, sizeof(pi));
pi.pi_sel = *sel;
pi.pi_reg = reg;
pi.pi_width = width;
pi.pi_data = data;
(void)ioctl(pcifd, PCIOCWRITE, &pi); /* XXX */
}
#ifdef LEGACY_SUPPORT
static int
passthru_add_msicap(struct pci_devinst *pi, int msgnum, int nextptr)
{
int capoff, i;
struct msicap msicap;
u_char *capdata;
pci_populate_msicap(&msicap, msgnum, nextptr);
/*
* XXX
* Copy the msi capability structure in the last 16 bytes of the
* config space. This is wrong because it could shadow something
* useful to the device.
*/
capoff = 256 - roundup(sizeof(msicap), 4);
capdata = (u_char *)&msicap;
for (i = 0; i < sizeof(msicap); i++)
pci_set_cfgdata8(pi, capoff + i, capdata[i]);
return (capoff);
}
#endif /* LEGACY_SUPPORT */
static int
cfginitmsi(struct passthru_softc *sc)
{
int ptr, capptr, cap, sts, caplen;
uint32_t u32;
struct pcisel sel;
struct pci_devinst *pi;
struct msixcap msixcap;
uint32_t *msixcap_ptr;
pi = sc->psc_pi;
sel = sc->psc_sel;
/*
* Parse the capabilities and cache the location of the MSI
* and MSI-X capabilities.
*/
sts = read_config(&sel, PCIR_STATUS, 2);
if (sts & PCIM_STATUS_CAPPRESENT) {
ptr = read_config(&sel, PCIR_CAP_PTR, 1);
while (ptr != 0 && ptr != 0xff) {
cap = read_config(&sel, ptr + PCICAP_ID, 1);
if (cap == PCIY_MSI) {
/*
* Copy the MSI capability into the config
* space of the emulated pci device
*/
sc->psc_msi.capoff = ptr;
sc->psc_msi.msgctrl = read_config(&sel,
ptr + 2, 2);
sc->psc_msi.emulated = 0;
caplen = msi_caplen(sc->psc_msi.msgctrl);
capptr = ptr;
while (caplen > 0) {
u32 = read_config(&sel, capptr, 4);
pci_set_cfgdata32(pi, capptr, u32);
caplen -= 4;
capptr += 4;
}
} else if (cap == PCIY_MSIX) {
/*
* Copy the MSI-X capability
*/
sc->psc_msix.capoff = ptr;
caplen = 12;
msixcap_ptr = (uint32_t*) &msixcap;
capptr = ptr;
while (caplen > 0) {
u32 = read_config(&sel, capptr, 4);
*msixcap_ptr = u32;
pci_set_cfgdata32(pi, capptr, u32);
caplen -= 4;
capptr += 4;
msixcap_ptr++;
}
}
ptr = read_config(&sel, ptr + PCICAP_NEXTPTR, 1);
}
}
if (sc->psc_msix.capoff != 0) {
pi->pi_msix.pba_bar =
msixcap.pba_offset & MSIX_TABLE_BIR_MASK;
pi->pi_msix.pba_offset =
msixcap.pba_offset & MSIX_TABLE_OFFSET_MASK;
pi->pi_msix.table_bar =
msixcap.table_offset & MSIX_TABLE_BIR_MASK;
pi->pi_msix.table_offset =
msixcap.table_offset & MSIX_TABLE_OFFSET_MASK;
pi->pi_msix.table_count = MSIX_TABLE_COUNT(msixcap.msgctrl);
}
#ifdef LEGACY_SUPPORT
/*
* If the passthrough device does not support MSI then craft a
* MSI capability for it. We link the new MSI capability at the
* head of the list of capabilities.
*/
if ((sts & PCIM_STATUS_CAPPRESENT) != 0 && sc->psc_msi.capoff == 0) {
int origptr, msiptr;
origptr = read_config(&sel, PCIR_CAP_PTR, 1);
msiptr = passthru_add_msicap(pi, 1, origptr);
sc->psc_msi.capoff = msiptr;
sc->psc_msi.msgctrl = pci_get_cfgdata16(pi, msiptr + 2);
sc->psc_msi.emulated = 1;
pci_set_cfgdata8(pi, PCIR_CAP_PTR, msiptr);
}
#endif
/* Make sure one of the capabilities is present */
if (sc->psc_msi.capoff == 0 && sc->psc_msix.capoff == 0)
return (-1);
else
return (0);
}
static uint64_t
msix_table_read(struct passthru_softc *sc, uint64_t offset, int size)
{
struct pci_devinst *pi;
struct msix_table_entry *entry;
uint8_t *src8;
uint16_t *src16;
uint32_t *src32;
uint64_t *src64;
uint64_t data;
size_t entry_offset;
int index;
pi = sc->psc_pi;
entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
index = offset / MSIX_TABLE_ENTRY_SIZE;
entry = &pi->pi_msix.table[index];
switch(size) {
case 1:
src8 = (uint8_t *)((void *)entry + entry_offset);
data = *src8;
break;
case 2:
src16 = (uint16_t *)((void *)entry + entry_offset);
data = *src16;
break;
case 4:
src32 = (uint32_t *)((void *)entry + entry_offset);
data = *src32;
break;
case 8:
src64 = (uint64_t *)((void *)entry + entry_offset);
data = *src64;
break;
default:
return (-1);
}
return (data);
}
static void
msix_table_write(struct vmctx *ctx, int vcpu, struct passthru_softc *sc,
uint64_t offset, int size, uint64_t data)
{
struct pci_devinst *pi;
struct msix_table_entry *entry;
uint32_t *dest;
size_t entry_offset;
uint32_t vector_control;
int error, index;
pi = sc->psc_pi;
entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
index = offset / MSIX_TABLE_ENTRY_SIZE;
entry = &pi->pi_msix.table[index];
/* Only 4 byte naturally-aligned writes are supported */
assert(size == 4);
assert(entry_offset % 4 == 0);
vector_control = entry->vector_control;
dest = (uint32_t *)((void *)entry + entry_offset);
*dest = data;
/* If MSI-X hasn't been enabled, do nothing */
if (pi->pi_msix.enabled) {
/* If the entry is masked, don't set it up */
if ((entry->vector_control & PCIM_MSIX_VCTRL_MASK) == 0 ||
(vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
error = vm_setup_msix(ctx, vcpu, sc->psc_sel.pc_bus,
sc->psc_sel.pc_dev,
sc->psc_sel.pc_func,
index, entry->msg_data,
entry->vector_control,
entry->addr);
}
}
}
static int
init_msix_table(struct vmctx *ctx, struct passthru_softc *sc, uint64_t base)
{
int idx;
size_t table_size;
vm_paddr_t start;
size_t len;
struct pci_devinst *pi = sc->psc_pi;
/*
* If the MSI-X table BAR maps memory intended for
* other uses, it is at least assured that the table
* either resides in its own page within the region,
* or it resides in a page shared with only the PBA.
*/
if (pi->pi_msix.pba_bar == pi->pi_msix.table_bar &&
((pi->pi_msix.pba_offset - pi->pi_msix.table_offset) < 4096)) {
/* Need to also emulate the PBA, not supported yet */
printf("Unsupported MSI-X table and PBA in same page\n");
return (-1);
}
/*
* May need to split the BAR into 3 regions:
* Before the MSI-X table, the MSI-X table, and after it
* XXX for now, assume that the table is not in the middle
*/
table_size = pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
pi->pi_msix.table_size = table_size;
idx = pi->pi_msix.table_bar;
/* Round up to page size */
table_size = (table_size + 0x1000) & ~0xFFF;
if (pi->pi_msix.table_offset == 0) {
/* Map everything after the MSI-X table */
start = pi->pi_bar[idx].addr + table_size;
len = pi->pi_bar[idx].size - table_size;
} else {
/* Map everything before the MSI-X table */
start = pi->pi_bar[idx].addr;
len = pi->pi_msix.table_offset;
}
return (vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus,
sc->psc_sel.pc_dev, sc->psc_sel.pc_func,
start, len, base + table_size));
}
static int
cfginitbar(struct vmctx *ctx, struct passthru_softc *sc)
{
int i, error;
struct pci_devinst *pi;
struct pci_bar_io bar;
enum pcibar_type bartype;
uint64_t base;
pi = sc->psc_pi;
/*
* Initialize BAR registers
*/
for (i = 0; i <= PCI_BARMAX; i++) {
bzero(&bar, sizeof(bar));
bar.pbi_sel = sc->psc_sel;
bar.pbi_reg = PCIR_BAR(i);
if (ioctl(pcifd, PCIOCGETBAR, &bar) < 0)
continue;
if (PCI_BAR_IO(bar.pbi_base)) {
bartype = PCIBAR_IO;
base = bar.pbi_base & PCIM_BAR_IO_BASE;
} else {
switch (bar.pbi_base & PCIM_BAR_MEM_TYPE) {
case PCIM_BAR_MEM_64:
bartype = PCIBAR_MEM64;
break;
default:
bartype = PCIBAR_MEM32;
break;
}
base = bar.pbi_base & PCIM_BAR_MEM_BASE;
}
/* Cache information about the "real" BAR */
sc->psc_bar[i].type = bartype;
sc->psc_bar[i].size = bar.pbi_length;
sc->psc_bar[i].addr = base;
/* Allocate the BAR in the guest I/O or MMIO space */
error = pci_emul_alloc_pbar(pi, i, base, bartype,
bar.pbi_length);
if (error)
return (-1);
/* The MSI-X table needs special handling */
if (i == pi->pi_msix.table_bar) {
error = init_msix_table(ctx, sc, base);
if (error)
return (-1);
} else if (bartype != PCIBAR_IO) {
/* Map the physical MMIO space in the guest MMIO space */
error = vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus,
sc->psc_sel.pc_dev, sc->psc_sel.pc_func,
pi->pi_bar[i].addr, pi->pi_bar[i].size, base);
if (error)
return (-1);
}
/*
* 64-bit BAR takes up two slots so skip the next one.
*/
if (bartype == PCIBAR_MEM64) {
i++;
assert(i <= PCI_BARMAX);
sc->psc_bar[i].type = PCIBAR_MEMHI64;
}
}
return (0);
}
static int
cfginit(struct vmctx *ctx, struct pci_devinst *pi, int bus, int slot, int func)
{
int error;
struct passthru_softc *sc;
error = 1;
sc = pi->pi_arg;
bzero(&sc->psc_sel, sizeof(struct pcisel));
sc->psc_sel.pc_bus = bus;
sc->psc_sel.pc_dev = slot;
sc->psc_sel.pc_func = func;
if (cfginitmsi(sc) != 0)
goto done;
if (cfginitbar(ctx, sc) != 0)
goto done;
error = 0; /* success */
done:
return (error);
}
static int
passthru_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
{
int bus, slot, func, error;
struct passthru_softc *sc;
sc = NULL;
error = 1;
if (pcifd < 0) {
pcifd = open(_PATH_DEVPCI, O_RDWR, 0);
if (pcifd < 0)
goto done;
}
if (iofd < 0) {
iofd = open(_PATH_DEVIO, O_RDWR, 0);
if (iofd < 0)
goto done;
}
if (opts == NULL ||
sscanf(opts, "%d/%d/%d", &bus, &slot, &func) != 3)
goto done;
if (vm_assign_pptdev(ctx, bus, slot, func) != 0)
goto done;
sc = malloc(sizeof(struct passthru_softc));
memset(sc, 0, sizeof(struct passthru_softc));
pi->pi_arg = sc;
sc->psc_pi = pi;
/* initialize config space */
if ((error = cfginit(ctx, pi, bus, slot, func)) != 0)
goto done;
error = 0; /* success */
done:
if (error) {
free(sc);
vm_unassign_pptdev(ctx, bus, slot, func);
}
return (error);
}
static int
bar_access(int coff)
{
if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1))
return (1);
else
return (0);
}
static int
msicap_access(struct passthru_softc *sc, int coff)
{
int caplen;
if (sc->psc_msi.capoff == 0)
return (0);
caplen = msi_caplen(sc->psc_msi.msgctrl);
if (coff >= sc->psc_msi.capoff && coff < sc->psc_msi.capoff + caplen)
return (1);
else
return (0);
}
static int
msixcap_access(struct passthru_softc *sc, int coff)
{
if (sc->psc_msix.capoff == 0)
return (0);
return (coff >= sc->psc_msix.capoff &&
coff < sc->psc_msix.capoff + MSIX_CAPLEN);
}
static int
passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
int coff, int bytes, uint32_t *rv)
{
struct passthru_softc *sc;
sc = pi->pi_arg;
/*
* PCI BARs and MSI capability is emulated.
*/
if (bar_access(coff) || msicap_access(sc, coff))
return (-1);
#ifdef LEGACY_SUPPORT
/*
* Emulate PCIR_CAP_PTR if this device does not support MSI capability
* natively.
*/
if (sc->psc_msi.emulated) {
if (coff >= PCIR_CAP_PTR && coff < PCIR_CAP_PTR + 4)
return (-1);
}
#endif
/* Everything else just read from the device's config space */
*rv = read_config(&sc->psc_sel, coff, bytes);
return (0);
}
static int
passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
int coff, int bytes, uint32_t val)
{
int error, msix_table_entries, i;
struct passthru_softc *sc;
sc = pi->pi_arg;
/*
* PCI BARs are emulated
*/
if (bar_access(coff))
return (-1);
/*
* MSI capability is emulated
*/
if (msicap_access(sc, coff)) {
msicap_cfgwrite(pi, sc->psc_msi.capoff, coff, bytes, val);
error = vm_setup_msi(ctx, vcpu, sc->psc_sel.pc_bus,
sc->psc_sel.pc_dev, sc->psc_sel.pc_func, pi->pi_msi.cpu,
pi->pi_msi.vector, pi->pi_msi.msgnum);
if (error != 0) {
printf("vm_setup_msi returned error %d\r\n", errno);
exit(1);
}
return (0);
}
if (msixcap_access(sc, coff)) {
msixcap_cfgwrite(pi, sc->psc_msix.capoff, coff, bytes, val);
if (pi->pi_msix.enabled) {
msix_table_entries = pi->pi_msix.table_count;
for (i = 0; i < msix_table_entries; i++) {
error = vm_setup_msix(ctx, vcpu, sc->psc_sel.pc_bus,
sc->psc_sel.pc_dev,
sc->psc_sel.pc_func, i,
pi->pi_msix.table[i].msg_data,
pi->pi_msix.table[i].vector_control,
pi->pi_msix.table[i].addr);
if (error) {
printf("vm_setup_msix returned error %d\r\n", errno);
exit(1);
}
}
}
return (0);
}
#ifdef LEGACY_SUPPORT
/*
* If this device does not support MSI natively then we cannot let
* the guest disable legacy interrupts from the device. It is the
* legacy interrupt that is triggering the virtual MSI to the guest.
*/
if (sc->psc_msi.emulated && pci_msi_enabled(pi)) {
if (coff == PCIR_COMMAND && bytes == 2)
val &= ~PCIM_CMD_INTxDIS;
}
#endif
write_config(&sc->psc_sel, coff, bytes, val);
return (0);
}
static void
passthru_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
uint64_t offset, int size, uint64_t value)
{
struct passthru_softc *sc;
struct iodev_pio_req pio;
sc = pi->pi_arg;
if (pi->pi_msix.table_bar == baridx) {
msix_table_write(ctx, vcpu, sc, offset, size, value);
} else {
assert(pi->pi_bar[baridx].type == PCIBAR_IO);
bzero(&pio, sizeof(struct iodev_pio_req));
pio.access = IODEV_PIO_WRITE;
pio.port = sc->psc_bar[baridx].addr + offset;
pio.width = size;
pio.val = value;
(void)ioctl(iofd, IODEV_PIO, &pio);
}
}
static uint64_t
passthru_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
uint64_t offset, int size)
{
struct passthru_softc *sc;
struct iodev_pio_req pio;
uint64_t val;
sc = pi->pi_arg;
if (pi->pi_msix.table_bar == baridx) {
val = msix_table_read(sc, offset, size);
} else {
assert(pi->pi_bar[baridx].type == PCIBAR_IO);
bzero(&pio, sizeof(struct iodev_pio_req));
pio.access = IODEV_PIO_READ;
pio.port = sc->psc_bar[baridx].addr + offset;
pio.width = size;
pio.val = 0;
(void)ioctl(iofd, IODEV_PIO, &pio);
val = pio.val;
}
return (val);
}
struct pci_devemu passthru = {
.pe_emu = "passthru",
.pe_init = passthru_init,
.pe_cfgwrite = passthru_cfgwrite,
.pe_cfgread = passthru_cfgread,
.pe_barwrite = passthru_write,
.pe_barread = passthru_read,
};
PCI_EMUL_SET(passthru);

626
usr.sbin/bhyve/pci_uart.c Normal file
View File

@ -0,0 +1,626 @@
/*-
* Copyright (c) 2012 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <sys/select.h>
#include <dev/ic/ns16550.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <termios.h>
#include <unistd.h>
#include <stdbool.h>
#include <string.h>
#include <pthread.h>
#include "bhyverun.h"
#include "pci_emul.h"
#include "mevent.h"
#define COM1_BASE 0x3F8
#define COM1_IRQ 4
#define COM2_BASE 0x2F8
#define COM2_IRQ 3
#define DEFAULT_RCLK 1843200
#define DEFAULT_BAUD 9600
#define FCR_RX_MASK 0xC0
#define MCR_OUT1 0x04
#define MCR_OUT2 0x08
#define MSR_DELTA_MASK 0x0f
#ifndef REG_SCR
#define REG_SCR com_scr
#endif
#define FIFOSZ 16
/*
* Pick a PCI vid/did of a chip with a single uart at
* BAR0, that most versions of FreeBSD can understand:
* Siig CyberSerial 1-port.
*/
#define COM_VENDOR 0x131f
#define COM_DEV 0x2000
static int pci_uart_stdio; /* stdio in use for i/o */
static int pci_uart_nldevs; /* number of legacy devices - 2 max */
static struct {
uint64_t baddr;
int vector;
} pci_uart_lres[] = {
{ COM1_BASE, COM1_IRQ},
{ COM2_BASE, COM2_IRQ},
{ 0, 0 }
};
struct fifo {
uint8_t buf[FIFOSZ];
int rindex; /* index to read from */
int windex; /* index to write to */
int num; /* number of characters in the fifo */
int size; /* size of the fifo */
};
struct pci_uart_softc {
struct pci_devinst *pi;
pthread_mutex_t mtx; /* protects all softc elements */
uint8_t data; /* Data register (R/W) */
uint8_t ier; /* Interrupt enable register (R/W) */
uint8_t lcr; /* Line control register (R/W) */
uint8_t mcr; /* Modem control register (R/W) */
uint8_t lsr; /* Line status register (R/W) */
uint8_t msr; /* Modem status register (R/W) */
uint8_t fcr; /* FIFO control register (W) */
uint8_t scr; /* Scratch register (R/W) */
uint8_t dll; /* Baudrate divisor latch LSB */
uint8_t dlh; /* Baudrate divisor latch MSB */
struct fifo rxfifo;
int opened;
int stdio;
bool thre_int_pending; /* THRE interrupt pending */
};
static void pci_uart_drain(int fd, enum ev_type ev, void *arg);
static struct termios tio_orig, tio_new; /* I/O Terminals */
static void
ttyclose(void)
{
tcsetattr(STDIN_FILENO, TCSANOW, &tio_orig);
}
static void
ttyopen(void)
{
tcgetattr(STDIN_FILENO, &tio_orig);
cfmakeraw(&tio_new);
tcsetattr(STDIN_FILENO, TCSANOW, &tio_new);
atexit(ttyclose);
}
static bool
tty_char_available(void)
{
fd_set rfds;
struct timeval tv;
FD_ZERO(&rfds);
FD_SET(STDIN_FILENO, &rfds);
tv.tv_sec = 0;
tv.tv_usec = 0;
if (select(STDIN_FILENO + 1, &rfds, NULL, NULL, &tv) > 0 ) {
return (true);
} else {
return (false);
}
}
static int
ttyread(void)
{
char rb;
if (tty_char_available()) {
read(STDIN_FILENO, &rb, 1);
return (rb & 0xff);
} else {
return (-1);
}
}
static void
ttywrite(unsigned char wb)
{
(void) write(STDIN_FILENO, &wb, 1);
}
static void
fifo_reset(struct fifo *fifo, int size)
{
bzero(fifo, sizeof(struct fifo));
fifo->size = size;
}
static int
fifo_putchar(struct fifo *fifo, uint8_t ch)
{
if (fifo->num < fifo->size) {
fifo->buf[fifo->windex] = ch;
fifo->windex = (fifo->windex + 1) % fifo->size;
fifo->num++;
return (0);
} else
return (-1);
}
static int
fifo_getchar(struct fifo *fifo)
{
int c;
if (fifo->num > 0) {
c = fifo->buf[fifo->rindex];
fifo->rindex = (fifo->rindex + 1) % fifo->size;
fifo->num--;
return (c);
} else
return (-1);
}
static int
fifo_numchars(struct fifo *fifo)
{
return (fifo->num);
}
static int
fifo_available(struct fifo *fifo)
{
return (fifo->num < fifo->size);
}
static void
pci_uart_opentty(struct pci_uart_softc *sc)
{
struct mevent *mev;
assert(sc->opened == 0);
assert(sc->stdio);
ttyopen();
mev = mevent_add(STDIN_FILENO, EVF_READ, pci_uart_drain, sc);
assert(mev);
}
static void
pci_uart_legacy_res(uint64_t *bar, int *ivec)
{
if (pci_uart_lres[pci_uart_nldevs].baddr != 0) {
*bar = pci_uart_lres[pci_uart_nldevs].baddr;
*ivec = pci_uart_lres[pci_uart_nldevs].vector;
pci_uart_nldevs++;
} else {
/* TODO: print warning ? */
*bar = 0;
*ivec= -1;
}
}
/*
* The IIR returns a prioritized interrupt reason:
* - receive data available
* - transmit holding register empty
* - modem status change
*
* Return an interrupt reason if one is available.
*/
static int
pci_uart_intr_reason(struct pci_uart_softc *sc)
{
if ((sc->lsr & LSR_OE) != 0 && (sc->ier & IER_ERLS) != 0)
return (IIR_RLS);
else if (fifo_numchars(&sc->rxfifo) > 0 && (sc->ier & IER_ERXRDY) != 0)
return (IIR_RXTOUT);
else if (sc->thre_int_pending && (sc->ier & IER_ETXRDY) != 0)
return (IIR_TXRDY);
else if ((sc->msr & MSR_DELTA_MASK) != 0 && (sc->ier & IER_EMSC) != 0)
return (IIR_MLSC);
else
return (IIR_NOPEND);
}
static void
pci_uart_reset(struct pci_uart_softc *sc)
{
uint16_t divisor;
divisor = DEFAULT_RCLK / DEFAULT_BAUD / 16;
sc->dll = divisor;
sc->dlh = divisor >> 16;
fifo_reset(&sc->rxfifo, 1); /* no fifo until enabled by software */
}
/*
* Toggle the COM port's intr pin depending on whether or not we have an
* interrupt condition to report to the processor.
*/
static void
pci_uart_toggle_intr(struct pci_uart_softc *sc)
{
uint8_t intr_reason;
intr_reason = pci_uart_intr_reason(sc);
if (intr_reason == IIR_NOPEND)
pci_lintr_deassert(sc->pi);
else
pci_lintr_assert(sc->pi);
}
static void
pci_uart_drain(int fd, enum ev_type ev, void *arg)
{
struct pci_uart_softc *sc;
int ch;
sc = arg;
assert(fd == STDIN_FILENO);
assert(ev == EVF_READ);
/*
* This routine is called in the context of the mevent thread
* to take out the softc lock to protect against concurrent
* access from a vCPU i/o exit
*/
pthread_mutex_lock(&sc->mtx);
if ((sc->mcr & MCR_LOOPBACK) != 0) {
(void) ttyread();
} else {
while (fifo_available(&sc->rxfifo) &&
((ch = ttyread()) != -1)) {
fifo_putchar(&sc->rxfifo, ch);
}
pci_uart_toggle_intr(sc);
}
pthread_mutex_unlock(&sc->mtx);
}
static void
pci_uart_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
int baridx, uint64_t offset, int size, uint64_t value)
{
struct pci_uart_softc *sc;
int fifosz;
uint8_t msr;
sc = pi->pi_arg;
assert(baridx == 0);
assert(size == 1);
/* Open terminal */
if (!sc->opened && sc->stdio) {
pci_uart_opentty(sc);
sc->opened = 1;
}
pthread_mutex_lock(&sc->mtx);
/*
* Take care of the special case DLAB accesses first
*/
if ((sc->lcr & LCR_DLAB) != 0) {
if (offset == REG_DLL) {
sc->dll = value;
goto done;
}
if (offset == REG_DLH) {
sc->dlh = value;
goto done;
}
}
switch (offset) {
case REG_DATA:
if (sc->mcr & MCR_LOOPBACK) {
if (fifo_putchar(&sc->rxfifo, value) != 0)
sc->lsr |= LSR_OE;
} else if (sc->stdio) {
ttywrite(value);
} /* else drop on floor */
sc->thre_int_pending = true;
break;
case REG_IER:
/*
* Apply mask so that bits 4-7 are 0
* Also enables bits 0-3 only if they're 1
*/
sc->ier = value & 0x0F;
break;
case REG_FCR:
/*
* When moving from FIFO and 16450 mode and vice versa,
* the FIFO contents are reset.
*/
if ((sc->fcr & FCR_ENABLE) ^ (value & FCR_ENABLE)) {
fifosz = (value & FCR_ENABLE) ? FIFOSZ : 1;
fifo_reset(&sc->rxfifo, fifosz);
}
/*
* The FCR_ENABLE bit must be '1' for the programming
* of other FCR bits to be effective.
*/
if ((value & FCR_ENABLE) == 0) {
sc->fcr = 0;
} else {
if ((value & FCR_RCV_RST) != 0)
fifo_reset(&sc->rxfifo, FIFOSZ);
sc->fcr = value &
(FCR_ENABLE | FCR_DMA | FCR_RX_MASK);
}
break;
case REG_LCR:
sc->lcr = value;
break;
case REG_MCR:
/* Apply mask so that bits 5-7 are 0 */
sc->mcr = value & 0x1F;
msr = 0;
if (sc->mcr & MCR_LOOPBACK) {
/*
* In the loopback mode certain bits from the
* MCR are reflected back into MSR
*/
if (sc->mcr & MCR_RTS)
msr |= MSR_CTS;
if (sc->mcr & MCR_DTR)
msr |= MSR_DSR;
if (sc->mcr & MCR_OUT1)
msr |= MSR_RI;
if (sc->mcr & MCR_OUT2)
msr |= MSR_DCD;
}
/*
* Detect if there has been any change between the
* previous and the new value of MSR. If there is
* then assert the appropriate MSR delta bit.
*/
if ((msr & MSR_CTS) ^ (sc->msr & MSR_CTS))
sc->msr |= MSR_DCTS;
if ((msr & MSR_DSR) ^ (sc->msr & MSR_DSR))
sc->msr |= MSR_DDSR;
if ((msr & MSR_DCD) ^ (sc->msr & MSR_DCD))
sc->msr |= MSR_DDCD;
if ((sc->msr & MSR_RI) != 0 && (msr & MSR_RI) == 0)
sc->msr |= MSR_TERI;
/*
* Update the value of MSR while retaining the delta
* bits.
*/
sc->msr &= MSR_DELTA_MASK;
sc->msr |= msr;
break;
case REG_LSR:
/*
* Line status register is not meant to be written to
* during normal operation.
*/
break;
case REG_MSR:
/*
* As far as I can tell MSR is a read-only register.
*/
break;
case REG_SCR:
sc->scr = value;
break;
default:
break;
}
done:
pci_uart_toggle_intr(sc);
pthread_mutex_unlock(&sc->mtx);
}
uint64_t
pci_uart_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
int baridx, uint64_t offset, int size)
{
struct pci_uart_softc *sc;
uint8_t iir, intr_reason;
uint64_t reg;
sc = pi->pi_arg;
assert(baridx == 0);
assert(size == 1);
/* Open terminal */
if (!sc->opened && sc->stdio) {
pci_uart_opentty(sc);
sc->opened = 1;
}
pthread_mutex_lock(&sc->mtx);
/*
* Take care of the special case DLAB accesses first
*/
if ((sc->lcr & LCR_DLAB) != 0) {
if (offset == REG_DLL) {
reg = sc->dll;
goto done;
}
if (offset == REG_DLH) {
reg = sc->dlh;
goto done;
}
}
switch (offset) {
case REG_DATA:
reg = fifo_getchar(&sc->rxfifo);
break;
case REG_IER:
reg = sc->ier;
break;
case REG_IIR:
iir = (sc->fcr & FCR_ENABLE) ? IIR_FIFO_MASK : 0;
intr_reason = pci_uart_intr_reason(sc);
/*
* Deal with side effects of reading the IIR register
*/
if (intr_reason == IIR_TXRDY)
sc->thre_int_pending = false;
iir |= intr_reason;
reg = iir;
break;
case REG_LCR:
reg = sc->lcr;
break;
case REG_MCR:
reg = sc->mcr;
break;
case REG_LSR:
/* Transmitter is always ready for more data */
sc->lsr |= LSR_TEMT | LSR_THRE;
/* Check for new receive data */
if (fifo_numchars(&sc->rxfifo) > 0)
sc->lsr |= LSR_RXRDY;
else
sc->lsr &= ~LSR_RXRDY;
reg = sc->lsr;
/* The LSR_OE bit is cleared on LSR read */
sc->lsr &= ~LSR_OE;
break;
case REG_MSR:
/*
* MSR delta bits are cleared on read
*/
reg = sc->msr;
sc->msr &= ~MSR_DELTA_MASK;
break;
case REG_SCR:
reg = sc->scr;
break;
default:
reg = 0xFF;
break;
}
done:
pci_uart_toggle_intr(sc);
pthread_mutex_unlock(&sc->mtx);
return (reg);
}
static int
pci_uart_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
{
struct pci_uart_softc *sc;
uint64_t bar;
int ivec;
sc = malloc(sizeof(struct pci_uart_softc));
memset(sc, 0, sizeof(struct pci_uart_softc));
pi->pi_arg = sc;
sc->pi = pi;
pthread_mutex_init(&sc->mtx, NULL);
/* initialize config space */
pci_set_cfgdata16(pi, PCIR_DEVICE, COM_DEV);
pci_set_cfgdata16(pi, PCIR_VENDOR, COM_VENDOR);
pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_SIMPLECOMM);
if (pci_is_legacy(pi)) {
pci_uart_legacy_res(&bar, &ivec);
pci_emul_alloc_pbar(pi, 0, bar, PCIBAR_IO, 8);
} else {
ivec = -1;
pci_emul_alloc_bar(pi, 0, PCIBAR_IO, 8);
}
pci_lintr_request(pi, ivec);
if (opts != NULL && !strcmp("stdio", opts) && !pci_uart_stdio) {
pci_uart_stdio = 1;
sc->stdio = 1;
}
pci_uart_reset(sc);
return (0);
}
struct pci_devemu pci_de_com = {
.pe_emu = "uart",
.pe_init = pci_uart_init,
.pe_barwrite = pci_uart_write,
.pe_barread = pci_uart_read
};
PCI_EMUL_SET(pci_de_com);

View File

@ -0,0 +1,534 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/linker_set.h>
#include <sys/stat.h>
#include <sys/uio.h>
#include <sys/ioctl.h>
#include <sys/disk.h>
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <strings.h>
#include <unistd.h>
#include <assert.h>
#include <pthread.h>
#include "bhyverun.h"
#include "pci_emul.h"
#include "virtio.h"
#define VTBLK_RINGSZ 64
#define VTBLK_CFGSZ 28
#define VTBLK_R_CFG VTCFG_R_CFG0
#define VTBLK_R_CFG_END VTBLK_R_CFG + VTBLK_CFGSZ -1
#define VTBLK_R_MAX VTBLK_R_CFG_END
#define VTBLK_REGSZ VTBLK_R_MAX+1
#define VTBLK_MAXSEGS 32
#define VTBLK_S_OK 0
#define VTBLK_S_IOERR 1
/*
* Host capabilities
*/
#define VTBLK_S_HOSTCAPS \
( 0x00000004 | /* host maximum request segments */ \
0x10000000 ) /* supports indirect descriptors */
struct vring_hqueue {
/* Internal state */
uint16_t hq_size;
uint16_t hq_cur_aidx; /* trails behind 'avail_idx' */
/* Host-context pointers to the queue */
struct virtio_desc *hq_dtable;
uint16_t *hq_avail_flags;
uint16_t *hq_avail_idx; /* monotonically increasing */
uint16_t *hq_avail_ring;
uint16_t *hq_used_flags;
uint16_t *hq_used_idx; /* monotonically increasing */
struct virtio_used *hq_used_ring;
};
/*
* Config space
*/
struct vtblk_config {
uint64_t vbc_capacity;
uint32_t vbc_size_max;
uint32_t vbc_seg_max;
uint16_t vbc_geom_c;
uint8_t vbc_geom_h;
uint8_t vbc_geom_s;
uint32_t vbc_blk_size;
uint32_t vbc_sectors_max;
} __packed;
CTASSERT(sizeof(struct vtblk_config) == VTBLK_CFGSZ);
/*
* Fixed-size block header
*/
struct virtio_blk_hdr {
#define VBH_OP_READ 0
#define VBH_OP_WRITE 1
uint32_t vbh_type;
uint32_t vbh_ioprio;
uint64_t vbh_sector;
} __packed;
/*
* Debug printf
*/
static int pci_vtblk_debug;
#define DPRINTF(params) if (pci_vtblk_debug) printf params
#define WPRINTF(params) printf params
/*
* Per-device softc
*/
struct pci_vtblk_softc {
struct pci_devinst *vbsc_pi;
int vbsc_fd;
int vbsc_status;
int vbsc_isr;
int vbsc_lastq;
uint32_t vbsc_features;
uint64_t vbsc_pfn;
struct vring_hqueue vbsc_q;
struct vtblk_config vbsc_cfg;
};
/*
* Return the number of available descriptors in the vring taking care
* of the 16-bit index wraparound.
*/
static int
hq_num_avail(struct vring_hqueue *hq)
{
int ndesc;
if (*hq->hq_avail_idx >= hq->hq_cur_aidx)
ndesc = *hq->hq_avail_idx - hq->hq_cur_aidx;
else
ndesc = UINT16_MAX - hq->hq_cur_aidx + *hq->hq_avail_idx + 1;
assert(ndesc >= 0 && ndesc <= hq->hq_size);
return (ndesc);
}
static void
pci_vtblk_update_status(struct pci_vtblk_softc *sc, uint32_t value)
{
if (value == 0) {
DPRINTF(("vtblk: device reset requested !\n"));
}
sc->vbsc_status = value;
}
static void
pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vring_hqueue *hq)
{
struct iovec iov[VTBLK_MAXSEGS];
struct virtio_blk_hdr *vbh;
struct virtio_desc *vd, *vid;
struct virtio_used *vu;
uint8_t *status;
int i;
int err;
int iolen;
int nsegs;
int uidx, aidx, didx;
int writeop;
off_t offset;
uidx = *hq->hq_used_idx;
aidx = hq->hq_cur_aidx;
didx = hq->hq_avail_ring[aidx % hq->hq_size];
assert(didx >= 0 && didx < hq->hq_size);
vd = &hq->hq_dtable[didx];
/*
* Verify that the descriptor is indirect, and obtain
* the pointer to the indirect descriptor.
* There has to be space for at least 3 descriptors
* in the indirect descriptor array: the block header,
* 1 or more data descriptors, and a status byte.
*/
assert(vd->vd_flags & VRING_DESC_F_INDIRECT);
nsegs = vd->vd_len / sizeof(struct virtio_desc);
assert(nsegs >= 3);
assert(nsegs < VTBLK_MAXSEGS + 2);
vid = paddr_guest2host(vd->vd_addr);
assert((vid->vd_flags & VRING_DESC_F_INDIRECT) == 0);
/*
* The first descriptor will be the read-only fixed header
*/
vbh = paddr_guest2host(vid[0].vd_addr);
assert(vid[0].vd_len == sizeof(struct virtio_blk_hdr));
assert(vid[0].vd_flags & VRING_DESC_F_NEXT);
assert((vid[0].vd_flags & VRING_DESC_F_WRITE) == 0);
writeop = (vbh->vbh_type == VBH_OP_WRITE);
offset = vbh->vbh_sector * DEV_BSIZE;
/*
* Build up the iovec based on the guest's data descriptors
*/
for (i = 1, iolen = 0; i < nsegs - 1; i++) {
iov[i-1].iov_base = paddr_guest2host(vid[i].vd_addr);
iov[i-1].iov_len = vid[i].vd_len;
iolen += vid[i].vd_len;
assert(vid[i].vd_flags & VRING_DESC_F_NEXT);
assert((vid[i].vd_flags & VRING_DESC_F_INDIRECT) == 0);
/*
* - write op implies read-only descriptor,
* - read op implies write-only descriptor,
* therefore test the inverse of the descriptor bit
* to the op.
*/
assert(((vid[i].vd_flags & VRING_DESC_F_WRITE) == 0) ==
writeop);
}
/* Lastly, get the address of the status byte */
status = paddr_guest2host(vid[nsegs - 1].vd_addr);
assert(vid[nsegs - 1].vd_len == 1);
assert((vid[nsegs - 1].vd_flags & VRING_DESC_F_NEXT) == 0);
assert(vid[nsegs - 1].vd_flags & VRING_DESC_F_WRITE);
DPRINTF(("virtio-block: %s op, %d bytes, %d segs, offset %ld\n\r",
writeop ? "write" : "read", iolen, nsegs - 2, offset));
if (writeop){
err = pwritev(sc->vbsc_fd, iov, nsegs - 2, offset);
} else {
err = preadv(sc->vbsc_fd, iov, nsegs - 2, offset);
}
*status = err < 0 ? VTBLK_S_IOERR : VTBLK_S_OK;
/*
* Return the single indirect descriptor back to the host
*/
vu = &hq->hq_used_ring[uidx % hq->hq_size];
vu->vu_idx = didx;
vu->vu_tlen = 1;
hq->hq_cur_aidx++;
*hq->hq_used_idx += 1;
}
static void
pci_vtblk_qnotify(struct pci_vtblk_softc *sc)
{
struct vring_hqueue *hq = &sc->vbsc_q;
int i;
int ndescs;
/*
* Calculate number of ring entries to process
*/
ndescs = hq_num_avail(hq);
if (ndescs == 0)
return;
/*
* Run through all the entries, placing them into iovecs and
* sending when an end-of-packet is found
*/
for (i = 0; i < ndescs; i++)
pci_vtblk_proc(sc, hq);
/*
* Generate an interrupt if able
*/
if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0 &&
sc->vbsc_isr == 0) {
sc->vbsc_isr = 1;
pci_generate_msi(sc->vbsc_pi, 0);
}
}
static void
pci_vtblk_ring_init(struct pci_vtblk_softc *sc, uint64_t pfn)
{
struct vring_hqueue *hq;
sc->vbsc_pfn = pfn << VRING_PFN;
/*
* Set up host pointers to the various parts of the
* queue
*/
hq = &sc->vbsc_q;
hq->hq_size = VTBLK_RINGSZ;
hq->hq_dtable = paddr_guest2host(pfn << VRING_PFN);
hq->hq_avail_flags = (uint16_t *)(hq->hq_dtable + hq->hq_size);
hq->hq_avail_idx = hq->hq_avail_flags + 1;
hq->hq_avail_ring = hq->hq_avail_flags + 2;
hq->hq_used_flags = (uint16_t *)roundup2((uintptr_t)hq->hq_avail_ring,
VRING_ALIGN);
hq->hq_used_idx = hq->hq_used_flags + 1;
hq->hq_used_ring = (struct virtio_used *)(hq->hq_used_flags + 2);
/*
* Initialize queue indexes
*/
hq->hq_cur_aidx = 0;
}
static int
pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
{
struct stat sbuf;
struct pci_vtblk_softc *sc;
off_t size;
int fd;
int sectsz;
if (opts == NULL) {
printf("virtio-block: backing device required\n");
return (1);
}
/*
* Access to guest memory is required. Fail if
* memory not mapped
*/
if (paddr_guest2host(0) == NULL)
return (1);
/*
* The supplied backing file has to exist
*/
fd = open(opts, O_RDWR);
if (fd < 0) {
perror("Could not open backing file");
return (1);
}
if (fstat(fd, &sbuf) < 0) {
perror("Could not stat backing file");
close(fd);
return (1);
}
/*
* Deal with raw devices
*/
size = sbuf.st_size;
sectsz = DEV_BSIZE;
if (S_ISCHR(sbuf.st_mode)) {
if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 ||
ioctl(fd, DIOCGSECTORSIZE, &sectsz)) {
perror("Could not fetch dev blk/sector size");
close(fd);
return (1);
}
assert(size != 0);
assert(sectsz != 0);
}
sc = malloc(sizeof(struct pci_vtblk_softc));
memset(sc, 0, sizeof(struct pci_vtblk_softc));
pi->pi_arg = sc;
sc->vbsc_pi = pi;
sc->vbsc_fd = fd;
/* setup virtio block config space */
sc->vbsc_cfg.vbc_capacity = size / sectsz;
sc->vbsc_cfg.vbc_seg_max = VTBLK_MAXSEGS;
sc->vbsc_cfg.vbc_blk_size = sectsz;
sc->vbsc_cfg.vbc_size_max = 0; /* not negotiated */
sc->vbsc_cfg.vbc_geom_c = 0; /* no geometry */
sc->vbsc_cfg.vbc_geom_h = 0;
sc->vbsc_cfg.vbc_geom_s = 0;
sc->vbsc_cfg.vbc_sectors_max = 0;
/* initialize config space */
pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_BLOCK);
pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_BLOCK);
pci_emul_add_msicap(pi, 1);
pci_emul_alloc_bar(pi, 0, PCIBAR_IO, VTBLK_REGSZ);
return (0);
}
static void
pci_vtblk_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
int baridx, uint64_t offset, int size, uint64_t value)
{
struct pci_vtblk_softc *sc = pi->pi_arg;
assert(baridx == 0);
if (offset + size > VTBLK_REGSZ) {
DPRINTF(("vtblk_write: 2big, offset %ld size %d\n",
offset, size));
return;
}
switch (offset) {
case VTCFG_R_GUESTCAP:
assert(size == 4);
sc->vbsc_features = value & VTBLK_S_HOSTCAPS;
break;
case VTCFG_R_PFN:
assert(size == 4);
pci_vtblk_ring_init(sc, value);
break;
case VTCFG_R_QSEL:
assert(size == 2);
sc->vbsc_lastq = value;
break;
case VTCFG_R_QNOTIFY:
assert(size == 2);
assert(value == 0);
pci_vtblk_qnotify(sc);
break;
case VTCFG_R_STATUS:
assert(size == 1);
pci_vtblk_update_status(sc, value);
break;
case VTCFG_R_HOSTCAP:
case VTCFG_R_QNUM:
case VTCFG_R_ISR:
case VTBLK_R_CFG ... VTBLK_R_CFG_END:
DPRINTF(("vtblk: write to readonly reg %ld\n\r", offset));
break;
default:
DPRINTF(("vtblk: unknown i/o write offset %ld\n\r", offset));
value = 0;
break;
}
}
uint64_t
pci_vtblk_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
int baridx, uint64_t offset, int size)
{
struct pci_vtblk_softc *sc = pi->pi_arg;
void *ptr;
uint32_t value;
assert(baridx == 0);
if (offset + size > VTBLK_REGSZ) {
DPRINTF(("vtblk_read: 2big, offset %ld size %d\n",
offset, size));
return (0);
}
switch (offset) {
case VTCFG_R_HOSTCAP:
assert(size == 4);
value = VTBLK_S_HOSTCAPS;
break;
case VTCFG_R_GUESTCAP:
assert(size == 4);
value = sc->vbsc_features; /* XXX never read ? */
break;
case VTCFG_R_PFN:
assert(size == 4);
value = sc->vbsc_pfn >> VRING_PFN;
break;
case VTCFG_R_QNUM:
value = (sc->vbsc_lastq == 0) ? VTBLK_RINGSZ: 0;
break;
case VTCFG_R_QSEL:
assert(size == 2);
value = sc->vbsc_lastq; /* XXX never read ? */
break;
case VTCFG_R_QNOTIFY:
assert(size == 2);
value = 0; /* XXX never read ? */
break;
case VTCFG_R_STATUS:
assert(size == 1);
value = sc->vbsc_status;
break;
case VTCFG_R_ISR:
assert(size == 1);
value = sc->vbsc_isr;
sc->vbsc_isr = 0; /* a read clears this flag */
break;
case VTBLK_R_CFG ... VTBLK_R_CFG_END:
assert(size + offset <= (VTBLK_R_CFG_END + 1));
ptr = (uint8_t *)&sc->vbsc_cfg + offset - VTBLK_R_CFG;
if (size == 1) {
value = *(uint8_t *) ptr;
} else if (size == 2) {
value = *(uint16_t *) ptr;
} else {
value = *(uint32_t *) ptr;
}
break;
default:
DPRINTF(("vtblk: unknown i/o read offset %ld\n\r", offset));
value = 0;
break;
}
return (value);
}
struct pci_devemu pci_de_vblk = {
.pe_emu = "virtio-blk",
.pe_init = pci_vtblk_init,
.pe_barwrite = pci_vtblk_write,
.pe_barread = pci_vtblk_read
};
PCI_EMUL_SET(pci_de_vblk);

View File

@ -0,0 +1,781 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/linker_set.h>
#include <sys/select.h>
#include <sys/uio.h>
#include <sys/ioctl.h>
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <strings.h>
#include <unistd.h>
#include <assert.h>
#include <md5.h>
#include <pthread.h>
#include "bhyverun.h"
#include "pci_emul.h"
#include "mevent.h"
#include "virtio.h"
#define VTNET_RINGSZ 256
#define VTNET_MAXSEGS 32
/*
* PCI config-space register offsets
*/
#define VTNET_R_CFG0 20
#define VTNET_R_CFG1 21
#define VTNET_R_CFG2 22
#define VTNET_R_CFG3 23
#define VTNET_R_CFG4 24
#define VTNET_R_CFG5 25
#define VTNET_R_CFG6 26
#define VTNET_R_CFG7 27
#define VTNET_R_MAX 27
#define VTNET_REGSZ VTNET_R_MAX+1
/*
* Host capabilities
*/
#define VTNET_S_HOSTCAPS \
( 0x00000020 | /* host supplies MAC */ \
0x00008000 | /* host can merge Rx buffers */ \
0x00010000 ) /* config status available */
/*
* Queue definitions.
*/
#define VTNET_RXQ 0
#define VTNET_TXQ 1
#define VTNET_CTLQ 2
#define VTNET_MAXQ 3
struct vring_hqueue {
/* Internal state */
uint16_t hq_size;
uint16_t hq_cur_aidx; /* trails behind 'avail_idx' */
/* Host-context pointers to the queue */
struct virtio_desc *hq_dtable;
uint16_t *hq_avail_flags;
uint16_t *hq_avail_idx; /* monotonically increasing */
uint16_t *hq_avail_ring;
uint16_t *hq_used_flags;
uint16_t *hq_used_idx; /* monotonically increasing */
struct virtio_used *hq_used_ring;
};
/*
* Fixed network header size
*/
struct virtio_net_rxhdr {
uint8_t vrh_flags;
uint8_t vrh_gso_type;
uint16_t vrh_hdr_len;
uint16_t vrh_gso_size;
uint16_t vrh_csum_start;
uint16_t vrh_csum_offset;
uint16_t vrh_bufs;
} __packed;
/*
* Debug printf
*/
static int pci_vtnet_debug;
#define DPRINTF(params) if (pci_vtnet_debug) printf params
#define WPRINTF(params) printf params
/*
* Per-device softc
*/
struct pci_vtnet_softc {
struct pci_devinst *vsc_pi;
pthread_mutex_t vsc_mtx;
struct mevent *vsc_mevp;
int vsc_curq;
int vsc_status;
int vsc_isr;
int vsc_tapfd;
int vsc_rx_ready;
int vsc_rxpend;
uint32_t vsc_features;
uint8_t vsc_macaddr[6];
uint64_t vsc_pfn[VTNET_MAXQ];
struct vring_hqueue vsc_hq[VTNET_MAXQ];
};
/*
* Return the number of available descriptors in the vring taking care
* of the 16-bit index wraparound.
*/
static int
hq_num_avail(struct vring_hqueue *hq)
{
int ndesc;
if (*hq->hq_avail_idx >= hq->hq_cur_aidx)
ndesc = *hq->hq_avail_idx - hq->hq_cur_aidx;
else
ndesc = UINT16_MAX - hq->hq_cur_aidx + *hq->hq_avail_idx + 1;
assert(ndesc >= 0 && ndesc <= hq->hq_size);
return (ndesc);
}
static uint16_t
pci_vtnet_qsize(int qnum)
{
/* XXX no ctl queue currently */
if (qnum == VTNET_CTLQ) {
return (0);
}
/* XXX fixed currently. Maybe different for tx/rx/ctl */
return (VTNET_RINGSZ);
}
static void
pci_vtnet_ring_reset(struct pci_vtnet_softc *sc, int ring)
{
struct vring_hqueue *hq;
assert(ring < VTNET_MAXQ);
hq = &sc->vsc_hq[ring];
/*
* Reset all soft state
*/
hq->hq_cur_aidx = 0;
}
static void
pci_vtnet_update_status(struct pci_vtnet_softc *sc, uint32_t value)
{
if (value == 0) {
DPRINTF(("vtnet: device reset requested !\n"));
pci_vtnet_ring_reset(sc, VTNET_RXQ);
pci_vtnet_ring_reset(sc, VTNET_TXQ);
sc->vsc_rx_ready = 0;
}
sc->vsc_status = value;
}
/*
* Called to send a buffer chain out to the tap device
*/
static void
pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
int len)
{
char pad[60];
if (sc->vsc_tapfd == -1)
return;
/*
* If the length is < 60, pad out to that and add the
* extra zero'd segment to the iov. It is guaranteed that
* there is always an extra iov available by the caller.
*/
if (len < 60) {
memset(pad, 0, 60 - len);
iov[iovcnt].iov_base = pad;
iov[iovcnt].iov_len = 60 - len;
iovcnt++;
}
(void) writev(sc->vsc_tapfd, iov, iovcnt);
}
/*
* Called when there is read activity on the tap file descriptor.
* Each buffer posted by the guest is assumed to be able to contain
* an entire ethernet frame + rx header.
* MP note: the dummybuf is only used for discarding frames, so there
* is no need for it to be per-vtnet or locked.
*/
static uint8_t dummybuf[2048];
static void
pci_vtnet_tap_rx(struct pci_vtnet_softc *sc)
{
struct virtio_desc *vd;
struct virtio_used *vu;
struct vring_hqueue *hq;
struct virtio_net_rxhdr *vrx;
uint8_t *buf;
int i;
int len;
int ndescs;
int didx, uidx, aidx; /* descriptor, avail and used index */
/*
* Should never be called without a valid tap fd
*/
assert(sc->vsc_tapfd != -1);
/*
* But, will be called when the rx ring hasn't yet
* been set up.
*/
if (sc->vsc_rx_ready == 0) {
/*
* Drop the packet and try later.
*/
(void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
return;
}
/*
* Calculate the number of available rx buffers
*/
hq = &sc->vsc_hq[VTNET_RXQ];
ndescs = hq_num_avail(hq);
if (ndescs == 0) {
/*
* Need to wait for host notification to read
*/
if (sc->vsc_rxpend == 0) {
WPRINTF(("vtnet: no rx descriptors !\n"));
sc->vsc_rxpend = 1;
}
/*
* Drop the packet and try later
*/
(void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
return;
}
aidx = hq->hq_cur_aidx;
uidx = *hq->hq_used_idx;
for (i = 0; i < ndescs; i++) {
/*
* 'aidx' indexes into the an array of descriptor indexes
*/
didx = hq->hq_avail_ring[aidx % hq->hq_size];
assert(didx >= 0 && didx < hq->hq_size);
vd = &hq->hq_dtable[didx];
/*
* Get a pointer to the rx header, and use the
* data immediately following it for the packet buffer.
*/
vrx = (struct virtio_net_rxhdr *)paddr_guest2host(vd->vd_addr);
buf = (uint8_t *)(vrx + 1);
len = read(sc->vsc_tapfd, buf,
vd->vd_len - sizeof(struct virtio_net_rxhdr));
if (len < 0 && errno == EWOULDBLOCK) {
break;
}
/*
* The only valid field in the rx packet header is the
* number of buffers, which is always 1 without TSO
* support.
*/
memset(vrx, 0, sizeof(struct virtio_net_rxhdr));
vrx->vrh_bufs = 1;
/*
* Write this descriptor into the used ring
*/
vu = &hq->hq_used_ring[uidx % hq->hq_size];
vu->vu_idx = didx;
vu->vu_tlen = len + sizeof(struct virtio_net_rxhdr);
uidx++;
aidx++;
}
/*
* Update the used pointer, and signal an interrupt if allowed
*/
*hq->hq_used_idx = uidx;
hq->hq_cur_aidx = aidx;
if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
sc->vsc_isr |= 1;
pci_generate_msi(sc->vsc_pi, 0);
}
}
static void
pci_vtnet_tap_callback(int fd, enum ev_type type, void *param)
{
struct pci_vtnet_softc *sc = param;
pthread_mutex_lock(&sc->vsc_mtx);
pci_vtnet_tap_rx(sc);
pthread_mutex_unlock(&sc->vsc_mtx);
}
static void
pci_vtnet_ping_rxq(struct pci_vtnet_softc *sc)
{
/*
* A qnotify means that the rx process can now begin
*/
if (sc->vsc_rx_ready == 0) {
sc->vsc_rx_ready = 1;
}
/*
* If the rx queue was empty, attempt to receive a
* packet that was previously blocked due to no rx bufs
* available
*/
if (sc->vsc_rxpend) {
WPRINTF(("vtnet: rx resumed\n\r"));
sc->vsc_rxpend = 0;
pci_vtnet_tap_rx(sc);
}
}
static void
pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vring_hqueue *hq)
{
struct iovec iov[VTNET_MAXSEGS + 1];
struct virtio_desc *vd;
struct virtio_used *vu;
int i;
int plen;
int tlen;
int uidx, aidx, didx;
uidx = *hq->hq_used_idx;
aidx = hq->hq_cur_aidx;
didx = hq->hq_avail_ring[aidx % hq->hq_size];
assert(didx >= 0 && didx < hq->hq_size);
vd = &hq->hq_dtable[didx];
/*
* Run through the chain of descriptors, ignoring the
* first header descriptor. However, include the header
* length in the total length that will be put into the
* used queue.
*/
tlen = vd->vd_len;
vd = &hq->hq_dtable[vd->vd_next];
for (i = 0, plen = 0;
i < VTNET_MAXSEGS;
i++, vd = &hq->hq_dtable[vd->vd_next]) {
iov[i].iov_base = paddr_guest2host(vd->vd_addr);
iov[i].iov_len = vd->vd_len;
plen += vd->vd_len;
tlen += vd->vd_len;
if ((vd->vd_flags & VRING_DESC_F_NEXT) == 0)
break;
}
assert(i < VTNET_MAXSEGS);
DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, i + 1));
pci_vtnet_tap_tx(sc, iov, i + 1, plen);
/*
* Return this chain back to the host
*/
vu = &hq->hq_used_ring[uidx % hq->hq_size];
vu->vu_idx = didx;
vu->vu_tlen = tlen;
hq->hq_cur_aidx = aidx + 1;
*hq->hq_used_idx = uidx + 1;
/*
* Generate an interrupt if able
*/
if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
sc->vsc_isr |= 1;
pci_generate_msi(sc->vsc_pi, 0);
}
}
static void
pci_vtnet_ping_txq(struct pci_vtnet_softc *sc)
{
struct vring_hqueue *hq = &sc->vsc_hq[VTNET_TXQ];
int i;
int ndescs;
/*
* Calculate number of ring entries to process
*/
ndescs = hq_num_avail(hq);
if (ndescs == 0)
return;
/*
* Run through all the entries, placing them into iovecs and
* sending when an end-of-packet is found
*/
for (i = 0; i < ndescs; i++)
pci_vtnet_proctx(sc, hq);
}
static void
pci_vtnet_ping_ctlq(struct pci_vtnet_softc *sc)
{
DPRINTF(("vtnet: control qnotify!\n\r"));
}
static void
pci_vtnet_ring_init(struct pci_vtnet_softc *sc, uint64_t pfn)
{
struct vring_hqueue *hq;
int qnum = sc->vsc_curq;
assert(qnum < VTNET_MAXQ);
sc->vsc_pfn[qnum] = pfn << VRING_PFN;
/*
* Set up host pointers to the various parts of the
* queue
*/
hq = &sc->vsc_hq[qnum];
hq->hq_size = pci_vtnet_qsize(qnum);
hq->hq_dtable = paddr_guest2host(pfn << VRING_PFN);
hq->hq_avail_flags = (uint16_t *)(hq->hq_dtable + hq->hq_size);
hq->hq_avail_idx = hq->hq_avail_flags + 1;
hq->hq_avail_ring = hq->hq_avail_flags + 2;
hq->hq_used_flags = (uint16_t *)roundup2((uintptr_t)hq->hq_avail_ring,
VRING_ALIGN);
hq->hq_used_idx = hq->hq_used_flags + 1;
hq->hq_used_ring = (struct virtio_used *)(hq->hq_used_flags + 2);
/*
* Initialize queue indexes
*/
hq->hq_cur_aidx = 0;
}
static int
pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
{
MD5_CTX mdctx;
unsigned char digest[16];
char nstr[80];
struct pci_vtnet_softc *sc;
/*
* Access to guest memory is required. Fail if
* memory not mapped
*/
if (paddr_guest2host(0) == NULL)
return (1);
sc = malloc(sizeof(struct pci_vtnet_softc));
memset(sc, 0, sizeof(struct pci_vtnet_softc));
pi->pi_arg = sc;
sc->vsc_pi = pi;
pthread_mutex_init(&sc->vsc_mtx, NULL);
/*
* Attempt to open the tap device
*/
sc->vsc_tapfd = -1;
if (opts != NULL) {
char tbuf[80];
strcpy(tbuf, "/dev/");
strlcat(tbuf, opts, sizeof(tbuf));
sc->vsc_tapfd = open(tbuf, O_RDWR);
if (sc->vsc_tapfd == -1) {
WPRINTF(("open of tap device %s failed\n", tbuf));
} else {
/*
* Set non-blocking and register for read
* notifications with the event loop
*/
int opt = 1;
if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) {
WPRINTF(("tap device O_NONBLOCK failed\n"));
close(sc->vsc_tapfd);
sc->vsc_tapfd = -1;
}
sc->vsc_mevp = mevent_add(sc->vsc_tapfd,
EVF_READ,
pci_vtnet_tap_callback,
sc);
if (sc->vsc_mevp == NULL) {
WPRINTF(("Could not register event\n"));
close(sc->vsc_tapfd);
sc->vsc_tapfd = -1;
}
}
}
/*
* The MAC address is the standard NetApp OUI of 00-a0-98,
* followed by an MD5 of the vm name. The slot/func number is
* prepended to this for slots other than 1:0, so that
* a bootloader can netboot from the equivalent of slot 1.
*/
if (pi->pi_slot == 1 && pi->pi_func == 0) {
strncpy(nstr, vmname, sizeof(nstr));
} else {
snprintf(nstr, sizeof(nstr), "%d-%d-%s", pi->pi_slot,
pi->pi_func, vmname);
}
MD5Init(&mdctx);
MD5Update(&mdctx, nstr, strlen(nstr));
MD5Final(digest, &mdctx);
sc->vsc_macaddr[0] = 0x00;
sc->vsc_macaddr[1] = 0xa0;
sc->vsc_macaddr[2] = 0x98;
sc->vsc_macaddr[3] = digest[0];
sc->vsc_macaddr[4] = digest[1];
sc->vsc_macaddr[5] = digest[2];
/* initialize config space */
pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET);
pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK);
pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET);
pci_emul_add_msicap(pi, 1);
pci_emul_alloc_bar(pi, 0, PCIBAR_IO, VTNET_REGSZ);
return (0);
}
/*
* Function pointer array to handle queue notifications
*/
static void (*pci_vtnet_qnotify[VTNET_MAXQ])(struct pci_vtnet_softc *) = {
pci_vtnet_ping_rxq,
pci_vtnet_ping_txq,
pci_vtnet_ping_ctlq
};
static void
pci_vtnet_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
int baridx, uint64_t offset, int size, uint64_t value)
{
struct pci_vtnet_softc *sc = pi->pi_arg;
void *ptr;
assert(baridx == 0);
if (offset + size > VTNET_REGSZ) {
DPRINTF(("vtnet_write: 2big, offset %ld size %d\n",
offset, size));
return;
}
pthread_mutex_lock(&sc->vsc_mtx);
switch (offset) {
case VTCFG_R_GUESTCAP:
assert(size == 4);
sc->vsc_features = value & VTNET_S_HOSTCAPS;
break;
case VTCFG_R_PFN:
assert(size == 4);
pci_vtnet_ring_init(sc, value);
break;
case VTCFG_R_QSEL:
assert(size == 2);
assert(value < VTNET_MAXQ);
sc->vsc_curq = value;
break;
case VTCFG_R_QNOTIFY:
assert(size == 2);
assert(value < VTNET_MAXQ);
(*pci_vtnet_qnotify[value])(sc);
break;
case VTCFG_R_STATUS:
assert(size == 1);
pci_vtnet_update_status(sc, value);
break;
case VTNET_R_CFG0:
case VTNET_R_CFG1:
case VTNET_R_CFG2:
case VTNET_R_CFG3:
case VTNET_R_CFG4:
case VTNET_R_CFG5:
assert((size + offset) <= (VTNET_R_CFG5 + 1));
ptr = &sc->vsc_macaddr[offset - VTNET_R_CFG0];
/*
* The driver is allowed to change the MAC address
*/
sc->vsc_macaddr[offset - VTNET_R_CFG0] = value;
if (size == 1) {
*(uint8_t *) ptr = value;
} else if (size == 2) {
*(uint16_t *) ptr = value;
} else {
*(uint32_t *) ptr = value;
}
break;
case VTCFG_R_HOSTCAP:
case VTCFG_R_QNUM:
case VTCFG_R_ISR:
case VTNET_R_CFG6:
case VTNET_R_CFG7:
DPRINTF(("vtnet: write to readonly reg %ld\n\r", offset));
break;
default:
DPRINTF(("vtnet: unknown i/o write offset %ld\n\r", offset));
value = 0;
break;
}
pthread_mutex_unlock(&sc->vsc_mtx);
}
uint64_t
pci_vtnet_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
int baridx, uint64_t offset, int size)
{
struct pci_vtnet_softc *sc = pi->pi_arg;
void *ptr;
uint64_t value;
assert(baridx == 0);
if (offset + size > VTNET_REGSZ) {
DPRINTF(("vtnet_read: 2big, offset %ld size %d\n",
offset, size));
return (0);
}
pthread_mutex_lock(&sc->vsc_mtx);
switch (offset) {
case VTCFG_R_HOSTCAP:
assert(size == 4);
value = VTNET_S_HOSTCAPS;
break;
case VTCFG_R_GUESTCAP:
assert(size == 4);
value = sc->vsc_features; /* XXX never read ? */
break;
case VTCFG_R_PFN:
assert(size == 4);
value = sc->vsc_pfn[sc->vsc_curq] >> VRING_PFN;
break;
case VTCFG_R_QNUM:
assert(size == 2);
value = pci_vtnet_qsize(sc->vsc_curq);
break;
case VTCFG_R_QSEL:
assert(size == 2);
value = sc->vsc_curq; /* XXX never read ? */
break;
case VTCFG_R_QNOTIFY:
assert(size == 2);
value = sc->vsc_curq; /* XXX never read ? */
break;
case VTCFG_R_STATUS:
assert(size == 1);
value = sc->vsc_status;
break;
case VTCFG_R_ISR:
assert(size == 1);
value = sc->vsc_isr;
sc->vsc_isr = 0; /* a read clears this flag */
break;
case VTNET_R_CFG0:
case VTNET_R_CFG1:
case VTNET_R_CFG2:
case VTNET_R_CFG3:
case VTNET_R_CFG4:
case VTNET_R_CFG5:
assert((size + offset) <= (VTNET_R_CFG5 + 1));
ptr = &sc->vsc_macaddr[offset - VTNET_R_CFG0];
if (size == 1) {
value = *(uint8_t *) ptr;
} else if (size == 2) {
value = *(uint16_t *) ptr;
} else {
value = *(uint32_t *) ptr;
}
break;
case VTNET_R_CFG6:
assert(size != 4);
value = 0x01; /* XXX link always up */
break;
case VTNET_R_CFG7:
assert(size == 1);
value = 0; /* XXX link status in LSB */
break;
default:
DPRINTF(("vtnet: unknown i/o read offset %ld\n\r", offset));
value = 0;
break;
}
pthread_mutex_unlock(&sc->vsc_mtx);
return (value);
}
struct pci_devemu pci_de_vnet = {
.pe_emu = "virtio-net",
.pe_init = pci_vtnet_init,
.pe_barwrite = pci_vtnet_write,
.pe_barread = pci_vtnet_read
};
PCI_EMUL_SET(pci_de_vnet);

198
usr.sbin/bhyve/pit_8254.c Normal file
View File

@ -0,0 +1,198 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/time.h>
#include <machine/clock.h>
#include <stdio.h>
#include <assert.h>
#include "bhyverun.h"
#include "inout.h"
#include "pit_8254.h"
#define TIMER_SEL_MASK 0xc0
#define TIMER_RW_MASK 0x30
#define TIMER_MODE_MASK 0x0f
#define TIMER_SEL_READBACK 0xc0
#define TIMER_DIV(freq, hz) (((freq) + (hz) / 2) / (hz))
#define PIT_8254_FREQ 1193182
static const int nsecs_per_tick = 1000000000 / PIT_8254_FREQ;
struct counter {
struct timeval tv; /* uptime when counter was loaded */
uint16_t initial; /* initial counter value */
uint8_t cr[2];
uint8_t ol[2];
int crbyte;
int olbyte;
};
static void
timevalfix(struct timeval *t1)
{
if (t1->tv_usec < 0) {
t1->tv_sec--;
t1->tv_usec += 1000000;
}
if (t1->tv_usec >= 1000000) {
t1->tv_sec++;
t1->tv_usec -= 1000000;
}
}
static void
timevalsub(struct timeval *t1, const struct timeval *t2)
{
t1->tv_sec -= t2->tv_sec;
t1->tv_usec -= t2->tv_usec;
timevalfix(t1);
}
static void
latch(struct counter *c)
{
struct timeval tv2;
uint16_t lval;
uint64_t delta_nsecs, delta_ticks;
/* cannot latch a new value until the old one has been consumed */
if (c->olbyte != 0)
return;
if (c->initial == 0 || c->initial == 1) {
/*
* XXX the program that runs the VM can be stopped and
* restarted at any time. This means that state that was
* created by the guest is destroyed between invocations
* of the program.
*
* If the counter's initial value is not programmed we
* assume a value that would be set to generate 'guest_hz'
* interrupts per second.
*/
c->initial = TIMER_DIV(PIT_8254_FREQ, guest_hz);
gettimeofday(&c->tv, NULL);
}
(void)gettimeofday(&tv2, NULL);
timevalsub(&tv2, &c->tv);
delta_nsecs = tv2.tv_sec * 1000000000 + tv2.tv_usec * 1000;
delta_ticks = delta_nsecs / nsecs_per_tick;
lval = c->initial - delta_ticks % c->initial;
c->olbyte = 2;
c->ol[1] = lval; /* LSB */
c->ol[0] = lval >> 8; /* MSB */
}
static int
pit_8254_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
int sel, rw, mode;
uint8_t val;
struct counter *c;
static struct counter counter[3];
if (bytes != 1)
return (-1);
val = *eax;
if (port == TIMER_MODE) {
assert(in == 0);
sel = val & TIMER_SEL_MASK;
rw = val & TIMER_RW_MASK;
mode = val & TIMER_MODE_MASK;
if (sel == TIMER_SEL_READBACK)
return (-1);
if (rw != TIMER_LATCH && rw != TIMER_16BIT)
return (-1);
if (rw != TIMER_LATCH) {
/*
* Counter mode is not affected when issuing a
* latch command.
*/
if (mode != TIMER_RATEGEN && mode != TIMER_SQWAVE)
return (-1);
}
c = &counter[sel >> 6];
if (rw == TIMER_LATCH)
latch(c);
else
c->olbyte = 0; /* reset latch after reprogramming */
return (0);
}
/* counter ports */
assert(port >= TIMER_CNTR0 && port <= TIMER_CNTR2);
c = &counter[port - TIMER_CNTR0];
if (in) {
/*
* XXX
* The spec says that once the output latch is completely
* read it should revert to "following" the counter. We don't
* do this because it is hard and any reasonable OS should
* always latch the counter before trying to read it.
*/
if (c->olbyte == 0)
c->olbyte = 2;
*eax = c->ol[--c->olbyte];
} else {
c->cr[c->crbyte++] = *eax;
if (c->crbyte == 2) {
c->crbyte = 0;
c->initial = c->cr[0] | (uint16_t)c->cr[1] << 8;
if (c->initial == 0)
c->initial = 0xffff;
gettimeofday(&c->tv, NULL);
}
}
return (0);
}
INOUT_PORT(8254, TIMER_MODE, IOPORT_F_OUT, pit_8254_handler);
INOUT_PORT(8254, TIMER_CNTR0, IOPORT_F_INOUT, pit_8254_handler);
INOUT_PORT(8254, TIMER_CNTR1, IOPORT_F_INOUT, pit_8254_handler);
INOUT_PORT(8254, TIMER_CNTR2, IOPORT_F_INOUT, pit_8254_handler);

45
usr.sbin/bhyve/pit_8254.h Normal file
View File

@ -0,0 +1,45 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _PIT_8254_H_
#define _PIT_8254_H_
/*
* Borrowed from amd64/include/timerreg.h because in that file it is
* conditionally compiled for #ifdef _KERNEL only.
*/
#include <dev/ic/i8253reg.h>
#define IO_TIMER1 0x40 /* 8253 Timer #1 */
#define TIMER_CNTR0 (IO_TIMER1 + TIMER_REG_CNTR0)
#define TIMER_CNTR1 (IO_TIMER1 + TIMER_REG_CNTR1)
#define TIMER_CNTR2 (IO_TIMER1 + TIMER_REG_CNTR2)
#define TIMER_MODE (IO_TIMER1 + TIMER_REG_MODE)
#endif /* _PIT_8254_H_ */

108
usr.sbin/bhyve/pmtmr.c Normal file
View File

@ -0,0 +1,108 @@
/*-
* Copyright (c) 2012 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <sys/sysctl.h>
#include <sys/time.h>
#include <machine/cpufunc.h>
#include <stdio.h>
#include <time.h>
#include <assert.h>
#include <pthread.h>
#include "inout.h"
/*
* The ACPI Power Management timer is a free-running 24- or 32-bit
* timer with a frequency of 3.579545MHz
*
* This implementation will be 32-bits
*/
#define IO_PMTMR 0x408 /* 4-byte i/o port for the timer */
#define PMTMR_FREQ 3579545 /* 3.579545MHz */
static pthread_mutex_t pmtmr_mtx;
static uint64_t pmtmr_tscf;
static uint64_t pmtmr_old;
static uint64_t pmtmr_tsc_old;
static uint32_t
pmtmr_val(void)
{
uint64_t pmtmr_tsc_new;
uint64_t pmtmr_new;
static int inited = 0;
if (!inited) {
size_t len;
uint32_t tmpf;
inited = 1;
pthread_mutex_init(&pmtmr_mtx, NULL);
len = sizeof(tmpf);
sysctlbyname("machdep.tsc_freq", &tmpf, &len,
NULL, 0);
pmtmr_tscf = tmpf;
pmtmr_tsc_old = rdtsc();
pmtmr_old = pmtmr_tsc_old / pmtmr_tscf * PMTMR_FREQ;
return (pmtmr_old);
}
pthread_mutex_lock(&pmtmr_mtx);
pmtmr_tsc_new = rdtsc();
pmtmr_new = (pmtmr_tsc_new - pmtmr_tsc_old) * PMTMR_FREQ / pmtmr_tscf +
pmtmr_old;
pmtmr_old = pmtmr_new;
pmtmr_tsc_old = pmtmr_tsc_new;
pthread_mutex_unlock(&pmtmr_mtx);
return (pmtmr_new);
}
static int
pmtmr_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
assert(in == 1);
if (bytes != 4)
return (-1);
*eax = pmtmr_val();
return (0);
}
INOUT_PORT(pmtmr, IO_PMTMR, IOPORT_F_IN, pmtmr_handler);

51
usr.sbin/bhyve/post.c Normal file
View File

@ -0,0 +1,51 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <assert.h>
#include "inout.h"
static int
post_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
assert(in == 1);
if (bytes != 1)
return (-1);
*eax = 0xff; /* return some garbage */
return (0);
}
INOUT_PORT(post, 0x84, IOPORT_F_IN, post_data_handler);

274
usr.sbin/bhyve/rtc.c Normal file
View File

@ -0,0 +1,274 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <sys/time.h>
#include <stdio.h>
#include <time.h>
#include <assert.h>
#include "inout.h"
#define IO_RTC 0x70
#define RTC_SEC 0x00 /* seconds */
#define RTC_MIN 0x02
#define RTC_HRS 0x04
#define RTC_WDAY 0x06
#define RTC_DAY 0x07
#define RTC_MONTH 0x08
#define RTC_YEAR 0x09
#define RTC_CENTURY 0x32 /* current century */
#define RTC_STATUSA 0xA
#define RTCSA_TUP 0x80 /* time update, don't look now */
#define RTC_STATUSB 0xB
#define RTCSB_DST 0x01
#define RTCSB_24HR 0x02
#define RTCSB_BIN 0x04 /* 0 = BCD, 1 = Binary */
#define RTCSB_PINTR 0x40 /* 1 = enable periodic clock interrupt */
#define RTCSB_HALT 0x80 /* stop clock updates */
#define RTC_INTR 0x0c /* status register C (R) interrupt source */
#define RTC_STATUSD 0x0d /* status register D (R) Lost Power */
#define RTCSD_PWR 0x80 /* clock power OK */
#define RTC_DIAG 0x0e
#define RTC_RSTCODE 0x0f
#define RTC_EQUIPMENT 0x14
static int addr;
/* XXX initialize these to default values as they would be from BIOS */
static uint8_t status_a, status_b, rstcode;
static u_char const bin2bcd_data[] = {
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09,
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29,
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99
};
#define bin2bcd(bin) (bin2bcd_data[bin])
#define rtcout(val) ((status_b & RTCSB_BIN) ? (val) : bin2bcd((val)))
static void
timevalfix(struct timeval *t1)
{
if (t1->tv_usec < 0) {
t1->tv_sec--;
t1->tv_usec += 1000000;
}
if (t1->tv_usec >= 1000000) {
t1->tv_sec++;
t1->tv_usec -= 1000000;
}
}
static void
timevalsub(struct timeval *t1, const struct timeval *t2)
{
t1->tv_sec -= t2->tv_sec;
t1->tv_usec -= t2->tv_usec;
timevalfix(t1);
}
static int
rtc_addr_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
assert(in == 0);
if (bytes != 1)
return (-1);
switch (*eax) {
case RTC_SEC:
case RTC_MIN:
case RTC_HRS:
case RTC_WDAY:
case RTC_DAY:
case RTC_MONTH:
case RTC_YEAR:
case RTC_CENTURY:
case RTC_STATUSA:
case RTC_STATUSB:
case RTC_INTR:
case RTC_STATUSD:
case RTC_DIAG:
case RTC_RSTCODE:
case RTC_EQUIPMENT:
break;
default:
return (-1);
}
addr = *eax;
return (0);
}
static int
rtc_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
int hour;
time_t t;
struct timeval cur, delta;
static struct timeval last;
static struct tm tm;
if (bytes != 1)
return (-1);
gettimeofday(&cur, NULL);
/*
* Increment the cached time only once per second so we can guarantee
* that the guest has at least one second to read the hour:min:sec
* separately and still get a coherent view of the time.
*/
delta = cur;
timevalsub(&delta, &last);
if (delta.tv_sec >= 1 && (status_b & RTCSB_HALT) == 0) {
t = cur.tv_sec;
localtime_r(&t, &tm);
last = cur;
}
if (in) {
switch (addr) {
case RTC_SEC:
*eax = rtcout(tm.tm_sec);
return (0);
case RTC_MIN:
*eax = rtcout(tm.tm_min);
return (0);
case RTC_HRS:
if (status_b & RTCSB_24HR)
hour = tm.tm_hour;
else
hour = (tm.tm_hour % 12) + 1;
*eax = rtcout(hour);
/*
* If we are representing time in the 12-hour format
* then set the MSB to indicate PM.
*/
if ((status_b & RTCSB_24HR) == 0 && tm.tm_hour >= 12)
*eax |= 0x80;
return (0);
case RTC_WDAY:
*eax = rtcout(tm.tm_wday + 1);
return (0);
case RTC_DAY:
*eax = rtcout(tm.tm_mday);
return (0);
case RTC_MONTH:
*eax = rtcout(tm.tm_mon + 1);
return (0);
case RTC_YEAR:
*eax = rtcout(tm.tm_year % 100);
return (0);
case RTC_CENTURY:
*eax = rtcout(tm.tm_year / 100);
break;
case RTC_STATUSA:
*eax = status_a;
return (0);
case RTC_INTR:
*eax = 0;
return (0);
case RTC_STATUSD:
*eax = RTCSD_PWR;
return (0);
case RTC_DIAG:
*eax = 0;
return (0);
case RTC_RSTCODE:
*eax = rstcode;
return (0);
case RTC_EQUIPMENT:
*eax = 0;
return (0);
default:
return (-1);
}
}
switch (addr) {
case RTC_STATUSA:
status_a = *eax & ~RTCSA_TUP;
break;
case RTC_STATUSB:
/* XXX not implemented yet XXX */
if (*eax & RTCSB_PINTR)
return (-1);
status_b = *eax;
break;
case RTC_RSTCODE:
rstcode = *eax;
break;
case RTC_SEC:
case RTC_MIN:
case RTC_HRS:
case RTC_WDAY:
case RTC_DAY:
case RTC_MONTH:
case RTC_YEAR:
case RTC_CENTURY:
/*
* Ignore writes to the time of day registers
*/
break;
default:
return (-1);
}
return (0);
}
INOUT_PORT(rtc, IO_RTC, IOPORT_F_OUT, rtc_addr_handler);
INOUT_PORT(rtc, IO_RTC + 1, IOPORT_F_INOUT, rtc_data_handler);

119
usr.sbin/bhyve/spinup_ap.c Normal file
View File

@ -0,0 +1,119 @@
/*-
* Copyright (c) 2012 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/types.h>
#include <machine/vmm.h>
#include <vmmapi.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include "bhyverun.h"
#include "spinup_ap.h"
static void
spinup_ap_realmode(struct vmctx *ctx, int newcpu, uint64_t *rip)
{
int vector, error;
uint16_t cs;
uint64_t desc_base;
uint32_t desc_limit, desc_access;
vector = *rip >> PAGE_SHIFT;
*rip = 0;
/*
* Update the %cs and %rip of the guest so that it starts
* executing real mode code at at 'vector << 12'.
*/
error = vm_set_register(ctx, newcpu, VM_REG_GUEST_RIP, *rip);
assert(error == 0);
error = vm_get_desc(ctx, newcpu, VM_REG_GUEST_CS, &desc_base,
&desc_limit, &desc_access);
assert(error == 0);
desc_base = vector << PAGE_SHIFT;
error = vm_set_desc(ctx, newcpu, VM_REG_GUEST_CS,
desc_base, desc_limit, desc_access);
assert(error == 0);
cs = (vector << PAGE_SHIFT) >> 4;
error = vm_set_register(ctx, newcpu, VM_REG_GUEST_CS, cs);
assert(error == 0);
}
int
spinup_ap(struct vmctx *ctx, int vcpu, int newcpu, uint64_t rip)
{
int error;
assert(newcpu != 0);
assert(newcpu < guest_ncpus);
error = vcpu_reset(ctx, newcpu);
assert(error == 0);
/* Set up capabilities */
if (fbsdrun_vmexit_on_hlt()) {
error = vm_set_capability(ctx, newcpu, VM_CAP_HALT_EXIT, 1);
assert(error == 0);
}
if (fbsdrun_vmexit_on_pause()) {
error = vm_set_capability(ctx, newcpu, VM_CAP_PAUSE_EXIT, 1);
assert(error == 0);
}
if (fbsdrun_disable_x2apic())
error = vm_set_x2apic_state(ctx, newcpu, X2APIC_DISABLED);
else
error = vm_set_x2apic_state(ctx, newcpu, X2APIC_ENABLED);
assert(error == 0);
/*
* Enable the 'unrestricted guest' mode for 'newcpu'.
*
* Set up the processor state in power-on 16-bit mode, with the CS:IP
* init'd to the specified low-mem 4K page.
*/
error = vm_set_capability(ctx, newcpu, VM_CAP_UNRESTRICTED_GUEST, 1);
assert(error == 0);
spinup_ap_realmode(ctx, newcpu, &rip);
fbsdrun_addcpu(ctx, newcpu, rip);
return (newcpu);
}

View File

@ -0,0 +1,34 @@
/*-
* Copyright (c) 2012 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _SPINUP_AP_H_
#define _SPINUP_AP_H_
int spinup_ap(struct vmctx *ctx, int vcpu, int newcpu, uint64_t rip);
#endif

60
usr.sbin/bhyve/uart.c Normal file
View File

@ -0,0 +1,60 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <assert.h>
#include "inout.h"
#define COM1 0x3F8
#define COM2 0x2F8
#define REG_IIR 2
static int
com_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
assert(in);
if (bytes != 1)
return (-1);
/*
* COM port is not implemented so we return 0xFF for all registers
*/
*eax = 0xFF;
return (0);
}
INOUT_PORT(uart, COM1 + REG_IIR, IOPORT_F_IN, com_handler);
INOUT_PORT(uart, COM2 + REG_IIR, IOPORT_F_IN, com_handler);

85
usr.sbin/bhyve/virtio.h Normal file
View File

@ -0,0 +1,85 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VIRTIO_H_
#define _VIRTIO_H_
#define VRING_ALIGN 4096
#define VRING_DESC_F_NEXT (1 << 0)
#define VRING_DESC_F_WRITE (1 << 1)
#define VRING_DESC_F_INDIRECT (1 << 2)
#define VRING_AVAIL_F_NO_INTERRUPT 1
struct virtio_desc {
uint64_t vd_addr;
uint32_t vd_len;
uint16_t vd_flags;
uint16_t vd_next;
} __packed;
struct virtio_used {
uint32_t vu_idx;
uint32_t vu_tlen;
} __packed;
/*
* PFN register shift amount
*/
#define VRING_PFN 12
/*
* Virtio device types
*/
#define VIRTIO_TYPE_NET 1
#define VIRTIO_TYPE_BLOCK 2
/*
* PCI vendor/device IDs
*/
#define VIRTIO_VENDOR 0x1AF4
#define VIRTIO_DEV_NET 0x1000
#define VIRTIO_DEV_BLOCK 0x1001
/*
* PCI config space constants
*/
#define VTCFG_R_HOSTCAP 0
#define VTCFG_R_GUESTCAP 4
#define VTCFG_R_PFN 8
#define VTCFG_R_QNUM 12
#define VTCFG_R_QSEL 14
#define VTCFG_R_QNOTIFY 16
#define VTCFG_R_STATUS 18
#define VTCFG_R_ISR 19
#define VTCFG_R_CFG0 20 /* No MSI-X */
#define VTCFG_R_CFG1 24 /* With MSI-X */
#define VTCFG_R_MSIX 20
#endif /* _VIRTIO_H_ */

48
usr.sbin/bhyve/xmsr.c Normal file
View File

@ -0,0 +1,48 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <machine/vmm.h>
#include <vmmapi.h>
#include <stdio.h>
#include <stdlib.h>
#include "xmsr.h"
int
emulate_wrmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t val)
{
printf("Unknown WRMSR code %x, val %lx, cpu %d\n", code, val, vcpu);
exit(1);
}

34
usr.sbin/bhyve/xmsr.h Normal file
View File

@ -0,0 +1,34 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _XMSR_H_
#define _XMSR_H_
int emulate_wrmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t val);
#endif

Some files were not shown because too many files have changed in this diff Show More