Import of bhyve hypervisor and utilities, part 1.

vmm.ko - kernel module for VT-x, VT-d and hypervisor control
  bhyve  - user-space sequencer and i/o emulation
  vmmctl - dump of hypervisor register state
  libvmm - front-end to vmm.ko chardev interface

bhyve was designed and implemented by Neel Natu.

Thanks to the following folk from NetApp who helped to make this available:
	Joe CaraDonna
	Peter Snyder
	Jeff Heller
	Sandeep Mann
	Steve Miller
	Brian Pawlowski
This commit is contained in:
grehan 2011-05-13 04:54:01 +00:00
parent 1430f46faf
commit d45b7f14ae
84 changed files with 19016 additions and 0 deletions

View File

@ -102,6 +102,7 @@ SUBDIR= ${SUBDIR_ORDERED} \
${_libusbhid} \
${_libusb} \
${_libvgl} \
${_libvmmapi} \
libwrap \
liby \
libz \
@ -177,6 +178,7 @@ _libncp= libncp
.endif
_libsmb= libsmb
_libvgl= libvgl
_libvmmapi= libvmmapi
.endif
.if ${MACHINE_ARCH} == "powerpc"

9
lib/libvmmapi/Makefile Normal file
View File

@ -0,0 +1,9 @@
# $FreeBSD$
LIB= vmmapi
SRCS= vmmapi.c vmmapi_freebsd.c mptable.c
INCS= vmmapi.h
CFLAGS+= -I${.CURDIR}
.include <bsd.lib.mk>

336
lib/libvmmapi/mptable.c Normal file
View File

@ -0,0 +1,336 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <sys/mman.h>
#include <stdio.h>
#include <string.h>
#include <machine/vmm.h>
#include <machine/vmm_dev.h>
#include "vmmapi.h"
#include "mptable.h"
#define LAPIC_PADDR (0xFEE00000)
#define LAPIC_VERSION (16)
#define IOAPIC_PADDR (0xFEC00000)
#define IOAPIC_VERSION (0x11)
extern int errno;
static uint8_t
mp_compute_checksum(void *base, size_t len)
{
uint8_t *bytes = base;
uint8_t sum = 0;
for(; len > 0; len--) {
sum += *bytes++;
}
return 256 - sum;
}
static void
mp_build_mpfp(struct mp_floating_pointer *mpfp, vm_paddr_t mpfp_gpa)
{
memset(mpfp, 0, sizeof(*mpfp));
memcpy(mpfp->signature, MPFP_SIGNATURE, MPFP_SIGNATURE_LEN);
mpfp->mptable_paddr = mpfp_gpa + sizeof(*mpfp);
mpfp->specrev = MP_SPECREV;
mpfp->feature2 = 0;
mpfp->checksum = mp_compute_checksum(mpfp, sizeof(*mpfp));
}
static void
mp_build_mpch(struct mp_config_hdr *mpch)
{
memset(mpch, 0, sizeof(*mpch));
mpch->specrev = MP_SPECREV;
memcpy(mpch->signature, MPCH_SIGNATURE, MPCH_SIGNATURE_LEN);
memcpy(mpch->oemid, MPCH_OEMID, MPCH_OEMID_LEN);
memcpy(mpch->prodid, MPCH_PRODID, MPCH_PRODID_LEN);
mpch->lapic_paddr = LAPIC_PADDR;
}
static void
mp_build_proc_entries(struct mpe_proc *mpep, int num_proc)
{
int i;
for (i = 0; i < num_proc; i++) {
memset(mpep, 0, sizeof(*mpep));
mpep->entry_type = MP_ENTRY_PROC;
mpep->lapic_id = i; // XXX
mpep->lapic_version = LAPIC_VERSION;
mpep->proc_flags = (i == 0)?MPEP_FLAGS_BSP:0;
mpep->proc_flags |= MPEP_FLAGS_EN;
mpep->proc_signature = MPEP_SIGNATURE;
mpep->feature_flags = MPEP_FEATURES;
mpep++;
}
}
static void
mp_build_bus_entries(struct mpe_bus *mpeb)
{
memset(mpeb, 0, sizeof(*mpeb));
mpeb->entry_type = MP_ENTRY_BUS;
mpeb->busid = MPE_BUSID_ISA;
memcpy(mpeb->busname, MPE_BUSNAME_ISA, MPE_BUSNAME_LEN);
mpeb++;
memset(mpeb, 0, sizeof(*mpeb));
mpeb->entry_type = MP_ENTRY_BUS;
mpeb->busid = MPE_BUSID_PCI;
memcpy(mpeb->busname, MPE_BUSNAME_PCI, MPE_BUSNAME_LEN);
}
static void
mp_build_ioapic_entries(struct mpe_ioapic *mpei)
{
memset(mpei, 0, sizeof(*mpei));
mpei->entry_type = MP_ENTRY_IOAPIC;
mpei->ioapic_id = MPE_IOAPIC_ID;
mpei->ioapic_version = IOAPIC_VERSION;
mpei->ioapic_flags = MPE_IOAPIC_FLAG_EN;
mpei->ioapic_paddr = IOAPIC_PADDR;
}
static void
mp_build_ioint_entries(struct mpe_ioint *mpeii, int num_pins)
{
int pin;
/*
* The following config is taken from kernel mptable.c
* mptable_parse_default_config_ints(...), for now
* just use the default config, tweek later if needed.
*/
/* Run through all 16 pins. */
for (pin = 0; pin < num_pins; pin++) {
memset(mpeii, 0, sizeof(*mpeii));
mpeii->entry_type = MP_ENTRY_IOINT;
mpeii->src_bus_id = MPE_BUSID_ISA;
mpeii->dst_apic_id = MPE_IOAPIC_ID;
/*
* All default configs route IRQs from bus 0 to the first 16 pins
* of the first I/O APIC with an APIC ID of 2.
*/
mpeii->dst_apic_intin = pin;
switch (pin) {
case 0:
/* Pin 0 is an ExtINT pin. */
mpeii->intr_type = MPEII_INTR_EXTINT;
break;
case 2:
/* IRQ 0 is routed to pin 2. */
mpeii->intr_type = MPEII_INTR_INT;
mpeii->src_bus_irq = 0;
break;
case 5:
case 10:
case 11:
/*
* PCI Irqs set to level triggered.
*/
mpeii->intr_flags = MPEII_FLAGS_TRIGMODE_LEVEL;
mpeii->src_bus_id = MPE_BUSID_PCI;
default:
/* All other pins are identity mapped. */
mpeii->intr_type = MPEII_INTR_INT;
mpeii->src_bus_irq = pin;
break;
}
mpeii++;
}
}
#define COPYSTR(dest, src, bytes) \
memcpy(dest, src, bytes); \
str[bytes] = 0;
static void
mptable_dump(struct mp_floating_pointer *mpfp, struct mp_config_hdr *mpch)
{
static char str[16];
int i;
char *cur;
union mpe {
struct mpe_proc *proc;
struct mpe_bus *bus;
struct mpe_ioapic *ioapic;
struct mpe_ioint *ioint;
struct mpe_lint *lnit;
char *p;
};
union mpe mpe;
printf(" MP Floating Pointer :\n");
COPYSTR(str, mpfp->signature, 4);
printf(" signature: %s\n", str);
printf(" mpch paddr: %x\n", mpfp->mptable_paddr);
printf(" length: %x\n", mpfp->length);
printf(" specrec: %x\n", mpfp->specrev);
printf(" checksum: %x\n", mpfp->checksum);
printf(" feature1: %x\n", mpfp->feature1);
printf(" feature2: %x\n", mpfp->feature2);
printf(" feature3: %x\n", mpfp->feature3);
printf(" feature4: %x\n", mpfp->feature4);
printf(" MP Configuration Header :\n");
COPYSTR(str, mpch->signature, 4);
printf(" signature: %s\n", str);
printf(" length: %x\n", mpch->length);
printf(" specrec: %x\n", mpch->specrev);
printf(" checksum: %x\n", mpch->checksum);
COPYSTR(str, mpch->oemid, MPCH_OEMID_LEN);
printf(" oemid: %s\n", str);
COPYSTR(str, mpch->prodid, MPCH_PRODID_LEN);
printf(" prodid: %s\n", str);
printf(" oem_ptr: %x\n", mpch->oem_ptr);
printf(" oem_sz: %x\n", mpch->oem_sz);
printf(" nr_entries: %x\n", mpch->nr_entries);
printf(" apic paddr: %x\n", mpch->lapic_paddr);
printf(" ext_length: %x\n", mpch->ext_length);
printf(" ext_checksum: %x\n", mpch->ext_checksum);
cur = (char *)mpch + sizeof(*mpch);
for (i = 0; i < mpch->nr_entries; i++) {
mpe.p = cur;
switch(*mpe.p) {
case MP_ENTRY_PROC:
printf(" MP Processor Entry :\n");
printf(" lapic_id: %x\n", mpe.proc->lapic_id);
printf(" lapic_version: %x\n", mpe.proc->lapic_version);
printf(" proc_flags: %x\n", mpe.proc->proc_flags);
printf(" proc_signature: %x\n", mpe.proc->proc_signature);
printf(" feature_flags: %x\n", mpe.proc->feature_flags);
cur += sizeof(struct mpe_proc);
break;
case MP_ENTRY_BUS:
printf(" MP Bus Entry :\n");
printf(" busid: %x\n", mpe.bus->busid);
COPYSTR(str, mpe.bus->busname, MPE_BUSNAME_LEN);
printf(" busname: %s\n", str);
cur += sizeof(struct mpe_bus);
break;
case MP_ENTRY_IOAPIC:
printf(" MP IOAPIC Entry :\n");
printf(" ioapi_id: %x\n", mpe.ioapic->ioapic_id);
printf(" ioapi_version: %x\n", mpe.ioapic->ioapic_version);
printf(" ioapi_flags: %x\n", mpe.ioapic->ioapic_flags);
printf(" ioapi_paddr: %x\n", mpe.ioapic->ioapic_paddr);
cur += sizeof(struct mpe_ioapic);
break;
case MP_ENTRY_IOINT:
printf(" MP IO Interrupt Entry :\n");
printf(" intr_type: %x\n", mpe.ioint->intr_type);
printf(" intr_flags: %x\n", mpe.ioint->intr_flags);
printf(" src_bus_id: %x\n", mpe.ioint->src_bus_id);
printf(" src_bus_irq: %x\n", mpe.ioint->src_bus_irq);
printf(" dst_apic_id: %x\n", mpe.ioint->dst_apic_id);
printf(" dst_apic_intin: %x\n", mpe.ioint->dst_apic_intin);
cur += sizeof(struct mpe_ioint);
break;
case MP_ENTRY_LINT:
printf(" MP Local Interrupt Entry :\n");
cur += sizeof(struct mpe_lint);
break;
}
}
}
int
vm_build_mptable(struct vmctx *ctx, vm_paddr_t gpa, int len, int ncpu,
void *oemp, int oemsz)
{
struct mp_config_hdr *mpch;
char *mapaddr;
char *startaddr;
int error;
mapaddr = vm_map_memory(ctx, gpa, len);
if (mapaddr == MAP_FAILED) {
printf("%s\n", strerror(errno));
goto err;
}
startaddr = mapaddr;
mp_build_mpfp((struct mp_floating_pointer*) mapaddr, gpa);
mapaddr += sizeof(struct mp_floating_pointer);
mpch = (struct mp_config_hdr*)mapaddr;
mp_build_mpch(mpch);
mapaddr += sizeof(struct mp_config_hdr);
mp_build_proc_entries((struct mpe_proc*) mapaddr, ncpu);
mapaddr += (sizeof(struct mpe_proc)*ncpu);
mpch->nr_entries += ncpu;
mp_build_bus_entries((struct mpe_bus*)mapaddr);
mapaddr += (sizeof(struct mpe_bus)*MPE_NUM_BUSES);
mpch->nr_entries += MPE_NUM_BUSES;
#if 0
mp_build_ioapic_entries((struct mpe_ioapic*)mapaddr);
mapaddr += sizeof(struct mpe_ioapic);
mpch->nr_entries++;
mp_build_ioint_entries((struct mpe_ioint*)mapaddr, MPEII_MAX_IRQ);
mapaddr += sizeof(struct mpe_ioint)*MPEII_MAX_IRQ;
mpch->nr_entries += MPEII_MAX_IRQ;
#endif
if (oemp) {
mpch->oem_ptr = mapaddr - startaddr + gpa;
mpch->oem_sz = oemsz;
memcpy(mapaddr, oemp, oemsz);
}
mpch->length = (mapaddr) - ((char*) mpch);
mpch->checksum = mp_compute_checksum(mpch, sizeof(*mpch));
// mptable_dump((struct mp_floating_pointer*)startaddr, mpch);
err:
return (error);
}

171
lib/libvmmapi/mptable.h Normal file
View File

@ -0,0 +1,171 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _MPTABLE_h_
#define _MPTABLE_h_
#define MP_SPECREV (4) // MP spec revision 1.1
/*
* MP Floating Pointer Structure
*/
#define MPFP_SIGNATURE "_MP_"
#define MPFP_SIGNATURE_LEN (4)
#define MPFP_FEATURE2 (0x80) // IMCR is present
struct mp_floating_pointer {
uint8_t signature[MPFP_SIGNATURE_LEN];
uint32_t mptable_paddr;
uint8_t length;
uint8_t specrev;
uint8_t checksum;
uint8_t feature1;
uint8_t feature2;
uint8_t feature3;
uint8_t feature4;
uint8_t feature5;
};
/*
* MP Configuration Table Header
*/
#define MPCH_SIGNATURE "PCMP"
#define MPCH_SIGNATURE_LEN (4)
#define MPCH_OEMID "NETAPP "
#define MPCH_OEMID_LEN (8)
#define MPCH_PRODID "vFiler "
#define MPCH_PRODID_LEN (12)
struct mp_config_hdr {
uint8_t signature[MPCH_SIGNATURE_LEN];
uint16_t length;
uint8_t specrev;
uint8_t checksum;
uint8_t oemid[MPCH_OEMID_LEN];
uint8_t prodid[MPCH_PRODID_LEN];
uint32_t oem_ptr;
uint16_t oem_sz;
uint16_t nr_entries;
uint32_t lapic_paddr;
uint16_t ext_length;
uint8_t ext_checksum;
uint8_t reserved;
};
#define MP_ENTRY_PROC (0)
#define MP_ENTRY_BUS (1)
#define MP_ENTRY_IOAPIC (2)
#define MP_ENTRY_IOINT (3)
#define MP_ENTRY_LINT (4)
/*
* MP Processor Entry
*/
#define MPEP_FLAGS_EN (0x1)
#define MPEP_FLAGS_BSP (0x2)
#define MPEP_SIG_FAMILY (6)
#define MPEP_SIG_MODEL (26)
#define MPEP_SIG_STEPPING (5)
#define MPEP_SIGNATURE ((MPEP_SIG_FAMILY << 8) | (MPEP_SIG_MODEL << 4) \
| (MPEP_SIG_STEPPING))
#define MPEP_FEATURES (0xBFEBFBFF) // Value from Intel i7 CPUID
struct mpe_proc {
uint8_t entry_type;
uint8_t lapic_id;
uint8_t lapic_version;
uint8_t proc_flags;
uint32_t proc_signature;
uint32_t feature_flags;
uint8_t reserved[8];
};
/*
* MP Bus Entry
*/
#define MPE_NUM_BUSES (2)
#define MPE_BUSNAME_LEN (6)
#define MPE_BUSID_ISA (0)
#define MPE_BUSID_PCI (1)
#define MPE_BUSNAME_ISA "ISA "
#define MPE_BUSNAME_PCI "PCI "
struct mpe_bus {
uint8_t entry_type;
uint8_t busid;
uint8_t busname[MPE_BUSNAME_LEN];
};
/*
* MP IO APIC Entry
*/
#define MPE_IOAPIC_ID (2)
#define MPE_IOAPIC_FLAG_EN (1)
struct mpe_ioapic {
uint8_t entry_type;
uint8_t ioapic_id;
uint8_t ioapic_version;
uint8_t ioapic_flags;
uint32_t ioapic_paddr;
};
/*
* MP IO Interrupt Assignment Entry
*/
#define MPEII_INTR_INT (0)
#define MPEII_INTR_NMI (1)
#define MPEII_INTR_SMI (2)
#define MPEII_INTR_EXTINT (3)
#define MPEII_PCI_IRQ_MASK (0x0c20U) /* IRQ 5,10,11 are PCI connected */
#define MPEII_MAX_IRQ (16)
#define MPEII_FLAGS_TRIGMODE_LEVEL (0x3)
struct mpe_ioint {
uint8_t entry_type;
uint8_t intr_type;
uint16_t intr_flags;
uint8_t src_bus_id;
uint8_t src_bus_irq;
uint8_t dst_apic_id;
uint8_t dst_apic_intin;
};
/*
* MP Local Interrupt Assignment Entry
*/
struct mpe_lint {
uint8_t entry_type;
};
int vm_build_mptable(struct vmctx *ctxt, vm_paddr_t gpa, int len,
int ncpu, void *oemp, int oemsz);
#endif /* _MPTABLE_h_ */

647
lib/libvmmapi/vmmapi.c Normal file
View File

@ -0,0 +1,647 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <sys/sysctl.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <machine/specialreg.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>
#include <machine/vmm.h>
#include <machine/vmm_dev.h>
#include "vmmapi.h"
#include "mptable.h"
#ifndef CR4_VMXE
#define CR4_VMXE (1UL << 13)
#endif
#define BIOS_ROM_BASE (0xf0000)
#define BIOS_ROM_SIZE (0x10000)
struct vmctx {
int fd;
char *name;
};
#define CREATE(x) sysctlbyname("hw.vmm.create", NULL, NULL, (x), strlen((x)))
#define DESTROY(x) sysctlbyname("hw.vmm.destroy", NULL, NULL, (x), strlen((x)))
static int
vm_device_open(const char *name)
{
int fd, len;
char *vmfile;
len = strlen("/dev/vmm/") + strlen(name) + 1;
vmfile = malloc(len);
assert(vmfile != NULL);
snprintf(vmfile, len, "/dev/vmm/%s", name);
/* Open the device file */
fd = open(vmfile, O_RDWR, 0);
free(vmfile);
return (fd);
}
int
vm_create(const char *name)
{
return (CREATE((char *)name));
}
struct vmctx *
vm_open(const char *name)
{
struct vmctx *vm;
vm = malloc(sizeof(struct vmctx) + strlen(name) + 1);
assert(vm != NULL);
vm->fd = -1;
vm->name = (char *)(vm + 1);
strcpy(vm->name, name);
if ((vm->fd = vm_device_open(vm->name)) < 0)
goto err;
return (vm);
err:
vm_destroy(vm);
return (NULL);
}
void
vm_destroy(struct vmctx *vm)
{
assert(vm != NULL);
DESTROY(vm->name);
if (vm->fd >= 0)
close(vm->fd);
free(vm);
}
int
vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa,
vm_paddr_t *ret_hpa, size_t *ret_len)
{
int error;
struct vm_memory_segment seg;
bzero(&seg, sizeof(seg));
seg.gpa = gpa;
error = ioctl(ctx->fd, VM_GET_MEMORY_SEG, &seg);
*ret_hpa = seg.hpa;
*ret_len = seg.len;
return (error);
}
int
vm_setup_memory(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char **mapaddr)
{
int error;
struct vm_memory_segment seg;
/*
* Create and optionally map 'len' bytes of memory at guest
* physical address 'gpa'
*/
bzero(&seg, sizeof(seg));
seg.gpa = gpa;
seg.len = len;
error = ioctl(ctx->fd, VM_MAP_MEMORY, &seg);
if (error == 0 && mapaddr != NULL) {
*mapaddr = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_SHARED,
ctx->fd, gpa);
}
return (error);
}
char *
vm_map_memory(struct vmctx *ctx, vm_paddr_t gpa, size_t len)
{
/* Map 'len' bytes of memory at guest physical address 'gpa' */
return ((char *)mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_SHARED,
ctx->fd, gpa));
}
int
vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
uint64_t base, uint32_t limit, uint32_t access)
{
int error;
struct vm_seg_desc vmsegdesc;
bzero(&vmsegdesc, sizeof(vmsegdesc));
vmsegdesc.cpuid = vcpu;
vmsegdesc.regnum = reg;
vmsegdesc.desc.base = base;
vmsegdesc.desc.limit = limit;
vmsegdesc.desc.access = access;
error = ioctl(ctx->fd, VM_SET_SEGMENT_DESCRIPTOR, &vmsegdesc);
return (error);
}
int
vm_get_desc(struct vmctx *ctx, int vcpu, int reg,
uint64_t *base, uint32_t *limit, uint32_t *access)
{
int error;
struct vm_seg_desc vmsegdesc;
bzero(&vmsegdesc, sizeof(vmsegdesc));
vmsegdesc.cpuid = vcpu;
vmsegdesc.regnum = reg;
error = ioctl(ctx->fd, VM_GET_SEGMENT_DESCRIPTOR, &vmsegdesc);
if (error == 0) {
*base = vmsegdesc.desc.base;
*limit = vmsegdesc.desc.limit;
*access = vmsegdesc.desc.access;
}
return (error);
}
int
vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val)
{
int error;
struct vm_register vmreg;
bzero(&vmreg, sizeof(vmreg));
vmreg.cpuid = vcpu;
vmreg.regnum = reg;
vmreg.regval = val;
error = ioctl(ctx->fd, VM_SET_REGISTER, &vmreg);
return (error);
}
int
vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *ret_val)
{
int error;
struct vm_register vmreg;
bzero(&vmreg, sizeof(vmreg));
vmreg.cpuid = vcpu;
vmreg.regnum = reg;
error = ioctl(ctx->fd, VM_GET_REGISTER, &vmreg);
*ret_val = vmreg.regval;
return (error);
}
int
vm_get_pinning(struct vmctx *ctx, int vcpu, int *host_cpuid)
{
int error;
struct vm_pin vmpin;
bzero(&vmpin, sizeof(vmpin));
vmpin.vm_cpuid = vcpu;
error = ioctl(ctx->fd, VM_GET_PINNING, &vmpin);
*host_cpuid = vmpin.host_cpuid;
return (error);
}
int
vm_set_pinning(struct vmctx *ctx, int vcpu, int host_cpuid)
{
int error;
struct vm_pin vmpin;
bzero(&vmpin, sizeof(vmpin));
vmpin.vm_cpuid = vcpu;
vmpin.host_cpuid = host_cpuid;
error = ioctl(ctx->fd, VM_SET_PINNING, &vmpin);
return (error);
}
int
vm_run(struct vmctx *ctx, int vcpu, uint64_t rip, struct vm_exit *vmexit)
{
int error;
struct vm_run vmrun;
bzero(&vmrun, sizeof(vmrun));
vmrun.cpuid = vcpu;
vmrun.rip = rip;
error = ioctl(ctx->fd, VM_RUN, &vmrun);
bcopy(&vmrun.vm_exit, vmexit, sizeof(struct vm_exit));
return (error);
}
static int
vm_inject_event_real(struct vmctx *ctx, int vcpu, enum vm_event_type type,
int vector, int error_code, int error_code_valid)
{
struct vm_event ev;
bzero(&ev, sizeof(ev));
ev.cpuid = vcpu;
ev.type = type;
ev.vector = vector;
ev.error_code = error_code;
ev.error_code_valid = error_code_valid;
return (ioctl(ctx->fd, VM_INJECT_EVENT, &ev));
}
int
vm_inject_event(struct vmctx *ctx, int vcpu, enum vm_event_type type,
int vector)
{
return (vm_inject_event_real(ctx, vcpu, type, vector, 0, 0));
}
int
vm_inject_event2(struct vmctx *ctx, int vcpu, enum vm_event_type type,
int vector, int error_code)
{
return (vm_inject_event_real(ctx, vcpu, type, vector, error_code, 1));
}
int
vm_build_tables(struct vmctx *ctxt, int ncpu, void *oemtbl, int oemtblsz)
{
return (vm_build_mptable(ctxt, BIOS_ROM_BASE, BIOS_ROM_SIZE, ncpu,
oemtbl, oemtblsz));
}
int
vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector)
{
struct vm_lapic_irq vmirq;
bzero(&vmirq, sizeof(vmirq));
vmirq.cpuid = vcpu;
vmirq.vector = vector;
return (ioctl(ctx->fd, VM_LAPIC_IRQ, &vmirq));
}
int
vm_inject_nmi(struct vmctx *ctx, int vcpu)
{
struct vm_nmi vmnmi;
bzero(&vmnmi, sizeof(vmnmi));
vmnmi.cpuid = vcpu;
return (ioctl(ctx->fd, VM_INJECT_NMI, &vmnmi));
}
int
vm_capability_name2type(const char *capname)
{
int i;
static struct {
const char *name;
int type;
} capstrmap[] = {
{ "hlt_exit", VM_CAP_HALT_EXIT },
{ "mtrap_exit", VM_CAP_MTRAP_EXIT },
{ "pause_exit", VM_CAP_PAUSE_EXIT },
{ "unrestricted_guest", VM_CAP_UNRESTRICTED_GUEST },
{ 0 }
};
for (i = 0; capstrmap[i].name != NULL && capname != NULL; i++) {
if (strcmp(capstrmap[i].name, capname) == 0)
return (capstrmap[i].type);
}
return (-1);
}
int
vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap,
int *retval)
{
int error;
struct vm_capability vmcap;
bzero(&vmcap, sizeof(vmcap));
vmcap.cpuid = vcpu;
vmcap.captype = cap;
error = ioctl(ctx->fd, VM_GET_CAPABILITY, &vmcap);
*retval = vmcap.capval;
return (error);
}
int
vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, int val)
{
struct vm_capability vmcap;
bzero(&vmcap, sizeof(vmcap));
vmcap.cpuid = vcpu;
vmcap.captype = cap;
vmcap.capval = val;
return (ioctl(ctx->fd, VM_SET_CAPABILITY, &vmcap));
}
int
vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func)
{
struct vm_pptdev pptdev;
bzero(&pptdev, sizeof(pptdev));
pptdev.bus = bus;
pptdev.slot = slot;
pptdev.func = func;
return (ioctl(ctx->fd, VM_BIND_PPTDEV, &pptdev));
}
int
vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func)
{
struct vm_pptdev pptdev;
bzero(&pptdev, sizeof(pptdev));
pptdev.bus = bus;
pptdev.slot = slot;
pptdev.func = func;
return (ioctl(ctx->fd, VM_UNBIND_PPTDEV, &pptdev));
}
int
vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func,
vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
{
struct vm_pptdev_mmio pptmmio;
bzero(&pptmmio, sizeof(pptmmio));
pptmmio.bus = bus;
pptmmio.slot = slot;
pptmmio.func = func;
pptmmio.gpa = gpa;
pptmmio.len = len;
pptmmio.hpa = hpa;
return (ioctl(ctx->fd, VM_MAP_PPTDEV_MMIO, &pptmmio));
}
int
vm_setup_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
int destcpu, int vector, int numvec)
{
struct vm_pptdev_msi pptmsi;
bzero(&pptmsi, sizeof(pptmsi));
pptmsi.vcpu = vcpu;
pptmsi.bus = bus;
pptmsi.slot = slot;
pptmsi.func = func;
pptmsi.destcpu = destcpu;
pptmsi.vector = vector;
pptmsi.numvec = numvec;
return (ioctl(ctx->fd, VM_PPTDEV_MSI, &pptmsi));
}
uint64_t *
vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv,
int *ret_entries)
{
int error;
static struct vm_stats vmstats;
vmstats.cpuid = vcpu;
error = ioctl(ctx->fd, VM_STATS, &vmstats);
if (error == 0) {
if (ret_entries)
*ret_entries = vmstats.num_entries;
if (ret_tv)
*ret_tv = vmstats.tv;
return (vmstats.statbuf);
} else
return (NULL);
}
const char *
vm_get_stat_desc(struct vmctx *ctx, int index)
{
int error;
static struct vm_stat_desc statdesc;
statdesc.index = index;
if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0)
return (statdesc.desc);
else
return (NULL);
}
/*
* From Intel Vol 3a:
* Table 9-1. IA-32 Processor States Following Power-up, Reset or INIT
*/
int
vcpu_reset(struct vmctx *vmctx, int vcpu)
{
int error;
uint64_t rflags, rip, cr0, cr4, zero, desc_base, rdx;
uint32_t desc_access, desc_limit;
uint16_t sel;
zero = 0;
rflags = 0x2;
error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RFLAGS, rflags);
if (error)
goto done;
rip = 0xfff0;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RIP, rip)) != 0)
goto done;
cr0 = CR0_NE;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR0, cr0)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR3, zero)) != 0)
goto done;
cr4 = CR4_VMXE;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR4, cr4)) != 0)
goto done;
/*
* CS: present, r/w, accessed, 16-bit, byte granularity, usable
*/
desc_base = 0xffff0000;
desc_limit = 0xffff;
desc_access = 0x0093;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_CS,
desc_base, desc_limit, desc_access);
if (error)
goto done;
sel = 0xf000;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CS, sel)) != 0)
goto done;
/*
* SS,DS,ES,FS,GS: present, r/w, accessed, 16-bit, byte granularity
*/
desc_base = 0;
desc_limit = 0xffff;
desc_access = 0x0093;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_SS,
desc_base, desc_limit, desc_access);
if (error)
goto done;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_DS,
desc_base, desc_limit, desc_access);
if (error)
goto done;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_ES,
desc_base, desc_limit, desc_access);
if (error)
goto done;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_FS,
desc_base, desc_limit, desc_access);
if (error)
goto done;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GS,
desc_base, desc_limit, desc_access);
if (error)
goto done;
sel = 0;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_SS, sel)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DS, sel)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_ES, sel)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_FS, sel)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_GS, sel)) != 0)
goto done;
/* General purpose registers */
rdx = 0xf00;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RAX, zero)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBX, zero)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RCX, zero)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDX, rdx)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSI, zero)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDI, zero)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBP, zero)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSP, zero)) != 0)
goto done;
/* GDTR, IDTR */
desc_base = 0;
desc_limit = 0xffff;
desc_access = 0;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GDTR,
desc_base, desc_limit, desc_access);
if (error != 0)
goto done;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_IDTR,
desc_base, desc_limit, desc_access);
if (error != 0)
goto done;
/* TR */
desc_base = 0;
desc_limit = 0xffff;
desc_access = 0x0000008b;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_TR, 0, 0, desc_access);
if (error)
goto done;
sel = 0;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_TR, sel)) != 0)
goto done;
/* LDTR */
desc_base = 0;
desc_limit = 0xffff;
desc_access = 0x00000082;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_LDTR, desc_base,
desc_limit, desc_access);
if (error)
goto done;
sel = 0;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_LDTR, 0)) != 0)
goto done;
/* XXX cr2, debug registers */
error = 0;
done:
return (error);
}

98
lib/libvmmapi/vmmapi.h Normal file
View File

@ -0,0 +1,98 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VMMAPI_H_
#define _VMMAPI_H_
struct vmctx;
int vm_create(const char *name);
struct vmctx *vm_open(const char *name);
void vm_destroy(struct vmctx *ctx);
int vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa,
vm_paddr_t *ret_hpa, size_t *ret_len);
/*
* Create a memory segment of 'len' bytes in the guest physical address space
* at offset 'gpa'.
*
* If 'mapaddr' is not NULL then this region is mmap'ed into the address
* space of the calling process. If there is an mmap error then *mapaddr
* will be set to MAP_FAILED.
*/
int vm_setup_memory(struct vmctx *ctx, vm_paddr_t gpa, size_t len,
char **mapaddr);
char * vm_map_memory(struct vmctx *ctx, vm_paddr_t gpa, size_t len);
int vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
uint64_t base, uint32_t limit, uint32_t access);
int vm_get_desc(struct vmctx *ctx, int vcpu, int reg,
uint64_t *base, uint32_t *limit, uint32_t *access);
int vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val);
int vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *retval);
int vm_get_pinning(struct vmctx *ctx, int vcpu, int *host_cpuid);
int vm_set_pinning(struct vmctx *ctx, int vcpu, int host_cpuid);
int vm_run(struct vmctx *ctx, int vcpu, uint64_t rip,
struct vm_exit *ret_vmexit);
int vm_build_tables(struct vmctx *ctxt, int ncpus, void *oemtbl,
int oemtblsz);
int vm_inject_event(struct vmctx *ctx, int vcpu, enum vm_event_type type,
int vector);
int vm_inject_event2(struct vmctx *ctx, int vcpu, enum vm_event_type type,
int vector, int error_code);
int vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector);
int vm_inject_nmi(struct vmctx *ctx, int vcpu);
int vm_capability_name2type(const char *capname);
int vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap,
int *retval);
int vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap,
int val);
int vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func);
int vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func);
int vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func,
vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
int vm_setup_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
int dest, int vector, int numvec);
/*
* Return a pointer to the statistics buffer. Note that this is not MT-safe.
*/
uint64_t *vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv,
int *ret_entries);
const char *vm_get_stat_desc(struct vmctx *ctx, int index);
/* Reset vcpu register state */
int vcpu_reset(struct vmctx *ctx, int vcpu);
/*
* FreeBSD specific APIs
*/
int vm_setup_freebsd_registers(struct vmctx *ctx, int vcpu,
uint64_t rip, uint64_t cr3, uint64_t gdtbase,
uint64_t rsp);
void vm_setup_freebsd_gdt(uint64_t *gdtr);
#endif /* _VMMAPI_H_ */

View File

@ -0,0 +1,187 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <machine/specialreg.h>
#include <machine/segments.h>
#include <machine/vmm.h>
#include "vmmapi.h"
#ifndef CR4_VMXE
#define CR4_VMXE (1UL << 13)
#endif
#define DESC_UNUSABLE 0x00010000
#define GUEST_NULL_SEL 0
#define GUEST_CODE_SEL 1
#define GUEST_DATA_SEL 2
#define GUEST_GDTR_LIMIT (3 * 8 - 1)
void
vm_setup_freebsd_gdt(uint64_t *gdtr)
{
gdtr[GUEST_NULL_SEL] = 0;
gdtr[GUEST_CODE_SEL] = 0x0020980000000000;
gdtr[GUEST_DATA_SEL] = 0x0000900000000000;
}
/*
* Setup the 'vcpu' register set such that it will begin execution at
* 'rip' in long mode.
*/
int
vm_setup_freebsd_registers(struct vmctx *vmctx, int vcpu,
uint64_t rip, uint64_t cr3, uint64_t gdtbase,
uint64_t rsp)
{
int error;
uint64_t cr0, cr4, efer, rflags, desc_base;
uint32_t desc_access, desc_limit;
uint16_t gsel;
cr0 = CR0_PE | CR0_PG | CR0_NE;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR0, cr0)) != 0)
goto done;
cr4 = CR4_PAE | CR4_VMXE;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR4, cr4)) != 0)
goto done;
efer = EFER_LME | EFER_LMA;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_EFER, efer)))
goto done;
rflags = 0x2;
error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RFLAGS, rflags);
if (error)
goto done;
desc_base = 0;
desc_limit = 0;
desc_access = 0x0000209B;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_CS,
desc_base, desc_limit, desc_access);
if (error)
goto done;
desc_access = 0x00000093;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_DS,
desc_base, desc_limit, desc_access);
if (error)
goto done;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_ES,
desc_base, desc_limit, desc_access);
if (error)
goto done;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_FS,
desc_base, desc_limit, desc_access);
if (error)
goto done;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GS,
desc_base, desc_limit, desc_access);
if (error)
goto done;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_SS,
desc_base, desc_limit, desc_access);
if (error)
goto done;
/*
* XXX TR is pointing to null selector even though we set the
* TSS segment to be usable with a base address and limit of 0.
*/
desc_access = 0x0000008b;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_TR, 0, 0, desc_access);
if (error)
goto done;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_LDTR, 0, 0,
DESC_UNUSABLE);
if (error)
goto done;
gsel = GSEL(GUEST_CODE_SEL, SEL_KPL);
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CS, gsel)) != 0)
goto done;
gsel = GSEL(GUEST_DATA_SEL, SEL_KPL);
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DS, gsel)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_ES, gsel)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_FS, gsel)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_GS, gsel)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_SS, gsel)) != 0)
goto done;
/* XXX TR is pointing to the null selector */
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_TR, 0)) != 0)
goto done;
/* LDTR is pointing to the null selector */
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_LDTR, 0)) != 0)
goto done;
/* entry point */
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RIP, rip)) != 0)
goto done;
/* page table base */
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR3, cr3)) != 0)
goto done;
desc_base = gdtbase;
desc_limit = GUEST_GDTR_LIMIT;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GDTR,
desc_base, desc_limit, 0);
if (error != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSP, rsp)) != 0)
goto done;
error = 0;
done:
return (error);
}

View File

@ -155,6 +155,7 @@ LIBUSB?= ${DESTDIR}${LIBDIR}/libusb.a
LIBUTIL?= ${DESTDIR}${LIBDIR}/libutil.a
LIBUUTIL?= ${DESTDIR}${LIBDIR}/libuutil.a
LIBVGL?= ${DESTDIR}${LIBDIR}/libvgl.a
LIBVMMAPI?= ${DESTDIR}${LIBDIR}/libvmmapi.a
LIBWRAP?= ${DESTDIR}${LIBDIR}/libwrap.a
LIBXPG4?= ${DESTDIR}${LIBDIR}/libxpg4.a
LIBY?= ${DESTDIR}${LIBDIR}/liby.a

View File

@ -297,6 +297,7 @@
*/
#define APICBASE_RESERVED 0x000006ff
#define APICBASE_BSP 0x00000100
#define APICBASE_X2APIC 0x00000400
#define APICBASE_ENABLED 0x00000800
#define APICBASE_ADDRESS 0xfffff000

268
sys/amd64/include/vmm.h Normal file
View File

@ -0,0 +1,268 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD: vmm.h 482 2011-05-09 21:22:43Z grehan $
*/
#ifndef _VMM_H_
#define _VMM_H_
#ifdef _KERNEL
#define VM_MAX_NAMELEN 32
struct vm;
struct vm_memory_segment;
struct seg_desc;
struct vm_exit;
struct vm_run;
struct vlapic;
typedef int (*vmm_init_func_t)(void);
typedef int (*vmm_cleanup_func_t)(void);
typedef void * (*vmi_init_func_t)(struct vm *vm); /* instance specific apis */
typedef int (*vmi_run_func_t)(void *vmi, int vcpu, register_t rip,
struct vm_exit *vmexit);
typedef void (*vmi_cleanup_func_t)(void *vmi);
typedef int (*vmi_mmap_func_t)(void *vmi, vm_paddr_t gpa, vm_paddr_t hpa,
size_t length, vm_memattr_t attr,
int prot, boolean_t superpages_ok);
typedef int (*vmi_get_register_t)(void *vmi, int vcpu, int num,
uint64_t *retval);
typedef int (*vmi_set_register_t)(void *vmi, int vcpu, int num,
uint64_t val);
typedef int (*vmi_get_desc_t)(void *vmi, int vcpu, int num,
struct seg_desc *desc);
typedef int (*vmi_set_desc_t)(void *vmi, int vcpu, int num,
struct seg_desc *desc);
typedef int (*vmi_inject_event_t)(void *vmi, int vcpu,
int type, int vector,
uint32_t code, int code_valid);
typedef int (*vmi_inject_nmi_t)(void *vmi, int vcpu);
typedef int (*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval);
typedef int (*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val);
struct vmm_ops {
vmm_init_func_t init; /* module wide initialization */
vmm_cleanup_func_t cleanup;
vmi_init_func_t vminit; /* vm-specific initialization */
vmi_run_func_t vmrun;
vmi_cleanup_func_t vmcleanup;
vmi_mmap_func_t vmmmap;
vmi_get_register_t vmgetreg;
vmi_set_register_t vmsetreg;
vmi_get_desc_t vmgetdesc;
vmi_set_desc_t vmsetdesc;
vmi_inject_event_t vminject;
vmi_inject_nmi_t vmnmi;
vmi_get_cap_t vmgetcap;
vmi_set_cap_t vmsetcap;
};
extern struct vmm_ops vmm_ops_intel;
extern struct vmm_ops vmm_ops_amd;
struct vm *vm_create(const char *name);
void vm_destroy(struct vm *vm);
const char *vm_name(struct vm *vm);
int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t *ret_hpa);
int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len);
vm_paddr_t vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t size);
int vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
struct vm_memory_segment *seg);
int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval);
int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val);
int vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
struct seg_desc *ret_desc);
int vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
struct seg_desc *desc);
int vm_get_pinning(struct vm *vm, int vcpu, int *cpuid);
int vm_set_pinning(struct vm *vm, int vcpu, int cpuid);
int vm_run(struct vm *vm, struct vm_run *vmrun);
int vm_inject_event(struct vm *vm, int vcpu, int type,
int vector, uint32_t error_code, int error_code_valid);
int vm_inject_nmi(struct vm *vm, int vcpu);
uint64_t *vm_guest_msrs(struct vm *vm, int cpu);
struct vlapic *vm_lapic(struct vm *vm, int cpu);
int vm_get_capability(struct vm *vm, int vcpu, int type, int *val);
int vm_set_capability(struct vm *vm, int vcpu, int type, int val);
void vm_activate_cpu(struct vm *vm, int vcpu);
cpumask_t vm_active_cpus(struct vm *vm);
/*
* Return 1 if device indicated by bus/slot/func is supposed to be a
* pci passthrough device.
*
* Return 0 otherwise.
*/
int vmm_is_pptdev(int bus, int slot, int func);
void *vm_iommu_domain(struct vm *vm);
#define VCPU_STOPPED 0
#define VCPU_RUNNING 1
void vm_set_run_state(struct vm *vm, int vcpu, int running);
int vm_get_run_state(struct vm *vm, int vcpu, int *hostcpu);
void *vcpu_stats(struct vm *vm, int vcpu);
static int __inline
vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu)
{
return (vm_get_run_state(vm, vcpu, hostcpu) == VCPU_RUNNING);
}
static cpumask_t __inline
vcpu_mask(int vcpuid)
{
return ((cpumask_t)1 << vcpuid);
}
#endif /* KERNEL */
#define VM_MAXCPU 8 /* maximum virtual cpus */
/*
* Identifiers for events that can be injected into the VM
*/
enum vm_event_type {
VM_EVENT_NONE,
VM_HW_INTR,
VM_NMI,
VM_HW_EXCEPTION,
VM_SW_INTR,
VM_PRIV_SW_EXCEPTION,
VM_SW_EXCEPTION,
VM_EVENT_MAX
};
/*
* Identifiers for architecturally defined registers.
*/
enum vm_reg_name {
VM_REG_GUEST_RAX,
VM_REG_GUEST_RBX,
VM_REG_GUEST_RCX,
VM_REG_GUEST_RDX,
VM_REG_GUEST_RSI,
VM_REG_GUEST_RDI,
VM_REG_GUEST_RBP,
VM_REG_GUEST_R8,
VM_REG_GUEST_R9,
VM_REG_GUEST_R10,
VM_REG_GUEST_R11,
VM_REG_GUEST_R12,
VM_REG_GUEST_R13,
VM_REG_GUEST_R14,
VM_REG_GUEST_R15,
VM_REG_GUEST_CR0,
VM_REG_GUEST_CR3,
VM_REG_GUEST_CR4,
VM_REG_GUEST_DR7,
VM_REG_GUEST_RSP,
VM_REG_GUEST_RIP,
VM_REG_GUEST_RFLAGS,
VM_REG_GUEST_ES,
VM_REG_GUEST_CS,
VM_REG_GUEST_SS,
VM_REG_GUEST_DS,
VM_REG_GUEST_FS,
VM_REG_GUEST_GS,
VM_REG_GUEST_LDTR,
VM_REG_GUEST_TR,
VM_REG_GUEST_IDTR,
VM_REG_GUEST_GDTR,
VM_REG_GUEST_EFER,
VM_REG_LAST
};
/*
* Identifiers for optional vmm capabilities
*/
enum vm_cap_type {
VM_CAP_HALT_EXIT,
VM_CAP_MTRAP_EXIT,
VM_CAP_PAUSE_EXIT,
VM_CAP_UNRESTRICTED_GUEST,
VM_CAP_MAX
};
/*
* The 'access' field has the format specified in Table 21-2 of the Intel
* Architecture Manual vol 3b.
*
* XXX The contents of the 'access' field are architecturally defined except
* bit 16 - Segment Unusable.
*/
struct seg_desc {
uint64_t base;
uint32_t limit;
uint32_t access;
};
enum vm_exitcode {
VM_EXITCODE_INOUT,
VM_EXITCODE_VMX,
VM_EXITCODE_BOGUS,
VM_EXITCODE_RDMSR,
VM_EXITCODE_WRMSR,
VM_EXITCODE_HLT,
VM_EXITCODE_MTRAP,
VM_EXITCODE_PAUSE,
VM_EXITCODE_MAX,
};
struct vm_exit {
enum vm_exitcode exitcode;
int inst_length; /* 0 means unknown */
uint64_t rip;
union {
struct {
uint16_t bytes:3; /* 1 or 2 or 4 */
uint16_t in:1; /* out is 0, in is 1 */
uint16_t string:1;
uint16_t rep:1;
uint16_t port;
uint32_t eax; /* valid for out */
} inout;
/*
* VMX specific payload. Used when there is no "better"
* exitcode to represent the VM-exit.
*/
struct {
int error; /* vmx inst error */
uint32_t exit_reason;
uint64_t exit_qualification;
} vmx;
struct {
uint32_t code; /* ecx value */
uint64_t wval;
} msr;
} u;
};
#endif /* _VMM_H_ */

191
sys/amd64/include/vmm_dev.h Normal file
View File

@ -0,0 +1,191 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD: vmm_dev.h 482 2011-05-09 21:22:43Z grehan $
*/
#ifndef _VMM_DEV_H_
#define _VMM_DEV_H_
#ifdef _KERNEL
void vmmdev_init(void);
void vmmdev_cleanup(void);
#endif
struct vm_memory_segment {
vm_paddr_t hpa; /* out */
vm_paddr_t gpa; /* in */
size_t len; /* in */
};
struct vm_register {
int cpuid;
int regnum; /* enum vm_reg_name */
uint64_t regval;
};
struct vm_seg_desc { /* data or code segment */
int cpuid;
int regnum; /* enum vm_reg_name */
struct seg_desc desc;
};
struct vm_pin {
int vm_cpuid;
int host_cpuid; /* -1 to unpin */
};
struct vm_run {
int cpuid;
uint64_t rip; /* start running here */
struct vm_exit vm_exit;
};
struct vm_event {
int cpuid;
enum vm_event_type type;
int vector;
uint32_t error_code;
int error_code_valid;
};
struct vm_lapic_irq {
int cpuid;
int vector;
};
struct vm_capability {
int cpuid;
enum vm_cap_type captype;
int capval;
int allcpus;
};
struct vm_pptdev {
int bus;
int slot;
int func;
};
struct vm_pptdev_mmio {
int bus;
int slot;
int func;
vm_paddr_t gpa;
vm_paddr_t hpa;
size_t len;
};
struct vm_pptdev_msi {
int vcpu;
int bus;
int slot;
int func;
int numvec; /* 0 means disabled */
int vector;
int destcpu;
};
struct vm_nmi {
int cpuid;
};
#define MAX_VM_STATS 64
struct vm_stats {
int cpuid; /* in */
int num_entries; /* out */
struct timeval tv;
uint64_t statbuf[MAX_VM_STATS];
};
struct vm_stat_desc {
int index; /* in */
char desc[128]; /* out */
};
enum {
IOCNUM_RUN,
IOCNUM_SET_PINNING,
IOCNUM_GET_PINNING,
IOCNUM_MAP_MEMORY,
IOCNUM_GET_MEMORY_SEG,
IOCNUM_SET_REGISTER,
IOCNUM_GET_REGISTER,
IOCNUM_SET_SEGMENT_DESCRIPTOR,
IOCNUM_GET_SEGMENT_DESCRIPTOR,
IOCNUM_INJECT_EVENT,
IOCNUM_LAPIC_IRQ,
IOCNUM_SET_CAPABILITY,
IOCNUM_GET_CAPABILITY,
IOCNUM_BIND_PPTDEV,
IOCNUM_UNBIND_PPTDEV,
IOCNUM_MAP_PPTDEV_MMIO,
IOCNUM_PPTDEV_MSI,
IOCNUM_INJECT_NMI,
IOCNUM_VM_STATS,
IOCNUM_VM_STAT_DESC,
};
#define VM_RUN \
_IOWR('v', IOCNUM_RUN, struct vm_run)
#define VM_SET_PINNING \
_IOW('v', IOCNUM_SET_PINNING, struct vm_pin)
#define VM_GET_PINNING \
_IOWR('v', IOCNUM_GET_PINNING, struct vm_pin)
#define VM_MAP_MEMORY \
_IOWR('v', IOCNUM_MAP_MEMORY, struct vm_memory_segment)
#define VM_GET_MEMORY_SEG \
_IOWR('v', IOCNUM_GET_MEMORY_SEG, struct vm_memory_segment)
#define VM_SET_REGISTER \
_IOW('v', IOCNUM_SET_REGISTER, struct vm_register)
#define VM_GET_REGISTER \
_IOWR('v', IOCNUM_GET_REGISTER, struct vm_register)
#define VM_SET_SEGMENT_DESCRIPTOR \
_IOW('v', IOCNUM_SET_SEGMENT_DESCRIPTOR, struct vm_seg_desc)
#define VM_GET_SEGMENT_DESCRIPTOR \
_IOWR('v', IOCNUM_GET_SEGMENT_DESCRIPTOR, struct vm_seg_desc)
#define VM_INJECT_EVENT \
_IOW('v', IOCNUM_INJECT_EVENT, struct vm_event)
#define VM_LAPIC_IRQ \
_IOW('v', IOCNUM_LAPIC_IRQ, struct vm_lapic_irq)
#define VM_SET_CAPABILITY \
_IOW('v', IOCNUM_SET_CAPABILITY, struct vm_capability)
#define VM_GET_CAPABILITY \
_IOWR('v', IOCNUM_GET_CAPABILITY, struct vm_capability)
#define VM_BIND_PPTDEV \
_IOW('v', IOCNUM_BIND_PPTDEV, struct vm_pptdev)
#define VM_UNBIND_PPTDEV \
_IOW('v', IOCNUM_UNBIND_PPTDEV, struct vm_pptdev)
#define VM_MAP_PPTDEV_MMIO \
_IOW('v', IOCNUM_MAP_PPTDEV_MMIO, struct vm_pptdev_mmio)
#define VM_PPTDEV_MSI \
_IOW('v', IOCNUM_PPTDEV_MSI, struct vm_pptdev_msi)
#define VM_INJECT_NMI \
_IOW('v', IOCNUM_INJECT_NMI, struct vm_nmi)
#define VM_STATS \
_IOWR('v', IOCNUM_VM_STATS, struct vm_stats)
#define VM_STAT_DESC \
_IOWR('v', IOCNUM_VM_STAT_DESC, struct vm_stat_desc)
#endif

247
sys/amd64/vmm/amd/amdv.c Normal file
View File

@ -0,0 +1,247 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/errno.h>
#include <machine/vmm.h>
#include "io/iommu.h"
static int
amdv_init(void)
{
printf("amdv_init: not implemented\n");
return (ENXIO);
}
static int
amdv_cleanup(void)
{
printf("amdv_cleanup: not implemented\n");
return (ENXIO);
}
static void *
amdv_vminit(struct vm *vm)
{
printf("amdv_vminit: not implemented\n");
return (NULL);
}
static int
amdv_vmrun(void *arg, int vcpu, register_t rip, struct vm_exit *vmexit)
{
printf("amdv_vmrun: not implemented\n");
return (ENXIO);
}
static void
amdv_vmcleanup(void *arg)
{
printf("amdv_vmcleanup: not implemented\n");
return;
}
static int
amdv_vmmmap(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
vm_memattr_t attr, int prot, boolean_t spok)
{
printf("amdv_vmmmap: not implemented\n");
return (EINVAL);
}
static int
amdv_getreg(void *arg, int vcpu, int regnum, uint64_t *retval)
{
printf("amdv_getreg: not implemented\n");
return (EINVAL);
}
static int
amdv_setreg(void *arg, int vcpu, int regnum, uint64_t val)
{
printf("amdv_setreg: not implemented\n");
return (EINVAL);
}
static int
amdv_getdesc(void *vmi, int vcpu, int num, struct seg_desc *desc)
{
printf("amdv_get_desc: not implemented\n");
return (EINVAL);
}
static int
amdv_setdesc(void *vmi, int vcpu, int num, struct seg_desc *desc)
{
printf("amdv_get_desc: not implemented\n");
return (EINVAL);
}
static int
amdv_inject_event(void *vmi, int vcpu, int type, int vector,
uint32_t error_code, int error_code_valid)
{
printf("amdv_inject_event: not implemented\n");
return (EINVAL);
}
static int
amdv_nmi(void *arg, int vcpu)
{
printf("amdv_nmi: not implemented\n");
return (EINVAL);
}
static int
amdv_getcap(void *arg, int vcpu, int type, int *retval)
{
printf("amdv_getcap: not implemented\n");
return (EINVAL);
}
static int
amdv_setcap(void *arg, int vcpu, int type, int val)
{
printf("amdv_setcap: not implemented\n");
return (EINVAL);
}
struct vmm_ops vmm_ops_amd = {
amdv_init,
amdv_cleanup,
amdv_vminit,
amdv_vmrun,
amdv_vmcleanup,
amdv_vmmmap,
amdv_getreg,
amdv_setreg,
amdv_getdesc,
amdv_setdesc,
amdv_inject_event,
amdv_nmi,
amdv_getcap,
amdv_setcap
};
static int
amd_iommu_init(void)
{
printf("amd_iommu_init: not implemented\n");
return (ENXIO);
}
static void
amd_iommu_cleanup(void)
{
printf("amd_iommu_cleanup: not implemented\n");
}
static void
amd_iommu_enable(void)
{
printf("amd_iommu_enable: not implemented\n");
}
static void
amd_iommu_disable(void)
{
printf("amd_iommu_disable: not implemented\n");
}
static void *
amd_iommu_create_domain(vm_paddr_t maxaddr)
{
printf("amd_iommu_create_domain: not implemented\n");
return (NULL);
}
static void
amd_iommu_destroy_domain(void *domain)
{
printf("amd_iommu_destroy_domain: not implemented\n");
}
static uint64_t
amd_iommu_create_mapping(void *domain, vm_paddr_t gpa, vm_paddr_t hpa,
uint64_t len)
{
printf("amd_iommu_create_mapping: not implemented\n");
return (0);
}
static void
amd_iommu_add_device(void *domain, int bus, int slot, int func)
{
printf("amd_iommu_add_device: not implemented\n");
}
static void
amd_iommu_remove_device(void *domain, int bus, int slot, int func)
{
printf("amd_iommu_remove_device: not implemented\n");
}
struct iommu_ops iommu_ops_amd = {
amd_iommu_init,
amd_iommu_cleanup,
amd_iommu_enable,
amd_iommu_disable,
amd_iommu_create_domain,
amd_iommu_destroy_domain,
amd_iommu_create_mapping,
amd_iommu_add_device,
amd_iommu_remove_device,
};

312
sys/amd64/vmm/intel/ept.c Normal file
View File

@ -0,0 +1,312 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <sys/errno.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/smp.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <machine/param.h>
#include <machine/cpufunc.h>
#include <machine/pmap.h>
#include <machine/vmparam.h>
#include <machine/vmm.h>
#include "vmx_cpufunc.h"
#include "vmx_msr.h"
#include "vmx.h"
#include "ept.h"
#define EPT_PWL4(cap) ((cap) & (1UL << 6))
#define EPT_MEMORY_TYPE_WB(cap) ((cap) & (1UL << 14))
#define EPT_PDE_SUPERPAGE(cap) ((cap) & (1UL << 16)) /* 2MB pages */
#define EPT_PDPTE_SUPERPAGE(cap) ((cap) & (1UL << 17)) /* 1GB pages */
#define INVVPID_SUPPORTED(cap) ((cap) & (1UL << 32))
#define INVEPT_SUPPORTED(cap) ((cap) & (1UL << 20))
#define INVVPID_ALL_TYPES_MASK 0xF0000000000UL
#define INVVPID_ALL_TYPES_SUPPORTED(cap) \
(((cap) & INVVPID_ALL_TYPES_MASK) == INVVPID_ALL_TYPES_MASK)
#define INVEPT_ALL_TYPES_MASK 0x6000000UL
#define INVEPT_ALL_TYPES_SUPPORTED(cap) \
(((cap) & INVEPT_ALL_TYPES_MASK) == INVEPT_ALL_TYPES_MASK)
#define EPT_PG_RD (1 << 0)
#define EPT_PG_WR (1 << 1)
#define EPT_PG_EX (1 << 2)
#define EPT_PG_MEMORY_TYPE(x) ((x) << 3)
#define EPT_PG_IGNORE_PAT (1 << 6)
#define EPT_PG_SUPERPAGE (1 << 7)
#define EPT_ADDR_MASK ((uint64_t)-1 << 12)
MALLOC_DECLARE(M_VMX);
static uint64_t page_sizes_mask;
int
ept_init(void)
{
int page_shift;
uint64_t cap;
cap = rdmsr(MSR_VMX_EPT_VPID_CAP);
/*
* Verify that:
* - page walk length is 4 steps
* - extended page tables can be laid out in write-back memory
* - invvpid instruction with all possible types is supported
* - invept instruction with all possible types is supported
*/
if (!EPT_PWL4(cap) ||
!EPT_MEMORY_TYPE_WB(cap) ||
!INVVPID_SUPPORTED(cap) ||
!INVVPID_ALL_TYPES_SUPPORTED(cap) ||
!INVEPT_SUPPORTED(cap) ||
!INVEPT_ALL_TYPES_SUPPORTED(cap))
return (EINVAL);
/* Set bits in 'page_sizes_mask' for each valid page size */
page_shift = PAGE_SHIFT;
page_sizes_mask = 1UL << page_shift; /* 4KB page */
page_shift += 9;
if (EPT_PDE_SUPERPAGE(cap))
page_sizes_mask |= 1UL << page_shift; /* 2MB superpage */
page_shift += 9;
if (EPT_PDPTE_SUPERPAGE(cap))
page_sizes_mask |= 1UL << page_shift; /* 1GB superpage */
return (0);
}
static size_t
ept_create_mapping(uint64_t *ptp, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
vm_memattr_t attr, vm_prot_t prot, boolean_t spok)
{
int spshift, ptpshift, ptpindex, nlevels;
/*
* Compute the size of the mapping that we can accomodate.
*
* This is based on three factors:
* - super page sizes supported by the processor
* - alignment of the region starting at 'gpa' and 'hpa'
* - length of the region 'len'
*/
spshift = PAGE_SHIFT;
if (spok)
spshift += (EPT_PWLEVELS - 1) * 9;
while (spshift >= PAGE_SHIFT) {
uint64_t spsize = 1UL << spshift;
if ((page_sizes_mask & spsize) != 0 &&
(gpa & (spsize - 1)) == 0 &&
(hpa & (spsize - 1)) == 0 &&
length >= spsize) {
break;
}
spshift -= 9;
}
if (spshift < PAGE_SHIFT) {
panic("Invalid spshift for gpa 0x%016lx, hpa 0x%016lx, "
"length 0x%016lx, page_sizes_mask 0x%016lx",
gpa, hpa, length, page_sizes_mask);
}
nlevels = EPT_PWLEVELS;
while (--nlevels >= 0) {
ptpshift = PAGE_SHIFT + nlevels * 9;
ptpindex = (gpa >> ptpshift) & 0x1FF;
/* We have reached the leaf mapping */
if (spshift >= ptpshift)
break;
/*
* We are working on a non-leaf page table page.
*
* Create the next level page table page if necessary and point
* to it from the current page table.
*/
if (ptp[ptpindex] == 0) {
void *nlp = malloc(PAGE_SIZE, M_VMX, M_WAITOK | M_ZERO);
ptp[ptpindex] = vtophys(nlp);
ptp[ptpindex] |= EPT_PG_RD | EPT_PG_WR | EPT_PG_EX;
}
/* Work our way down to the next level page table page */
ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & EPT_ADDR_MASK);
}
if ((gpa & ((1UL << ptpshift) - 1)) != 0) {
panic("ept_create_mapping: gpa 0x%016lx and ptpshift %d "
"mismatch\n", gpa, ptpshift);
}
/* Do the mapping */
ptp[ptpindex] = hpa;
/* Apply the access controls */
if (prot & VM_PROT_READ)
ptp[ptpindex] |= EPT_PG_RD;
if (prot & VM_PROT_WRITE)
ptp[ptpindex] |= EPT_PG_WR;
if (prot & VM_PROT_EXECUTE)
ptp[ptpindex] |= EPT_PG_EX;
/*
* XXX should we enforce this memory type by setting the ignore PAT
* bit to 1.
*/
ptp[ptpindex] |= EPT_PG_MEMORY_TYPE(attr);
if (nlevels > 0)
ptp[ptpindex] |= EPT_PG_SUPERPAGE;
return (1UL << ptpshift);
}
static void
ept_free_pt_entry(pt_entry_t pte)
{
if (pte == 0)
return;
/* sanity check */
if ((pte & EPT_PG_SUPERPAGE) != 0)
panic("ept_free_pt_entry: pte cannot have superpage bit");
return;
}
static void
ept_free_pd_entry(pd_entry_t pde)
{
pt_entry_t *pt;
int i;
if (pde == 0)
return;
if ((pde & EPT_PG_SUPERPAGE) == 0) {
pt = (pt_entry_t *)PHYS_TO_DMAP(pde & EPT_ADDR_MASK);
for (i = 0; i < NPTEPG; i++)
ept_free_pt_entry(pt[i]);
free(pt, M_VMX); /* free the page table page */
}
}
static void
ept_free_pdp_entry(pdp_entry_t pdpe)
{
pd_entry_t *pd;
int i;
if (pdpe == 0)
return;
if ((pdpe & EPT_PG_SUPERPAGE) == 0) {
pd = (pd_entry_t *)PHYS_TO_DMAP(pdpe & EPT_ADDR_MASK);
for (i = 0; i < NPDEPG; i++)
ept_free_pd_entry(pd[i]);
free(pd, M_VMX); /* free the page directory page */
}
}
static void
ept_free_pml4_entry(pml4_entry_t pml4e)
{
pdp_entry_t *pdp;
int i;
if (pml4e == 0)
return;
if ((pml4e & EPT_PG_SUPERPAGE) == 0) {
pdp = (pdp_entry_t *)PHYS_TO_DMAP(pml4e & EPT_ADDR_MASK);
for (i = 0; i < NPDPEPG; i++)
ept_free_pdp_entry(pdp[i]);
free(pdp, M_VMX); /* free the page directory ptr page */
}
}
void
ept_vmcleanup(struct vmx *vmx)
{
int i;
for (i = 0; i < NPML4EPG; i++)
ept_free_pml4_entry(vmx->pml4ept[i]);
}
int
ept_vmmmap(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t len,
vm_memattr_t attr, int prot, boolean_t spok)
{
size_t n;
struct vmx *vmx = arg;
while (len > 0) {
n = ept_create_mapping(vmx->pml4ept, gpa, hpa, len, attr,
prot, spok);
len -= n;
gpa += n;
hpa += n;
}
return (0);
}
static void
invept_single_context(void *arg)
{
struct invept_desc desc = *(struct invept_desc *)arg;
invept(INVEPT_TYPE_SINGLE_CONTEXT, desc);
}
void
ept_invalidate_mappings(u_long pml4ept)
{
struct invept_desc invept_desc = { 0 };
invept_desc.eptp = EPTP(pml4ept);
smp_rendezvous(NULL, invept_single_context, NULL, &invept_desc);
}

42
sys/amd64/vmm/intel/ept.h Normal file
View File

@ -0,0 +1,42 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _EPT_H_
#define _EPT_H_
struct vmx;
#define EPT_PWLEVELS 4 /* page walk levels */
#define EPTP(pml4) ((pml4) | (EPT_PWLEVELS - 1) << 3 | PAT_WRITE_BACK)
int ept_init(void);
int ept_vmmmap(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
vm_memattr_t attr, int prot, boolean_t allow_superpage_mappings);
void ept_invalidate_mappings(u_long ept_pml4);
void ept_vmcleanup(struct vmx *vmx);
#endif

451
sys/amd64/vmm/intel/vmcs.c Normal file
View File

@ -0,0 +1,451 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/pcpu.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <machine/segments.h>
#include <machine/pmap.h>
#include <machine/vmm.h>
#include "vmcs.h"
#include "vmx_cpufunc.h"
#include "ept.h"
#include "vmx.h"
static uint64_t
vmcs_fix_regval(uint32_t encoding, uint64_t val)
{
switch (encoding) {
case VMCS_GUEST_CR0:
val = vmx_fix_cr0(val);
break;
case VMCS_GUEST_CR4:
val = vmx_fix_cr4(val);
break;
default:
break;
}
return (val);
}
static uint32_t
vmcs_field_encoding(int ident)
{
switch (ident) {
case VM_REG_GUEST_CR0:
return (VMCS_GUEST_CR0);
case VM_REG_GUEST_CR3:
return (VMCS_GUEST_CR3);
case VM_REG_GUEST_CR4:
return (VMCS_GUEST_CR4);
case VM_REG_GUEST_DR7:
return (VMCS_GUEST_DR7);
case VM_REG_GUEST_RSP:
return (VMCS_GUEST_RSP);
case VM_REG_GUEST_RIP:
return (VMCS_GUEST_RIP);
case VM_REG_GUEST_RFLAGS:
return (VMCS_GUEST_RFLAGS);
case VM_REG_GUEST_ES:
return (VMCS_GUEST_ES_SELECTOR);
case VM_REG_GUEST_CS:
return (VMCS_GUEST_CS_SELECTOR);
case VM_REG_GUEST_SS:
return (VMCS_GUEST_SS_SELECTOR);
case VM_REG_GUEST_DS:
return (VMCS_GUEST_DS_SELECTOR);
case VM_REG_GUEST_FS:
return (VMCS_GUEST_FS_SELECTOR);
case VM_REG_GUEST_GS:
return (VMCS_GUEST_GS_SELECTOR);
case VM_REG_GUEST_TR:
return (VMCS_GUEST_TR_SELECTOR);
case VM_REG_GUEST_LDTR:
return (VMCS_GUEST_LDTR_SELECTOR);
case VM_REG_GUEST_EFER:
return (VMCS_GUEST_IA32_EFER);
default:
return (-1);
}
}
static int
vmcs_seg_desc_encoding(int seg, uint32_t *base, uint32_t *lim, uint32_t *acc)
{
switch (seg) {
case VM_REG_GUEST_ES:
*base = VMCS_GUEST_ES_BASE;
*lim = VMCS_GUEST_ES_LIMIT;
*acc = VMCS_GUEST_ES_ACCESS_RIGHTS;
break;
case VM_REG_GUEST_CS:
*base = VMCS_GUEST_CS_BASE;
*lim = VMCS_GUEST_CS_LIMIT;
*acc = VMCS_GUEST_CS_ACCESS_RIGHTS;
break;
case VM_REG_GUEST_SS:
*base = VMCS_GUEST_SS_BASE;
*lim = VMCS_GUEST_SS_LIMIT;
*acc = VMCS_GUEST_SS_ACCESS_RIGHTS;
break;
case VM_REG_GUEST_DS:
*base = VMCS_GUEST_DS_BASE;
*lim = VMCS_GUEST_DS_LIMIT;
*acc = VMCS_GUEST_DS_ACCESS_RIGHTS;
break;
case VM_REG_GUEST_FS:
*base = VMCS_GUEST_FS_BASE;
*lim = VMCS_GUEST_FS_LIMIT;
*acc = VMCS_GUEST_FS_ACCESS_RIGHTS;
break;
case VM_REG_GUEST_GS:
*base = VMCS_GUEST_GS_BASE;
*lim = VMCS_GUEST_GS_LIMIT;
*acc = VMCS_GUEST_GS_ACCESS_RIGHTS;
break;
case VM_REG_GUEST_TR:
*base = VMCS_GUEST_TR_BASE;
*lim = VMCS_GUEST_TR_LIMIT;
*acc = VMCS_GUEST_TR_ACCESS_RIGHTS;
break;
case VM_REG_GUEST_LDTR:
*base = VMCS_GUEST_LDTR_BASE;
*lim = VMCS_GUEST_LDTR_LIMIT;
*acc = VMCS_GUEST_LDTR_ACCESS_RIGHTS;
break;
case VM_REG_GUEST_IDTR:
*base = VMCS_GUEST_IDTR_BASE;
*lim = VMCS_GUEST_IDTR_LIMIT;
*acc = VMCS_INVALID_ENCODING;
break;
case VM_REG_GUEST_GDTR:
*base = VMCS_GUEST_GDTR_BASE;
*lim = VMCS_GUEST_GDTR_LIMIT;
*acc = VMCS_INVALID_ENCODING;
break;
default:
return (EINVAL);
}
return (0);
}
int
vmcs_getreg(struct vmcs *vmcs, int ident, uint64_t *retval)
{
int error;
uint32_t encoding;
/*
* If we need to get at vmx-specific state in the VMCS we can bypass
* the translation of 'ident' to 'encoding' by simply setting the
* sign bit. As it so happens the upper 16 bits are reserved (i.e
* set to 0) in the encodings for the VMCS so we are free to use the
* sign bit.
*/
if (ident < 0)
encoding = ident & 0x7fffffff;
else
encoding = vmcs_field_encoding(ident);
if (encoding == (uint32_t)-1)
return (EINVAL);
VMPTRLD(vmcs);
error = vmread(encoding, retval);
VMCLEAR(vmcs);
return (error);
}
int
vmcs_setreg(struct vmcs *vmcs, int ident, uint64_t val)
{
int error;
uint32_t encoding;
if (ident < 0)
encoding = ident & 0x7fffffff;
else
encoding = vmcs_field_encoding(ident);
if (encoding == (uint32_t)-1)
return (EINVAL);
val = vmcs_fix_regval(encoding, val);
VMPTRLD(vmcs);
error = vmwrite(encoding, val);
VMCLEAR(vmcs);
return (error);
}
int
vmcs_setdesc(struct vmcs *vmcs, int seg, struct seg_desc *desc)
{
int error;
uint32_t base, limit, access;
error = vmcs_seg_desc_encoding(seg, &base, &limit, &access);
if (error != 0)
panic("vmcs_setdesc: invalid segment register %d", seg);
VMPTRLD(vmcs);
if ((error = vmwrite(base, desc->base)) != 0)
goto done;
if ((error = vmwrite(limit, desc->limit)) != 0)
goto done;
if (access != VMCS_INVALID_ENCODING) {
if ((error = vmwrite(access, desc->access)) != 0)
goto done;
}
done:
VMCLEAR(vmcs);
return (error);
}
int
vmcs_getdesc(struct vmcs *vmcs, int seg, struct seg_desc *desc)
{
int error;
uint32_t base, limit, access;
uint64_t u64;
error = vmcs_seg_desc_encoding(seg, &base, &limit, &access);
if (error != 0)
panic("vmcs_getdesc: invalid segment register %d", seg);
VMPTRLD(vmcs);
if ((error = vmread(base, &u64)) != 0)
goto done;
desc->base = u64;
if ((error = vmread(limit, &u64)) != 0)
goto done;
desc->limit = u64;
if (access != VMCS_INVALID_ENCODING) {
if ((error = vmread(access, &u64)) != 0)
goto done;
desc->access = u64;
}
done:
VMCLEAR(vmcs);
return (error);
}
int
vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count)
{
int error;
VMPTRLD(vmcs);
/*
* Guest MSRs are saved in the VM-exit MSR-store area.
* Guest MSRs are loaded from the VM-entry MSR-load area.
* Both areas point to the same location in memory.
*/
if ((error = vmwrite(VMCS_EXIT_MSR_STORE, g_area)) != 0)
goto done;
if ((error = vmwrite(VMCS_EXIT_MSR_STORE_COUNT, g_count)) != 0)
goto done;
if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD, g_area)) != 0)
goto done;
if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD_COUNT, g_count)) != 0)
goto done;
error = 0;
done:
VMCLEAR(vmcs);
return (error);
}
int
vmcs_set_defaults(struct vmcs *vmcs,
u_long host_rip, u_long host_rsp, u_long ept_pml4,
uint32_t pinbased_ctls, uint32_t procbased_ctls,
uint32_t procbased_ctls2, uint32_t exit_ctls,
uint32_t entry_ctls, u_long msr_bitmap, uint16_t vpid)
{
int error, codesel, datasel, tsssel;
u_long cr0, cr4, efer;
uint64_t eptp, pat;
uint32_t exc_bitmap;
codesel = GSEL(GCODE_SEL, SEL_KPL);
datasel = GSEL(GDATA_SEL, SEL_KPL);
tsssel = GSEL(GPROC0_SEL, SEL_KPL);
/*
* Make sure we have a "current" VMCS to work with.
*/
VMPTRLD(vmcs);
/*
* Load the VMX controls
*/
if ((error = vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls)) != 0)
goto done;
if ((error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls)) != 0)
goto done;
if ((error = vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2)) != 0)
goto done;
if ((error = vmwrite(VMCS_EXIT_CTLS, exit_ctls)) != 0)
goto done;
if ((error = vmwrite(VMCS_ENTRY_CTLS, entry_ctls)) != 0)
goto done;
/* Guest state */
/* Initialize guest IA32_PAT MSR with the default value */
pat = PAT_VALUE(0, PAT_WRITE_BACK) |
PAT_VALUE(1, PAT_WRITE_THROUGH) |
PAT_VALUE(2, PAT_UNCACHED) |
PAT_VALUE(3, PAT_UNCACHEABLE) |
PAT_VALUE(4, PAT_WRITE_BACK) |
PAT_VALUE(5, PAT_WRITE_THROUGH) |
PAT_VALUE(6, PAT_UNCACHED) |
PAT_VALUE(7, PAT_UNCACHEABLE);
if ((error = vmwrite(VMCS_GUEST_IA32_PAT, pat)) != 0)
goto done;
/* Host state */
/* Initialize host IA32_PAT MSR */
pat = rdmsr(MSR_PAT);
if ((error = vmwrite(VMCS_HOST_IA32_PAT, pat)) != 0)
goto done;
/* Load the IA32_EFER MSR */
efer = rdmsr(MSR_EFER);
if ((error = vmwrite(VMCS_HOST_IA32_EFER, efer)) != 0)
goto done;
/* Load the control registers */
cr0 = rcr0();
if ((error = vmwrite(VMCS_HOST_CR0, cr0)) != 0)
goto done;
cr4 = rcr4();
if ((error = vmwrite(VMCS_HOST_CR4, cr4)) != 0)
goto done;
/* Load the segment selectors */
if ((error = vmwrite(VMCS_HOST_ES_SELECTOR, datasel)) != 0)
goto done;
if ((error = vmwrite(VMCS_HOST_CS_SELECTOR, codesel)) != 0)
goto done;
if ((error = vmwrite(VMCS_HOST_SS_SELECTOR, datasel)) != 0)
goto done;
if ((error = vmwrite(VMCS_HOST_DS_SELECTOR, datasel)) != 0)
goto done;
if ((error = vmwrite(VMCS_HOST_FS_SELECTOR, datasel)) != 0)
goto done;
if ((error = vmwrite(VMCS_HOST_GS_SELECTOR, datasel)) != 0)
goto done;
if ((error = vmwrite(VMCS_HOST_TR_SELECTOR, tsssel)) != 0)
goto done;
/*
* Load the Base-Address for %fs and idtr.
*
* Note that we exclude %gs, tss and gdtr here because their base
* address is pcpu specific.
*/
if ((error = vmwrite(VMCS_HOST_FS_BASE, 0)) != 0)
goto done;
if ((error = vmwrite(VMCS_HOST_IDTR_BASE, r_idt.rd_base)) != 0)
goto done;
/* instruction pointer */
if ((error = vmwrite(VMCS_HOST_RIP, host_rip)) != 0)
goto done;
/* stack pointer */
if ((error = vmwrite(VMCS_HOST_RSP, host_rsp)) != 0)
goto done;
/* eptp */
eptp = EPTP(ept_pml4);
if ((error = vmwrite(VMCS_EPTP, eptp)) != 0)
goto done;
/* vpid */
if ((error = vmwrite(VMCS_VPID, vpid)) != 0)
goto done;
/* msr bitmap */
if ((error = vmwrite(VMCS_MSR_BITMAP, msr_bitmap)) != 0)
goto done;
/* exception bitmap */
exc_bitmap = 1 << IDT_MC;
if ((error = vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap)) != 0)
goto done;
/* link pointer */
if ((error = vmwrite(VMCS_LINK_POINTER, ~0)) != 0)
goto done;
done:
VMCLEAR(vmcs);
return (error);
}
uint64_t
vmcs_read(uint32_t encoding)
{
int error;
uint64_t val;
error = vmread(encoding, &val);
if (error != 0)
panic("vmcs_read(%u) error %d", encoding, error);
return (val);
}

324
sys/amd64/vmm/intel/vmcs.h Normal file
View File

@ -0,0 +1,324 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VMCS_H_
#define _VMCS_H_
#ifdef _KERNEL
struct vmcs {
uint32_t identifier;
uint32_t abort_code;
char _impl_specific[PAGE_SIZE - sizeof(uint32_t) * 2];
};
CTASSERT(sizeof(struct vmcs) == PAGE_SIZE);
/* MSR save region is composed of an array of 'struct msr_entry' */
struct msr_entry {
uint32_t index;
uint32_t reserved;
uint64_t val;
};
int vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count);
int vmcs_set_defaults(struct vmcs *vmcs, u_long host_rip, u_long host_rsp,
u_long ept_pml4,
uint32_t pinbased_ctls, uint32_t procbased_ctls,
uint32_t procbased_ctls2, uint32_t exit_ctls,
uint32_t entry_ctls, u_long msr_bitmap,
uint16_t vpid);
int vmcs_getreg(struct vmcs *vmcs, int ident, uint64_t *retval);
int vmcs_setreg(struct vmcs *vmcs, int ident, uint64_t val);
int vmcs_getdesc(struct vmcs *vmcs, int ident,
struct seg_desc *desc);
int vmcs_setdesc(struct vmcs *vmcs, int ident,
struct seg_desc *desc);
uint64_t vmcs_read(uint32_t encoding);
#define vmexit_instruction_length() vmcs_read(VMCS_EXIT_INSTRUCTION_LENGTH)
#define vmcs_guest_rip() vmcs_read(VMCS_GUEST_RIP)
#define vmcs_instruction_error() vmcs_read(VMCS_INSTRUCTION_ERROR)
#define vmcs_exit_reason() (vmcs_read(VMCS_EXIT_REASON) & 0xffff)
#define vmcs_exit_qualification() vmcs_read(VMCS_EXIT_QUALIFICATION)
#endif /* _KERNEL */
#define VMCS_IDENT(encoding) ((encoding) | 0x80000000)
/*
* VMCS field encodings from Appendix H, Intel Architecture Manual Vol3B.
*/
#define VMCS_INVALID_ENCODING 0xffffffff
/* 16-bit control fields */
#define VMCS_VPID 0x00000000
/* 16-bit guest-state fields */
#define VMCS_GUEST_ES_SELECTOR 0x00000800
#define VMCS_GUEST_CS_SELECTOR 0x00000802
#define VMCS_GUEST_SS_SELECTOR 0x00000804
#define VMCS_GUEST_DS_SELECTOR 0x00000806
#define VMCS_GUEST_FS_SELECTOR 0x00000808
#define VMCS_GUEST_GS_SELECTOR 0x0000080A
#define VMCS_GUEST_LDTR_SELECTOR 0x0000080C
#define VMCS_GUEST_TR_SELECTOR 0x0000080E
/* 16-bit host-state fields */
#define VMCS_HOST_ES_SELECTOR 0x00000C00
#define VMCS_HOST_CS_SELECTOR 0x00000C02
#define VMCS_HOST_SS_SELECTOR 0x00000C04
#define VMCS_HOST_DS_SELECTOR 0x00000C06
#define VMCS_HOST_FS_SELECTOR 0x00000C08
#define VMCS_HOST_GS_SELECTOR 0x00000C0A
#define VMCS_HOST_TR_SELECTOR 0x00000C0C
/* 64-bit control fields */
#define VMCS_IO_BITMAP_A 0x00002000
#define VMCS_IO_BITMAP_B 0x00002002
#define VMCS_MSR_BITMAP 0x00002004
#define VMCS_EXIT_MSR_STORE 0x00002006
#define VMCS_EXIT_MSR_LOAD 0x00002008
#define VMCS_ENTRY_MSR_LOAD 0x0000200A
#define VMCS_EXECUTIVE_VMCS 0x0000200C
#define VMCS_TSC_OFFSET 0x00002010
#define VMCS_VIRTUAL_APIC 0x00002012
#define VMCS_APIC_ACCESS 0x00002014
#define VMCS_EPTP 0x0000201A
/* 64-bit read-only fields */
#define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400
/* 64-bit guest-state fields */
#define VMCS_LINK_POINTER 0x00002800
#define VMCS_GUEST_IA32_DEBUGCTL 0x00002802
#define VMCS_GUEST_IA32_PAT 0x00002804
#define VMCS_GUEST_IA32_EFER 0x00002806
#define VMCS_GUEST_IA32_PERF_GLOBAL_CTRL 0x00002808
#define VMCS_GUEST_PDPTE0 0x0000280A
#define VMCS_GUEST_PDPTE1 0x0000280C
#define VMCS_GUEST_PDPTE2 0x0000280E
#define VMCS_GUEST_PDPTE3 0x00002810
/* 64-bit host-state fields */
#define VMCS_HOST_IA32_PAT 0x00002C00
#define VMCS_HOST_IA32_EFER 0x00002C02
#define VMCS_HOST_IA32_PERF_GLOBAL_CTRL 0x00002C04
/* 32-bit control fields */
#define VMCS_PIN_BASED_CTLS 0x00004000
#define VMCS_PRI_PROC_BASED_CTLS 0x00004002
#define VMCS_EXCEPTION_BITMAP 0x00004004
#define VMCS_PF_ERROR_MASK 0x00004006
#define VMCS_PF_ERROR_MATCH 0x00004008
#define VMCS_CR3_TARGET_COUNT 0x0000400A
#define VMCS_EXIT_CTLS 0x0000400C
#define VMCS_EXIT_MSR_STORE_COUNT 0x0000400E
#define VMCS_EXIT_MSR_LOAD_COUNT 0x00004010
#define VMCS_ENTRY_CTLS 0x00004012
#define VMCS_ENTRY_MSR_LOAD_COUNT 0x00004014
#define VMCS_ENTRY_INTR_INFO 0x00004016
#define VMCS_ENTRY_EXCEPTION_ERROR 0x00004018
#define VMCS_ENTRY_INST_LENGTH 0x0000401A
#define VMCS_TPR_THRESHOLD 0x0000401C
#define VMCS_SEC_PROC_BASED_CTLS 0x0000401E
#define VMCS_PLE_GAP 0x00004020
#define VMCS_PLE_WINDOW 0x00004022
/* 32-bit read-only data fields */
#define VMCS_INSTRUCTION_ERROR 0x00004400
#define VMCS_EXIT_REASON 0x00004402
#define VMCS_EXIT_INTERRUPTION_INFO 0x00004404
#define VMCS_EXIT_INTERRUPTION_ERROR 0x00004406
#define VMCS_IDT_VECTORING_INFO 0x00004408
#define VMCS_IDT_VECTORING_ERROR 0x0000440A
#define VMCS_EXIT_INSTRUCTION_LENGTH 0x0000440C
#define VMCS_EXIT_INSTRUCTION_INFO 0x0000440E
/* 32-bit guest-state fields */
#define VMCS_GUEST_ES_LIMIT 0x00004800
#define VMCS_GUEST_CS_LIMIT 0x00004802
#define VMCS_GUEST_SS_LIMIT 0x00004804
#define VMCS_GUEST_DS_LIMIT 0x00004806
#define VMCS_GUEST_FS_LIMIT 0x00004808
#define VMCS_GUEST_GS_LIMIT 0x0000480A
#define VMCS_GUEST_LDTR_LIMIT 0x0000480C
#define VMCS_GUEST_TR_LIMIT 0x0000480E
#define VMCS_GUEST_GDTR_LIMIT 0x00004810
#define VMCS_GUEST_IDTR_LIMIT 0x00004812
#define VMCS_GUEST_ES_ACCESS_RIGHTS 0x00004814
#define VMCS_GUEST_CS_ACCESS_RIGHTS 0x00004816
#define VMCS_GUEST_SS_ACCESS_RIGHTS 0x00004818
#define VMCS_GUEST_DS_ACCESS_RIGHTS 0x0000481A
#define VMCS_GUEST_FS_ACCESS_RIGHTS 0x0000481C
#define VMCS_GUEST_GS_ACCESS_RIGHTS 0x0000481E
#define VMCS_GUEST_LDTR_ACCESS_RIGHTS 0x00004820
#define VMCS_GUEST_TR_ACCESS_RIGHTS 0x00004822
#define VMCS_GUEST_INTERRUPTIBILITY 0x00004824
#define VMCS_GUEST_ACTIVITY 0x00004826
#define VMCS_GUEST_SMBASE 0x00004828
#define VMCS_GUEST_IA32_SYSENTER_CS 0x0000482A
#define VMCS_PREEMPTION_TIMER_VALUE 0x0000482E
/* 32-bit host state fields */
#define VMCS_HOST_IA32_SYSENTER_CS 0x00004C00
/* Natural Width control fields */
#define VMCS_CR0_MASK 0x00006000
#define VMCS_CR4_MASK 0x00006002
#define VMCS_CR0_SHADOW 0x00006004
#define VMCS_CR4_SHADOW 0x00006006
#define VMCS_CR3_TARGET0 0x00006008
#define VMCS_CR3_TARGET1 0x0000600A
#define VMCS_CR3_TARGET2 0x0000600C
#define VMCS_CR3_TARGET3 0x0000600E
/* Natural Width read-only fields */
#define VMCS_EXIT_QUALIFICATION 0x00006400
#define VMCS_IO_RCX 0x00006402
#define VMCS_IO_RSI 0x00006404
#define VMCS_IO_RDI 0x00006406
#define VMCS_IO_RIP 0x00006408
#define VMCS_GUEST_LINEAR_ADDRESS 0x0000640A
/* Natural Width guest-state fields */
#define VMCS_GUEST_CR0 0x00006800
#define VMCS_GUEST_CR3 0x00006802
#define VMCS_GUEST_CR4 0x00006804
#define VMCS_GUEST_ES_BASE 0x00006806
#define VMCS_GUEST_CS_BASE 0x00006808
#define VMCS_GUEST_SS_BASE 0x0000680A
#define VMCS_GUEST_DS_BASE 0x0000680C
#define VMCS_GUEST_FS_BASE 0x0000680E
#define VMCS_GUEST_GS_BASE 0x00006810
#define VMCS_GUEST_LDTR_BASE 0x00006812
#define VMCS_GUEST_TR_BASE 0x00006814
#define VMCS_GUEST_GDTR_BASE 0x00006816
#define VMCS_GUEST_IDTR_BASE 0x00006818
#define VMCS_GUEST_DR7 0x0000681A
#define VMCS_GUEST_RSP 0x0000681C
#define VMCS_GUEST_RIP 0x0000681E
#define VMCS_GUEST_RFLAGS 0x00006820
#define VMCS_GUEST_PENDING_DBG_EXCEPTIONS 0x00006822
#define VMCS_GUEST_IA32_SYSENTER_ESP 0x00006824
#define VMCS_GUEST_IA32_SYSENTER_EIP 0x00006826
/* Natural Width host-state fields */
#define VMCS_HOST_CR0 0x00006C00
#define VMCS_HOST_CR3 0x00006C02
#define VMCS_HOST_CR4 0x00006C04
#define VMCS_HOST_FS_BASE 0x00006C06
#define VMCS_HOST_GS_BASE 0x00006C08
#define VMCS_HOST_TR_BASE 0x00006C0A
#define VMCS_HOST_GDTR_BASE 0x00006C0C
#define VMCS_HOST_IDTR_BASE 0x00006C0E
#define VMCS_HOST_IA32_SYSENTER_ESP 0x00006C10
#define VMCS_HOST_IA32_SYSENTER_EIP 0x00006C12
#define VMCS_HOST_RSP 0x00006C14
#define VMCS_HOST_RIP 0x00006c16
/*
* VM instruction error numbers
*/
#define VMRESUME_WITH_NON_LAUNCHED_VMCS 5
/*
* VMCS exit reasons
*/
#define EXIT_REASON_EXCEPTION 0
#define EXIT_REASON_EXT_INTR 1
#define EXIT_REASON_TRIPLE_FAULT 2
#define EXIT_REASON_INIT 3
#define EXIT_REASON_SIPI 4
#define EXIT_REASON_IO_SMI 5
#define EXIT_REASON_SMI 6
#define EXIT_REASON_INTR_WINDOW 7
#define EXIT_REASON_NMI_WINDOW 8
#define EXIT_REASON_TASK_SWITCH 9
#define EXIT_REASON_CPUID 10
#define EXIT_REASON_GETSEC 11
#define EXIT_REASON_HLT 12
#define EXIT_REASON_INVD 13
#define EXIT_REASON_INVLPG 14
#define EXIT_REASON_RDPMC 15
#define EXIT_REASON_RDTSC 16
#define EXIT_REASON_RSM 17
#define EXIT_REASON_VMCALL 18
#define EXIT_REASON_VMCLEAR 19
#define EXIT_REASON_VMLAUNCH 20
#define EXIT_REASON_VMPTRLD 21
#define EXIT_REASON_VMPTRST 22
#define EXIT_REASON_VMREAD 23
#define EXIT_REASON_VMRESUME 24
#define EXIT_REASON_VMWRITE 25
#define EXIT_REASON_VMXOFF 26
#define EXIT_REASON_VMXON 27
#define EXIT_REASON_CR_ACCESS 28
#define EXIT_REASON_DR_ACCESS 29
#define EXIT_REASON_INOUT 30
#define EXIT_REASON_RDMSR 31
#define EXIT_REASON_WRMSR 32
#define EXIT_REASON_INVAL_VMCS 33
#define EXIT_REASON_INVAL_MSR 34
#define EXIT_REASON_MWAIT 36
#define EXIT_REASON_MTF 37
#define EXIT_REASON_MONITOR 39
#define EXIT_REASON_PAUSE 40
#define EXIT_REASON_MCE 41
#define EXIT_REASON_TPR 43
#define EXIT_REASON_APIC 44
#define EXIT_REASON_GDTR_IDTR 46
#define EXIT_REASON_LDTR_TR 47
#define EXIT_REASON_EPT_FAULT 48
#define EXIT_REASON_EPT_MISCONFIG 49
#define EXIT_REASON_INVEPT 50
#define EXIT_REASON_RDTSCP 51
#define EXIT_REASON_VMX_PREEMPT 52
#define EXIT_REASON_INVVPID 53
#define EXIT_REASON_WBINVD 54
#define EXIT_REASON_XSETBV 55
/*
* VMCS interrupt information fields
*/
#define VMCS_INTERRUPTION_INFO_VALID (1 << 31)
#define VMCS_INTERRUPTION_INFO_HW_INTR (0 << 8)
#define VMCS_INTERRUPTION_INFO_NMI (2 << 8)
/*
* VMCS Guest interruptibility field
*/
#define VMCS_INTERRUPTIBILITY_STI_BLOCKING (1 << 0)
#define VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING (1 << 1)
#define VMCS_INTERRUPTIBILITY_SMI_BLOCKING (1 << 2)
#define VMCS_INTERRUPTIBILITY_NMI_BLOCKING (1 << 3)
/*
* Exit qualification for EXIT_REASON_INVAL_VMCS
*/
#define EXIT_QUAL_NMI_WHILE_STI_BLOCKING 3
#endif

1673
sys/amd64/vmm/intel/vmx.c Normal file

File diff suppressed because it is too large Load Diff

115
sys/amd64/vmm/intel/vmx.h Normal file
View File

@ -0,0 +1,115 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VMX_H_
#define _VMX_H_
#include "vmcs.h"
#define GUEST_MSR_MAX_ENTRIES 64 /* arbitrary */
struct vmxctx {
register_t guest_rdi; /* Guest state */
register_t guest_rsi;
register_t guest_rdx;
register_t guest_rcx;
register_t guest_r8;
register_t guest_r9;
register_t guest_rax;
register_t guest_rbx;
register_t guest_rbp;
register_t guest_r10;
register_t guest_r11;
register_t guest_r12;
register_t guest_r13;
register_t guest_r14;
register_t guest_r15;
register_t guest_cr2;
register_t host_r15; /* Host state */
register_t host_r14;
register_t host_r13;
register_t host_r12;
register_t host_rbp;
register_t host_rsp;
register_t host_rbx;
register_t host_rip;
/*
* XXX todo debug registers and fpu state
*/
int launch_error;
};
struct vmxcap {
int set;
uint32_t proc_ctls;
};
struct vmxstate {
int request_nmi;
int lastcpu; /* host cpu that this 'vcpu' last ran on */
uint16_t vpid;
};
/* virtual machine softc */
struct vmx {
pml4_entry_t pml4ept[NPML4EPG];
struct vmcs vmcs[VM_MAXCPU]; /* one vmcs per virtual cpu */
char msr_bitmap[PAGE_SIZE];
struct msr_entry guest_msrs[VM_MAXCPU][GUEST_MSR_MAX_ENTRIES];
struct vmxctx ctx[VM_MAXCPU];
struct vmxcap cap[VM_MAXCPU];
struct vmxstate state[VM_MAXCPU];
struct vm *vm;
};
CTASSERT((offsetof(struct vmx, pml4ept) & PAGE_MASK) == 0);
CTASSERT((offsetof(struct vmx, vmcs) & PAGE_MASK) == 0);
CTASSERT((offsetof(struct vmx, msr_bitmap) & PAGE_MASK) == 0);
CTASSERT((offsetof(struct vmx, guest_msrs) & 15) == 0);
#define VMX_RETURN_DIRECT 0
#define VMX_RETURN_LONGJMP 1
#define VMX_RETURN_VMRESUME 2
#define VMX_RETURN_VMLAUNCH 3
/*
* vmx_setjmp() returns:
* - 0 when it returns directly
* - 1 when it returns from vmx_longjmp
* - 2 when it returns from vmx_resume (which would only be in the error case)
* - 3 when it returns from vmx_launch (which would only be in the error case)
*/
int vmx_setjmp(struct vmxctx *ctx);
void vmx_longjmp(void); /* returns via vmx_setjmp */
void vmx_launch(struct vmxctx *ctx) __dead2; /* may return via vmx_setjmp */
void vmx_resume(struct vmxctx *ctx) __dead2; /* may return via vmx_setjmp */
u_long vmx_fix_cr0(u_long cr0);
u_long vmx_fix_cr4(u_long cr4);
#endif

View File

@ -0,0 +1,92 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VMX_CONTROLS_H_
#define _VMX_CONTROLS_H_
/* Pin-Based VM-Execution Controls */
#define PINBASED_EXTINT_EXITING (1 << 0)
#define PINBASED_NMI_EXITING (1 << 3)
#define PINBASED_VIRTUAL_NMI (1 << 5)
#define PINBASED_PREMPTION_TIMER (1 << 6)
/* Primary Processor-Based VM-Execution Controls */
#define PROCBASED_INT_WINDOW_EXITING (1 << 2)
#define PROCBASED_TSC_OFFSET (1 << 3)
#define PROCBASED_HLT_EXITING (1 << 7)
#define PROCBASED_INVLPG_EXITING (1 << 9)
#define PROCBASED_MWAIT_EXITING (1 << 10)
#define PROCBASED_RDPMC_EXITING (1 << 11)
#define PROCBASED_RDTSC_EXITING (1 << 12)
#define PROCBASED_CR3_LOAD_EXITING (1 << 15)
#define PROCBASED_CR3_STORE_EXITING (1 << 16)
#define PROCBASED_CR8_LOAD_EXITING (1 << 19)
#define PROCBASED_CR8_STORE_EXITING (1 << 20)
#define PROCBASED_USE_TPR_SHADOW (1 << 21)
#define PROCBASED_NMI_WINDOW_EXITING (1 << 22)
#define PROCBASED_MOV_DR_EXITING (1 << 23)
#define PROCBASED_IO_EXITING (1 << 24)
#define PROCBASED_IO_BITMAPS (1 << 25)
#define PROCBASED_MTF (1 << 27)
#define PROCBASED_MSR_BITMAPS (1 << 28)
#define PROCBASED_MONITOR_EXITING (1 << 29)
#define PROCBASED_PAUSE_EXITING (1 << 30)
#define PROCBASED_SECONDARY_CONTROLS (1 << 31)
/* Secondary Processor-Based VM-Execution Controls */
#define PROCBASED2_VIRTUALIZE_APIC (1 << 0)
#define PROCBASED2_ENABLE_EPT (1 << 1)
#define PROCBASED2_DESC_TABLE_EXITING (1 << 2)
#define PROCBASED2_ENABLE_RDTSCP (1 << 3)
#define PROCBASED2_VIRTUALIZE_X2APIC (1 << 4)
#define PROCBASED2_ENABLE_VPID (1 << 5)
#define PROCBASED2_WBINVD_EXITING (1 << 6)
#define PROCBASED2_UNRESTRICTED_GUEST (1 << 7)
#define PROCBASED2_PAUSE_LOOP_EXITING (1 << 10)
/* VM Exit Controls */
#define VM_EXIT_SAVE_DEBUG_CONTROLS (1 << 2)
#define VM_EXIT_HOST_LMA (1 << 9)
#define VM_EXIT_LOAD_PERF_GLOBAL_CTRL (1 << 12)
#define VM_EXIT_ACKNOWLEDGE_INTERRUPT (1 << 15)
#define VM_EXIT_SAVE_PAT (1 << 18)
#define VM_EXIT_LOAD_PAT (1 << 19)
#define VM_EXIT_SAVE_EFER (1 << 20)
#define VM_EXIT_LOAD_EFER (1 << 21)
#define VM_EXIT_SAVE_PREEMPTION_TIMER (1 << 22)
/* VM Entry Controls */
#define VM_ENTRY_LOAD_DEBUG_CONTROLS (1 << 2)
#define VM_ENTRY_GUEST_LMA (1 << 9)
#define VM_ENTRY_INTO_SMM (1 << 10)
#define VM_ENTRY_DEACTIVATE_DUAL_MONITOR (1 << 11)
#define VM_ENTRY_LOAD_PERF_GLOBAL_CTRL (1 << 13)
#define VM_ENTRY_LOAD_PAT (1 << 14)
#define VM_ENTRY_LOAD_EFER (1 << 15)
#endif

View File

@ -0,0 +1,199 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VMX_CPUFUNC_H_
#define _VMX_CPUFUNC_H_
struct vmcs;
/*
* Section 5.2 "Conventions" from Intel Architecture Manual 2B.
*
* error
* VMsucceed 0
* VMFailInvalid 1
* VMFailValid 2 see also VMCS VM-Instruction Error Field
*/
#define VM_SUCCESS 0
#define VM_FAIL_INVALID 1
#define VM_FAIL_VALID 2
#define VMX_SET_ERROR_CODE(varname) \
do { \
__asm __volatile(" jnc 1f;" \
" mov $1, %0;" /* CF: error = 1 */ \
" jmp 3f;" \
"1: jnz 2f;" \
" mov $2, %0;" /* ZF: error = 2 */ \
" jmp 3f;" \
"2: mov $0, %0;" \
"3: nop" \
:"=r" (varname)); \
} while (0)
/* returns 0 on success and non-zero on failure */
static __inline int
vmxon(char *region)
{
int error;
uint64_t addr;
addr = vtophys(region);
__asm __volatile("vmxon %0" : : "m" (*(uint64_t *)&addr) : "memory");
VMX_SET_ERROR_CODE(error);
return (error);
}
/* returns 0 on success and non-zero on failure */
static __inline int
vmclear(struct vmcs *vmcs)
{
int error;
uint64_t addr;
addr = vtophys(vmcs);
__asm __volatile("vmclear %0" : : "m" (*(uint64_t *)&addr) : "memory");
VMX_SET_ERROR_CODE(error);
return (error);
}
static __inline void
vmxoff(void)
{
__asm __volatile("vmxoff");
}
static __inline void
vmptrst(uint64_t *addr)
{
__asm __volatile("vmptrst %0" : : "m" (*addr) : "memory");
}
static __inline int
vmptrld(struct vmcs *vmcs)
{
int error;
uint64_t addr;
addr = vtophys(vmcs);
__asm __volatile("vmptrld %0" : : "m" (*(uint64_t *)&addr) : "memory");
VMX_SET_ERROR_CODE(error);
return (error);
}
static __inline int
vmwrite(uint64_t reg, uint64_t val)
{
int error;
__asm __volatile("vmwrite %0, %1" : : "r" (val), "r" (reg) : "memory");
VMX_SET_ERROR_CODE(error);
return (error);
}
static __inline int
vmread(uint64_t r, uint64_t *addr)
{
int error;
__asm __volatile("vmread %0, %1" : : "r" (r), "m" (*addr) : "memory");
VMX_SET_ERROR_CODE(error);
return (error);
}
static void __inline
VMCLEAR(struct vmcs *vmcs)
{
int err;
err = vmclear(vmcs);
if (err != 0)
panic("%s: vmclear(%p) error %d", __func__, vmcs, err);
critical_exit();
}
static void __inline
VMPTRLD(struct vmcs *vmcs)
{
int err;
critical_enter();
err = vmptrld(vmcs);
if (err != 0)
panic("%s: vmptrld(%p) error %d", __func__, vmcs, err);
}
#define INVVPID_TYPE_ADDRESS 0UL
#define INVVPID_TYPE_SINGLE_CONTEXT 1UL
#define INVVPID_TYPE_ALL_CONTEXTS 2UL
struct invvpid_desc {
uint16_t vpid;
uint16_t _res1;
uint32_t _res2;
uint64_t linear_addr;
};
CTASSERT(sizeof(struct invvpid_desc) == 16);
static void __inline
invvpid(uint64_t type, struct invvpid_desc desc)
{
int error;
__asm __volatile("invvpid %0, %1" :: "m" (desc), "r" (type) : "memory");
VMX_SET_ERROR_CODE(error);
if (error)
panic("invvpid error %d", error);
}
#define INVEPT_TYPE_SINGLE_CONTEXT 1UL
#define INVEPT_TYPE_ALL_CONTEXTS 2UL
struct invept_desc {
uint64_t eptp;
uint64_t _res;
};
CTASSERT(sizeof(struct invept_desc) == 16);
static void __inline
invept(uint64_t type, struct invept_desc desc)
{
int error;
__asm __volatile("invept %0, %1" :: "m" (desc), "r" (type) : "memory");
VMX_SET_ERROR_CODE(error);
if (error)
panic("invept error %d", error);
}
#endif

View File

@ -0,0 +1,81 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/assym.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <machine/pmap.h>
#include <machine/vmm.h>
#include "vmx.h"
#include "vmx_cpufunc.h"
ASSYM(VMXCTX_GUEST_RDI, offsetof(struct vmxctx, guest_rdi));
ASSYM(VMXCTX_GUEST_RSI, offsetof(struct vmxctx, guest_rsi));
ASSYM(VMXCTX_GUEST_RDX, offsetof(struct vmxctx, guest_rdx));
ASSYM(VMXCTX_GUEST_RCX, offsetof(struct vmxctx, guest_rcx));
ASSYM(VMXCTX_GUEST_R8, offsetof(struct vmxctx, guest_r8));
ASSYM(VMXCTX_GUEST_R9, offsetof(struct vmxctx, guest_r9));
ASSYM(VMXCTX_GUEST_RAX, offsetof(struct vmxctx, guest_rax));
ASSYM(VMXCTX_GUEST_RBX, offsetof(struct vmxctx, guest_rbx));
ASSYM(VMXCTX_GUEST_RBP, offsetof(struct vmxctx, guest_rbp));
ASSYM(VMXCTX_GUEST_R10, offsetof(struct vmxctx, guest_r10));
ASSYM(VMXCTX_GUEST_R11, offsetof(struct vmxctx, guest_r11));
ASSYM(VMXCTX_GUEST_R12, offsetof(struct vmxctx, guest_r12));
ASSYM(VMXCTX_GUEST_R13, offsetof(struct vmxctx, guest_r13));
ASSYM(VMXCTX_GUEST_R14, offsetof(struct vmxctx, guest_r14));
ASSYM(VMXCTX_GUEST_R15, offsetof(struct vmxctx, guest_r15));
ASSYM(VMXCTX_GUEST_CR2, offsetof(struct vmxctx, guest_cr2));
ASSYM(VMXCTX_HOST_R15, offsetof(struct vmxctx, host_r15));
ASSYM(VMXCTX_HOST_R14, offsetof(struct vmxctx, host_r14));
ASSYM(VMXCTX_HOST_R13, offsetof(struct vmxctx, host_r13));
ASSYM(VMXCTX_HOST_R12, offsetof(struct vmxctx, host_r12));
ASSYM(VMXCTX_HOST_RBP, offsetof(struct vmxctx, host_rbp));
ASSYM(VMXCTX_HOST_RSP, offsetof(struct vmxctx, host_rsp));
ASSYM(VMXCTX_HOST_RBX, offsetof(struct vmxctx, host_rbx));
ASSYM(VMXCTX_HOST_RIP, offsetof(struct vmxctx, host_rip));
ASSYM(VMXCTX_LAUNCH_ERROR, offsetof(struct vmxctx, launch_error));
ASSYM(VM_SUCCESS, VM_SUCCESS);
ASSYM(VM_FAIL_INVALID, VM_FAIL_INVALID);
ASSYM(VM_FAIL_VALID, VM_FAIL_VALID);
ASSYM(VMX_RETURN_DIRECT, VMX_RETURN_DIRECT);
ASSYM(VMX_RETURN_LONGJMP, VMX_RETURN_LONGJMP);
ASSYM(VMX_RETURN_VMRESUME, VMX_RETURN_VMRESUME);
ASSYM(VMX_RETURN_VMLAUNCH, VMX_RETURN_VMLAUNCH);

View File

@ -0,0 +1,172 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <machine/cpufunc.h>
#include "vmx_msr.h"
static boolean_t
vmx_ctl_allows_one_setting(uint64_t msr_val, int bitpos)
{
if (msr_val & (1UL << (bitpos + 32)))
return (TRUE);
else
return (FALSE);
}
static boolean_t
vmx_ctl_allows_zero_setting(uint64_t msr_val, int bitpos)
{
if ((msr_val & (1UL << bitpos)) == 0)
return (TRUE);
else
return (FALSE);
}
uint32_t
vmx_revision(void)
{
return (rdmsr(MSR_VMX_BASIC) & 0xffffffff);
}
/*
* Generate a bitmask to be used for the VMCS execution control fields.
*
* The caller specifies what bits should be set to one in 'ones_mask'
* and what bits should be set to zero in 'zeros_mask'. The don't-care
* bits are set to the default value. The default values are obtained
* based on "Algorithm 3" in Section 27.5.1 "Algorithms for Determining
* VMX Capabilities".
*
* Returns zero on success and non-zero on error.
*/
int
vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask,
uint32_t zeros_mask, uint32_t *retval)
{
int i;
uint64_t val, trueval;
boolean_t true_ctls_avail, one_allowed, zero_allowed;
/* We cannot ask the same bit to be set to both '1' and '0' */
if ((ones_mask ^ zeros_mask) != (ones_mask | zeros_mask))
return (EINVAL);
if (rdmsr(MSR_VMX_BASIC) & (1UL << 55))
true_ctls_avail = TRUE;
else
true_ctls_avail = FALSE;
val = rdmsr(ctl_reg);
if (true_ctls_avail)
trueval = rdmsr(true_ctl_reg); /* step c */
else
trueval = val; /* step a */
for (i = 0; i < 32; i++) {
one_allowed = vmx_ctl_allows_one_setting(trueval, i);
zero_allowed = vmx_ctl_allows_zero_setting(trueval, i);
KASSERT(one_allowed || zero_allowed,
("invalid zero/one setting for bit %d of ctl 0x%0x, "
"truectl 0x%0x\n", i, ctl_reg, true_ctl_reg));
if (zero_allowed && !one_allowed) { /* b(i),c(i) */
if (ones_mask & (1 << i))
return (EINVAL);
*retval &= ~(1 << i);
} else if (one_allowed && !zero_allowed) { /* b(i),c(i) */
if (zeros_mask & (1 << i))
return (EINVAL);
*retval |= 1 << i;
} else {
if (zeros_mask & (1 << i)) /* b(ii),c(ii) */
*retval &= ~(1 << i);
else if (ones_mask & (1 << i)) /* b(ii), c(ii) */
*retval |= 1 << i;
else if (!true_ctls_avail)
*retval &= ~(1 << i); /* b(iii) */
else if (vmx_ctl_allows_zero_setting(val, i))/* c(iii)*/
*retval &= ~(1 << i);
else if (vmx_ctl_allows_one_setting(val, i)) /* c(iv) */
*retval |= 1 << i;
else {
panic("vmx_set_ctlreg: unable to determine "
"correct value of ctl bit %d for msr "
"0x%0x and true msr 0x%0x", i, ctl_reg,
true_ctl_reg);
}
}
}
return (0);
}
void
msr_bitmap_initialize(char *bitmap)
{
memset(bitmap, 0xff, PAGE_SIZE);
}
int
msr_bitmap_change_access(char *bitmap, u_int msr, int access)
{
int byte, bit;
if (msr >= 0x00000000 && msr <= 0x00001FFF)
byte = msr / 8;
else if (msr >= 0xC0000000 && msr <= 0xC0001FFF)
byte = 1024 + (msr - 0xC0000000) / 8;
else
return (EINVAL);
bit = msr & 0x7;
if (access & MSR_BITMAP_ACCESS_READ)
bitmap[byte] &= ~(1 << bit);
else
bitmap[byte] |= 1 << bit;
byte += 2048;
if (access & MSR_BITMAP_ACCESS_WRITE)
bitmap[byte] &= ~(1 << bit);
else
bitmap[byte] |= 1 << bit;
return (0);
}

View File

@ -0,0 +1,78 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VMX_MSR_H_
#define _VMX_MSR_H_
#define MSR_VMX_BASIC 0x480
#define MSR_VMX_EPT_VPID_CAP 0x48C
#define MSR_VMX_PROCBASED_CTLS 0x482
#define MSR_VMX_TRUE_PROCBASED_CTLS 0x48E
#define MSR_VMX_PINBASED_CTLS 0x481
#define MSR_VMX_TRUE_PINBASED_CTLS 0x48D
#define MSR_VMX_PROCBASED_CTLS2 0x48B
#define MSR_VMX_EXIT_CTLS 0x483
#define MSR_VMX_TRUE_EXIT_CTLS 0x48f
#define MSR_VMX_ENTRY_CTLS 0x484
#define MSR_VMX_TRUE_ENTRY_CTLS 0x490
#define MSR_VMX_CR0_FIXED0 0x486
#define MSR_VMX_CR0_FIXED1 0x487
#define MSR_VMX_CR4_FIXED0 0x488
#define MSR_VMX_CR4_FIXED1 0x489
uint32_t vmx_revision(void);
int vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask,
uint32_t zeros_mask, uint32_t *retval);
/*
* According to Section 21.10.4 "Software Access to Related Structures",
* changes to data structures pointed to by the VMCS must be made only when
* there is no logical processor with a current VMCS that points to the
* data structure.
*
* This pretty much limits us to configuring the MSR bitmap before VMCS
* initialization for SMP VMs. Unless of course we do it the hard way - which
* would involve some form of synchronization between the vcpus to vmclear
* all VMCSs' that point to the bitmap.
*/
#define MSR_BITMAP_ACCESS_NONE 0x0
#define MSR_BITMAP_ACCESS_READ 0x1
#define MSR_BITMAP_ACCESS_WRITE 0x2
#define MSR_BITMAP_ACCESS_RW (MSR_BITMAP_ACCESS_READ|MSR_BITMAP_ACCESS_WRITE)
void msr_bitmap_initialize(char *bitmap);
int msr_bitmap_change_access(char *bitmap, u_int msr, int access);
#endif

View File

@ -0,0 +1,204 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <machine/asmacros.h>
#include "vmx_assym.s"
/*
* Assumes that %rdi holds a pointer to the 'vmxctx'
*/
#define VMX_GUEST_RESTORE \
/* \
* Make sure that interrupts are disabled before restoring CR2. \
* Otherwise there could be a page fault during the interrupt \
* handler execution that would end up trashing CR2. \
*/ \
cli; \
movq VMXCTX_GUEST_CR2(%rdi),%rsi; \
movq %rsi,%cr2; \
movq VMXCTX_GUEST_RSI(%rdi),%rsi; \
movq VMXCTX_GUEST_RDX(%rdi),%rdx; \
movq VMXCTX_GUEST_RCX(%rdi),%rcx; \
movq VMXCTX_GUEST_R8(%rdi),%r8; \
movq VMXCTX_GUEST_R9(%rdi),%r9; \
movq VMXCTX_GUEST_RAX(%rdi),%rax; \
movq VMXCTX_GUEST_RBX(%rdi),%rbx; \
movq VMXCTX_GUEST_RBP(%rdi),%rbp; \
movq VMXCTX_GUEST_R10(%rdi),%r10; \
movq VMXCTX_GUEST_R11(%rdi),%r11; \
movq VMXCTX_GUEST_R12(%rdi),%r12; \
movq VMXCTX_GUEST_R13(%rdi),%r13; \
movq VMXCTX_GUEST_R14(%rdi),%r14; \
movq VMXCTX_GUEST_R15(%rdi),%r15; \
movq VMXCTX_GUEST_RDI(%rdi),%rdi; /* restore rdi the last */
#define VM_INSTRUCTION_ERROR(reg) \
jnc 1f; \
movl $VM_FAIL_INVALID,reg; /* CF is set */ \
jmp 3f; \
1: jnz 2f; \
movl $VM_FAIL_VALID,reg; /* ZF is set */ \
jmp 3f; \
2: movl $VM_SUCCESS,reg; \
3: movl reg,VMXCTX_LAUNCH_ERROR(%rsp)
.text
/*
* int vmx_setjmp(ctxp)
* %rdi = ctxp
*
* Return value is '0' when it returns directly from here.
* Return value is '1' when it returns after a vm exit through vmx_longjmp.
*/
ENTRY(vmx_setjmp)
movq (%rsp),%rax /* return address */
movq %r15,VMXCTX_HOST_R15(%rdi)
movq %r14,VMXCTX_HOST_R14(%rdi)
movq %r13,VMXCTX_HOST_R13(%rdi)
movq %r12,VMXCTX_HOST_R12(%rdi)
movq %rbp,VMXCTX_HOST_RBP(%rdi)
movq %rsp,VMXCTX_HOST_RSP(%rdi)
movq %rbx,VMXCTX_HOST_RBX(%rdi)
movq %rax,VMXCTX_HOST_RIP(%rdi)
/*
* XXX save host debug registers
*/
movl $VMX_RETURN_DIRECT,%eax
ret
END(vmx_setjmp)
/*
* void vmx_return(struct vmxctx *ctxp, int retval)
* %rdi = ctxp
* %rsi = retval
* Return to vmm context through vmx_setjmp() with a value of 'retval'.
*/
ENTRY(vmx_return)
/* Restore host context. */
movq VMXCTX_HOST_R15(%rdi),%r15
movq VMXCTX_HOST_R14(%rdi),%r14
movq VMXCTX_HOST_R13(%rdi),%r13
movq VMXCTX_HOST_R12(%rdi),%r12
movq VMXCTX_HOST_RBP(%rdi),%rbp
movq VMXCTX_HOST_RSP(%rdi),%rsp
movq VMXCTX_HOST_RBX(%rdi),%rbx
movq VMXCTX_HOST_RIP(%rdi),%rax
movq %rax,(%rsp) /* return address */
/*
* XXX restore host debug registers
*/
movl %esi,%eax
ret
END(vmx_return)
/*
* void vmx_longjmp(void)
* %rsp points to the struct vmxctx
*/
ENTRY(vmx_longjmp)
/*
* Save guest state that is not automatically saved in the vmcs.
*/
movq %rdi,VMXCTX_GUEST_RDI(%rsp)
movq %rsi,VMXCTX_GUEST_RSI(%rsp)
movq %rdx,VMXCTX_GUEST_RDX(%rsp)
movq %rcx,VMXCTX_GUEST_RCX(%rsp)
movq %r8,VMXCTX_GUEST_R8(%rsp)
movq %r9,VMXCTX_GUEST_R9(%rsp)
movq %rax,VMXCTX_GUEST_RAX(%rsp)
movq %rbx,VMXCTX_GUEST_RBX(%rsp)
movq %rbp,VMXCTX_GUEST_RBP(%rsp)
movq %r10,VMXCTX_GUEST_R10(%rsp)
movq %r11,VMXCTX_GUEST_R11(%rsp)
movq %r12,VMXCTX_GUEST_R12(%rsp)
movq %r13,VMXCTX_GUEST_R13(%rsp)
movq %r14,VMXCTX_GUEST_R14(%rsp)
movq %r15,VMXCTX_GUEST_R15(%rsp)
movq %cr2,%rdi
movq %rdi,VMXCTX_GUEST_CR2(%rsp)
movq %rsp,%rdi
movq $VMX_RETURN_LONGJMP,%rsi
callq vmx_return
END(vmx_longjmp)
/*
* void vmx_resume(struct vmxctx *ctxp)
* %rdi = ctxp
*
* Although the return type is a 'void' this function may return indirectly
* through vmx_setjmp() with a return value of 2.
*/
ENTRY(vmx_resume)
/*
* Restore guest state that is not automatically loaded from the vmcs.
*/
VMX_GUEST_RESTORE
vmresume
/*
* Capture the reason why vmresume failed.
*/
VM_INSTRUCTION_ERROR(%eax)
/* Return via vmx_setjmp with return value of VMX_RETURN_VMRESUME */
movq %rsp,%rdi
movq $VMX_RETURN_VMRESUME,%rsi
callq vmx_return
END(vmx_resume)
/*
* void vmx_launch(struct vmxctx *ctxp)
* %rdi = ctxp
*
* Although the return type is a 'void' this function may return indirectly
* through vmx_setjmp() with a return value of 3.
*/
ENTRY(vmx_launch)
/*
* Restore guest state that is not automatically loaded from the vmcs.
*/
VMX_GUEST_RESTORE
vmlaunch
/*
* Capture the reason why vmlaunch failed.
*/
VM_INSTRUCTION_ERROR(%eax)
/* Return via vmx_setjmp with return value of VMX_RETURN_VMLAUNCH */
movq %rsp,%rdi
movq $VMX_RETURN_VMLAUNCH,%rsi
callq vmx_return
END(vmx_launch)

637
sys/amd64/vmm/intel/vtd.c Normal file
View File

@ -0,0 +1,637 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <dev/pci/pcireg.h>
#include <machine/pmap.h>
#include <machine/vmparam.h>
#include <machine/pci_cfgreg.h>
#include "io/iommu.h"
/*
* Documented in the "Intel Virtualization Technology for Directed I/O",
* Architecture Spec, September 2008.
*/
/* Section 10.4 "Register Descriptions" */
struct vtdmap {
volatile uint32_t version;
volatile uint32_t res0;
volatile uint64_t cap;
volatile uint64_t ext_cap;
volatile uint32_t gcr;
volatile uint32_t gsr;
volatile uint64_t rta;
volatile uint64_t ccr;
};
#define VTD_CAP_SAGAW(cap) (((cap) >> 8) & 0x1F)
#define VTD_CAP_ND(cap) ((cap) & 0x7)
#define VTD_CAP_CM(cap) (((cap) >> 7) & 0x1)
#define VTD_CAP_SPS(cap) (((cap) >> 34) & 0xF)
#define VTD_CAP_RWBF(cap) (((cap) >> 4) & 0x1)
#define VTD_ECAP_DI(ecap) (((ecap) >> 2) & 0x1)
#define VTD_ECAP_COHERENCY(ecap) ((ecap) & 0x1)
#define VTD_ECAP_IRO(ecap) (((ecap) >> 8) & 0x3FF)
#define VTD_GCR_WBF (1 << 27)
#define VTD_GCR_SRTP (1 << 30)
#define VTD_GCR_TE (1 << 31)
#define VTD_GSR_WBFS (1 << 27)
#define VTD_GSR_RTPS (1 << 30)
#define VTD_GSR_TES (1 << 31)
#define VTD_CCR_ICC (1UL << 63) /* invalidate context cache */
#define VTD_CCR_CIRG_GLOBAL (1UL << 61) /* global invalidation */
#define VTD_IIR_IVT (1UL << 63) /* invalidation IOTLB */
#define VTD_IIR_IIRG_GLOBAL (1ULL << 60) /* global IOTLB invalidation */
#define VTD_IIR_IIRG_DOMAIN (2ULL << 60) /* domain IOTLB invalidation */
#define VTD_IIR_IIRG_PAGE (3ULL << 60) /* page IOTLB invalidation */
#define VTD_IIR_DRAIN_READS (1ULL << 49) /* drain pending DMA reads */
#define VTD_IIR_DRAIN_WRITES (1ULL << 48) /* drain pending DMA writes */
#define VTD_IIR_DOMAIN_P 32
#define VTD_ROOT_PRESENT 0x1
#define VTD_CTX_PRESENT 0x1
#define VTD_CTX_TT_ALL (1UL << 2)
#define VTD_PTE_RD (1UL << 0)
#define VTD_PTE_WR (1UL << 1)
#define VTD_PTE_SUPERPAGE (1UL << 7)
#define VTD_PTE_ADDR_M (0x000FFFFFFFFFF000UL)
struct domain {
uint64_t *ptp; /* first level page table page */
int pt_levels; /* number of page table levels */
int addrwidth; /* 'AW' field in context entry */
int spsmask; /* supported super page sizes */
u_int id; /* domain id */
vm_paddr_t maxaddr; /* highest address to be mapped */
SLIST_ENTRY(domain) next;
};
static SLIST_HEAD(, domain) domhead;
#define DRHD_MAX_UNITS 8
static int drhd_num;
static struct vtdmap *vtdmaps[DRHD_MAX_UNITS];
static int max_domains;
typedef int (*drhd_ident_func_t)(void);
static uint64_t root_table[PAGE_SIZE / sizeof(uint64_t)] __aligned(4096);
static uint64_t ctx_tables[256][PAGE_SIZE / sizeof(uint64_t)] __aligned(4096);
static MALLOC_DEFINE(M_VTD, "vtd", "vtd");
/*
* Config space register definitions from the "Intel 5520 and 5500" datasheet.
*/
static int
tylersburg_vtd_ident(void)
{
int units, nlbus;
uint16_t did, vid;
uint32_t miscsts, vtbar;
const int bus = 0;
const int slot = 20;
const int func = 0;
units = 0;
vid = pci_cfgregread(bus, slot, func, PCIR_VENDOR, 2);
did = pci_cfgregread(bus, slot, func, PCIR_DEVICE, 2);
if (vid != 0x8086 || did != 0x342E)
goto done;
/*
* Check if this is a dual IOH configuration.
*/
miscsts = pci_cfgregread(bus, slot, func, 0x9C, 4);
if (miscsts & (1 << 25))
nlbus = pci_cfgregread(bus, slot, func, 0x160, 1);
else
nlbus = -1;
vtbar = pci_cfgregread(bus, slot, func, 0x180, 4);
if (vtbar & 0x1) {
vtdmaps[units++] = (struct vtdmap *)
PHYS_TO_DMAP(vtbar & 0xffffe000);
} else if (bootverbose)
printf("VT-d unit in legacy IOH is disabled!\n");
if (nlbus != -1) {
vtbar = pci_cfgregread(nlbus, slot, func, 0x180, 4);
if (vtbar & 0x1) {
vtdmaps[units++] = (struct vtdmap *)
PHYS_TO_DMAP(vtbar & 0xffffe000);
} else if (bootverbose)
printf("VT-d unit in non-legacy IOH is disabled!\n");
}
done:
return (units);
}
static drhd_ident_func_t drhd_ident_funcs[] = {
tylersburg_vtd_ident,
NULL
};
static int
vtd_max_domains(struct vtdmap *vtdmap)
{
int nd;
nd = VTD_CAP_ND(vtdmap->cap);
switch (nd) {
case 0:
return (16);
case 1:
return (64);
case 2:
return (256);
case 3:
return (1024);
case 4:
return (4 * 1024);
case 5:
return (16 * 1024);
case 6:
return (64 * 1024);
default:
panic("vtd_max_domains: invalid value of nd (0x%0x)", nd);
}
}
static u_int
domain_id(void)
{
u_int id;
struct domain *dom;
/* Skip domain id 0 - it is reserved when Caching Mode field is set */
for (id = 1; id < max_domains; id++) {
SLIST_FOREACH(dom, &domhead, next) {
if (dom->id == id)
break;
}
if (dom == NULL)
break; /* found it */
}
if (id >= max_domains)
panic("domain ids exhausted");
return (id);
}
static void
vtd_wbflush(struct vtdmap *vtdmap)
{
if (VTD_ECAP_COHERENCY(vtdmap->ext_cap) == 0)
pmap_invalidate_cache();
if (VTD_CAP_RWBF(vtdmap->cap)) {
vtdmap->gcr = VTD_GCR_WBF;
while ((vtdmap->gsr & VTD_GSR_WBFS) != 0)
;
}
}
static void
vtd_ctx_global_invalidate(struct vtdmap *vtdmap)
{
vtdmap->ccr = VTD_CCR_ICC | VTD_CCR_CIRG_GLOBAL;
while ((vtdmap->ccr & VTD_CCR_ICC) != 0)
;
}
static void
vtd_iotlb_global_invalidate(struct vtdmap *vtdmap)
{
int offset;
volatile uint64_t *iotlb_reg, val;
vtd_wbflush(vtdmap);
offset = VTD_ECAP_IRO(vtdmap->ext_cap) * 16;
iotlb_reg = (volatile uint64_t *)((caddr_t)vtdmap + offset + 8);
*iotlb_reg = VTD_IIR_IVT | VTD_IIR_IIRG_GLOBAL |
VTD_IIR_DRAIN_READS | VTD_IIR_DRAIN_WRITES;
while (1) {
val = *iotlb_reg;
if ((val & VTD_IIR_IVT) == 0)
break;
}
}
static void
vtd_translation_enable(struct vtdmap *vtdmap)
{
vtdmap->gcr = VTD_GCR_TE;
while ((vtdmap->gsr & VTD_GSR_TES) == 0)
;
}
static void
vtd_translation_disable(struct vtdmap *vtdmap)
{
vtdmap->gcr = 0;
while ((vtdmap->gsr & VTD_GSR_TES) != 0)
;
}
static int
vtd_init(void)
{
int i, units;
struct vtdmap *vtdmap;
vm_paddr_t ctx_paddr;
for (i = 0; drhd_ident_funcs[i] != NULL; i++) {
units = (*drhd_ident_funcs[i])();
if (units > 0)
break;
}
if (units <= 0)
return (ENXIO);
drhd_num = units;
vtdmap = vtdmaps[0];
if (VTD_CAP_CM(vtdmap->cap) != 0)
panic("vtd_init: invalid caching mode");
max_domains = vtd_max_domains(vtdmap);
/*
* Set up the root-table to point to the context-entry tables
*/
for (i = 0; i < 256; i++) {
ctx_paddr = vtophys(ctx_tables[i]);
if (ctx_paddr & PAGE_MASK)
panic("ctx table (0x%0lx) not page aligned", ctx_paddr);
root_table[i * 2] = ctx_paddr | VTD_ROOT_PRESENT;
}
return (0);
}
static void
vtd_cleanup(void)
{
}
static void
vtd_enable(void)
{
int i;
struct vtdmap *vtdmap;
for (i = 0; i < drhd_num; i++) {
vtdmap = vtdmaps[i];
vtd_wbflush(vtdmap);
/* Update the root table address */
vtdmap->rta = vtophys(root_table);
vtdmap->gcr = VTD_GCR_SRTP;
while ((vtdmap->gsr & VTD_GSR_RTPS) == 0)
;
vtd_ctx_global_invalidate(vtdmap);
vtd_iotlb_global_invalidate(vtdmap);
vtd_translation_enable(vtdmap);
}
}
static void
vtd_disable(void)
{
int i;
struct vtdmap *vtdmap;
for (i = 0; i < drhd_num; i++) {
vtdmap = vtdmaps[i];
vtd_translation_disable(vtdmap);
}
}
static void
vtd_add_device(void *arg, int bus, int slot, int func)
{
int idx;
uint64_t *ctxp;
struct domain *dom = arg;
vm_paddr_t pt_paddr;
struct vtdmap *vtdmap;
if (bus < 0 || bus > PCI_BUSMAX ||
slot < 0 || slot > PCI_SLOTMAX ||
func < 0 || func > PCI_FUNCMAX)
panic("vtd_add_device: invalid bsf %d/%d/%d", bus, slot, func);
vtdmap = vtdmaps[0];
ctxp = ctx_tables[bus];
pt_paddr = vtophys(dom->ptp);
idx = (slot << 3 | func) * 2;
if (ctxp[idx] & VTD_CTX_PRESENT) {
panic("vtd_add_device: device %d/%d/%d is already owned by "
"domain %d", bus, slot, func,
(uint16_t)(ctxp[idx + 1] >> 8));
}
/*
* Order is important. The 'present' bit is set only after all fields
* of the context pointer are initialized.
*/
ctxp[idx + 1] = dom->addrwidth | (dom->id << 8);
if (VTD_ECAP_DI(vtdmap->ext_cap))
ctxp[idx] = VTD_CTX_TT_ALL;
else
ctxp[idx] = 0;
ctxp[idx] |= pt_paddr | VTD_CTX_PRESENT;
/*
* 'Not Present' entries are not cached in either the Context Cache
* or in the IOTLB, so there is no need to invalidate either of them.
*/
}
static void
vtd_remove_device(void *arg, int bus, int slot, int func)
{
int i, idx;
uint64_t *ctxp;
struct vtdmap *vtdmap;
if (bus < 0 || bus > PCI_BUSMAX ||
slot < 0 || slot > PCI_SLOTMAX ||
func < 0 || func > PCI_FUNCMAX)
panic("vtd_add_device: invalid bsf %d/%d/%d", bus, slot, func);
ctxp = ctx_tables[bus];
idx = (slot << 3 | func) * 2;
/*
* Order is important. The 'present' bit is must be cleared first.
*/
ctxp[idx] = 0;
ctxp[idx + 1] = 0;
/*
* Invalidate the Context Cache and the IOTLB.
*
* XXX use device-selective invalidation for Context Cache
* XXX use domain-selective invalidation for IOTLB
*/
for (i = 0; i < drhd_num; i++) {
vtdmap = vtdmaps[i];
vtd_ctx_global_invalidate(vtdmap);
vtd_iotlb_global_invalidate(vtdmap);
}
}
static uint64_t
vtd_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len)
{
struct domain *dom;
int i, spshift, ptpshift, ptpindex, nlevels;
uint64_t spsize, *ptp;
dom = arg;
ptpindex = 0;
ptpshift = 0;
if (gpa & PAGE_MASK)
panic("vtd_create_mapping: unaligned gpa 0x%0lx", gpa);
if (hpa & PAGE_MASK)
panic("vtd_create_mapping: unaligned hpa 0x%0lx", hpa);
if (len & PAGE_MASK)
panic("vtd_create_mapping: unaligned len 0x%0lx", len);
/*
* Compute the size of the mapping that we can accomodate.
*
* This is based on three factors:
* - supported super page size
* - alignment of the region starting at 'gpa' and 'hpa'
* - length of the region 'len'
*/
spshift = 48;
for (i = 3; i >= 0; i--) {
spsize = 1UL << spshift;
if ((dom->spsmask & (1 << i)) != 0 &&
(gpa & (spsize - 1)) == 0 &&
(hpa & (spsize - 1)) == 0 &&
(len >= spsize)) {
break;
}
spshift -= 9;
}
ptp = dom->ptp;
nlevels = dom->pt_levels;
while (--nlevels >= 0) {
ptpshift = 12 + nlevels * 9;
ptpindex = (gpa >> ptpshift) & 0x1FF;
/* We have reached the leaf mapping */
if (spshift >= ptpshift) {
break;
}
/*
* We are working on a non-leaf page table page.
*
* Create a downstream page table page if necessary and point
* to it from the current page table.
*/
if (ptp[ptpindex] == 0) {
void *nlp = malloc(PAGE_SIZE, M_VTD, M_WAITOK | M_ZERO);
ptp[ptpindex] = vtophys(nlp)| VTD_PTE_RD | VTD_PTE_WR;
}
ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & VTD_PTE_ADDR_M);
}
if ((gpa & ((1UL << ptpshift) - 1)) != 0)
panic("gpa 0x%lx and ptpshift %d mismatch", gpa, ptpshift);
/*
* Create a 'gpa' -> 'hpa' mapping
*/
ptp[ptpindex] = hpa | VTD_PTE_RD | VTD_PTE_WR;
if (nlevels > 0)
ptp[ptpindex] |= VTD_PTE_SUPERPAGE;
return (1UL << ptpshift);
}
static void *
vtd_create_domain(vm_paddr_t maxaddr)
{
struct domain *dom;
vm_paddr_t addr;
int tmp, i, gaw, agaw, sagaw, res, pt_levels, addrwidth;
struct vtdmap *vtdmap;
if (drhd_num <= 0)
panic("vtd_create_domain: no dma remapping hardware available");
vtdmap = vtdmaps[0];
/*
* Calculate AGAW.
* Section 3.4.2 "Adjusted Guest Address Width", Architecture Spec.
*/
addr = 0;
for (gaw = 0; addr < maxaddr; gaw++)
addr = 1ULL << gaw;
res = (gaw - 12) % 9;
if (res == 0)
agaw = gaw;
else
agaw = gaw + 9 - res;
if (agaw > 64)
agaw = 64;
/*
* Select the smallest Supported AGAW and the corresponding number
* of page table levels.
*/
pt_levels = 2;
sagaw = 30;
addrwidth = 0;
tmp = VTD_CAP_SAGAW(vtdmap->cap);
for (i = 0; i < 5; i++) {
if ((tmp & (1 << i)) != 0 && sagaw >= agaw)
break;
pt_levels++;
addrwidth++;
sagaw += 9;
if (sagaw > 64)
sagaw = 64;
}
if (i >= 5) {
panic("vtd_create_domain: SAGAW 0x%lx does not support AGAW %d",
VTD_CAP_SAGAW(vtdmap->cap), agaw);
}
dom = malloc(sizeof(struct domain), M_VTD, M_ZERO | M_WAITOK);
dom->pt_levels = pt_levels;
dom->addrwidth = addrwidth;
dom->spsmask = VTD_CAP_SPS(vtdmap->cap);
dom->id = domain_id();
dom->maxaddr = maxaddr;
dom->ptp = malloc(PAGE_SIZE, M_VTD, M_ZERO | M_WAITOK);
if ((uintptr_t)dom->ptp & PAGE_MASK)
panic("vtd_create_domain: ptp (%p) not page aligned", dom->ptp);
SLIST_INSERT_HEAD(&domhead, dom, next);
return (dom);
}
static void
vtd_free_ptp(uint64_t *ptp, int level)
{
int i;
uint64_t *nlp;
if (level > 1) {
for (i = 0; i < 512; i++) {
if ((ptp[i] & (VTD_PTE_RD | VTD_PTE_WR)) == 0)
continue;
if ((ptp[i] & VTD_PTE_SUPERPAGE) != 0)
continue;
nlp = (uint64_t *)PHYS_TO_DMAP(ptp[i] & VTD_PTE_ADDR_M);
vtd_free_ptp(nlp, level - 1);
}
}
bzero(ptp, PAGE_SIZE);
free(ptp, M_VTD);
}
static void
vtd_destroy_domain(void *arg)
{
struct domain *dom;
dom = arg;
SLIST_REMOVE(&domhead, dom, domain, next);
vtd_free_ptp(dom->ptp, dom->pt_levels);
free(dom, M_VTD);
}
struct iommu_ops iommu_ops_intel = {
vtd_init,
vtd_cleanup,
vtd_enable,
vtd_disable,
vtd_create_domain,
vtd_destroy_domain,
vtd_create_mapping,
vtd_add_device,
vtd_remove_device,
};

230
sys/amd64/vmm/io/iommu.c Normal file
View File

@ -0,0 +1,230 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/types.h>
#include <sys/systm.h>
#include <sys/bus.h>
#include <dev/pci/pcivar.h>
#include <dev/pci/pcireg.h>
#include <machine/md_var.h>
#include "vmm_util.h"
#include "iommu.h"
static boolean_t iommu_avail;
static struct iommu_ops *ops;
static void *host_domain;
static __inline int
IOMMU_INIT(void)
{
if (ops != NULL)
return ((*ops->init)());
else
return (ENXIO);
}
static __inline void
IOMMU_CLEANUP(void)
{
if (ops != NULL && iommu_avail)
(*ops->cleanup)();
}
static __inline void *
IOMMU_CREATE_DOMAIN(vm_paddr_t maxaddr)
{
if (ops != NULL && iommu_avail)
return ((*ops->create_domain)(maxaddr));
else
return (NULL);
}
static __inline void
IOMMU_DESTROY_DOMAIN(void *dom)
{
if (ops != NULL && iommu_avail)
(*ops->destroy_domain)(dom);
}
static __inline uint64_t
IOMMU_CREATE_MAPPING(void *domain, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len)
{
if (ops != NULL && iommu_avail)
return ((*ops->create_mapping)(domain, gpa, hpa, len));
else
return (len); /* XXX */
}
static __inline void
IOMMU_ADD_DEVICE(void *domain, int bus, int slot, int func)
{
if (ops != NULL && iommu_avail)
(*ops->add_device)(domain, bus, slot, func);
}
static __inline void
IOMMU_REMOVE_DEVICE(void *domain, int bus, int slot, int func)
{
if (ops != NULL && iommu_avail)
(*ops->remove_device)(domain, bus, slot, func);
}
static __inline void
IOMMU_ENABLE(void)
{
if (ops != NULL && iommu_avail)
(*ops->enable)();
}
static __inline void
IOMMU_DISABLE(void)
{
if (ops != NULL && iommu_avail)
(*ops->disable)();
}
void
iommu_init(void)
{
int error, bus, slot, func;
vm_paddr_t maxaddr;
const char *name;
device_t dev;
if (vmm_is_intel())
ops = &iommu_ops_intel;
else if (vmm_is_amd())
ops = &iommu_ops_amd;
else
ops = NULL;
error = IOMMU_INIT();
if (error)
return;
iommu_avail = TRUE;
/*
* Create a domain for the devices owned by the host
*/
maxaddr = ptoa(Maxmem);
host_domain = IOMMU_CREATE_DOMAIN(maxaddr);
if (host_domain == NULL)
panic("iommu_init: unable to create a host domain");
/*
* Create 1:1 mappings from '0' to 'Maxmem' for devices assigned to
* the host
*/
iommu_create_mapping(host_domain, 0, 0, maxaddr);
for (bus = 0; bus <= PCI_BUSMAX; bus++) {
for (slot = 0; slot <= PCI_SLOTMAX; slot++) {
for (func = 0; func <= PCI_FUNCMAX; func++) {
dev = pci_find_dbsf(0, bus, slot, func);
if (dev == NULL)
continue;
/* skip passthrough devices */
name = device_get_name(dev);
if (name != NULL && strcmp(name, "ppt") == 0)
continue;
/* everything else belongs to the host domain */
iommu_add_device(host_domain, bus, slot, func);
}
}
}
IOMMU_ENABLE();
}
void
iommu_cleanup(void)
{
IOMMU_DISABLE();
IOMMU_DESTROY_DOMAIN(host_domain);
IOMMU_CLEANUP();
}
void *
iommu_create_domain(vm_paddr_t maxaddr)
{
return (IOMMU_CREATE_DOMAIN(maxaddr));
}
void
iommu_destroy_domain(void *dom)
{
IOMMU_DESTROY_DOMAIN(dom);
}
void
iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa, size_t len)
{
uint64_t mapped, remaining;
remaining = len;
while (remaining > 0) {
mapped = IOMMU_CREATE_MAPPING(dom, gpa, hpa, remaining);
gpa += mapped;
hpa += mapped;
remaining -= mapped;
}
}
void
iommu_add_device(void *dom, int bus, int slot, int func)
{
IOMMU_ADD_DEVICE(dom, bus, slot, func);
}
void
iommu_remove_device(void *dom, int bus, int slot, int func)
{
IOMMU_REMOVE_DEVICE(dom, bus, slot, func);
}

67
sys/amd64/vmm/io/iommu.h Normal file
View File

@ -0,0 +1,67 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _IO_IOMMU_H_
#define _IO_IOMMU_H_
typedef int (*iommu_init_func_t)(void);
typedef void (*iommu_cleanup_func_t)(void);
typedef void (*iommu_enable_func_t)(void);
typedef void (*iommu_disable_func_t)(void);
typedef void *(*iommu_create_domain_t)(vm_paddr_t maxaddr);
typedef void (*iommu_destroy_domain_t)(void *domain);
typedef uint64_t (*iommu_create_mapping_t)(void *domain, vm_paddr_t gpa,
vm_paddr_t hpa, uint64_t len);
typedef void (*iommu_add_device_t)(void *domain, int bus, int slot, int func);
typedef void (*iommu_remove_device_t)(void *dom, int bus, int slot, int func);
struct iommu_ops {
iommu_init_func_t init; /* module wide */
iommu_cleanup_func_t cleanup;
iommu_enable_func_t enable;
iommu_disable_func_t disable;
iommu_create_domain_t create_domain; /* domain-specific */
iommu_destroy_domain_t destroy_domain;
iommu_create_mapping_t create_mapping;
iommu_add_device_t add_device;
iommu_remove_device_t remove_device;
};
extern struct iommu_ops iommu_ops_intel;
extern struct iommu_ops iommu_ops_amd;
void iommu_init(void);
void iommu_cleanup(void);
void *iommu_create_domain(vm_paddr_t maxaddr);
void iommu_destroy_domain(void *dom);
void iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa,
size_t len);
void iommu_add_device(void *dom, int bus, int slot, int func);
void iommu_remove_device(void *dom, int bus, int slot, int func);
#endif

449
sys/amd64/vmm/io/ppt.c Normal file
View File

@ -0,0 +1,449 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/module.h>
#include <sys/bus.h>
#include <sys/pciio.h>
#include <sys/rman.h>
#include <dev/pci/pcivar.h>
#include <dev/pci/pcireg.h>
#include <machine/resource.h>
#include <machine/vmm.h>
#include <machine/vmm_dev.h>
#include "vmm_lapic.h"
#include "vmm_ktr.h"
#include "iommu.h"
#include "ppt.h"
#define MAX_PPTDEVS (sizeof(pptdevs) / sizeof(pptdevs[0]))
#define MAX_MMIOSEGS (PCIR_MAX_BAR_0 + 1)
#define MAX_MSIMSGS 32
struct pptintr_arg { /* pptintr(pptintr_arg) */
struct pptdev *pptdev;
int msg;
};
static struct pptdev {
device_t dev;
struct vm *vm; /* owner of this device */
struct vm_memory_segment mmio[MAX_MMIOSEGS];
struct {
int num_msgs; /* guest state */
int vector;
int vcpu;
int startrid; /* host state */
struct resource *res[MAX_MSIMSGS];
void *cookie[MAX_MSIMSGS];
struct pptintr_arg arg[MAX_MSIMSGS];
} msi;
} pptdevs[32];
static int num_pptdevs;
static int
ppt_probe(device_t dev)
{
int bus, slot, func;
struct pci_devinfo *dinfo;
dinfo = (struct pci_devinfo *)device_get_ivars(dev);
bus = pci_get_bus(dev);
slot = pci_get_slot(dev);
func = pci_get_function(dev);
/*
* To qualify as a pci passthrough device a device must:
* - be allowed by administrator to be used in this role
* - be an endpoint device
*/
if (vmm_is_pptdev(bus, slot, func) &&
(dinfo->cfg.hdrtype & PCIM_HDRTYPE) == PCIM_HDRTYPE_NORMAL)
return (0);
else
return (ENXIO);
}
static int
ppt_attach(device_t dev)
{
int n;
if (num_pptdevs >= MAX_PPTDEVS) {
printf("ppt_attach: maximum number of pci passthrough devices "
"exceeded\n");
return (ENXIO);
}
n = num_pptdevs++;
pptdevs[n].dev = dev;
if (bootverbose)
device_printf(dev, "attached\n");
return (0);
}
static int
ppt_detach(device_t dev)
{
/*
* XXX check whether there are any pci passthrough devices assigned
* to guests before we allow this driver to detach.
*/
return (0);
}
static device_method_t ppt_methods[] = {
/* Device interface */
DEVMETHOD(device_probe, ppt_probe),
DEVMETHOD(device_attach, ppt_attach),
DEVMETHOD(device_detach, ppt_detach),
{0, 0}
};
static devclass_t ppt_devclass;
DEFINE_CLASS_0(ppt, ppt_driver, ppt_methods, 0);
DRIVER_MODULE(ppt, pci, ppt_driver, ppt_devclass, NULL, NULL);
static struct pptdev *
ppt_find(int bus, int slot, int func)
{
device_t dev;
int i, b, s, f;
for (i = 0; i < num_pptdevs; i++) {
dev = pptdevs[i].dev;
b = pci_get_bus(dev);
s = pci_get_slot(dev);
f = pci_get_function(dev);
if (bus == b && slot == s && func == f)
return (&pptdevs[i]);
}
return (NULL);
}
static void
ppt_unmap_mmio(struct vm *vm, struct pptdev *ppt)
{
int i;
struct vm_memory_segment *seg;
for (i = 0; i < MAX_MMIOSEGS; i++) {
seg = &ppt->mmio[i];
if (seg->len == 0)
continue;
(void)vm_unmap_mmio(vm, seg->gpa, seg->len);
bzero(seg, sizeof(struct vm_memory_segment));
}
}
static void
ppt_teardown_msi(struct pptdev *ppt)
{
int i, rid;
void *cookie;
struct resource *res;
if (ppt->msi.num_msgs == 0)
return;
for (i = 0; i < ppt->msi.num_msgs; i++) {
rid = ppt->msi.startrid + i;
res = ppt->msi.res[i];
cookie = ppt->msi.cookie[i];
if (cookie != NULL)
bus_teardown_intr(ppt->dev, res, cookie);
if (res != NULL)
bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res);
ppt->msi.res[i] = NULL;
ppt->msi.cookie[i] = NULL;
}
if (ppt->msi.startrid == 1)
pci_release_msi(ppt->dev);
ppt->msi.num_msgs = 0;
}
int
ppt_assign_device(struct vm *vm, int bus, int slot, int func)
{
struct pptdev *ppt;
ppt = ppt_find(bus, slot, func);
if (ppt != NULL) {
/*
* If this device is owned by a different VM then we
* cannot change its owner.
*/
if (ppt->vm != NULL && ppt->vm != vm)
return (EBUSY);
ppt->vm = vm;
iommu_add_device(vm_iommu_domain(vm), bus, slot, func);
return (0);
}
return (ENOENT);
}
int
ppt_unassign_device(struct vm *vm, int bus, int slot, int func)
{
struct pptdev *ppt;
ppt = ppt_find(bus, slot, func);
if (ppt != NULL) {
/*
* If this device is not owned by this 'vm' then bail out.
*/
if (ppt->vm != vm)
return (EBUSY);
ppt_unmap_mmio(vm, ppt);
ppt_teardown_msi(ppt);
iommu_remove_device(vm_iommu_domain(vm), bus, slot, func);
ppt->vm = NULL;
return (0);
}
return (ENOENT);
}
int
ppt_unassign_all(struct vm *vm)
{
int i, bus, slot, func;
device_t dev;
for (i = 0; i < num_pptdevs; i++) {
if (pptdevs[i].vm == vm) {
dev = pptdevs[i].dev;
bus = pci_get_bus(dev);
slot = pci_get_slot(dev);
func = pci_get_function(dev);
ppt_unassign_device(vm, bus, slot, func);
}
}
return (0);
}
int
ppt_map_mmio(struct vm *vm, int bus, int slot, int func,
vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
{
int i, error;
struct vm_memory_segment *seg;
struct pptdev *ppt;
ppt = ppt_find(bus, slot, func);
if (ppt != NULL) {
if (ppt->vm != vm)
return (EBUSY);
for (i = 0; i < MAX_MMIOSEGS; i++) {
seg = &ppt->mmio[i];
if (seg->len == 0) {
error = vm_map_mmio(vm, gpa, len, hpa);
if (error == 0) {
seg->gpa = gpa;
seg->len = len;
seg->hpa = hpa;
}
return (error);
}
}
return (ENOSPC);
}
return (ENOENT);
}
static int
pptintr(void *arg)
{
int vec;
struct pptdev *ppt;
struct pptintr_arg *pptarg;
pptarg = arg;
ppt = pptarg->pptdev;
vec = ppt->msi.vector + pptarg->msg;
if (ppt->vm != NULL)
(void) lapic_set_intr(ppt->vm, ppt->msi.vcpu, vec);
else {
/*
* XXX
* This is not expected to happen - panic?
*/
}
/*
* For legacy interrupts give other filters a chance in case
* the interrupt was not generated by the passthrough device.
*/
if (ppt->msi.startrid == 0)
return (FILTER_STRAY);
else
return (FILTER_HANDLED);
}
/*
* XXX
* When we try to free the MSI resource the kernel will bind the thread to
* the host cpu was originally handling the MSI. The function freeing the
* MSI vector (apic_free_vector()) will panic the kernel if the thread
* is already bound to a cpu.
*
* So, we temporarily unbind the vcpu thread before freeing the MSI resource.
*/
static void
PPT_TEARDOWN_MSI(struct vm *vm, int vcpu, struct pptdev *ppt)
{
int pincpu = -1;
vm_get_pinning(vm, vcpu, &pincpu);
if (pincpu >= 0)
vm_set_pinning(vm, vcpu, -1);
ppt_teardown_msi(ppt);
if (pincpu >= 0)
vm_set_pinning(vm, vcpu, pincpu);
}
int
ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
int destcpu, int vector, int numvec)
{
int i, rid, flags;
int msi_count, startrid, error, tmp;
struct pptdev *ppt;
if ((destcpu >= VM_MAXCPU || destcpu < 0) ||
(vector < 0 || vector > 255) ||
(numvec < 0 || numvec > MAX_MSIMSGS))
return (EINVAL);
ppt = ppt_find(bus, slot, func);
if (ppt == NULL)
return (ENOENT);
if (ppt->vm != vm) /* Make sure we own this device */
return (EBUSY);
/* Free any allocated resources */
PPT_TEARDOWN_MSI(vm, vcpu, ppt);
if (numvec == 0) /* nothing more to do */
return (0);
flags = RF_ACTIVE;
msi_count = pci_msi_count(ppt->dev);
if (msi_count == 0) {
startrid = 0; /* legacy interrupt */
msi_count = 1;
flags |= RF_SHAREABLE;
} else
startrid = 1; /* MSI */
/*
* The device must be capable of supporting the number of vectors
* the guest wants to allocate.
*/
if (numvec > msi_count)
return (EINVAL);
/*
* Make sure that we can allocate all the MSI vectors that are needed
* by the guest.
*/
if (startrid == 1) {
tmp = numvec;
error = pci_alloc_msi(ppt->dev, &tmp);
if (error)
return (error);
else if (tmp != numvec) {
pci_release_msi(ppt->dev);
return (ENOSPC);
} else {
/* success */
}
}
ppt->msi.vector = vector;
ppt->msi.vcpu = destcpu;
ppt->msi.startrid = startrid;
/*
* Allocate the irq resource and attach it to the interrupt handler.
*/
for (i = 0; i < numvec; i++) {
ppt->msi.num_msgs = i + 1;
ppt->msi.cookie[i] = NULL;
rid = startrid + i;
ppt->msi.res[i] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ,
&rid, flags);
if (ppt->msi.res[i] == NULL)
break;
ppt->msi.arg[i].pptdev = ppt;
ppt->msi.arg[i].msg = i;
error = bus_setup_intr(ppt->dev, ppt->msi.res[i],
INTR_TYPE_NET | INTR_MPSAFE | INTR_FAST,
pptintr, NULL, &ppt->msi.arg[i],
&ppt->msi.cookie[i]);
if (error != 0)
break;
}
if (i < numvec) {
PPT_TEARDOWN_MSI(vm, vcpu, ppt);
return (ENXIO);
}
return (0);
}

40
sys/amd64/vmm/io/ppt.h Normal file
View File

@ -0,0 +1,40 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _IO_PPT_H_
#define _IO_PPT_H_
int ppt_assign_device(struct vm *vm, int bus, int slot, int func);
int ppt_unassign_device(struct vm *vm, int bus, int slot, int func);
int ppt_unassign_all(struct vm *vm);
int ppt_map_mmio(struct vm *vm, int bus, int slot, int func,
vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
int ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
int destcpu, int vector, int numvec);
#endif

270
sys/amd64/vmm/io/vdev.c Normal file
View File

@ -0,0 +1,270 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include "vdev.h"
struct vdev {
SLIST_ENTRY(vdev) entry;
struct vdev_ops *ops;
void *dev;
};
static SLIST_HEAD(, vdev) vdev_head;
static int vdev_count;
struct vdev_region {
SLIST_ENTRY(vdev_region) entry;
struct vdev_ops *ops;
void *dev;
struct io_region *io;
};
static SLIST_HEAD(, vdev_region) region_head;
static int region_count;
static MALLOC_DEFINE(M_VDEV, "vdev", "vdev");
#define VDEV_INIT (0)
#define VDEV_RESET (1)
#define VDEV_HALT (2)
// static const char* vdev_event_str[] = {"VDEV_INIT", "VDEV_RESET", "VDEV_HALT"};
static int
vdev_system_event(int event)
{
struct vdev *vd;
int rc;
// TODO: locking
SLIST_FOREACH(vd, &vdev_head, entry) {
// printf("%s : %s Device %s\n", __func__, vdev_event_str[event], vd->ops->name);
switch (event) {
case VDEV_INIT:
rc = vd->ops->init(vd->dev);
break;
case VDEV_RESET:
rc = vd->ops->reset(vd->dev);
break;
case VDEV_HALT:
rc = vd->ops->halt(vd->dev);
break;
default:
break;
}
if (rc) {
printf("vdev %s init failed rc=%d\n",
vd->ops->name, rc);
return rc;
}
}
return 0;
}
int
vdev_init(void)
{
return vdev_system_event(VDEV_INIT);
}
int
vdev_reset(void)
{
return vdev_system_event(VDEV_RESET);
}
int
vdev_halt(void)
{
return vdev_system_event(VDEV_HALT);
}
void
vdev_vm_init(void)
{
SLIST_INIT(&vdev_head);
vdev_count = 0;
SLIST_INIT(&region_head);
region_count = 0;
}
void
vdev_vm_cleanup(void)
{
struct vdev *vd;
// TODO: locking
while (!SLIST_EMPTY(&vdev_head)) {
vd = SLIST_FIRST(&vdev_head);
SLIST_REMOVE_HEAD(&vdev_head, entry);
free(vd, M_VDEV);
vdev_count--;
}
}
int
vdev_register(struct vdev_ops *ops, void *dev)
{
struct vdev *vd;
vd = malloc(sizeof(*vd), M_VDEV, M_WAITOK | M_ZERO);
vd->ops = ops;
vd->dev = dev;
// TODO: locking
SLIST_INSERT_HEAD(&vdev_head, vd, entry);
vdev_count++;
return 0;
}
void
vdev_unregister(void *dev)
{
struct vdev *vd, *found;
found = NULL;
// TODO: locking
SLIST_FOREACH(vd, &vdev_head, entry) {
if (vd->dev == dev) {
found = vd;
}
}
if (found) {
SLIST_REMOVE(&vdev_head, found, vdev, entry);
free(found, M_VDEV);
}
}
#define IN_RANGE(val, start, end) \
(((val) >= (start)) && ((val) < (end)))
static struct vdev_region*
vdev_find_region(struct io_region *io, void *dev)
{
struct vdev_region *region, *found;
uint64_t region_base;
uint64_t region_end;
found = NULL;
// TODO: locking
// FIXME: we should verify we are in the context the current
// vcpu here as well.
SLIST_FOREACH(region, &region_head, entry) {
region_base = region->io->base;
region_end = region_base + region->io->len;
if (IN_RANGE(io->base, region_base, region_end) &&
IN_RANGE(io->base+io->len, region_base, region_end+1) &&
(dev && dev == region->dev)) {
found = region;
break;
}
}
return found;
}
int
vdev_register_region(struct vdev_ops *ops, void *dev, struct io_region *io)
{
struct vdev_region *region;
region = vdev_find_region(io, dev);
if (region) {
return -EEXIST;
}
region = malloc(sizeof(*region), M_VDEV, M_WAITOK | M_ZERO);
region->io = io;
region->ops = ops;
region->dev = dev;
// TODO: locking
SLIST_INSERT_HEAD(&region_head, region, entry);
region_count++;
return 0;
}
void
vdev_unregister_region(void *dev, struct io_region *io)
{
struct vdev_region *region;
region = vdev_find_region(io, dev);
if (region) {
SLIST_REMOVE(&region_head, region, vdev_region, entry);
free(region, M_VDEV);
region_count--;
}
}
static int
vdev_memrw(uint64_t gpa, opsize_t size, uint64_t *data, int read)
{
struct vdev_region *region;
struct io_region io;
region_attr_t attr;
int rc;
io.base = gpa;
io.len = size;
region = vdev_find_region(&io, NULL);
if (!region)
return -EINVAL;
attr = (read) ? MMIO_READ : MMIO_WRITE;
if (!(region->io->attr & attr))
return -EPERM;
if (read)
rc = region->ops->memread(region->dev, gpa, size, data);
else
rc = region->ops->memwrite(region->dev, gpa, size, *data);
return rc;
}
int
vdev_memread(uint64_t gpa, opsize_t size, uint64_t *data)
{
return vdev_memrw(gpa, size, data, 1);
}
int
vdev_memwrite(uint64_t gpa, opsize_t size, uint64_t data)
{
return vdev_memrw(gpa, size, &data, 0);
}

84
sys/amd64/vmm/io/vdev.h Normal file
View File

@ -0,0 +1,84 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VDEV_H_
#define _VDEV_H_
typedef enum {
BYTE = 1,
WORD = 2,
DWORD = 4,
QWORD = 8,
} opsize_t;
typedef enum {
MMIO_READ = 1,
MMIO_WRITE = 2,
} region_attr_t;
struct io_region {
uint64_t base;
uint64_t len;
region_attr_t attr;
int vcpu;
};
typedef int (*vdev_init_t)(void* dev);
typedef int (*vdev_reset_t)(void* dev);
typedef int (*vdev_halt_t)(void* dev);
typedef int (*vdev_memread_t)(void* dev, uint64_t gpa, opsize_t size, uint64_t *data);
typedef int (*vdev_memwrite_t)(void* dev, uint64_t gpa, opsize_t size, uint64_t data);
struct vdev_ops {
const char *name;
vdev_init_t init;
vdev_reset_t reset;
vdev_halt_t halt;
vdev_memread_t memread;
vdev_memwrite_t memwrite;
};
void vdev_vm_init(void);
void vdev_vm_cleanup(void);
int vdev_register(struct vdev_ops *ops, void *dev);
void vdev_unregister(void *dev);
int vdev_register_region(struct vdev_ops *ops, void *dev, struct io_region *io);
void vdev_unregister_region(void *dev, struct io_region *io);
int vdev_init(void);
int vdev_reset(void);
int vdev_halt(void);
int vdev_memread(uint64_t gpa, opsize_t size, uint64_t *data);
int vdev_memwrite(uint64_t gpa, opsize_t size, uint64_t data);
#endif /* _VDEV_H_ */

812
sys/amd64/vmm/io/vlapic.c Normal file
View File

@ -0,0 +1,812 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/systm.h>
#include <machine/clock.h>
#include <machine/apicreg.h>
#include <machine/vmm.h>
#include "vmm_lapic.h"
#include "vmm_ktr.h"
#include "vdev.h"
#include "vlapic.h"
#define VLAPIC_CTR0(vlapic, format) \
VMM_CTR0((vlapic)->vm, (vlapic)->vcpuid, format)
#define VLAPIC_CTR1(vlapic, format, p1) \
VMM_CTR1((vlapic)->vm, (vlapic)->vcpuid, format, p1)
#define VLAPIC_CTR_IRR(vlapic, msg) \
do { \
uint32_t *irrptr = &(vlapic)->apic.irr0; \
irrptr[0] = irrptr[0]; /* silence compiler */ \
VLAPIC_CTR1((vlapic), msg " irr0 0x%08x", irrptr[0 << 2]); \
VLAPIC_CTR1((vlapic), msg " irr1 0x%08x", irrptr[1 << 2]); \
VLAPIC_CTR1((vlapic), msg " irr2 0x%08x", irrptr[2 << 2]); \
VLAPIC_CTR1((vlapic), msg " irr3 0x%08x", irrptr[3 << 2]); \
VLAPIC_CTR1((vlapic), msg " irr4 0x%08x", irrptr[4 << 2]); \
VLAPIC_CTR1((vlapic), msg " irr5 0x%08x", irrptr[5 << 2]); \
VLAPIC_CTR1((vlapic), msg " irr6 0x%08x", irrptr[6 << 2]); \
VLAPIC_CTR1((vlapic), msg " irr7 0x%08x", irrptr[7 << 2]); \
} while (0)
#define VLAPIC_CTR_ISR(vlapic, msg) \
do { \
uint32_t *isrptr = &(vlapic)->apic.isr0; \
isrptr[0] = isrptr[0]; /* silence compiler */ \
VLAPIC_CTR1((vlapic), msg " isr0 0x%08x", isrptr[0 << 2]); \
VLAPIC_CTR1((vlapic), msg " isr1 0x%08x", isrptr[1 << 2]); \
VLAPIC_CTR1((vlapic), msg " isr2 0x%08x", isrptr[2 << 2]); \
VLAPIC_CTR1((vlapic), msg " isr3 0x%08x", isrptr[3 << 2]); \
VLAPIC_CTR1((vlapic), msg " isr4 0x%08x", isrptr[4 << 2]); \
VLAPIC_CTR1((vlapic), msg " isr5 0x%08x", isrptr[5 << 2]); \
VLAPIC_CTR1((vlapic), msg " isr6 0x%08x", isrptr[6 << 2]); \
VLAPIC_CTR1((vlapic), msg " isr7 0x%08x", isrptr[7 << 2]); \
} while (0)
static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic");
#define PRIO(x) ((x) >> 4)
#define VLAPIC_VERSION (16)
#define VLAPIC_MAXLVT_ENTRIES (5)
struct vlapic {
struct vm *vm;
int vcpuid;
struct io_region *mmio;
struct vdev_ops *ops;
struct LAPIC apic;
int esr_update;
int divisor;
int ccr_ticks;
/*
* The 'isrvec_stk' is a stack of vectors injected by the local apic.
* A vector is popped from the stack when the processor does an EOI.
* The vector on the top of the stack is used to compute the
* Processor Priority in conjunction with the TPR.
*/
uint8_t isrvec_stk[ISRVEC_STK_SIZE];
int isrvec_stk_top;
};
static void
vlapic_mask_lvts(uint32_t *lvts, int num_lvt)
{
int i;
for (i = 0; i < num_lvt; i++) {
*lvts |= APIC_LVT_M;
lvts += 4;
}
}
#if 0
static inline void
vlapic_dump_lvt(uint32_t offset, uint32_t *lvt)
{
printf("Offset %x: lvt %08x (V:%02x DS:%x M:%x)\n", offset,
*lvt, *lvt & APIC_LVTT_VECTOR, *lvt & APIC_LVTT_DS,
*lvt & APIC_LVTT_M);
}
#endif
static uint64_t
vlapic_get_ccr(struct vlapic *vlapic)
{
struct LAPIC *lapic = &vlapic->apic;
return lapic->ccr_timer;
}
static void
vlapic_update_errors(struct vlapic *vlapic)
{
struct LAPIC *lapic = &vlapic->apic;
lapic->esr = 0; // XXX
}
static void
vlapic_init_ipi(struct vlapic *vlapic)
{
struct LAPIC *lapic = &vlapic->apic;
lapic->version = VLAPIC_VERSION;
lapic->version |= (VLAPIC_MAXLVT_ENTRIES < MAXLVTSHIFT);
lapic->dfr = 0xffffffff;
lapic->svr = APIC_SVR_VECTOR;
vlapic_mask_lvts(&lapic->lvt_timer, VLAPIC_MAXLVT_ENTRIES+1);
}
static int
vlapic_op_reset(void* dev)
{
struct vlapic *vlapic = (struct vlapic*)dev;
struct LAPIC *lapic = &vlapic->apic;
memset(lapic, 0, sizeof(*lapic));
lapic->id = vlapic->vcpuid << 24;
lapic->apr = vlapic->vcpuid;
vlapic_init_ipi(vlapic);
return 0;
}
static int
vlapic_op_init(void* dev)
{
struct vlapic *vlapic = (struct vlapic*)dev;
vdev_register_region(vlapic->ops, vlapic, vlapic->mmio);
return vlapic_op_reset(dev);
}
static int
vlapic_op_halt(void* dev)
{
struct vlapic *vlapic = (struct vlapic*)dev;
vdev_unregister_region(vlapic, vlapic->mmio);
return 0;
}
void
vlapic_set_intr_ready(struct vlapic *vlapic, int vector)
{
struct LAPIC *lapic = &vlapic->apic;
uint32_t *irrptr;
int idx;
if (vector < 0 || vector >= 256)
panic("vlapic_set_intr_ready: invalid vector %d\n", vector);
idx = (vector / 32) * 4;
irrptr = &lapic->irr0;
atomic_set_int(&irrptr[idx], 1 << (vector % 32));
VLAPIC_CTR_IRR(vlapic, "vlapic_set_intr_ready");
}
#define VLAPIC_BUS_FREQ tsc_freq
#define VLAPIC_DCR(x) ((x->dcr_timer & 0x8) >> 1)|(x->dcr_timer & 0x3)
static int
vlapic_timer_divisor(uint32_t dcr)
{
switch (dcr & 0xB) {
case APIC_TDCR_2:
return (2);
case APIC_TDCR_4:
return (4);
case APIC_TDCR_8:
return (8);
case APIC_TDCR_16:
return (16);
case APIC_TDCR_32:
return (32);
case APIC_TDCR_64:
return (64);
case APIC_TDCR_128:
return (128);
default:
panic("vlapic_timer_divisor: invalid dcr 0x%08x", dcr);
}
}
static void
vlapic_start_timer(struct vlapic *vlapic, uint32_t elapsed)
{
uint32_t icr_timer;
icr_timer = vlapic->apic.icr_timer;
vlapic->ccr_ticks = ticks;
if (elapsed < icr_timer)
vlapic->apic.ccr_timer = icr_timer - elapsed;
else {
/*
* This can happen when the guest is trying to run its local
* apic timer higher that the setting of 'hz' in the host.
*
* We deal with this by running the guest local apic timer
* at the rate of the host's 'hz' setting.
*/
vlapic->apic.ccr_timer = 0;
}
}
static __inline uint32_t *
vlapic_get_lvt(struct vlapic *vlapic, uint32_t offset)
{
struct LAPIC *lapic = &vlapic->apic;
int i;
if (offset < APIC_OFFSET_TIMER_LVT || offset > APIC_OFFSET_ERROR_LVT) {
panic("vlapic_get_lvt: invalid LVT\n");
}
i = (offset - APIC_OFFSET_TIMER_LVT) >> 2;
return ((&lapic->lvt_timer) + i);;
}
#if 1
static void
dump_isrvec_stk(struct vlapic *vlapic)
{
int i;
uint32_t *isrptr;
isrptr = &vlapic->apic.isr0;
for (i = 0; i < 8; i++)
printf("ISR%d 0x%08x\n", i, isrptr[i * 4]);
for (i = 0; i <= vlapic->isrvec_stk_top; i++)
printf("isrvec_stk[%d] = %d\n", i, vlapic->isrvec_stk[i]);
}
#endif
/*
* Algorithm adopted from section "Interrupt, Task and Processor Priority"
* in Intel Architecture Manual Vol 3a.
*/
static void
vlapic_update_ppr(struct vlapic *vlapic)
{
int isrvec, tpr, ppr;
/*
* Note that the value on the stack at index 0 is always 0.
*
* This is a placeholder for the value of ISRV when none of the
* bits is set in the ISRx registers.
*/
isrvec = vlapic->isrvec_stk[vlapic->isrvec_stk_top];
tpr = vlapic->apic.tpr;
#if 1
{
int i, lastprio, curprio, vector, idx;
uint32_t *isrptr;
if (vlapic->isrvec_stk_top == 0 && isrvec != 0)
panic("isrvec_stk is corrupted: %d", isrvec);
/*
* Make sure that the priority of the nested interrupts is
* always increasing.
*/
lastprio = -1;
for (i = 1; i <= vlapic->isrvec_stk_top; i++) {
curprio = PRIO(vlapic->isrvec_stk[i]);
if (curprio <= lastprio) {
dump_isrvec_stk(vlapic);
panic("isrvec_stk does not satisfy invariant");
}
lastprio = curprio;
}
/*
* Make sure that each bit set in the ISRx registers has a
* corresponding entry on the isrvec stack.
*/
i = 1;
isrptr = &vlapic->apic.isr0;
for (vector = 0; vector < 256; vector++) {
idx = (vector / 32) * 4;
if (isrptr[idx] & (1 << (vector % 32))) {
if (i > vlapic->isrvec_stk_top ||
vlapic->isrvec_stk[i] != vector) {
dump_isrvec_stk(vlapic);
panic("ISR and isrvec_stk out of sync");
}
i++;
}
}
}
#endif
if (PRIO(tpr) >= PRIO(isrvec))
ppr = tpr;
else
ppr = isrvec & 0xf0;
vlapic->apic.ppr = ppr;
VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr);
}
static void
vlapic_process_eoi(struct vlapic *vlapic)
{
struct LAPIC *lapic = &vlapic->apic;
uint32_t *isrptr;
int i, idx, bitpos;
isrptr = &lapic->isr0;
/*
* The x86 architecture reserves the the first 32 vectors for use
* by the processor.
*/
for (i = 7; i > 0; i--) {
idx = i * 4;
bitpos = fls(isrptr[idx]);
if (bitpos != 0) {
if (vlapic->isrvec_stk_top <= 0) {
panic("invalid vlapic isrvec_stk_top %d",
vlapic->isrvec_stk_top);
}
isrptr[idx] &= ~(1 << (bitpos - 1));
VLAPIC_CTR_ISR(vlapic, "vlapic_process_eoi");
vlapic->isrvec_stk_top--;
vlapic_update_ppr(vlapic);
return;
}
}
}
static __inline int
vlapic_get_lvt_field(uint32_t *lvt, uint32_t mask)
{
return (*lvt & mask);
}
static __inline int
vlapic_periodic_timer(struct vlapic *vlapic)
{
uint32_t *lvt;
lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
return (vlapic_get_lvt_field(lvt, APIC_LVTT_TM_PERIODIC));
}
static void
vlapic_fire_timer(struct vlapic *vlapic)
{
int vector;
uint32_t *lvt;
lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
if (!vlapic_get_lvt_field(lvt, APIC_LVTT_M)) {
vector = vlapic_get_lvt_field(lvt,APIC_LVTT_VECTOR);
vlapic_set_intr_ready(vlapic, vector);
}
}
static int
lapic_process_icr(struct vlapic *vlapic, uint64_t icrval)
{
int i;
cpumask_t dmask, thiscpumask;
uint32_t dest, vec, mode;
thiscpumask = vcpu_mask(vlapic->vcpuid);
dmask = 0;
dest = icrval >> 32;
vec = icrval & APIC_VECTOR_MASK;
mode = icrval & APIC_DELMODE_MASK;
if (mode == APIC_DELMODE_FIXED || mode == APIC_DELMODE_NMI) {
switch (icrval & APIC_DEST_MASK) {
case APIC_DEST_DESTFLD:
dmask = vcpu_mask(dest);
break;
case APIC_DEST_SELF:
dmask = thiscpumask;
break;
case APIC_DEST_ALLISELF:
dmask = vm_active_cpus(vlapic->vm);
break;
case APIC_DEST_ALLESELF:
dmask = vm_active_cpus(vlapic->vm) & ~thiscpumask;
break;
}
for (i = 0; i < VM_MAXCPU; i++) {
if (dmask & vcpu_mask(i)) {
if (mode == APIC_DELMODE_FIXED)
lapic_set_intr(vlapic->vm, i, vec);
else
vm_inject_nmi(vlapic->vm, i);
}
}
return (0); /* handled completely in the kernel */
}
/*
* XXX this assumes that the startup IPI always succeeds
*/
if (mode == APIC_DELMODE_STARTUP)
vm_activate_cpu(vlapic->vm, dest);
/*
* This will cause a return to userland.
*/
return (1);
}
int
vlapic_pending_intr(struct vlapic *vlapic)
{
struct LAPIC *lapic = &vlapic->apic;
int idx, i, bitpos, vector;
uint32_t *irrptr, val;
irrptr = &lapic->irr0;
/*
* The x86 architecture reserves the the first 32 vectors for use
* by the processor.
*/
for (i = 7; i > 0; i--) {
idx = i * 4;
val = atomic_load_acq_int(&irrptr[idx]);
bitpos = fls(val);
if (bitpos != 0) {
vector = i * 32 + (bitpos - 1);
if (PRIO(vector) > PRIO(lapic->ppr)) {
VLAPIC_CTR1(vlapic, "pending intr %d", vector);
return (vector);
} else
break;
}
}
VLAPIC_CTR0(vlapic, "no pending intr");
return (-1);
}
void
vlapic_intr_accepted(struct vlapic *vlapic, int vector)
{
struct LAPIC *lapic = &vlapic->apic;
uint32_t *irrptr, *isrptr;
int idx, stk_top;
/*
* clear the ready bit for vector being accepted in irr
* and set the vector as in service in isr.
*/
idx = (vector / 32) * 4;
irrptr = &lapic->irr0;
atomic_clear_int(&irrptr[idx], 1 << (vector % 32));
VLAPIC_CTR_IRR(vlapic, "vlapic_intr_accepted");
isrptr = &lapic->isr0;
isrptr[idx] |= 1 << (vector % 32);
VLAPIC_CTR_ISR(vlapic, "vlapic_intr_accepted");
/*
* Update the PPR
*/
vlapic->isrvec_stk_top++;
stk_top = vlapic->isrvec_stk_top;
if (stk_top >= ISRVEC_STK_SIZE)
panic("isrvec_stk_top overflow %d", stk_top);
vlapic->isrvec_stk[stk_top] = vector;
vlapic_update_ppr(vlapic);
}
int
vlapic_op_mem_read(void* dev, uint64_t gpa, opsize_t size, uint64_t *data)
{
struct vlapic *vlapic = (struct vlapic*)dev;
struct LAPIC *lapic = &vlapic->apic;
uint64_t offset = gpa & ~(PAGE_SIZE);
uint32_t *reg;
int i;
if (offset > sizeof(*lapic)) {
*data = 0;
return 0;
}
offset &= ~3;
switch(offset)
{
case APIC_OFFSET_ID:
*data = lapic->id;
break;
case APIC_OFFSET_VER:
*data = lapic->version;
break;
case APIC_OFFSET_TPR:
*data = lapic->tpr;
break;
case APIC_OFFSET_APR:
*data = lapic->apr;
break;
case APIC_OFFSET_PPR:
*data = lapic->ppr;
break;
case APIC_OFFSET_EOI:
*data = lapic->eoi;
break;
case APIC_OFFSET_LDR:
*data = lapic->ldr;
break;
case APIC_OFFSET_DFR:
*data = lapic->dfr;
break;
case APIC_OFFSET_SVR:
*data = lapic->svr;
break;
case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
i = (offset - APIC_OFFSET_ISR0) >> 2;
reg = &lapic->isr0;
*data = *(reg + i);
break;
case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
i = (offset - APIC_OFFSET_TMR0) >> 2;
reg = &lapic->tmr0;
*data = *(reg + i);
break;
case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
i = (offset - APIC_OFFSET_IRR0) >> 2;
reg = &lapic->irr0;
*data = atomic_load_acq_int(reg + i);
break;
case APIC_OFFSET_ESR:
*data = lapic->esr;
break;
case APIC_OFFSET_ICR_LOW:
*data = lapic->icr_lo;
break;
case APIC_OFFSET_ICR_HI:
*data = lapic->icr_hi;
break;
case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
reg = vlapic_get_lvt(vlapic, offset);
*data = *(reg);
break;
case APIC_OFFSET_ICR:
*data = lapic->icr_timer;
break;
case APIC_OFFSET_CCR:
*data = vlapic_get_ccr(vlapic);
break;
case APIC_OFFSET_DCR:
*data = lapic->dcr_timer;
break;
case APIC_OFFSET_RRR:
default:
*data = 0;
break;
}
return 0;
}
int
vlapic_op_mem_write(void* dev, uint64_t gpa, opsize_t size, uint64_t data)
{
struct vlapic *vlapic = (struct vlapic*)dev;
struct LAPIC *lapic = &vlapic->apic;
uint64_t offset = gpa & ~(PAGE_SIZE);
uint32_t *reg;
int retval;
if (offset > sizeof(*lapic)) {
return 0;
}
retval = 0;
offset &= ~3;
switch(offset)
{
case APIC_OFFSET_ID:
lapic->id = data;
break;
case APIC_OFFSET_TPR:
lapic->tpr = data & 0xff;
vlapic_update_ppr(vlapic);
break;
case APIC_OFFSET_EOI:
vlapic_process_eoi(vlapic);
break;
case APIC_OFFSET_LDR:
break;
case APIC_OFFSET_DFR:
break;
case APIC_OFFSET_SVR:
lapic->svr = data;
break;
case APIC_OFFSET_ICR_LOW:
retval = lapic_process_icr(vlapic, data);
break;
case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
reg = vlapic_get_lvt(vlapic, offset);
if (!(lapic->svr & APIC_SVR_ENABLE)) {
data |= APIC_LVT_M;
}
*reg = data;
// vlapic_dump_lvt(offset, reg);
break;
case APIC_OFFSET_ICR:
lapic->icr_timer = data;
vlapic_start_timer(vlapic, 0);
break;
case APIC_OFFSET_DCR:
lapic->dcr_timer = data;
vlapic->divisor = vlapic_timer_divisor(data);
break;
case APIC_OFFSET_ESR:
vlapic_update_errors(vlapic);
break;
case APIC_OFFSET_VER:
case APIC_OFFSET_APR:
case APIC_OFFSET_PPR:
case APIC_OFFSET_RRR:
case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
case APIC_OFFSET_CCR:
default:
// Read only.
break;
}
return (retval);
}
void
vlapic_timer_tick(struct vlapic *vlapic)
{
int curticks, delta, periodic;
uint32_t ccr;
uint32_t decrement, remainder;
curticks = ticks;
/* Common case */
delta = curticks - vlapic->ccr_ticks;
if (delta == 0)
return;
/* Local APIC timer is disabled */
if (vlapic->apic.icr_timer == 0)
return;
/* One-shot mode and timer has already counted down to zero */
periodic = vlapic_periodic_timer(vlapic);
if (!periodic && vlapic->apic.ccr_timer == 0)
return;
/*
* The 'curticks' and 'ccr_ticks' are out of sync by more than
* 2^31 ticks. We deal with this by restarting the timer.
*/
if (delta < 0) {
vlapic_start_timer(vlapic, 0);
return;
}
ccr = vlapic->apic.ccr_timer;
decrement = (VLAPIC_BUS_FREQ / vlapic->divisor) / hz;
while (delta-- > 0) {
if (ccr <= decrement) {
remainder = decrement - ccr;
vlapic_fire_timer(vlapic);
if (periodic) {
vlapic_start_timer(vlapic, remainder);
ccr = vlapic->apic.ccr_timer;
} else {
/*
* One-shot timer has counted down to zero.
*/
ccr = 0;
break;
}
} else
ccr -= decrement;
}
vlapic->ccr_ticks = curticks;
vlapic->apic.ccr_timer = ccr;
}
struct vdev_ops vlapic_dev_ops = {
.name = "vlapic",
.init = vlapic_op_init,
.reset = vlapic_op_reset,
.halt = vlapic_op_halt,
.memread = vlapic_op_mem_read,
.memwrite = vlapic_op_mem_write,
};
static struct io_region vlapic_mmio[VM_MAXCPU];
struct vlapic *
vlapic_init(struct vm *vm, int vcpuid)
{
struct vlapic *vlapic;
vlapic = malloc(sizeof(struct vlapic), M_VLAPIC, M_WAITOK | M_ZERO);
vlapic->vm = vm;
vlapic->vcpuid = vcpuid;
vlapic->ops = &vlapic_dev_ops;
vlapic->mmio = vlapic_mmio + vcpuid;
vlapic->mmio->base = DEFAULT_APIC_BASE;
vlapic->mmio->len = PAGE_SIZE;
vlapic->mmio->attr = MMIO_READ|MMIO_WRITE;
vlapic->mmio->vcpu = vcpuid;
vdev_register(&vlapic_dev_ops, vlapic);
vlapic_op_init(vlapic);
return (vlapic);
}
void
vlapic_cleanup(struct vlapic *vlapic)
{
vdev_unregister(vlapic);
free(vlapic, M_VLAPIC);
}

105
sys/amd64/vmm/io/vlapic.h Normal file
View File

@ -0,0 +1,105 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VLAPIC_H_
#define _VLAPIC_H_
#include "vdev.h"
struct vm;
/*
* Map of APIC Registers: Offset Description Access
*/
#define APIC_OFFSET_ID 0x20 // Local APIC ID R/W
#define APIC_OFFSET_VER 0x30 // Local APIC Version R
#define APIC_OFFSET_TPR 0x80 // Task Priority Register R/W
#define APIC_OFFSET_APR 0x90 // Arbitration Priority Register R
#define APIC_OFFSET_PPR 0xA0 // Processor Priority Register R
#define APIC_OFFSET_EOI 0xB0 // EOI Register W
#define APIC_OFFSET_RRR 0xC0 // Remote read R
#define APIC_OFFSET_LDR 0xD0 // Logical Destination R/W
#define APIC_OFFSET_DFR 0xE0 // Destination Format Register 0..27 R; 28..31 R/W
#define APIC_OFFSET_SVR 0xF0 // Spurious Interrupt Vector Reg. 0..3 R; 4..9 R/W
#define APIC_OFFSET_ISR0 0x100 // ISR 000-031 R
#define APIC_OFFSET_ISR1 0x110 // ISR 032-063 R
#define APIC_OFFSET_ISR2 0x120 // ISR 064-095 R
#define APIC_OFFSET_ISR3 0x130 // ISR 095-128 R
#define APIC_OFFSET_ISR4 0x140 // ISR 128-159 R
#define APIC_OFFSET_ISR5 0x150 // ISR 160-191 R
#define APIC_OFFSET_ISR6 0x160 // ISR 192-223 R
#define APIC_OFFSET_ISR7 0x170 // ISR 224-255 R
#define APIC_OFFSET_TMR0 0x180 // TMR 000-031 R
#define APIC_OFFSET_TMR1 0x190 // TMR 032-063 R
#define APIC_OFFSET_TMR2 0x1A0 // TMR 064-095 R
#define APIC_OFFSET_TMR3 0x1B0 // TMR 095-128 R
#define APIC_OFFSET_TMR4 0x1C0 // TMR 128-159 R
#define APIC_OFFSET_TMR5 0x1D0 // TMR 160-191 R
#define APIC_OFFSET_TMR6 0x1E0 // TMR 192-223 R
#define APIC_OFFSET_TMR7 0x1F0 // TMR 224-255 R
#define APIC_OFFSET_IRR0 0x200 // IRR 000-031 R
#define APIC_OFFSET_IRR1 0x210 // IRR 032-063 R
#define APIC_OFFSET_IRR2 0x220 // IRR 064-095 R
#define APIC_OFFSET_IRR3 0x230 // IRR 095-128 R
#define APIC_OFFSET_IRR4 0x240 // IRR 128-159 R
#define APIC_OFFSET_IRR5 0x250 // IRR 160-191 R
#define APIC_OFFSET_IRR6 0x260 // IRR 192-223 R
#define APIC_OFFSET_IRR7 0x270 // IRR 224-255 R
#define APIC_OFFSET_ESR 0x280 // Error Status Register R
#define APIC_OFFSET_ICR_LOW 0x300 // Interrupt Command Reg. (0-31) R/W
#define APIC_OFFSET_ICR_HI 0x310 // Interrupt Command Reg. (32-63) R/W
#define APIC_OFFSET_TIMER_LVT 0x320 // Local Vector Table (Timer) R/W
#define APIC_OFFSET_THERM_LVT 0x330 // Local Vector Table (Thermal) R/W (PIV+)
#define APIC_OFFSET_PERF_LVT 0x340 // Local Vector Table (Performance) R/W (P6+)
#define APIC_OFFSET_LINT0_LVT 0x350 // Local Vector Table (LINT0) R/W
#define APIC_OFFSET_LINT1_LVT 0x360 // Local Vector Table (LINT1) R/W
#define APIC_OFFSET_ERROR_LVT 0x370 // Local Vector Table (ERROR) R/W
#define APIC_OFFSET_ICR 0x380 // Initial Count Reg. for Timer R/W
#define APIC_OFFSET_CCR 0x390 // Current Count of Timer R
#define APIC_OFFSET_DCR 0x3E0 // Timer Divide Configuration Reg. R/W
/*
* 16 priority levels with at most one vector injected per level.
*/
#define ISRVEC_STK_SIZE (16 + 1)
struct vlapic *vlapic_init(struct vm *vm, int vcpuid);
void vlapic_cleanup(struct vlapic *vlapic);
int vlapic_op_mem_write(void* dev, uint64_t gpa,
opsize_t size, uint64_t data);
int vlapic_op_mem_read(void* dev, uint64_t gpa,
opsize_t size, uint64_t *data);
int vlapic_pending_intr(struct vlapic *vlapic);
void vlapic_intr_accepted(struct vlapic *vlapic, int vector);
void vlapic_set_intr_ready(struct vlapic *vlapic, int vector);
void vlapic_timer_tick(struct vlapic *vlapic);
#endif /* _VLAPIC_H_ */

737
sys/amd64/vmm/vmm.c Normal file
View File

@ -0,0 +1,737 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/module.h>
#include <sys/sysctl.h>
#include <sys/malloc.h>
#include <sys/pcpu.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/sched.h>
#include <sys/smp.h>
#include <sys/systm.h>
#include <vm/vm.h>
#include <machine/vm.h>
#include <machine/pcb.h>
#include <machine/apicreg.h>
#include <machine/vmm.h>
#include "vmm_mem.h"
#include "vmm_util.h"
#include <machine/vmm_dev.h>
#include "vlapic.h"
#include "vmm_msr.h"
#include "vmm_ipi.h"
#include "vmm_stat.h"
#include "io/ppt.h"
#include "io/iommu.h"
struct vlapic;
struct vcpu {
int flags;
int pincpu; /* host cpuid this vcpu is bound to */
int hostcpu; /* host cpuid this vcpu last ran on */
uint64_t guest_msrs[VMM_MSR_NUM];
struct vlapic *vlapic;
int vcpuid;
struct savefpu savefpu; /* guest fpu state */
void *stats;
};
#define VCPU_F_PINNED 0x0001
#define VCPU_F_RUNNING 0x0002
#define VCPU_PINCPU(vm, vcpuid) \
((vm->vcpu[vcpuid].flags & VCPU_F_PINNED) ? vm->vcpu[vcpuid].pincpu : -1)
#define VCPU_UNPIN(vm, vcpuid) (vm->vcpu[vcpuid].flags &= ~VCPU_F_PINNED)
#define VCPU_PIN(vm, vcpuid, host_cpuid) \
do { \
vm->vcpu[vcpuid].flags |= VCPU_F_PINNED; \
vm->vcpu[vcpuid].pincpu = host_cpuid; \
} while(0)
#define VM_MAX_MEMORY_SEGMENTS 2
struct vm {
void *cookie; /* processor-specific data */
void *iommu; /* iommu-specific data */
struct vcpu vcpu[VM_MAXCPU];
int num_mem_segs;
struct vm_memory_segment mem_segs[VM_MAX_MEMORY_SEGMENTS];
char name[VM_MAX_NAMELEN];
/*
* Mask of active vcpus.
* An active vcpu is one that has been started implicitly (BSP) or
* explicitly (AP) by sending it a startup ipi.
*/
cpumask_t active_cpus;
};
static struct vmm_ops *ops;
#define VMM_INIT() (ops != NULL ? (*ops->init)() : 0)
#define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0)
#define VMINIT(vm) (ops != NULL ? (*ops->vminit)(vm): NULL)
#define VMRUN(vmi, vcpu, rip, vmexit) \
(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, vmexit) : ENXIO)
#define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
#define VMMMAP(vmi, gpa, hpa, len, attr, prot, spm) \
(ops != NULL ? (*ops->vmmmap)(vmi, gpa, hpa, len, attr, prot, spm) : ENXIO)
#define VMGETREG(vmi, vcpu, num, retval) \
(ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
#define VMSETREG(vmi, vcpu, num, val) \
(ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
#define VMGETDESC(vmi, vcpu, num, desc) \
(ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
#define VMSETDESC(vmi, vcpu, num, desc) \
(ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
#define VMINJECT(vmi, vcpu, type, vec, ec, ecv) \
(ops != NULL ? (*ops->vminject)(vmi, vcpu, type, vec, ec, ecv) : ENXIO)
#define VMNMI(vmi, vcpu) \
(ops != NULL ? (*ops->vmnmi)(vmi, vcpu) : ENXIO)
#define VMGETCAP(vmi, vcpu, num, retval) \
(ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
#define VMSETCAP(vmi, vcpu, num, val) \
(ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
#define fxrstor(addr) __asm("fxrstor %0" : : "m" (*(addr)))
#define fxsave(addr) __asm __volatile("fxsave %0" : "=m" (*(addr)))
#define fpu_start_emulating() __asm("smsw %%ax; orb %0,%%al; lmsw %%ax" \
: : "n" (CR0_TS) : "ax")
#define fpu_stop_emulating() __asm("clts")
static MALLOC_DEFINE(M_VM, "vm", "vm");
CTASSERT(VMM_MSR_NUM <= 64); /* msr_mask can keep track of up to 64 msrs */
/* statistics */
static VMM_STAT_DEFINE(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
static void
vcpu_cleanup(struct vcpu *vcpu)
{
vlapic_cleanup(vcpu->vlapic);
vmm_stat_free(vcpu->stats);
}
static void
vcpu_init(struct vm *vm, uint32_t vcpu_id)
{
struct vcpu *vcpu;
vcpu = &vm->vcpu[vcpu_id];
vcpu->hostcpu = -1;
vcpu->vcpuid = vcpu_id;
vcpu->vlapic = vlapic_init(vm, vcpu_id);
fpugetregs(curthread, &vcpu->savefpu);
vcpu->stats = vmm_stat_alloc();
}
static int
vmm_init(void)
{
int error;
vmm_ipi_init();
error = vmm_mem_init();
if (error)
return (error);
if (vmm_is_intel())
ops = &vmm_ops_intel;
else if (vmm_is_amd())
ops = &vmm_ops_amd;
else
return (ENXIO);
vmm_msr_init();
return (VMM_INIT());
}
static int
vmm_handler(module_t mod, int what, void *arg)
{
int error;
switch (what) {
case MOD_LOAD:
vmmdev_init();
iommu_init();
error = vmm_init();
break;
case MOD_UNLOAD:
vmmdev_cleanup();
iommu_cleanup();
vmm_ipi_cleanup();
error = VMM_CLEANUP();
break;
default:
error = 0;
break;
}
return (error);
}
static moduledata_t vmm_kmod = {
"vmm",
vmm_handler,
NULL
};
/*
* Execute the module load handler after the pci passthru driver has had
* a chance to claim devices. We need this information at the time we do
* iommu initialization.
*/
DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_CONFIGURE + 1, SI_ORDER_ANY);
MODULE_VERSION(vmm, 1);
SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
struct vm *
vm_create(const char *name)
{
int i;
struct vm *vm;
vm_paddr_t maxaddr;
const int BSP = 0;
if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
return (NULL);
vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
strcpy(vm->name, name);
vm->cookie = VMINIT(vm);
for (i = 0; i < VM_MAXCPU; i++) {
vcpu_init(vm, i);
guest_msrs_init(vm, i);
}
maxaddr = vmm_mem_maxaddr();
vm->iommu = iommu_create_domain(maxaddr);
vm_activate_cpu(vm, BSP);
return (vm);
}
void
vm_destroy(struct vm *vm)
{
int i;
ppt_unassign_all(vm);
for (i = 0; i < vm->num_mem_segs; i++)
vmm_mem_free(vm->mem_segs[i].hpa, vm->mem_segs[i].len);
for (i = 0; i < VM_MAXCPU; i++)
vcpu_cleanup(&vm->vcpu[i]);
iommu_destroy_domain(vm->iommu);
VMCLEANUP(vm->cookie);
free(vm, M_VM);
}
const char *
vm_name(struct vm *vm)
{
return (vm->name);
}
int
vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
{
const boolean_t spok = TRUE; /* superpage mappings are ok */
return (VMMMAP(vm->cookie, gpa, hpa, len, VM_MEMATTR_UNCACHEABLE,
VM_PROT_RW, spok));
}
int
vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
{
const boolean_t spok = TRUE; /* superpage mappings are ok */
return (VMMMAP(vm->cookie, gpa, 0, len, VM_MEMATTR_UNCACHEABLE,
VM_PROT_NONE, spok));
}
int
vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t *ret_hpa)
{
int error;
vm_paddr_t hpa;
const boolean_t spok = TRUE; /* superpage mappings are ok */
/*
* find the hpa if already it was already vm_malloc'd.
*/
hpa = vm_gpa2hpa(vm, gpa, len);
if (hpa != ((vm_paddr_t)-1))
goto out;
if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
return (E2BIG);
hpa = vmm_mem_alloc(len);
if (hpa == 0)
return (ENOMEM);
error = VMMMAP(vm->cookie, gpa, hpa, len, VM_MEMATTR_WRITE_BACK,
VM_PROT_ALL, spok);
if (error) {
vmm_mem_free(hpa, len);
return (error);
}
iommu_create_mapping(vm->iommu, gpa, hpa, len);
vm->mem_segs[vm->num_mem_segs].gpa = gpa;
vm->mem_segs[vm->num_mem_segs].hpa = hpa;
vm->mem_segs[vm->num_mem_segs].len = len;
vm->num_mem_segs++;
out:
*ret_hpa = hpa;
return (0);
}
vm_paddr_t
vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t len)
{
int i;
vm_paddr_t gpabase, gpalimit, hpabase;
for (i = 0; i < vm->num_mem_segs; i++) {
hpabase = vm->mem_segs[i].hpa;
gpabase = vm->mem_segs[i].gpa;
gpalimit = gpabase + vm->mem_segs[i].len;
if (gpa >= gpabase && gpa + len <= gpalimit)
return ((gpa - gpabase) + hpabase);
}
return ((vm_paddr_t)-1);
}
int
vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
struct vm_memory_segment *seg)
{
int i;
for (i = 0; i < vm->num_mem_segs; i++) {
if (gpabase == vm->mem_segs[i].gpa) {
*seg = vm->mem_segs[i];
return (0);
}
}
return (-1);
}
int
vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
{
if (vcpu < 0 || vcpu >= VM_MAXCPU)
return (EINVAL);
if (reg >= VM_REG_LAST)
return (EINVAL);
return (VMGETREG(vm->cookie, vcpu, reg, retval));
}
int
vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val)
{
if (vcpu < 0 || vcpu >= VM_MAXCPU)
return (EINVAL);
if (reg >= VM_REG_LAST)
return (EINVAL);
return (VMSETREG(vm->cookie, vcpu, reg, val));
}
static boolean_t
is_descriptor_table(int reg)
{
switch (reg) {
case VM_REG_GUEST_IDTR:
case VM_REG_GUEST_GDTR:
return (TRUE);
default:
return (FALSE);
}
}
static boolean_t
is_segment_register(int reg)
{
switch (reg) {
case VM_REG_GUEST_ES:
case VM_REG_GUEST_CS:
case VM_REG_GUEST_SS:
case VM_REG_GUEST_DS:
case VM_REG_GUEST_FS:
case VM_REG_GUEST_GS:
case VM_REG_GUEST_TR:
case VM_REG_GUEST_LDTR:
return (TRUE);
default:
return (FALSE);
}
}
int
vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
struct seg_desc *desc)
{
if (vcpu < 0 || vcpu >= VM_MAXCPU)
return (EINVAL);
if (!is_segment_register(reg) && !is_descriptor_table(reg))
return (EINVAL);
return (VMGETDESC(vm->cookie, vcpu, reg, desc));
}
int
vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
struct seg_desc *desc)
{
if (vcpu < 0 || vcpu >= VM_MAXCPU)
return (EINVAL);
if (!is_segment_register(reg) && !is_descriptor_table(reg))
return (EINVAL);
return (VMSETDESC(vm->cookie, vcpu, reg, desc));
}
int
vm_get_pinning(struct vm *vm, int vcpuid, int *cpuid)
{
if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
return (EINVAL);
*cpuid = VCPU_PINCPU(vm, vcpuid);
return (0);
}
int
vm_set_pinning(struct vm *vm, int vcpuid, int host_cpuid)
{
struct thread *td;
if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
return (EINVAL);
td = curthread; /* XXXSMP only safe when muxing vcpus */
/* unpin */
if (host_cpuid < 0) {
VCPU_UNPIN(vm, vcpuid);
thread_lock(td);
sched_unbind(td);
thread_unlock(td);
return (0);
}
if (CPU_ABSENT(host_cpuid))
return (EINVAL);
/*
* XXX we should check that 'host_cpuid' has not already been pinned
* by another vm.
*/
thread_lock(td);
sched_bind(td, host_cpuid);
thread_unlock(td);
VCPU_PIN(vm, vcpuid, host_cpuid);
return (0);
}
static void
restore_guest_fpustate(struct vcpu *vcpu)
{
register_t s;
s = intr_disable();
fpu_stop_emulating();
fxrstor(&vcpu->savefpu);
fpu_start_emulating();
intr_restore(s);
}
static void
save_guest_fpustate(struct vcpu *vcpu)
{
register_t s;
s = intr_disable();
fpu_stop_emulating();
fxsave(&vcpu->savefpu);
fpu_start_emulating();
intr_restore(s);
}
int
vm_run(struct vm *vm, struct vm_run *vmrun)
{
int error, vcpuid;
struct vcpu *vcpu;
struct pcb *pcb;
uint64_t tscval;
vcpuid = vmrun->cpuid;
if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
return (EINVAL);
vcpu = &vm->vcpu[vcpuid];
critical_enter();
tscval = rdtsc();
pcb = PCPU_GET(curpcb);
pcb->pcb_full_iret = 1;
vcpu->hostcpu = curcpu;
fpuexit(curthread);
restore_guest_msrs(vm, vcpuid);
restore_guest_fpustate(vcpu);
error = VMRUN(vm->cookie, vcpuid, vmrun->rip, &vmrun->vm_exit);
save_guest_fpustate(vcpu);
restore_host_msrs(vm, vcpuid);
vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
critical_exit();
return (error);
}
int
vm_inject_event(struct vm *vm, int vcpuid, int type,
int vector, uint32_t code, int code_valid)
{
if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
return (EINVAL);
if ((type > VM_EVENT_NONE && type < VM_EVENT_MAX) == 0)
return (EINVAL);
if (vector < 0 || vector > 255)
return (EINVAL);
return (VMINJECT(vm->cookie, vcpuid, type, vector, code, code_valid));
}
int
vm_inject_nmi(struct vm *vm, int vcpu)
{
int error;
if (vcpu < 0 || vcpu >= VM_MAXCPU)
return (EINVAL);
error = VMNMI(vm->cookie, vcpu);
vm_interrupt_hostcpu(vm, vcpu);
return (error);
}
int
vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
{
if (vcpu < 0 || vcpu >= VM_MAXCPU)
return (EINVAL);
if (type < 0 || type >= VM_CAP_MAX)
return (EINVAL);
return (VMGETCAP(vm->cookie, vcpu, type, retval));
}
int
vm_set_capability(struct vm *vm, int vcpu, int type, int val)
{
if (vcpu < 0 || vcpu >= VM_MAXCPU)
return (EINVAL);
if (type < 0 || type >= VM_CAP_MAX)
return (EINVAL);
return (VMSETCAP(vm->cookie, vcpu, type, val));
}
uint64_t *
vm_guest_msrs(struct vm *vm, int cpu)
{
return (vm->vcpu[cpu].guest_msrs);
}
struct vlapic *
vm_lapic(struct vm *vm, int cpu)
{
return (vm->vcpu[cpu].vlapic);
}
boolean_t
vmm_is_pptdev(int bus, int slot, int func)
{
int found, b, s, f, n;
char *val, *cp, *cp2;
/*
* setenv pptdevs "1/2/3 4/5/6 7/8/9 10/11/12"
*/
found = 0;
cp = val = getenv("pptdevs");
while (cp != NULL && *cp != '\0') {
if ((cp2 = strchr(cp, ' ')) != NULL)
*cp2 = '\0';
n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
if (n == 3 && bus == b && slot == s && func == f) {
found = 1;
break;
}
if (cp2 != NULL)
*cp2++ = ' ';
cp = cp2;
}
freeenv(val);
return (found);
}
void *
vm_iommu_domain(struct vm *vm)
{
return (vm->iommu);
}
void
vm_set_run_state(struct vm *vm, int vcpuid, int state)
{
struct vcpu *vcpu;
if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
vcpu = &vm->vcpu[vcpuid];
if (state == VCPU_RUNNING) {
if (vcpu->flags & VCPU_F_RUNNING) {
panic("vm_set_run_state: %s[%d] is already running",
vm_name(vm), vcpuid);
}
vcpu->flags |= VCPU_F_RUNNING;
} else {
if ((vcpu->flags & VCPU_F_RUNNING) == 0) {
panic("vm_set_run_state: %s[%d] is already stopped",
vm_name(vm), vcpuid);
}
vcpu->flags &= ~VCPU_F_RUNNING;
}
}
int
vm_get_run_state(struct vm *vm, int vcpuid, int *cpuptr)
{
int retval, hostcpu;
struct vcpu *vcpu;
if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
vcpu = &vm->vcpu[vcpuid];
if (vcpu->flags & VCPU_F_RUNNING) {
retval = VCPU_RUNNING;
hostcpu = vcpu->hostcpu;
} else {
retval = VCPU_STOPPED;
hostcpu = -1;
}
if (cpuptr)
*cpuptr = hostcpu;
return (retval);
}
void
vm_activate_cpu(struct vm *vm, int vcpuid)
{
if (vcpuid >= 0 && vcpuid < VM_MAXCPU)
vm->active_cpus |= vcpu_mask(vcpuid);
}
cpumask_t
vm_active_cpus(struct vm *vm)
{
return (vm->active_cpus);
}
void *
vcpu_stats(struct vm *vm, int vcpuid)
{
return (vm->vcpu[vcpuid].stats);
}

468
sys/amd64/vmm/vmm_dev.c Normal file
View File

@ -0,0 +1,468 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/queue.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/malloc.h>
#include <sys/conf.h>
#include <sys/sysctl.h>
#include <sys/libkern.h>
#include <sys/ioccom.h>
#include <sys/mman.h>
#include <sys/uio.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <machine/pmap.h>
#include <machine/vmparam.h>
#include <machine/vmm.h>
#include "vmm_lapic.h"
#include "vmm_stat.h"
#include "io/ppt.h"
#include <machine/vmm_dev.h>
struct vmmdev_softc {
struct vm *vm; /* vm instance cookie */
struct cdev *cdev;
SLIST_ENTRY(vmmdev_softc) link;
};
static SLIST_HEAD(, vmmdev_softc) head;
static struct mtx vmmdev_mtx;
static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
SYSCTL_DECL(_hw_vmm);
static struct vmmdev_softc *
vmmdev_lookup(const char *name)
{
struct vmmdev_softc *sc;
#ifdef notyet /* XXX kernel is not compiled with invariants */
mtx_assert(&vmmdev_mtx, MA_OWNED);
#endif
SLIST_FOREACH(sc, &head, link) {
if (strcmp(name, vm_name(sc->vm)) == 0)
break;
}
return (sc);
}
static struct vmmdev_softc *
vmmdev_lookup2(struct cdev *cdev)
{
struct vmmdev_softc *sc;
#ifdef notyet /* XXX kernel is not compiled with invariants */
mtx_assert(&vmmdev_mtx, MA_OWNED);
#endif
SLIST_FOREACH(sc, &head, link) {
if (sc->cdev == cdev)
break;
}
return (sc);
}
static int
vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
{
int error, off, c;
vm_paddr_t hpa, gpa;
struct vmmdev_softc *sc;
static char zerobuf[PAGE_SIZE];
error = 0;
mtx_lock(&vmmdev_mtx);
sc = vmmdev_lookup2(cdev);
while (uio->uio_resid > 0 && error == 0) {
gpa = uio->uio_offset;
off = gpa & PAGE_MASK;
c = min(uio->uio_resid, PAGE_SIZE - off);
/*
* The VM has a hole in its physical memory map. If we want to
* use 'dd' to inspect memory beyond the hole we need to
* provide bogus data for memory that lies in the hole.
*
* Since this device does not support lseek(2), dd(1) will
* read(2) blocks of data to simulate the lseek(2).
*/
hpa = vm_gpa2hpa(sc->vm, gpa, c);
if (hpa == (vm_paddr_t)-1) {
if (uio->uio_rw == UIO_READ)
error = uiomove(zerobuf, c, uio);
else
error = EFAULT;
} else
error = uiomove((void *)PHYS_TO_DMAP(hpa), c, uio);
}
mtx_unlock(&vmmdev_mtx);
return (error);
}
static int
vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
struct thread *td)
{
int error, vcpu;
struct vmmdev_softc *sc;
struct vm_memory_segment *seg;
struct vm_register *vmreg;
struct vm_seg_desc* vmsegdesc;
struct vm_pin *vmpin;
struct vm_run *vmrun;
struct vm_event *vmevent;
struct vm_lapic_irq *vmirq;
struct vm_capability *vmcap;
struct vm_pptdev *pptdev;
struct vm_pptdev_mmio *pptmmio;
struct vm_pptdev_msi *pptmsi;
struct vm_nmi *vmnmi;
struct vm_stats *vmstats;
struct vm_stat_desc *statdesc;
mtx_lock(&vmmdev_mtx);
sc = vmmdev_lookup2(cdev);
if (sc == NULL) {
mtx_unlock(&vmmdev_mtx);
return (ENXIO);
}
/*
* Some VMM ioctls can operate only on vcpus that are not running.
*/
switch (cmd) {
case VM_RUN:
case VM_SET_PINNING:
case VM_GET_REGISTER:
case VM_SET_REGISTER:
case VM_GET_SEGMENT_DESCRIPTOR:
case VM_SET_SEGMENT_DESCRIPTOR:
case VM_INJECT_EVENT:
case VM_GET_CAPABILITY:
case VM_SET_CAPABILITY:
case VM_PPTDEV_MSI:
/*
* XXX fragile, handle with care
* Assumes that the first field of the ioctl data is the vcpu.
*/
vcpu = *(int *)data;
if (vcpu < 0 || vcpu >= VM_MAXCPU) {
error = EINVAL;
goto done;
}
if (vcpu_is_running(sc->vm, vcpu, NULL)) {
error = EBUSY;
goto done;
}
break;
default:
break;
}
switch(cmd) {
case VM_RUN:
vmrun = (struct vm_run *)data;
vm_set_run_state(sc->vm, vmrun->cpuid, VCPU_RUNNING);
mtx_unlock(&vmmdev_mtx);
error = vm_run(sc->vm, vmrun);
mtx_lock(&vmmdev_mtx);
vm_set_run_state(sc->vm, vmrun->cpuid, VCPU_STOPPED);
break;
case VM_STAT_DESC: {
const char *desc;
statdesc = (struct vm_stat_desc *)data;
desc = vmm_stat_desc(statdesc->index);
if (desc != NULL) {
error = 0;
strlcpy(statdesc->desc, desc, sizeof(statdesc->desc));
} else
error = EINVAL;
break;
}
case VM_STATS: {
CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_TYPES);
vmstats = (struct vm_stats *)data;
getmicrotime(&vmstats->tv);
error = vmm_stat_copy(sc->vm, vmstats->cpuid,
&vmstats->num_entries, vmstats->statbuf);
break;
}
case VM_PPTDEV_MSI:
pptmsi = (struct vm_pptdev_msi *)data;
error = ppt_setup_msi(sc->vm, pptmsi->vcpu,
pptmsi->bus, pptmsi->slot, pptmsi->func,
pptmsi->destcpu, pptmsi->vector,
pptmsi->numvec);
break;
case VM_MAP_PPTDEV_MMIO:
pptmmio = (struct vm_pptdev_mmio *)data;
error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
pptmmio->func, pptmmio->gpa, pptmmio->len,
pptmmio->hpa);
break;
case VM_BIND_PPTDEV:
pptdev = (struct vm_pptdev *)data;
error = ppt_assign_device(sc->vm, pptdev->bus, pptdev->slot,
pptdev->func);
break;
case VM_UNBIND_PPTDEV:
pptdev = (struct vm_pptdev *)data;
error = ppt_unassign_device(sc->vm, pptdev->bus, pptdev->slot,
pptdev->func);
break;
case VM_INJECT_EVENT:
vmevent = (struct vm_event *)data;
error = vm_inject_event(sc->vm, vmevent->cpuid, vmevent->type,
vmevent->vector,
vmevent->error_code,
vmevent->error_code_valid);
break;
case VM_INJECT_NMI:
vmnmi = (struct vm_nmi *)data;
error = vm_inject_nmi(sc->vm, vmnmi->cpuid);
break;
case VM_LAPIC_IRQ:
vmirq = (struct vm_lapic_irq *)data;
error = lapic_set_intr(sc->vm, vmirq->cpuid, vmirq->vector);
break;
case VM_SET_PINNING:
vmpin = (struct vm_pin *)data;
error = vm_set_pinning(sc->vm, vmpin->vm_cpuid,
vmpin->host_cpuid);
break;
case VM_GET_PINNING:
vmpin = (struct vm_pin *)data;
error = vm_get_pinning(sc->vm, vmpin->vm_cpuid,
&vmpin->host_cpuid);
break;
case VM_MAP_MEMORY:
seg = (struct vm_memory_segment *)data;
error = vm_malloc(sc->vm, seg->gpa, seg->len, &seg->hpa);
break;
case VM_GET_MEMORY_SEG:
seg = (struct vm_memory_segment *)data;
seg->hpa = seg->len = 0;
(void)vm_gpabase2memseg(sc->vm, seg->gpa, seg);
error = 0;
break;
case VM_GET_REGISTER:
vmreg = (struct vm_register *)data;
error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum,
&vmreg->regval);
break;
case VM_SET_REGISTER:
vmreg = (struct vm_register *)data;
error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum,
vmreg->regval);
break;
case VM_SET_SEGMENT_DESCRIPTOR:
vmsegdesc = (struct vm_seg_desc *)data;
error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid,
vmsegdesc->regnum,
&vmsegdesc->desc);
break;
case VM_GET_SEGMENT_DESCRIPTOR:
vmsegdesc = (struct vm_seg_desc *)data;
error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid,
vmsegdesc->regnum,
&vmsegdesc->desc);
break;
case VM_GET_CAPABILITY:
vmcap = (struct vm_capability *)data;
error = vm_get_capability(sc->vm, vmcap->cpuid,
vmcap->captype,
&vmcap->capval);
break;
case VM_SET_CAPABILITY:
vmcap = (struct vm_capability *)data;
error = vm_set_capability(sc->vm, vmcap->cpuid,
vmcap->captype,
vmcap->capval);
break;
default:
error = ENOTTY;
break;
}
done:
mtx_unlock(&vmmdev_mtx);
return (error);
}
static int
vmmdev_mmap(struct cdev *cdev, vm_offset_t offset, vm_paddr_t *paddr, int nprot)
{
int error;
struct vmmdev_softc *sc;
error = -1;
mtx_lock(&vmmdev_mtx);
sc = vmmdev_lookup2(cdev);
if (sc != NULL && (nprot & PROT_EXEC) == 0) {
*paddr = vm_gpa2hpa(sc->vm, (vm_paddr_t)offset, PAGE_SIZE);
if (*paddr != (vm_paddr_t)-1)
error = 0;
}
mtx_unlock(&vmmdev_mtx);
return (error);
}
static void
vmmdev_destroy(struct vmmdev_softc *sc)
{
#ifdef notyet /* XXX kernel is not compiled with invariants */
mtx_assert(&vmmdev_mtx, MA_OWNED);
#endif
/*
* XXX must stop virtual machine instances that may be still
* running and cleanup their state.
*/
SLIST_REMOVE(&head, sc, vmmdev_softc, link);
destroy_dev(sc->cdev);
vm_destroy(sc->vm);
free(sc, M_VMMDEV);
}
static int
sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
{
int error;
char buf[VM_MAX_NAMELEN];
struct vmmdev_softc *sc;
strlcpy(buf, "beavis", sizeof(buf));
error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
if (error != 0 || req->newptr == NULL)
return (error);
mtx_lock(&vmmdev_mtx);
sc = vmmdev_lookup(buf);
if (sc == NULL) {
mtx_unlock(&vmmdev_mtx);
return (EINVAL);
}
vmmdev_destroy(sc);
mtx_unlock(&vmmdev_mtx);
return (0);
}
SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW,
NULL, 0, sysctl_vmm_destroy, "A", NULL);
static struct cdevsw vmmdevsw = {
.d_name = "vmmdev",
.d_version = D_VERSION,
.d_ioctl = vmmdev_ioctl,
.d_mmap = vmmdev_mmap,
.d_read = vmmdev_rw,
.d_write = vmmdev_rw,
};
static int
sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
{
int error;
struct vm *vm;
struct vmmdev_softc *sc;
char buf[VM_MAX_NAMELEN];
strlcpy(buf, "beavis", sizeof(buf));
error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
if (error != 0 || req->newptr == NULL)
return (error);
mtx_lock(&vmmdev_mtx);
sc = vmmdev_lookup(buf);
if (sc != NULL) {
mtx_unlock(&vmmdev_mtx);
return (EEXIST);
}
vm = vm_create(buf);
if (vm == NULL) {
mtx_unlock(&vmmdev_mtx);
return (EINVAL);
}
sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO);
sc->vm = vm;
sc->cdev = make_dev(&vmmdevsw, 0, UID_ROOT, GID_WHEEL, 0600,
"vmm/%s", buf);
sc->cdev->si_drv1 = sc;
SLIST_INSERT_HEAD(&head, sc, link);
mtx_unlock(&vmmdev_mtx);
return (0);
}
SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW,
NULL, 0, sysctl_vmm_create, "A", NULL);
void
vmmdev_init(void)
{
mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF);
}
void
vmmdev_cleanup(void)
{
struct vmmdev_softc *sc, *sc2;
mtx_lock(&vmmdev_mtx);
SLIST_FOREACH_SAFE(sc, &head, link, sc2)
vmmdev_destroy(sc);
mtx_unlock(&vmmdev_mtx);
}

103
sys/amd64/vmm/vmm_ipi.c Normal file
View File

@ -0,0 +1,103 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/bus.h>
#include <machine/intr_machdep.h>
#include <machine/apicvar.h>
#include <machine/segments.h>
#include <machine/md_var.h>
#include <machine/smp.h>
#include <machine/vmm.h>
#include "vmm_ipi.h"
extern inthand_t IDTVEC(rsvd), IDTVEC(justreturn);
/*
* The default is to use the IPI_AST to interrupt a vcpu.
*/
static int ipinum = IPI_AST;
CTASSERT(APIC_SPURIOUS_INT == 255);
void
vmm_ipi_init(void)
{
int idx;
uintptr_t func;
struct gate_descriptor *ip;
/*
* Search backwards from the highest IDT vector available for use
* as our IPI vector. We install the 'justreturn' handler at that
* vector and use it to interrupt the vcpus.
*
* We do this because the IPI_AST is heavyweight and saves all
* registers in the trapframe. This is overkill for our use case
* which is simply to EOI the interrupt and return.
*/
idx = APIC_SPURIOUS_INT;
while (--idx >= APIC_IPI_INTS) {
ip = &idt[idx];
func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
if (func == (uintptr_t)&IDTVEC(rsvd)) {
ipinum = idx;
setidt(ipinum, IDTVEC(justreturn), SDT_SYSIGT,
SEL_KPL, 0);
break;
}
}
if (ipinum != IPI_AST && bootverbose) {
printf("vmm_ipi_init: installing ipi handler to interrupt "
"vcpus at vector %d\n", ipinum);
}
}
void
vmm_ipi_cleanup(void)
{
if (ipinum != IPI_AST)
setidt(ipinum, IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
}
void
vm_interrupt_hostcpu(struct vm *vm, int vcpu)
{
int hostcpu;
if (vcpu_is_running(vm, vcpu, &hostcpu) && hostcpu != curcpu)
ipi_selected((cpumask_t)1 << hostcpu, ipinum);
}

38
sys/amd64/vmm/vmm_ipi.h Normal file
View File

@ -0,0 +1,38 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VMM_IPI_H_
#define _VMM_IPI_H_
struct vm;
void vmm_ipi_init(void);
void vmm_ipi_cleanup(void);
void vm_interrupt_hostcpu(struct vm *vm, int vcpu);
#endif

51
sys/amd64/vmm/vmm_ktr.h Normal file
View File

@ -0,0 +1,51 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VMM_KTR_H_
#define _VMM_KTR_H_
#include <sys/ktr.h>
#include <sys/pcpu.h>
#define KTR_VMM KTR_GEN
#define VMM_CTR0(vm, vcpuid, format) \
CTR3(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu)
#define VMM_CTR1(vm, vcpuid, format, p1) \
CTR4(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu, \
(p1))
#define VMM_CTR2(vm, vcpuid, format, p1, p2) \
CTR5(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu, \
(p1), (p2))
#define VMM_CTR3(vm, vcpuid, format, p1, p2, p3) \
CTR6(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu, \
(p1), (p2), (p3))
#endif

121
sys/amd64/vmm/vmm_lapic.c Normal file
View File

@ -0,0 +1,121 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <machine/vmm.h>
#include "vmm_ipi.h"
#include "vmm_lapic.h"
#include "vlapic.h"
int
lapic_write(struct vm *vm, int cpu, u_int offset, uint64_t val)
{
int handled;
struct vlapic *vlapic;
vlapic = vm_lapic(vm, cpu);
if (vlapic_op_mem_write(vlapic, offset, DWORD, val) == 0)
handled = 1;
else
handled = 0;
return (handled);
}
int
lapic_read(struct vm *vm, int cpu, u_int offset, uint64_t *rv)
{
int handled;
struct vlapic *vlapic;
vlapic = vm_lapic(vm, cpu);
if (vlapic_op_mem_read(vlapic, offset, DWORD, rv) == 0)
handled = 1;
else
handled = 0;
return (handled);
}
int
lapic_pending_intr(struct vm *vm, int cpu)
{
struct vlapic *vlapic;
vlapic = vm_lapic(vm, cpu);
return (vlapic_pending_intr(vlapic));
}
void
lapic_intr_accepted(struct vm *vm, int cpu, int vector)
{
struct vlapic *vlapic;
vlapic = vm_lapic(vm, cpu);
vlapic_intr_accepted(vlapic, vector);
}
int
lapic_set_intr(struct vm *vm, int cpu, int vector)
{
struct vlapic *vlapic;
if (cpu < 0 || cpu >= VM_MAXCPU)
return (EINVAL);
if (vector < 32 || vector > 255)
return (EINVAL);
vlapic = vm_lapic(vm, cpu);
vlapic_set_intr_ready(vlapic, vector);
vm_interrupt_hostcpu(vm, cpu);
return (0);
}
void
lapic_timer_tick(struct vm *vm, int cpu)
{
struct vlapic *vlapic;
vlapic = vm_lapic(vm, cpu);
vlapic_timer_tick(vlapic);
}

64
sys/amd64/vmm/vmm_lapic.h Normal file
View File

@ -0,0 +1,64 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VMM_LAPIC_H_
#define _VMM_LAPIC_H_
struct vm;
int lapic_write(struct vm *vm, int cpu, u_int offset, uint64_t val);
int lapic_read(struct vm *vm, int cpu, u_int offset, uint64_t *retval);
void lapic_timer_tick(struct vm *vm, int cpu);
/*
* Returns a vector between 32 and 255 if an interrupt is pending in the
* IRR that can be delivered based on the current state of ISR and TPR.
*
* Note that the vector does not automatically transition to the ISR as a
* result of calling this function.
*
* Returns -1 if there is no eligible vector that can be delivered to the
* guest at this time.
*/
int lapic_pending_intr(struct vm *vm, int cpu);
/*
* Transition 'vector' from IRR to ISR. This function is called with the
* vector returned by 'lapic_pending_intr()' when the guest is able to
* accept this interrupt (i.e. RFLAGS.IF = 1 and no conditions exist that
* block interrupt delivery).
*/
void lapic_intr_accepted(struct vm *vm, int cpu, int vector);
/*
* Signals to the LAPIC that an interrupt at 'vector' needs to be generated
* to the 'cpu', the state is recorded in IRR.
*/
int lapic_set_intr(struct vm *vm, int cpu, int vector);
#endif

413
sys/amd64/vmm/vmm_mem.c Normal file
View File

@ -0,0 +1,413 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/linker.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/kernel.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <machine/md_var.h>
#include <machine/metadata.h>
#include <machine/pc/bios.h>
#include <machine/vmparam.h>
#include <machine/pmap.h>
#include "vmm_util.h"
#include "vmm_mem.h"
static MALLOC_DEFINE(M_VMM_MEM, "vmm memory", "vmm memory");
#define MB (1024 * 1024)
#define GB (1024 * MB)
#define VMM_MEM_MAXSEGS 64
/* protected by vmm_mem_mtx */
static struct {
vm_paddr_t base;
vm_size_t length;
} vmm_mem_avail[VMM_MEM_MAXSEGS];
static int vmm_mem_nsegs;
static vm_paddr_t maxaddr;
static struct mtx vmm_mem_mtx;
/*
* Steal any memory that was deliberately hidden from FreeBSD either by
* the use of MAXMEM kernel config option or the hw.physmem loader tunable.
*/
static int
vmm_mem_steal_memory(void)
{
int nsegs;
caddr_t kmdp;
uint32_t smapsize;
uint64_t base, length;
struct bios_smap *smapbase, *smap, *smapend;
/*
* Borrowed from hammer_time() and getmemsize() in machdep.c
*/
kmdp = preload_search_by_type("elf kernel");
if (kmdp == NULL)
kmdp = preload_search_by_type("elf64 kernel");
smapbase = (struct bios_smap *)preload_search_info(kmdp,
MODINFO_METADATA | MODINFOMD_SMAP);
if (smapbase == NULL)
panic("No BIOS smap info from loader!");
smapsize = *((uint32_t *)smapbase - 1);
smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
nsegs = 0;
for (smap = smapbase; smap < smapend; smap++) {
/*
* XXX
* Assuming non-overlapping, monotonically increasing
* memory segments.
*/
if (smap->type != SMAP_TYPE_MEMORY)
continue;
if (smap->length == 0)
break;
base = roundup(smap->base, NBPDR);
length = rounddown(smap->length, NBPDR);
/* Skip this segment if FreeBSD is using all of it. */
if (base + length <= ptoa(Maxmem))
continue;
/*
* If FreeBSD is using part of this segment then adjust
* 'base' and 'length' accordingly.
*/
if (base < ptoa(Maxmem)) {
uint64_t used;
used = roundup(ptoa(Maxmem), NBPDR) - base;
base += used;
length -= used;
}
if (length == 0)
continue;
vmm_mem_avail[nsegs].base = base;
vmm_mem_avail[nsegs].length = length;
if (base + length > maxaddr)
maxaddr = base + length;
if (0 && bootverbose) {
printf("vmm_mem_populate: index %d, base 0x%0lx, "
"length %ld\n",
nsegs, vmm_mem_avail[nsegs].base,
vmm_mem_avail[nsegs].length);
}
nsegs++;
if (nsegs >= VMM_MEM_MAXSEGS) {
printf("vmm_mem_populate: maximum number of vmm memory "
"segments reached!\n");
return (ENOSPC);
}
}
vmm_mem_nsegs = nsegs;
return (0);
}
static void
vmm_mem_direct_map(vm_paddr_t start, vm_paddr_t end)
{
vm_paddr_t addr, remaining;
int pdpi, pdi, superpage_size;
pml4_entry_t *pml4p;
pdp_entry_t *pdp;
pd_entry_t *pd;
uint64_t page_attr_bits;
if (end >= NBPML4)
panic("Cannot map memory beyond %ldGB", NBPML4 / GB);
/* XXX FreeBSD 8.1 does not use 1G superpages in the direct map */
if (0 && vmm_supports_1G_pages())
superpage_size = NBPDP;
else
superpage_size = NBPDR;
/*
* Get the page directory pointer page that contains the direct
* map address mappings.
*/
pml4p = kernel_pmap->pm_pml4;
pdp = (pdp_entry_t *)PHYS_TO_DMAP(pml4p[DMPML4I] & ~PAGE_MASK);
page_attr_bits = PG_RW | PG_V | PG_PS | PG_G;
addr = start;
while (addr < end) {
remaining = end - addr;
pdpi = addr / NBPDP;
if (superpage_size == NBPDP &&
remaining >= NBPDP &&
addr % NBPDP == 0) {
/*
* If there isn't a mapping for this address then
* create one but if there is one already make sure
* it matches what we expect it to be.
*/
if (pdp[pdpi] == 0) {
pdp[pdpi] = addr | page_attr_bits;
if (0 && bootverbose) {
printf("vmm_mem_populate: mapping "
"0x%lx with 1GB page at "
"pdpi %d\n", addr, pdpi);
}
} else {
pdp_entry_t pdpe = pdp[pdpi];
if ((pdpe & ~PAGE_MASK) != addr ||
(pdpe & page_attr_bits) != page_attr_bits) {
panic("An invalid mapping 0x%016lx "
"already exists for 0x%016lx\n",
pdpe, addr);
}
}
addr += NBPDP;
} else {
if (remaining < NBPDR) {
panic("vmm_mem_populate: remaining (%ld) must "
"be greater than NBPDR (%d)\n",
remaining, NBPDR);
}
if (pdp[pdpi] == 0) {
/*
* XXX we lose this memory forever because
* we do not keep track of the virtual address
* that would be required to free this page.
*/
pd = malloc(PAGE_SIZE, M_VMM_MEM,
M_WAITOK | M_ZERO);
if ((uintptr_t)pd & PAGE_MASK) {
panic("vmm_mem_populate: page directory"
"page not aligned on %d "
"boundary\n", PAGE_SIZE);
}
pdp[pdpi] = vtophys(pd);
pdp[pdpi] |= PG_RW | PG_V | PG_U;
if (0 && bootverbose) {
printf("Creating page directory "
"at pdp index %d for 0x%016lx\n",
pdpi, addr);
}
}
pdi = (addr % NBPDP) / NBPDR;
pd = (pd_entry_t *)PHYS_TO_DMAP(pdp[pdpi] & ~PAGE_MASK);
/*
* Create a new mapping if one doesn't already exist
* or validate it if it does.
*/
if (pd[pdi] == 0) {
pd[pdi] = addr | page_attr_bits;
if (0 && bootverbose) {
printf("vmm_mem_populate: mapping "
"0x%lx with 2MB page at "
"pdpi %d, pdi %d\n",
addr, pdpi, pdi);
}
} else {
pd_entry_t pde = pd[pdi];
if ((pde & ~PAGE_MASK) != addr ||
(pde & page_attr_bits) != page_attr_bits) {
panic("An invalid mapping 0x%016lx "
"already exists for 0x%016lx\n",
pde, addr);
}
}
addr += NBPDR;
}
}
}
static int
vmm_mem_populate(void)
{
int seg, error;
vm_paddr_t start, end;
/* populate the vmm_mem_avail[] array */
error = vmm_mem_steal_memory();
if (error)
return (error);
/*
* Now map the memory that was hidden from FreeBSD in
* the direct map VA space.
*/
for (seg = 0; seg < vmm_mem_nsegs; seg++) {
start = vmm_mem_avail[seg].base;
end = start + vmm_mem_avail[seg].length;
if ((start & PDRMASK) != 0 || (end & PDRMASK) != 0) {
panic("start (0x%016lx) and end (0x%016lx) must be "
"aligned on a %dMB boundary\n",
start, end, NBPDR / MB);
}
vmm_mem_direct_map(start, end);
}
return (0);
}
int
vmm_mem_init(void)
{
int error;
mtx_init(&vmm_mem_mtx, "vmm_mem_mtx", NULL, MTX_DEF);
error = vmm_mem_populate();
if (error)
return (error);
return (0);
}
vm_paddr_t
vmm_mem_alloc(size_t size)
{
int i;
vm_paddr_t addr;
if ((size & PDRMASK) != 0) {
panic("vmm_mem_alloc: size 0x%0lx must be "
"aligned on a 0x%0x boundary\n", size, NBPDR);
}
addr = 0;
mtx_lock(&vmm_mem_mtx);
for (i = 0; i < vmm_mem_nsegs; i++) {
if (vmm_mem_avail[i].length >= size) {
addr = vmm_mem_avail[i].base;
vmm_mem_avail[i].base += size;
vmm_mem_avail[i].length -= size;
/* remove a zero length segment */
if (vmm_mem_avail[i].length == 0) {
memmove(&vmm_mem_avail[i],
&vmm_mem_avail[i + 1],
(vmm_mem_nsegs - (i + 1)) *
sizeof(vmm_mem_avail[0]));
vmm_mem_nsegs--;
}
break;
}
}
mtx_unlock(&vmm_mem_mtx);
return (addr);
}
void
vmm_mem_free(vm_paddr_t base, size_t length)
{
int i;
if ((base & PDRMASK) != 0 || (length & PDRMASK) != 0) {
panic("vmm_mem_free: base 0x%0lx and length 0x%0lx must be "
"aligned on a 0x%0x boundary\n", base, length, NBPDR);
}
mtx_lock(&vmm_mem_mtx);
for (i = 0; i < vmm_mem_nsegs; i++) {
if (vmm_mem_avail[i].base > base)
break;
}
if (vmm_mem_nsegs >= VMM_MEM_MAXSEGS)
panic("vmm_mem_free: cannot free any more segments");
/* Create a new segment at index 'i' */
memmove(&vmm_mem_avail[i + 1], &vmm_mem_avail[i],
(vmm_mem_nsegs - i) * sizeof(vmm_mem_avail[0]));
vmm_mem_avail[i].base = base;
vmm_mem_avail[i].length = length;
vmm_mem_nsegs++;
coalesce_some_more:
for (i = 0; i < vmm_mem_nsegs - 1; i++) {
if (vmm_mem_avail[i].base + vmm_mem_avail[i].length ==
vmm_mem_avail[i + 1].base) {
vmm_mem_avail[i].length += vmm_mem_avail[i + 1].length;
memmove(&vmm_mem_avail[i + 1], &vmm_mem_avail[i + 2],
(vmm_mem_nsegs - (i + 2)) * sizeof(vmm_mem_avail[0]));
vmm_mem_nsegs--;
goto coalesce_some_more;
}
}
mtx_unlock(&vmm_mem_mtx);
}
vm_paddr_t
vmm_mem_maxaddr(void)
{
return (maxaddr);
}
void
vmm_mem_dump(void)
{
int i;
vm_paddr_t base;
vm_size_t length;
mtx_lock(&vmm_mem_mtx);
for (i = 0; i < vmm_mem_nsegs; i++) {
base = vmm_mem_avail[i].base;
length = vmm_mem_avail[i].length;
printf("%-4d0x%016lx 0x%016lx\n", i, base, base + length);
}
mtx_unlock(&vmm_mem_mtx);
}

38
sys/amd64/vmm/vmm_mem.h Normal file
View File

@ -0,0 +1,38 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VMM_MEM_H_
#define _VMM_MEM_H_
int vmm_mem_init(void);
vm_paddr_t vmm_mem_alloc(size_t size);
void vmm_mem_free(vm_paddr_t start, size_t size);
vm_paddr_t vmm_mem_maxaddr(void);
void vmm_mem_dump(void);
#endif

264
sys/amd64/vmm/vmm_msr.c Normal file
View File

@ -0,0 +1,264 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <machine/specialreg.h>
#include <machine/apicreg.h>
#include <machine/vmm.h>
#include "vmm_lapic.h"
#include "vmm_msr.h"
#define VMM_MSR_F_EMULATE 0x01
#define VMM_MSR_F_READONLY 0x02
struct vmm_msr {
int num;
int flags;
uint64_t hostval;
};
static struct vmm_msr vmm_msr[] = {
{ MSR_LSTAR, 0 },
{ MSR_CSTAR, 0 },
{ MSR_STAR, 0 },
{ MSR_SF_MASK, 0 },
{ MSR_APICBASE, VMM_MSR_F_EMULATE },
{ MSR_BIOS_SIGN,VMM_MSR_F_EMULATE },
{ MSR_MCG_CAP, VMM_MSR_F_EMULATE | VMM_MSR_F_READONLY },
};
#define vmm_msr_num (sizeof(vmm_msr) / sizeof(vmm_msr[0]))
CTASSERT(VMM_MSR_NUM >= vmm_msr_num);
#define readonly_msr(idx) \
((vmm_msr[(idx)].flags & VMM_MSR_F_READONLY) != 0)
#define emulated_msr(idx) \
((vmm_msr[(idx)].flags & VMM_MSR_F_EMULATE) != 0)
void
vmm_msr_init(void)
{
int i;
for (i = 0; i < vmm_msr_num; i++) {
if (emulated_msr(i))
continue;
/*
* XXX this assumes that the value of the host msr does not
* change after we have cached it.
*/
vmm_msr[i].hostval = rdmsr(vmm_msr[i].num);
}
}
void
guest_msrs_init(struct vm *vm, int cpu)
{
int i;
uint64_t *guest_msrs;
guest_msrs = vm_guest_msrs(vm, cpu);
for (i = 0; i < vmm_msr_num; i++) {
switch (vmm_msr[i].num) {
case MSR_LSTAR:
case MSR_CSTAR:
case MSR_STAR:
case MSR_SF_MASK:
case MSR_BIOS_SIGN:
case MSR_MCG_CAP:
guest_msrs[i] = 0;
break;
case MSR_APICBASE:
guest_msrs[i] = DEFAULT_APIC_BASE | APICBASE_ENABLED |
APICBASE_X2APIC;
if (cpu == 0)
guest_msrs[i] |= APICBASE_BSP;
break;
default:
panic("guest_msrs_init: missing initialization for msr "
"0x%0x", vmm_msr[i].num);
}
}
}
static boolean_t
x2apic_msr(u_int num)
{
if (num >= 0x800 && num <= 0xBFF)
return (TRUE);
else
return (FALSE);
}
static u_int
x2apic_msr_to_regoff(u_int msr)
{
return ((msr - 0x800) << 4);
}
static boolean_t
x2apic_msr_id(u_int num)
{
return (num == 0x802);
}
static int
msr_num_to_idx(u_int num)
{
int i;
for (i = 0; i < vmm_msr_num; i++)
if (vmm_msr[i].num == num)
return (i);
return (-1);
}
int
emulate_wrmsr(struct vm *vm, int cpu, u_int num, uint64_t val)
{
int handled, idx;
uint64_t *guest_msrs;
handled = 0;
if (x2apic_msr(num))
return (lapic_write(vm, cpu, x2apic_msr_to_regoff(num), val));
idx = msr_num_to_idx(num);
if (idx < 0)
goto done;
if (!readonly_msr(idx)) {
guest_msrs = vm_guest_msrs(vm, cpu);
/* Stash the value */
guest_msrs[idx] = val;
/* Update processor state for non-emulated MSRs */
if (!emulated_msr(idx))
wrmsr(vmm_msr[idx].num, val);
}
handled = 1;
done:
return (handled);
}
int
emulate_rdmsr(struct vm *vm, int cpu, u_int num)
{
int error, handled, idx;
uint32_t eax, edx;
uint64_t result, *guest_msrs;
handled = 0;
if (x2apic_msr(num)) {
handled = lapic_read(vm, cpu, x2apic_msr_to_regoff(num),
&result);
/*
* The version ID needs to be massaged
*/
if (x2apic_msr_id(num)) {
result = result >> 24;
}
goto done;
}
idx = msr_num_to_idx(num);
if (idx < 0)
goto done;
guest_msrs = vm_guest_msrs(vm, cpu);
result = guest_msrs[idx];
/*
* If this is not an emulated msr register make sure that the processor
* state matches our cached state.
*/
if (!emulated_msr(idx) && (rdmsr(num) != result)) {
panic("emulate_rdmsr: msr 0x%0x has inconsistent cached "
"(0x%016lx) and actual (0x%016lx) values", num,
result, rdmsr(num));
}
handled = 1;
done:
if (handled) {
eax = result;
edx = result >> 32;
error = vm_set_register(vm, cpu, VM_REG_GUEST_RAX, eax);
if (error)
panic("vm_set_register(rax) error %d", error);
error = vm_set_register(vm, cpu, VM_REG_GUEST_RDX, edx);
if (error)
panic("vm_set_register(rdx) error %d", error);
}
return (handled);
}
void
restore_guest_msrs(struct vm *vm, int cpu)
{
int i;
uint64_t *guest_msrs;
guest_msrs = vm_guest_msrs(vm, cpu);
for (i = 0; i < vmm_msr_num; i++) {
if (emulated_msr(i))
continue;
else
wrmsr(vmm_msr[i].num, guest_msrs[i]);
}
}
void
restore_host_msrs(struct vm *vm, int cpu)
{
int i;
for (i = 0; i < vmm_msr_num; i++) {
if (emulated_msr(i))
continue;
else
wrmsr(vmm_msr[i].num, vmm_msr[i].hostval);
}
}

42
sys/amd64/vmm/vmm_msr.h Normal file
View File

@ -0,0 +1,42 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VMM_MSR_H_
#define _VMM_MSR_H_
#define VMM_MSR_NUM 16
struct vm;
void vmm_msr_init(void);
int emulate_wrmsr(struct vm *vm, int vcpu, u_int msr, uint64_t val);
int emulate_rdmsr(struct vm *vm, int vcpu, u_int msr);
void guest_msrs_init(struct vm *vm, int cpu);
void restore_host_msrs(struct vm *vm, int cpu);
void restore_guest_msrs(struct vm *vm, int cpu);
#endif

103
sys/amd64/vmm/vmm_stat.c Normal file
View File

@ -0,0 +1,103 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <machine/vmm.h>
#include "vmm_stat.h"
static int vstnum;
static struct vmm_stat_type *vsttab[MAX_VMM_STAT_TYPES];
static MALLOC_DEFINE(M_VMM_STAT, "vmm stat", "vmm stat");
void
vmm_stat_init(void *arg)
{
struct vmm_stat_type *vst = arg;
/* We require all stats to identify themselves with a description */
if (vst->desc == NULL)
return;
if (vstnum >= MAX_VMM_STAT_TYPES) {
printf("Cannot accomodate vmm stat type \"%s\"!\n", vst->desc);
return;
}
vst->index = vstnum;
vsttab[vstnum++] = vst;
}
int
vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf)
{
int i;
uint64_t *stats;
if (vcpu < 0 || vcpu >= VM_MAXCPU)
return (EINVAL);
stats = vcpu_stats(vm, vcpu);
for (i = 0; i < vstnum; i++)
buf[i] = stats[i];
*num_stats = vstnum;
return (0);
}
void *
vmm_stat_alloc(void)
{
u_long size;
size = vstnum * sizeof(uint64_t);
return (malloc(size, M_VMM_STAT, M_ZERO | M_WAITOK));
}
void
vmm_stat_free(void *vp)
{
free(vp, M_VMM_STAT);
}
const char *
vmm_stat_desc(int index)
{
if (index >= 0 && index < vstnum)
return (vsttab[index]->desc);
else
return (NULL);
}

71
sys/amd64/vmm/vmm_stat.h Normal file
View File

@ -0,0 +1,71 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VMM_STAT_H_
#define _VMM_STAT_H_
struct vm;
#define MAX_VMM_STAT_TYPES 64 /* arbitrary */
struct vmm_stat_type {
const char *desc; /* description of statistic */
int index; /* position in the stats buffer */
};
void vmm_stat_init(void *arg);
#define VMM_STAT_DEFINE(type, desc) \
struct vmm_stat_type type[1] = { \
{ desc, -1 } \
}; \
SYSINIT(type##_stat, SI_SUB_KLD, SI_ORDER_ANY, vmm_stat_init, type)
void *vmm_stat_alloc(void);
void vmm_stat_free(void *vp);
/*
* 'buf' should be at least fit 'MAX_VMM_STAT_TYPES' entries
*/
int vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf);
const char *vmm_stat_desc(int index);
static void __inline
vmm_stat_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst, uint64_t x)
{
#ifdef VMM_KEEP_STATS
uint64_t *stats = vcpu_stats(vm, vcpu);
if (vst->index >= 0)
stats[vst->index] += x;
#endif
}
#endif

View File

@ -0,0 +1,42 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#define LOCORE
#include <machine/asmacros.h>
#define LA_EOI 0xB0
.text
SUPERALIGN_TEXT
IDTVEC(justreturn)
pushq %rax
movq lapic, %rax
movl $0, LA_EOI(%rax)
popq %rax
iretq

111
sys/amd64/vmm/vmm_util.c Normal file
View File

@ -0,0 +1,111 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/libkern.h>
#include <machine/md_var.h>
#include "vmm_util.h"
boolean_t
vmm_is_intel(void)
{
if (strcmp(cpu_vendor, "GenuineIntel") == 0)
return (TRUE);
else
return (FALSE);
}
boolean_t
vmm_is_amd(void)
{
if (strcmp(cpu_vendor, "AuthenticAMD") == 0)
return (TRUE);
else
return (FALSE);
}
boolean_t
vmm_supports_1G_pages(void)
{
unsigned int regs[4];
/*
* CPUID.80000001:EDX[bit 26] = 1 indicates support for 1GB pages
*
* Both Intel and AMD support this bit.
*/
if (cpu_exthigh >= 0x80000001) {
do_cpuid(0x80000001, regs);
if (regs[3] & (1 << 26))
return (TRUE);
}
return (FALSE);
}
#include <sys/proc.h>
#include <machine/frame.h>
#define DUMP_REG(x) printf(#x "\t\t0x%016lx\n", (long)(tf->tf_ ## x))
#define DUMP_SEG(x) printf(#x "\t\t0x%04x\n", (unsigned)(tf->tf_ ## x))
void
dump_trapframe(struct trapframe *tf)
{
DUMP_REG(rdi);
DUMP_REG(rsi);
DUMP_REG(rdx);
DUMP_REG(rcx);
DUMP_REG(r8);
DUMP_REG(r9);
DUMP_REG(rax);
DUMP_REG(rbx);
DUMP_REG(rbp);
DUMP_REG(r10);
DUMP_REG(r11);
DUMP_REG(r12);
DUMP_REG(r13);
DUMP_REG(r14);
DUMP_REG(r15);
DUMP_REG(trapno);
DUMP_REG(addr);
DUMP_REG(flags);
DUMP_REG(err);
DUMP_REG(rip);
DUMP_REG(rflags);
DUMP_REG(rsp);
DUMP_SEG(cs);
DUMP_SEG(ss);
DUMP_SEG(fs);
DUMP_SEG(gs);
DUMP_SEG(es);
DUMP_SEG(ds);
}

40
sys/amd64/vmm/vmm_util.h Normal file
View File

@ -0,0 +1,40 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VMM_UTIL_H_
#define _VMM_UTIL_H_
struct trapframe;
boolean_t vmm_is_intel(void);
boolean_t vmm_is_amd(void);
boolean_t vmm_supports_1G_pages(void);
void dump_trapframe(struct trapframe *tf);
#endif

113
sys/amd64/vmm/x86.c Normal file
View File

@ -0,0 +1,113 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <machine/cpufunc.h>
#include <machine/specialreg.h>
#include "x86.h"
int
x86_emulate_cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
{
unsigned int func, regs[4];
func = *eax;
cpuid_count(*eax, *ecx, regs);
switch(func) {
case CPUID_0000_0000:
case CPUID_0000_0002:
case CPUID_0000_0003:
case CPUID_0000_0004:
case CPUID_0000_000A:
break;
case CPUID_8000_0000:
case CPUID_8000_0001:
case CPUID_8000_0002:
case CPUID_8000_0003:
case CPUID_8000_0004:
case CPUID_8000_0006:
case CPUID_8000_0007:
case CPUID_8000_0008:
break;
case CPUID_0000_0001:
/*
* Override the APIC ID only in ebx
*/
regs[1] &= ~(CPUID_0000_0001_APICID_MASK);
/*
* XXX fixme for MP case, set apicid properly for cpu.
*/
regs[1] |= (0 << CPUID_0000_0001_APICID_SHIFT);
/*
* Don't expose VMX capability.
* Advertise x2APIC capability.
*/
regs[2] &= ~CPUID_0000_0001_FEAT0_VMX;
regs[2] |= CPUID2_X2APIC;
/*
* Machine check handling is done in the host.
* Hide MTRR capability.
*/
regs[3] &= ~(CPUID_MCA | CPUID_MCE | CPUID_MTRR);
break;
case CPUID_0000_000B:
/*
* XXXSMP fixme
* Processor topology enumeration
*/
regs[0] = 0;
regs[1] = 0;
regs[2] = *ecx & 0xff;
regs[3] = 0;
break;
default:
return (0);
}
*eax = regs[0];
*ebx = regs[1];
*ecx = regs[2];
*edx = regs[3];
return (1);
}

62
sys/amd64/vmm/x86.h Normal file
View File

@ -0,0 +1,62 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _X86_H_
#define _X86_H_
#define CPUID_0000_0000 (0x0)
#define CPUID_0000_0001 (0x1)
#define CPUID_0000_0002 (0x2)
#define CPUID_0000_0003 (0x3)
#define CPUID_0000_0004 (0x4)
#define CPUID_0000_000A (0xA)
#define CPUID_0000_000B (0xB)
#define CPUID_8000_0000 (0x80000000)
#define CPUID_8000_0001 (0x80000001)
#define CPUID_8000_0002 (0x80000002)
#define CPUID_8000_0003 (0x80000003)
#define CPUID_8000_0004 (0x80000004)
#define CPUID_8000_0006 (0x80000006)
#define CPUID_8000_0007 (0x80000007)
#define CPUID_8000_0008 (0x80000008)
/*
* CPUID instruction Fn0000_0001:
*/
#define CPUID_0000_0001_APICID_MASK (0xff<<24)
#define CPUID_0000_0001_APICID_SHIFT 24
/*
* CPUID instruction Fn0000_0001 ECX
*/
#define CPUID_0000_0001_FEAT0_VMX (1<<5)
int x86_emulate_cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx,
uint32_t *edx);
#endif

View File

@ -290,6 +290,7 @@ SUBDIR= ${_3dfx} \
${_vesa} \
vge \
vkbd \
${_vmm} \
${_vpo} \
vr \
vx \
@ -557,6 +558,7 @@ _sppp= sppp
_tmpfs= tmpfs
_twa= twa
_vesa= vesa
_vmm= vmm
_x86bios= x86bios
_wi= wi
_wpi= wpi

66
sys/modules/vmm/Makefile Normal file
View File

@ -0,0 +1,66 @@
# $FreeBSD$
# *REQUIRES* binutils 2.20.1 for VT-x instructions
AS= /usr/local/bin/as
LD= /usr/local/bin/ld
CFLAGS+= -B /usr/local/bin
KMOD= vmm
SRCS= device_if.h bus_if.h pci_if.h
CFLAGS+= -DVMM_KEEP_STATS
CFLAGS+= -DOLD_BINUTILS
CFLAGS+= -I${.CURDIR}/../../amd64/vmm
CFLAGS+= -I${.CURDIR}/../../amd64/vmm/io
CFLAGS+= -I${.CURDIR}/../../amd64/vmm/intel
# generic vmm support
.PATH: ${.CURDIR}/../../amd64/vmm
SRCS+= vmm.c \
vmm_dev.c \
vmm_ipi.c \
vmm_lapic.c \
vmm_mem.c \
vmm_msr.c \
vmm_stat.c \
vmm_util.c \
x86.c \
vmm_support.S
.PATH: ${.CURDIR}/../../amd64/vmm/io
SRCS+= iommu.c \
ppt.c \
vdev.c \
vlapic.c
# intel-specific files
.PATH: ${.CURDIR}/../../amd64/vmm/intel
SRCS+= ept.c \
vmcs.c \
vmx_msr.c \
vmx.c \
vtd.c
# amd-specific files
.PATH: ${.CURDIR}/../../amd64/vmm/amd
SRCS+= amdv.c
OBJS= vmx_support.o
CLEANFILES= vmx_assym.s vmx_genassym.o
vmx_assym.s: vmx_genassym.o
.if exists(@)
vmx_assym.s: @/kern/genassym.sh
.endif
sh @/kern/genassym.sh vmx_genassym.o > ${.TARGET}
vmx_support.o: vmx_support.S vmx_assym.s
${CC} -c -x assembler-with-cpp -DLOCORE ${CFLAGS} \
${.IMPSRC} -o ${.TARGET}
vmx_genassym.o: vmx_genassym.c @ machine
${CC} -c ${CFLAGS:N-fno-common} ${.IMPSRC}
.include <bsd.kmod.mk>

View File

@ -19,6 +19,7 @@ SUBDIR= ${_ac} \
${_auditd} \
${_auditreduce} \
${_authpf} \
${_bhyve} \
${_bluetooth} \
${_boot0cfg} \
${_boot98cfg} \
@ -194,6 +195,7 @@ SUBDIR= ${_ac} \
${_usbdevs} \
${_usbconfig} \
${_vidcontrol} \
${_vmmctl} \
vipw \
wake \
watch \
@ -477,6 +479,7 @@ _boot98cfg= boot98cfg
_acpi= acpi
.endif
_asf= asf
_bhyve= bhyve
_boot0cfg= boot0cfg
.if ${MK_TOOLCHAIN} != "no"
_btxld= btxld
@ -494,6 +497,7 @@ _ndiscvt= ndiscvt
.endif
_sicontrol= sicontrol
_spkrtest= spkrtest
_vmmctl= vmmctl
_zzz= zzz
.endif

18
usr.sbin/bhyve/Makefile Normal file
View File

@ -0,0 +1,18 @@
#
# $FreeBSD$
#
PROG= bhyve
SRCS= atpic.c consport.c dbgport.c elcr.c fbsdrun.c inout.c mevent.c
SRCS+= pci_emul.c pci_hostbridge.c pci_passthru.c pci_virtio_block.c
SRCS+= pci_virtio_net.c pit_8254.c post.c rtc.c uart.c xmsr.c
NO_MAN=
DPADD= ${LIBVMMAPI} ${LIBMD} ${LIBPTHREAD}
LDADD= -lvmmapi -lmd -lpthread
CFLAGS+= -I${.CURDIR}/../../sys
.include <bsd.prog.mk>

68
usr.sbin/bhyve/atpic.c Normal file
View File

@ -0,0 +1,68 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include "inout.h"
/*
* FreeBSD only writes to the 8259 interrupt controllers to put them in a
* shutdown state.
*
* So, we just ignore the writes.
*/
#define IO_ICU1 0x20
#define IO_ICU2 0xA0
#define ICU_IMR_OFFSET 1
static int
atpic_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
if (bytes != 1)
return (-1);
if (in)
return (-1);
/* Pretend all writes to the 8259 are alright */
return (0);
}
INOUT_PORT(atpic, IO_ICU1, IOPORT_F_INOUT, atpic_handler);
INOUT_PORT(atpic, IO_ICU1 + ICU_IMR_OFFSET, IOPORT_F_INOUT, atpic_handler);
INOUT_PORT(atpic, IO_ICU2, IOPORT_F_INOUT, atpic_handler);
INOUT_PORT(atpic, IO_ICU2 + ICU_IMR_OFFSET, IOPORT_F_INOUT, atpic_handler);

121
usr.sbin/bhyve/consport.c Normal file
View File

@ -0,0 +1,121 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <sys/select.h>
#include <stdio.h>
#include <stdlib.h>
#include <termios.h>
#include <unistd.h>
#include <stdbool.h>
#include "inout.h"
#define BVM_CONSOLE_PORT 0x220
static struct termios tio_orig, tio_new;
static void
ttyclose(void)
{
tcsetattr(STDIN_FILENO, TCSANOW, &tio_orig);
}
static void
ttyopen(void)
{
tcgetattr(STDIN_FILENO, &tio_orig);
cfmakeraw(&tio_new);
tcsetattr(STDIN_FILENO, TCSANOW, &tio_new);
atexit(ttyclose);
}
static bool
tty_char_available(void)
{
fd_set rfds;
struct timeval tv;
FD_ZERO(&rfds);
FD_SET(STDIN_FILENO, &rfds);
tv.tv_sec = 0;
tv.tv_usec = 0;
if (select(STDIN_FILENO + 1, &rfds, NULL, NULL, &tv) > 0) {
return (true);
} else {
return (false);
}
}
static int
ttyread(void)
{
char rb;
if (tty_char_available()) {
read(STDIN_FILENO, &rb, 1);
return (rb & 0xff);
} else {
return (-1);
}
}
static void
ttywrite(unsigned char wb)
{
(void) write(STDOUT_FILENO, &wb, 1);
}
static int
console_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
static int opened;
if (bytes != 4)
return (-1);
if (!opened) {
ttyopen();
opened = 1;
}
if (in)
*eax = ttyread();
else
ttywrite(*eax);
return (0);
}
INOUT_PORT(console, BVM_CONSOLE_PORT, IOPORT_F_INOUT, console_handler);

124
usr.sbin/bhyve/dbgport.c Normal file
View File

@ -0,0 +1,124 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <sys/uio.h>
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
#include "inout.h"
#define BVM_DBG_PORT 0x224
static int listen_fd, conn_fd;
static struct sockaddr_in sin;
void
init_dbgport(int sport)
{
conn_fd = -1;
if ((listen_fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
perror("socket");
exit(1);
}
sin.sin_len = sizeof(sin);
sin.sin_family = AF_INET;
sin.sin_addr.s_addr = htonl(INADDR_ANY);
sin.sin_port = htons(sport);
if (bind(listen_fd, (struct sockaddr *)&sin, sizeof(sin)) < 0) {
perror("bind");
exit(1);
}
if (listen(listen_fd, 1) < 0) {
perror("listen");
exit(1);
}
}
static int
dbg_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
char ch;
int nwritten, nread, printonce;
if (bytes != 4)
return (-1);
again:
printonce = 0;
while (conn_fd < 0) {
if (!printonce) {
printf("Waiting for connection from gdb\r\n");
printonce = 1;
}
conn_fd = accept(listen_fd, NULL, NULL);
if (conn_fd >= 0)
fcntl(conn_fd, F_SETFL, O_NONBLOCK);
else if (errno != EINTR)
perror("accept");
}
if (in) {
nread = read(conn_fd, &ch, 1);
if (nread == -1 && errno == EAGAIN)
*eax = -1;
else if (nread == 1)
*eax = ch;
else {
close(conn_fd);
conn_fd = -1;
goto again;
}
} else {
ch = *eax;
nwritten = write(conn_fd, &ch, 1);
if (nwritten != 1) {
close(conn_fd);
conn_fd = -1;
goto again;
}
}
return (0);
}
INOUT_PORT(dbg, BVM_DBG_PORT, IOPORT_F_INOUT, dbg_handler);

36
usr.sbin/bhyve/dbgport.h Normal file
View File

@ -0,0 +1,36 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _DBGPORT_H_
#define _DBGPORT_H_
#define DEFAULT_GDB_PORT 6466
void init_dbgport(int port);
#endif

65
usr.sbin/bhyve/elcr.c Normal file
View File

@ -0,0 +1,65 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include "inout.h"
/*
* EISA interrupt Level Control Register.
*
* This is a 16-bit register with one bit for each of the IRQ0 through IRQ15.
* A level triggered irq is indicated by setting the corresponding bit to '1'.
*/
#define ELCR_PORT 0x4d0
static uint8_t elcr[2] = { 0x00, 0x00 };
static int
elcr_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
int idx;
if (bytes != 1)
return (-1);
idx = port - ELCR_PORT;
if (in)
*eax = elcr[idx];
else
elcr[idx] = *eax;
return (0);
}
INOUT_PORT(elcr, ELCR_PORT + 0, IOPORT_F_INOUT, elcr_handler);
INOUT_PORT(elcr, ELCR_PORT + 1, IOPORT_F_INOUT, elcr_handler);

650
usr.sbin/bhyve/fbsdrun.c Normal file
View File

@ -0,0 +1,650 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/time.h>
#include <machine/segments.h>
#include <stdio.h>
#include <stdlib.h>
#include <libgen.h>
#include <unistd.h>
#include <assert.h>
#include <errno.h>
#include <signal.h>
#include <pthread.h>
#include <machine/vmm.h>
#include <vmmapi.h>
#include "fbsdrun.h"
#include "inout.h"
#include "dbgport.h"
#include "mevent.h"
#include "pci_emul.h"
#include "xmsr.h"
#define DEFAULT_GUEST_HZ 100
#define DEFAULT_GUEST_TSLICE 200
#define GUEST_NIO_PORT 0x488 /* guest upcalls via i/o port */
#define VMEXIT_SWITCH 0 /* force vcpu switch in mux mode */
#define VMEXIT_CONTINUE 1 /* continue from next instruction */
#define VMEXIT_RESTART 2 /* restart current instruction */
#define VMEXIT_ABORT 3 /* abort the vm run loop */
#define VMEXIT_RESET 4 /* guest machine has reset */
#define MB (1024UL * 1024)
#define GB (1024UL * MB)
typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu);
int guest_tslice = DEFAULT_GUEST_TSLICE;
int guest_hz = DEFAULT_GUEST_HZ;
char *vmname;
u_long lomem_sz;
u_long himem_sz;
int guest_ncpus;
static int pincpu = -1;
static int guest_vcpu_mux;
static int guest_vmexit_on_hlt, guest_vmexit_on_pause;
static int foundcpus;
static char *lomem_addr;
static char *himem_addr;
static char *progname;
static const int BSP = 0;
static int cpumask;
static void *oem_tbl_start;
static int oem_tbl_size;
static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip);
struct vm_exit vmexit[VM_MAXCPU];
struct fbsdstats {
uint64_t vmexit_bogus;
uint64_t vmexit_bogus_switch;
uint64_t vmexit_hlt;
uint64_t vmexit_pause;
uint64_t vmexit_mtrap;
uint64_t cpu_switch_rotate;
uint64_t cpu_switch_direct;
int io_reset;
} stats;
struct mt_vmm_info {
pthread_t mt_thr;
struct vmctx *mt_ctx;
int mt_vcpu;
} mt_vmm_info[VM_MAXCPU];
static void
usage(int code)
{
fprintf(stderr,
"Usage: %s [-hBHP][-g <gdb port>][-z <hz>][-s <pci>][-p pincpu]"
"[-n <pci>][-m lowmem][-M highmem] <vm>\n"
" -g: gdb port (default is %d and 0 means don't open)\n"
" -c: # cpus (default 1)\n"
" -p: pin vcpu 'n' to host cpu 'pincpu + n'\n"
" -B: inject breakpoint exception on vm entry\n"
" -H: vmexit from the guest on hlt\n"
" -P: vmexit from the guest on pause\n"
" -h: help\n"
" -z: guest hz (default is %d)\n"
" -s: <slot,driver,configinfo> PCI slot config\n"
" -n: <slot,name> PCI slot naming\n"
" -m: lowmem in MB\n"
" -M: highmem in MB\n"
" -x: mux vcpus to 1 hcpu\n"
" -t: mux vcpu timeslice hz (default %d)\n",
progname, DEFAULT_GDB_PORT, DEFAULT_GUEST_HZ,
DEFAULT_GUEST_TSLICE);
exit(code);
}
void *
paddr_guest2host(uintptr_t gaddr)
{
if (lomem_sz == 0)
return (NULL);
if (gaddr < lomem_sz) {
return ((void *)(lomem_addr + gaddr));
} else if (gaddr >= 4*GB && gaddr < (4*GB + himem_sz)) {
return ((void *)(himem_addr + gaddr - 4*GB));
} else
return (NULL);
}
void
fbsdrun_add_oemtbl(void *tbl, int tblsz)
{
oem_tbl_start = tbl;
oem_tbl_size = tblsz;
}
int
fbsdrun_vmexit_on_pause(void)
{
return (guest_vmexit_on_pause);
}
int
fbsdrun_vmexit_on_hlt(void)
{
return (guest_vmexit_on_hlt);
}
int
fbsdrun_muxed(void)
{
return (guest_vcpu_mux);
}
void *
fbsdrun_start_thread(void *param)
{
int vcpu;
struct mt_vmm_info *mtp = param;
vcpu = mtp->mt_vcpu;
vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip);
/* not reached */
exit(1);
return (NULL);
}
void
fbsdrun_addcpu(struct vmctx *ctx, int vcpu, uint64_t rip)
{
int error;
if (cpumask & (1 << vcpu)) {
printf("addcpu: attempting to add existing cpu %d\n", vcpu);
exit(1);
}
cpumask |= 1 << vcpu;
foundcpus++;
/*
* Set up the vmexit struct to allow execution to start
* at the given RIP
*/
vmexit[vcpu].rip = rip;
vmexit[vcpu].inst_length = 0;
if (vcpu == BSP || !guest_vcpu_mux){
mt_vmm_info[vcpu].mt_ctx = ctx;
mt_vmm_info[vcpu].mt_vcpu = vcpu;
error = pthread_create(&mt_vmm_info[vcpu].mt_thr, NULL,
fbsdrun_start_thread, &mt_vmm_info[vcpu]);
assert(error == 0);
}
}
static int
fbsdrun_get_next_cpu(int curcpu)
{
/*
* Get the next available CPU. Assumes they arrive
* in ascending order with no gaps.
*/
return ((curcpu + 1) % foundcpus);
}
int
vmexit_catch_reset(void)
{
stats.io_reset++;
return (VMEXIT_RESET);
}
int
vmexit_catch_inout(void)
{
return (VMEXIT_ABORT);
}
int
vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu,
uint32_t eax)
{
#if PG_DEBUG /* put all types of debug here */
if (eax == 0) {
pause_noswitch = 1;
} else if (eax == 1) {
pause_noswitch = 0;
} else {
pause_noswitch = 0;
if (eax == 5) {
vm_set_capability(ctx, *pvcpu, VM_CAP_MTRAP_EXIT, 1);
}
}
#endif
return (VMEXIT_CONTINUE);
}
static int
vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
{
int error;
int bytes, port, in, out;
uint32_t eax;
int vcpu;
vcpu = *pvcpu;
port = vme->u.inout.port;
bytes = vme->u.inout.bytes;
eax = vme->u.inout.eax;
in = vme->u.inout.in;
out = !in;
/* We don't deal with these */
if (vme->u.inout.string || vme->u.inout.rep)
return (VMEXIT_ABORT);
/* Special case of guest reset */
if (out && port == 0x64 && (uint8_t)eax == 0xFE)
return (vmexit_catch_reset());
/* Extra-special case of host notifications */
if (out && port == GUEST_NIO_PORT)
return (vmexit_handle_notify(ctx, vme, pvcpu, eax));
error = emulate_inout(ctx, vcpu, in, port, bytes, &eax);
if (error == 0 && in)
error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, eax);
if (error == 0)
return (VMEXIT_CONTINUE);
else {
fprintf(stderr, "Unhandled %s%c 0x%04x\n",
in ? "in" : "out",
bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port);
return (vmexit_catch_inout());
}
}
static int
vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
{
printf("vm exit rdmsr 0x%x, cpu %d\n", vme->u.msr.code, *pvcpu);
return (VMEXIT_ABORT);
}
static int
vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
{
int newcpu;
int retval = VMEXIT_CONTINUE;
newcpu = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code,vme->u.msr.wval);
if (guest_vcpu_mux && *pvcpu != newcpu) {
retval = VMEXIT_SWITCH;
*pvcpu = newcpu;
}
return (retval);
}
static int
vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
printf("vm exit[%d]\n", *pvcpu);
printf("\treason\t\tVMX\n");
printf("\trip\t\t0x%016lx\n", vmexit->rip);
printf("\tinst_length\t%d\n", vmexit->inst_length);
printf("\terror\t\t%d\n", vmexit->u.vmx.error);
printf("\texit_reason\t%u\n", vmexit->u.vmx.exit_reason);
printf("\tqualification\t0x%016lx\n", vmexit->u.vmx.exit_qualification);
return (VMEXIT_ABORT);
}
static int bogus_noswitch = 1;
static int
vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
stats.vmexit_bogus++;
if (!guest_vcpu_mux || guest_ncpus == 1 || bogus_noswitch) {
return (VMEXIT_RESTART);
} else {
stats.vmexit_bogus_switch++;
vmexit->inst_length = 0;
*pvcpu = -1;
return (VMEXIT_SWITCH);
}
}
static int
vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
stats.vmexit_hlt++;
if (fbsdrun_muxed()) {
*pvcpu = -1;
return (VMEXIT_SWITCH);
} else {
/*
* Just continue execution with the next instruction. We use
* the HLT VM exit as a way to be friendly with the host
* scheduler.
*/
return (VMEXIT_CONTINUE);
}
}
static int pause_noswitch;
static int
vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
stats.vmexit_pause++;
if (fbsdrun_muxed() && !pause_noswitch) {
*pvcpu = -1;
return (VMEXIT_SWITCH);
} else {
return (VMEXIT_CONTINUE);
}
}
static int
vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
stats.vmexit_mtrap++;
return (VMEXIT_RESTART);
}
static void
sigalrm(int sig)
{
return;
}
static void
setup_timeslice(void)
{
struct sigaction sa;
struct itimerval itv;
int error;
/*
* Setup a realtime timer to generate a SIGALRM at a
* frequency of 'guest_tslice' ticks per second.
*/
sigemptyset(&sa.sa_mask);
sa.sa_flags = 0;
sa.sa_handler = sigalrm;
error = sigaction(SIGALRM, &sa, NULL);
assert(error == 0);
itv.it_interval.tv_sec = 0;
itv.it_interval.tv_usec = 1000000 / guest_tslice;
itv.it_value.tv_sec = 0;
itv.it_value.tv_usec = 1000000 / guest_tslice;
error = setitimer(ITIMER_REAL, &itv, NULL);
assert(error == 0);
}
static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
[VM_EXITCODE_INOUT] = vmexit_inout,
[VM_EXITCODE_VMX] = vmexit_vmx,
[VM_EXITCODE_BOGUS] = vmexit_bogus,
[VM_EXITCODE_RDMSR] = vmexit_rdmsr,
[VM_EXITCODE_WRMSR] = vmexit_wrmsr,
[VM_EXITCODE_MTRAP] = vmexit_mtrap,
};
static void
vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip)
{
int error, rc, prevcpu;
if (guest_vcpu_mux)
setup_timeslice();
if (pincpu >= 0) {
error = vm_set_pinning(ctx, vcpu, pincpu + vcpu);
assert(error == 0);
}
while (1) {
error = vm_run(ctx, vcpu, rip, &vmexit[vcpu]);
if (error != 0)
break;
prevcpu = vcpu;
rc = (*handler[vmexit[vcpu].exitcode])(ctx, &vmexit[vcpu],
&vcpu);
switch (rc) {
case VMEXIT_SWITCH:
assert(guest_vcpu_mux);
if (vcpu == -1) {
stats.cpu_switch_rotate++;
vcpu = fbsdrun_get_next_cpu(prevcpu);
} else {
stats.cpu_switch_direct++;
}
/* fall through */
case VMEXIT_CONTINUE:
rip = vmexit[vcpu].rip + vmexit[vcpu].inst_length;
break;
case VMEXIT_RESTART:
rip = vmexit[vcpu].rip;
break;
case VMEXIT_RESET:
exit(0);
default:
exit(1);
}
}
fprintf(stderr, "vm_run error %d, errno %d\n", error, errno);
}
int
main(int argc, char *argv[])
{
int c, error, gdb_port, inject_bkpt, tmp, err;
struct vmctx *ctx;
uint64_t rip;
inject_bkpt = 0;
progname = basename(argv[0]);
gdb_port = DEFAULT_GDB_PORT;
guest_ncpus = 1;
while ((c = getopt(argc, argv, "hBHPxp:g:c:z:s:n:m:M:")) != -1) {
switch (c) {
case 'B':
inject_bkpt = 1;
break;
case 'x':
guest_vcpu_mux = 1;
break;
case 'p':
pincpu = atoi(optarg);
break;
case 'c':
guest_ncpus = atoi(optarg);
break;
case 'g':
gdb_port = atoi(optarg);
break;
case 'z':
guest_hz = atoi(optarg);
break;
case 't':
guest_tslice = atoi(optarg);
break;
case 's':
pci_parse_slot(optarg);
break;
case 'n':
pci_parse_name(optarg);
break;
case 'm':
lomem_sz = strtoul(optarg, NULL, 0) * MB;
break;
case 'M':
himem_sz = strtoul(optarg, NULL, 0) * MB;
break;
case 'H':
guest_vmexit_on_hlt = 1;
break;
case 'P':
guest_vmexit_on_pause = 1;
break;
case 'h':
usage(0);
default:
usage(1);
}
}
argc -= optind;
argv += optind;
if (argc != 1)
usage(1);
/* No need to mux if guest is uni-processor */
if (guest_ncpus <= 1)
guest_vcpu_mux = 0;
/* vmexit on hlt if guest is muxed */
if (guest_vcpu_mux) {
guest_vmexit_on_hlt = 1;
guest_vmexit_on_pause = 1;
}
vmname = argv[0];
ctx = vm_open(vmname);
if (ctx == NULL) {
perror("vm_open");
exit(1);
}
if (fbsdrun_vmexit_on_hlt()) {
err = vm_get_capability(ctx, BSP, VM_CAP_HALT_EXIT, &tmp);
if (err < 0) {
printf("VM exit on HLT not supported\n");
exit(1);
}
vm_set_capability(ctx, BSP, VM_CAP_HALT_EXIT, 1);
handler[VM_EXITCODE_HLT] = vmexit_hlt;
}
if (fbsdrun_vmexit_on_pause()) {
/*
* pause exit support required for this mode
*/
err = vm_get_capability(ctx, BSP, VM_CAP_PAUSE_EXIT, &tmp);
if (err < 0) {
printf("SMP mux requested, no pause support\n");
exit(1);
}
vm_set_capability(ctx, BSP, VM_CAP_PAUSE_EXIT, 1);
handler[VM_EXITCODE_PAUSE] = vmexit_pause;
}
if (lomem_sz != 0) {
lomem_addr = vm_map_memory(ctx, 0, lomem_sz);
if (lomem_addr == (char *) MAP_FAILED) {
lomem_sz = 0;
} else if (himem_sz != 0) {
himem_addr = vm_map_memory(ctx, 4*GB, himem_sz);
if (himem_addr == (char *) MAP_FAILED) {
lomem_sz = 0;
himem_sz = 0;
}
}
}
init_inout();
init_pci(ctx);
if (gdb_port != 0)
init_dbgport(gdb_port);
error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip);
assert(error == 0);
if (inject_bkpt) {
error = vm_inject_event(ctx, BSP, VM_HW_EXCEPTION, IDT_BP);
assert(error == 0);
}
/*
* build the guest tables, MP etc.
*/
vm_build_tables(ctx, guest_ncpus, oem_tbl_start, oem_tbl_size);
/*
* Add CPU 0
*/
fbsdrun_addcpu(ctx, BSP, rip);
/*
* Head off to the main event dispatch loop
*/
mevent_dispatch();
exit(1);
}

53
usr.sbin/bhyve/fbsdrun.h Normal file
View File

@ -0,0 +1,53 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _FBSDRUN_H_
#define _FBSDRUN_H_
#ifndef CTASSERT /* Allow lint to override */
#define CTASSERT(x) _CTASSERT(x, __LINE__)
#define _CTASSERT(x, y) __CTASSERT(x, y)
#define __CTASSERT(x, y) typedef char __assert ## y[(x) ? 1 : -1]
#endif
struct vmctx;
extern int guest_hz;
extern int guest_tslice;
extern int guest_ncpus;
extern char *vmname;
extern u_long lomem_sz, himem_sz;
void *paddr_guest2host(uintptr_t);
void fbsdrun_addcpu(struct vmctx *ctx, int cpu, uint64_t rip);
void fbsdrun_add_oemtbl(void *tbl, int tblsz);
int fbsdrun_muxed(void);
int fbsdrun_vmexit_on_hlt(void);
int fbsdrun_vmexit_on_pause(void);
#endif

98
usr.sbin/bhyve/inout.c Normal file
View File

@ -0,0 +1,98 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/linker_set.h>
#include <stdio.h>
#include <assert.h>
#include "inout.h"
SET_DECLARE(inout_port_set, struct inout_port);
#define MAX_IOPORTS (1 << 16)
static struct {
const char *name;
int flags;
inout_func_t handler;
void *arg;
} inout_handlers[MAX_IOPORTS];
int
emulate_inout(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax)
{
int flags;
inout_func_t handler;
void *arg;
assert(port < MAX_IOPORTS);
if ((handler = inout_handlers[port].handler) == NULL)
return (-1);
flags = inout_handlers[port].flags;
arg = inout_handlers[port].arg;
if ((in && (flags & IOPORT_F_IN)) || (!in && (flags & IOPORT_F_OUT)))
return ((*handler)(ctx, vcpu, in, port, bytes, eax, arg));
else
return (-1);
}
void
init_inout(void)
{
struct inout_port **iopp, *iop;
SET_FOREACH(iopp, inout_port_set) {
iop = *iopp;
assert(iop->port < MAX_IOPORTS);
inout_handlers[iop->port].name = iop->name;
inout_handlers[iop->port].flags = iop->flags;
inout_handlers[iop->port].handler = iop->handler;
inout_handlers[iop->port].arg = NULL;
}
}
int
register_inout(struct inout_port *iop)
{
assert(iop->port < MAX_IOPORTS);
inout_handlers[iop->port].name = iop->name;
inout_handlers[iop->port].flags = iop->flags;
inout_handlers[iop->port].handler = iop->handler;
inout_handlers[iop->port].arg = iop->arg;
return (0);
}

64
usr.sbin/bhyve/inout.h Normal file
View File

@ -0,0 +1,64 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _INOUT_H_
#define _INOUT_H_
#include <sys/linker_set.h>
struct vmctx;
typedef int (*inout_func_t)(struct vmctx *ctx, int vcpu, int in, int port,
int bytes, uint32_t *eax, void *arg);
struct inout_port {
const char *name;
int port;
int flags;
inout_func_t handler;
void *arg;
};
#define IOPORT_F_IN 0x1
#define IOPORT_F_OUT 0x2
#define IOPORT_F_INOUT 0x3
#define INOUT_PORT(name, port, flags, handler) \
static struct inout_port __CONCAT(__inout_port, __LINE__) = { \
#name, \
(port), \
(flags), \
(handler) \
}; \
DATA_SET(inout_port_set, __CONCAT(__inout_port, __LINE__))
void init_inout(void);
int emulate_inout(struct vmctx *, int vcpu, int in, int port, int bytes,
uint32_t *eax);
int register_inout(struct inout_port *iop);
#endif /* _INOUT_H_ */

419
usr.sbin/bhyve/mevent.c Normal file
View File

@ -0,0 +1,419 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
/*
* Micro event library for FreeBSD, designed for a single i/o thread
* using kqueue, and having events be persistent by default.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <assert.h>
#include <errno.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/event.h>
#include <sys/time.h>
#include <pthread.h>
#include "mevent.h"
#define MEVENT_MAX 64
#define MEV_ENABLE 1
#define MEV_DISABLE 2
#define MEV_DEL_PENDING 3
static pthread_t mevent_tid;
static int mevent_pipefd[2];
static pthread_mutex_t mevent_lmutex = PTHREAD_MUTEX_INITIALIZER;
struct mevent {
void (*me_func)(int, enum ev_type, void *);
int me_fd;
enum ev_type me_type;
void *me_param;
int me_cq;
int me_state;
int me_closefd;
LIST_ENTRY(mevent) me_list;
};
static LIST_HEAD(listhead, mevent) global_head, change_head;
static void
mevent_qlock(void)
{
pthread_mutex_lock(&mevent_lmutex);
}
static void
mevent_qunlock(void)
{
pthread_mutex_unlock(&mevent_lmutex);
}
static void
mevent_pipe_read(int fd, enum ev_type type, void *param)
{
char buf[MEVENT_MAX];
int status;
/*
* Drain the pipe read side. The fd is non-blocking so this is
* safe to do.
*/
do {
status = read(fd, buf, sizeof(buf));
} while (status == MEVENT_MAX);
}
static void
mevent_notify(void)
{
char c;
/*
* If calling from outside the i/o thread, write a byte on the
* pipe to force the i/o thread to exit the blocking kevent call.
*/
if (mevent_pipefd[1] != 0 && pthread_self() != mevent_tid) {
write(mevent_pipefd[1], &c, 1);
}
}
static int
mevent_kq_filter(struct mevent *mevp)
{
int retval;
retval = 0;
if (mevp->me_type == EVF_READ)
retval = EVFILT_READ;
if (mevp->me_type == EVF_WRITE)
retval = EVFILT_WRITE;
return (retval);
}
static int
mevent_kq_flags(struct mevent *mevp)
{
int ret;
switch (mevp->me_state) {
case MEV_ENABLE:
ret = EV_ADD;
break;
case MEV_DISABLE:
ret = EV_DISABLE;
break;
case MEV_DEL_PENDING:
ret = EV_DELETE;
break;
}
return (ret);
}
static int
mevent_kq_fflags(struct mevent *mevp)
{
/* XXX nothing yet, perhaps EV_EOF for reads ? */
return (0);
}
static int
mevent_build(int mfd, struct kevent *kev)
{
struct mevent *mevp, *tmpp;
int i;
i = 0;
mevent_qlock();
LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) {
if (mevp->me_closefd) {
/*
* A close of the file descriptor will remove the
* event
*/
close(mevp->me_fd);
} else {
kev[i].ident = mevp->me_fd;
kev[i].filter = mevent_kq_filter(mevp);
kev[i].flags = mevent_kq_flags(mevp);
kev[i].fflags = mevent_kq_fflags(mevp);
kev[i].data = 0;
kev[i].udata = mevp;
i++;
}
mevp->me_cq = 0;
LIST_REMOVE(mevp, me_list);
if (mevp->me_state == MEV_DEL_PENDING) {
free(mevp);
} else {
LIST_INSERT_HEAD(&global_head, mevp, me_list);
}
assert(i < MEVENT_MAX);
}
mevent_qunlock();
return (i);
}
static void
mevent_handle(struct kevent *kev, int numev)
{
struct mevent *mevp;
int i;
for (i = 0; i < numev; i++) {
mevp = kev[i].udata;
/* XXX check for EV_ERROR ? */
(*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param);
}
}
struct mevent *
mevent_add(int fd, enum ev_type type,
void (*func)(int, enum ev_type, void *), void *param)
{
struct mevent *lp, *mevp;
if (fd < 0 || func == NULL) {
return (NULL);
}
mevp = NULL;
mevent_qlock();
/*
* Verify that the fd/type tuple is not present in any list
*/
LIST_FOREACH(lp, &global_head, me_list) {
if (lp->me_fd == fd && lp->me_type == type) {
goto exit;
}
}
LIST_FOREACH(lp, &change_head, me_list) {
if (lp->me_fd == fd && lp->me_type == type) {
goto exit;
}
}
/*
* Allocate an entry, populate it, and add it to the change list.
*/
mevp = malloc(sizeof(struct mevent));
if (mevp == NULL) {
goto exit;
}
memset(mevp, 0, sizeof(struct mevent));
mevp->me_fd = fd;
mevp->me_type = type;
mevp->me_func = func;
mevp->me_param = param;
LIST_INSERT_HEAD(&change_head, mevp, me_list);
mevp->me_cq = 1;
mevp->me_state = MEV_ENABLE;
mevent_notify();
exit:
mevent_qunlock();
return (mevp);
}
static int
mevent_update(struct mevent *evp, int newstate)
{
/*
* It's not possible to enable/disable a deleted event
*/
if (evp->me_state == MEV_DEL_PENDING)
return (EINVAL);
/*
* No update needed if state isn't changing
*/
if (evp->me_state == newstate)
return (0);
mevent_qlock();
evp->me_state = newstate;
/*
* Place the entry onto the changed list if not already there.
*/
if (evp->me_cq == 0) {
evp->me_cq = 1;
LIST_REMOVE(evp, me_list);
LIST_INSERT_HEAD(&change_head, evp, me_list);
mevent_notify();
}
mevent_qunlock();
return (0);
}
int
mevent_enable(struct mevent *evp)
{
return (mevent_update(evp, MEV_ENABLE));
}
int
mevent_disable(struct mevent *evp)
{
return (mevent_update(evp, MEV_DISABLE));
}
static int
mevent_delete_event(struct mevent *evp, int closefd)
{
mevent_qlock();
/*
* Place the entry onto the changed list if not already there, and
* mark as to be deleted.
*/
if (evp->me_cq == 0) {
evp->me_cq = 1;
LIST_REMOVE(evp, me_list);
LIST_INSERT_HEAD(&change_head, evp, me_list);
mevent_notify();
}
evp->me_state = MEV_DEL_PENDING;
if (closefd)
evp->me_closefd = 1;
mevent_qunlock();
return (0);
}
int
mevent_delete(struct mevent *evp)
{
return (mevent_delete_event(evp, 0));
}
int
mevent_delete_close(struct mevent *evp)
{
return (mevent_delete_event(evp, 1));
}
void
mevent_dispatch(void)
{
struct kevent changelist[MEVENT_MAX];
struct kevent eventlist[MEVENT_MAX];
struct mevent *pipev;
int mfd;
int numev;
int ret;
mevent_tid = pthread_self();
mfd = kqueue();
assert(mfd > 0);
/*
* Open the pipe that will be used for other threads to force
* the blocking kqueue call to exit by writing to it. Set the
* descriptor to non-blocking.
*/
ret = pipe(mevent_pipefd);
if (ret < 0) {
perror("pipe");
exit(0);
}
/*
* Add internal event handler for the pipe write fd
*/
pipev = mevent_add(mevent_pipefd[0], EVF_READ, mevent_pipe_read, NULL);
assert(pipev != NULL);
for (;;) {
/*
* Build changelist if required.
* XXX the changelist can be put into the blocking call
* to eliminate the extra syscall. Currently better for
* debug.
*/
numev = mevent_build(mfd, changelist);
if (numev) {
ret = kevent(mfd, changelist, numev, NULL, 0, NULL);
if (ret == -1) {
perror("Error return from kevent change");
}
}
/*
* Block awaiting events
*/
ret = kevent(mfd, NULL, 0, eventlist, MEVENT_MAX, NULL);
if (ret == -1) {
perror("Error return from kevent monitor");
}
/*
* Handle reported events
*/
mevent_handle(eventlist, ret);
}
}

49
usr.sbin/bhyve/mevent.h Normal file
View File

@ -0,0 +1,49 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _MEVENT_H_
#define _MEVENT_H_
enum ev_type {
EVF_READ,
EVF_WRITE
};
struct mevent;
struct mevent *mevent_add(int fd, enum ev_type type,
void (*func)(int, enum ev_type, void *),
void *param);
int mevent_enable(struct mevent *evp);
int mevent_disable(struct mevent *evp);
int mevent_delete(struct mevent *evp);
int mevent_delete_close(struct mevent *evp);
void mevent_dispatch(void);
#endif /* _MEVENT_H_ */

View File

@ -0,0 +1,180 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
/*
* Test program for the micro event library. Set up a simple TCP echo
* service.
*
* cc mevent_test.c mevent.c -lpthread
*/
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include "mevent.h"
#define TEST_PORT 4321
static pthread_mutex_t accept_mutex = PTHREAD_MUTEX_INITIALIZER;
static pthread_cond_t accept_condvar = PTHREAD_COND_INITIALIZER;
#define MEVENT_ECHO
#ifdef MEVENT_ECHO
struct esync {
pthread_mutex_t e_mt;
pthread_cond_t e_cond;
};
static void
echoer_callback(int fd, enum ev_type type, void *param)
{
struct esync *sync = param;
pthread_mutex_lock(&sync->e_mt);
pthread_cond_signal(&sync->e_cond);
pthread_mutex_unlock(&sync->e_mt);
}
static void *
echoer(void *param)
{
struct esync sync;
struct mevent *mev;
char buf[128];
int fd = (int)(uintptr_t) param;
int len;
pthread_mutex_init(&sync.e_mt, NULL);
pthread_cond_init(&sync.e_cond, NULL);
pthread_mutex_lock(&sync.e_mt);
mev = mevent_add(fd, EVF_READ, echoer_callback, &sync);
if (mev == NULL) {
printf("Could not allocate echoer event\n");
exit(1);
}
while (!pthread_cond_wait(&sync.e_cond, &sync.e_mt)) {
len = read(fd, buf, sizeof(buf));
if (len > 0) {
write(fd, buf, len);
write(0, buf, len);
} else {
break;
}
}
mevent_delete_close(mev);
pthread_mutex_unlock(&sync.e_mt);
pthread_mutex_destroy(&sync.e_mt);
pthread_cond_destroy(&sync.e_cond);
}
#else
static void *
echoer(void *param)
{
char buf[128];
int fd = (int)(uintptr_t) param;
int len;
while ((len = read(fd, buf, sizeof(buf))) > 0) {
write(1, buf, len);
}
}
#endif /* MEVENT_ECHO */
static void
acceptor_callback(int fd, enum ev_type type, void *param)
{
pthread_mutex_lock(&accept_mutex);
pthread_cond_signal(&accept_condvar);
pthread_mutex_unlock(&accept_mutex);
}
static void *
acceptor(void *param)
{
struct sockaddr_in sin;
pthread_t tid;
int news;
int s;
if ((s = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
perror("socket");
exit(1);
}
sin.sin_len = sizeof(sin);
sin.sin_family = AF_INET;
sin.sin_addr.s_addr = htonl(INADDR_ANY);
sin.sin_port = htons(TEST_PORT);
if (bind(s, (struct sockaddr *)&sin, sizeof(sin)) < 0) {
perror("bind");
exit(1);
}
if (listen(s, 1) < 0) {
perror("listen");
exit(1);
}
(void) mevent_add(s, EVF_READ, acceptor_callback, NULL);
pthread_mutex_lock(&accept_mutex);
while (!pthread_cond_wait(&accept_condvar, &accept_mutex)) {
news = accept(s, NULL, NULL);
if (news < 0) {
perror("accept error");
} else {
printf("incoming connection, spawning thread\n");
pthread_create(&tid, NULL, echoer,
(void *)(uintptr_t)news);
}
}
}
main()
{
pthread_t tid;
pthread_create(&tid, NULL, acceptor, NULL);
mevent_dispatch();
}

976
usr.sbin/bhyve/pci_emul.c Normal file
View File

@ -0,0 +1,976 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/linker_set.h>
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>
#include <assert.h>
#include <machine/vmm.h>
#include <vmmapi.h>
#include "fbsdrun.h"
#include "inout.h"
#include "pci_emul.h"
#define CONF1_ADDR_PORT 0x0cf8
#define CONF1_DATA_PORT 0x0cfc
#define CFGWRITE(pi,off,val,b) \
do { \
if ((b) == 1) { \
pci_set_cfgdata8((pi),(off),(val)); \
} else if ((b) == 2) { \
pci_set_cfgdata16((pi),(off),(val)); \
} else { \
pci_set_cfgdata32((pi),(off),(val)); \
} \
} while (0)
#define MAXSLOTS 32
static struct slotinfo {
char *si_name;
char *si_param;
struct pci_devinst *si_devi;
int si_titled;
int si_pslot;
char si_prefix;
char si_suffix;
} pci_slotinfo[MAXSLOTS];
/*
* NetApp specific:
* struct used to build an in-core OEM table to supply device names
* to driver instances
*/
static struct mptable_pci_devnames {
#define MPT_HDR_BASE 0
#define MPT_HDR_NAME 2
uint16_t md_hdrtype;
uint16_t md_entries;
uint16_t md_cksum;
uint16_t md_pad;
#define MPT_NTAP_SIG \
((uint32_t)(('P' << 24) | ('A' << 16) | ('T' << 8) | 'N'))
uint32_t md_sig;
uint32_t md_rsvd;
struct mptable_pci_slotinfo {
uint16_t mds_type;
uint16_t mds_phys_slot;
uint8_t mds_bus;
uint8_t mds_slot;
uint8_t mds_func;
uint8_t mds_pad;
uint16_t mds_vid;
uint16_t mds_did;
uint8_t mds_suffix[4];
uint8_t mds_prefix[4];
uint32_t mds_rsvd[3];
} md_slotinfo[MAXSLOTS];
} pci_devnames;
SET_DECLARE(pci_devemu_set, struct pci_devemu);
static uint64_t pci_emul_iobase;
static uint64_t pci_emul_membase32;
static uint64_t pci_emul_membase64;
#define PCI_EMUL_IOBASE 0x2000
#define PCI_EMUL_IOLIMIT 0x10000
#define PCI_EMUL_MEMBASE32 (lomem_sz)
#define PCI_EMUL_MEMLIMIT32 0xE0000000 /* 3.5GB */
#define PCI_EMUL_MEMBASE64 0xD000000000UL
#define PCI_EMUL_MEMLIMIT64 0xFD00000000UL
static int pci_emul_devices;
static int devname_elems;
/*
* I/O access
*/
/*
* Slot options are in the form:
*
* <slot>,<emul>[,<config>]
*
* slot is 0..31
* emul is a string describing the type of PCI device e.g. virtio-net
* config is an optional string, depending on the device, that can be
* used for configuration.
* Examples are:
* 1,virtio-net,tap0
* 3,dummy
*/
static void
pci_parse_slot_usage(char *aopt)
{
printf("Invalid PCI slot info field \"%s\"\n", aopt);
free(aopt);
}
void
pci_parse_slot(char *opt)
{
char *slot, *emul, *config;
char *str, *cpy;
int snum;
str = cpy = strdup(opt);
config = NULL;
slot = strsep(&str, ",");
emul = strsep(&str, ",");
if (str != NULL) {
config = strsep(&str, ",");
}
if (emul == NULL) {
pci_parse_slot_usage(cpy);
return;
}
snum = 255;
snum = atoi(slot);
if (snum < 0 || snum >= MAXSLOTS) {
pci_parse_slot_usage(cpy);
} else {
pci_slotinfo[snum].si_name = emul;
pci_slotinfo[snum].si_param = config;
}
}
/*
*
* PCI MPTable names are of the form:
*
* <slot>,[prefix]<digit><suffix>
*
* .. with <prefix> an alphabetic char, <digit> a 1 or 2-digit string,
* and <suffix> a single char.
*
* Examples:
* 1,e0c
* 4,e0P
* 6,43a
* 7,0f
* 10,1
* 12,e0M
* 2,12a
*
* Note that this is NetApp-specific, but is ignored on other o/s's.
*/
static void
pci_parse_name_usage(char *aopt)
{
printf("Invalid PCI slot name field \"%s\"\n", aopt);
}
void
pci_parse_name(char *opt)
{
char csnum[4];
char *namestr;
char *slotend;
char prefix, suffix;
int i;
int pslot;
int snum;
pslot = -1;
prefix = suffix = 0;
slotend = strchr(opt, ',');
/*
* A comma must be present, and can't be the first character
* or no slot would be present. Also, the slot number can't be
* more than 2 characters.
*/
if (slotend == NULL || slotend == opt || (slotend - opt > 2)) {
pci_parse_name_usage(opt);
return;
}
for (i = 0; i < (slotend - opt); i++) {
csnum[i] = opt[i];
}
csnum[i] = '\0';
snum = 255;
snum = atoi(csnum);
if (snum < 0 || snum >= MAXSLOTS) {
pci_parse_name_usage(opt);
return;
}
namestr = slotend + 1;
if (strlen(namestr) > 3) {
pci_parse_name_usage(opt);
return;
}
if (isalpha(*namestr)) {
prefix = *namestr++;
}
if (!isdigit(*namestr)) {
pci_parse_name_usage(opt);
} else {
pslot = *namestr++ - '0';
if (isnumber(*namestr)) {
pslot = 10*pslot + *namestr++ - '0';
}
if (isalpha(*namestr) && *(namestr + 1) == 0) {
suffix = *namestr;
pci_slotinfo[snum].si_titled = 1;
pci_slotinfo[snum].si_pslot = pslot;
pci_slotinfo[snum].si_prefix = prefix;
pci_slotinfo[snum].si_suffix = suffix;
} else {
pci_parse_name_usage(opt);
}
}
}
static void
pci_add_mptable_name(struct slotinfo *si)
{
struct mptable_pci_slotinfo *ms;
/*
* If naming information has been supplied for this slot, populate
* the next available mptable OEM entry
*/
if (si->si_titled) {
ms = &pci_devnames.md_slotinfo[devname_elems];
ms->mds_type = MPT_HDR_NAME;
ms->mds_phys_slot = si->si_pslot;
ms->mds_bus = si->si_devi->pi_bus;
ms->mds_slot = si->si_devi->pi_slot;
ms->mds_func = si->si_devi->pi_func;
ms->mds_vid = pci_get_cfgdata16(si->si_devi, PCIR_VENDOR);
ms->mds_did = pci_get_cfgdata16(si->si_devi, PCIR_DEVICE);
ms->mds_suffix[0] = si->si_suffix;
ms->mds_prefix[0] = si->si_prefix;
devname_elems++;
}
}
static void
pci_finish_mptable_names(void)
{
int size;
if (devname_elems) {
pci_devnames.md_hdrtype = MPT_HDR_BASE;
pci_devnames.md_entries = devname_elems;
pci_devnames.md_cksum = 0; /* XXX */
pci_devnames.md_sig = MPT_NTAP_SIG;
size = (uintptr_t)&pci_devnames.md_slotinfo[devname_elems] -
(uintptr_t)&pci_devnames;
fbsdrun_add_oemtbl(&pci_devnames, size);
}
}
static int
pci_emul_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
struct pci_devinst *pdi = arg;
struct pci_devemu *pe = pdi->pi_d;
int offset, i;
for (i = 0; i <= PCI_BARMAX; i++) {
if (pdi->pi_bar[i].type == PCIBAR_IO &&
port >= pdi->pi_bar[i].addr &&
port + bytes <= pdi->pi_bar[i].addr + pdi->pi_bar[i].size) {
offset = port - pdi->pi_bar[i].addr;
if (in)
*eax = (*pe->pe_ior)(pdi, i, offset, bytes);
else
(*pe->pe_iow)(pdi, i, offset, bytes, *eax);
return (0);
}
}
return (-1);
}
static int
pci_emul_alloc_resource(uint64_t *baseptr, uint64_t limit, uint64_t size,
uint64_t *addr)
{
uint64_t base;
assert((size & (size - 1)) == 0); /* must be a power of 2 */
base = roundup2(*baseptr, size);
if (base + size <= limit) {
*addr = base;
*baseptr = base + size;
return (0);
} else
return (-1);
}
int
pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, uint64_t hostbase,
enum pcibar_type type, uint64_t size)
{
int i, error;
uint64_t *baseptr, limit, addr, mask, lobits, bar;
struct inout_port iop;
assert(idx >= 0 && idx <= PCI_BARMAX);
if ((size & (size - 1)) != 0)
size = 1UL << flsl(size); /* round up to a power of 2 */
switch (type) {
case PCIBAR_NONE:
baseptr = NULL;
addr = mask = lobits = 0;
break;
case PCIBAR_IO:
baseptr = &pci_emul_iobase;
limit = PCI_EMUL_IOLIMIT;
mask = PCIM_BAR_IO_BASE;
lobits = PCIM_BAR_IO_SPACE;
break;
case PCIBAR_MEM64:
/*
* XXX
* Some drivers do not work well if the 64-bit BAR is allocated
* above 4GB. Allow for this by allocating small requests under
* 4GB unless then allocation size is larger than some arbitrary
* number (32MB currently).
*/
if (size > 32 * 1024 * 1024) {
/*
* XXX special case for device requiring peer-peer DMA
*/
if (size == 0x100000000UL)
baseptr = &hostbase;
else
baseptr = &pci_emul_membase64;
limit = PCI_EMUL_MEMLIMIT64;
mask = PCIM_BAR_MEM_BASE;
lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 |
PCIM_BAR_MEM_PREFETCH;
break;
}
/* fallthrough */
case PCIBAR_MEM32:
baseptr = &pci_emul_membase32;
limit = PCI_EMUL_MEMLIMIT32;
mask = PCIM_BAR_MEM_BASE;
lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32;
break;
default:
printf("pci_emul_alloc_base: invalid bar type %d\n", type);
assert(0);
}
if (baseptr != NULL) {
error = pci_emul_alloc_resource(baseptr, limit, size, &addr);
if (error != 0)
return (error);
}
pdi->pi_bar[idx].type = type;
pdi->pi_bar[idx].addr = addr;
pdi->pi_bar[idx].size = size;
/* Initialize the BAR register in config space */
bar = (addr & mask) | lobits;
pci_set_cfgdata32(pdi, PCIR_BAR(idx), bar);
if (type == PCIBAR_MEM64) {
assert(idx + 1 <= PCI_BARMAX);
pdi->pi_bar[idx + 1].type = PCIBAR_MEMHI64;
pci_set_cfgdata32(pdi, PCIR_BAR(idx + 1), bar >> 32);
}
/* add a handler to intercept accesses to the I/O bar */
if (type == PCIBAR_IO) {
iop.name = pdi->pi_name;
iop.flags = IOPORT_F_INOUT;
iop.handler = pci_emul_handler;
iop.arg = pdi;
for (i = 0; i < size; i++) {
iop.port = addr + i;
register_inout(&iop);
}
}
return (0);
}
#define CAP_START_OFFSET 0x40
static int
pci_emul_add_capability(struct pci_devinst *pi, u_char *capdata, int caplen)
{
int i, capoff, capid, reallen;
uint16_t sts;
static u_char endofcap[4] = {
PCIY_RESERVED, 0, 0, 0
};
assert(caplen > 0 && capdata[0] != PCIY_RESERVED);
reallen = roundup2(caplen, 4); /* dword aligned */
sts = pci_get_cfgdata16(pi, PCIR_STATUS);
if ((sts & PCIM_STATUS_CAPPRESENT) == 0) {
capoff = CAP_START_OFFSET;
pci_set_cfgdata8(pi, PCIR_CAP_PTR, capoff);
pci_set_cfgdata16(pi, PCIR_STATUS, sts|PCIM_STATUS_CAPPRESENT);
} else {
capoff = pci_get_cfgdata8(pi, PCIR_CAP_PTR);
while (1) {
assert((capoff & 0x3) == 0);
capid = pci_get_cfgdata8(pi, capoff);
if (capid == PCIY_RESERVED)
break;
capoff = pci_get_cfgdata8(pi, capoff + 1);
}
}
/* Check if we have enough space */
if (capoff + reallen + sizeof(endofcap) > PCI_REGMAX + 1)
return (-1);
/* Copy the capability */
for (i = 0; i < caplen; i++)
pci_set_cfgdata8(pi, capoff + i, capdata[i]);
/* Set the next capability pointer */
pci_set_cfgdata8(pi, capoff + 1, capoff + reallen);
/* Copy of the reserved capability which serves as the end marker */
for (i = 0; i < sizeof(endofcap); i++)
pci_set_cfgdata8(pi, capoff + reallen + i, endofcap[i]);
return (0);
}
static struct pci_devemu *
pci_emul_finddev(char *name)
{
struct pci_devemu **pdpp, *pdp;
SET_FOREACH(pdpp, pci_devemu_set) {
pdp = *pdpp;
if (!strcmp(pdp->pe_emu, name)) {
return (pdp);
}
}
return (NULL);
}
static void
pci_emul_init(struct vmctx *ctx, struct pci_devemu *pde, int slot, char *params)
{
struct pci_devinst *pdi;
pdi = malloc(sizeof(struct pci_devinst));
bzero(pdi, sizeof(*pdi));
pdi->pi_vmctx = ctx;
pdi->pi_bus = 0;
pdi->pi_slot = slot;
pdi->pi_func = 0;
pdi->pi_d = pde;
snprintf(pdi->pi_name, PI_NAMESZ, "%s-pci-%d", pde->pe_emu, slot);
/* Disable legacy interrupts */
pci_set_cfgdata8(pdi, PCIR_INTLINE, 255);
pci_set_cfgdata8(pdi, PCIR_INTPIN, 0);
pci_set_cfgdata8(pdi, PCIR_COMMAND,
PCIM_CMD_PORTEN | PCIM_CMD_MEMEN | PCIM_CMD_BUSMASTEREN);
if ((*pde->pe_init)(ctx, pdi, params) != 0) {
free(pdi);
} else {
pci_emul_devices++;
pci_slotinfo[slot].si_devi = pdi;
}
}
void
pci_populate_msicap(struct msicap *msicap, int msgnum, int nextptr)
{
int mmc;
CTASSERT(sizeof(struct msicap) == 14);
/* Number of msi messages must be a power of 2 between 1 and 32 */
assert((msgnum & (msgnum - 1)) == 0 && msgnum >= 1 && msgnum <= 32);
mmc = ffs(msgnum) - 1;
bzero(msicap, sizeof(struct msicap));
msicap->capid = PCIY_MSI;
msicap->nextptr = nextptr;
msicap->msgctrl = PCIM_MSICTRL_64BIT | (mmc << 1);
}
int
pci_emul_add_msicap(struct pci_devinst *pi, int msgnum)
{
struct msicap msicap;
pci_populate_msicap(&msicap, msgnum, 0);
return (pci_emul_add_capability(pi, (u_char *)&msicap, sizeof(msicap)));
}
void
msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
int bytes, uint32_t val)
{
uint16_t msgctrl, rwmask, msgdata, mme;
uint32_t addrlo;
/*
* If guest is writing to the message control register make sure
* we do not overwrite read-only fields.
*/
if ((offset - capoff) == 2 && bytes == 2) {
rwmask = PCIM_MSICTRL_MME_MASK | PCIM_MSICTRL_MSI_ENABLE;
msgctrl = pci_get_cfgdata16(pi, offset);
msgctrl &= ~rwmask;
msgctrl |= val & rwmask;
val = msgctrl;
addrlo = pci_get_cfgdata32(pi, capoff + 4);
if (msgctrl & PCIM_MSICTRL_64BIT)
msgdata = pci_get_cfgdata16(pi, capoff + 12);
else
msgdata = pci_get_cfgdata16(pi, capoff + 8);
/*
* XXX check delivery mode, destination mode etc
*/
mme = msgctrl & PCIM_MSICTRL_MME_MASK;
pi->pi_msi.enabled = msgctrl & PCIM_MSICTRL_MSI_ENABLE ? 1 : 0;
if (pi->pi_msi.enabled) {
pi->pi_msi.cpu = (addrlo >> 12) & 0xff;
pi->pi_msi.vector = msgdata & 0xff;
pi->pi_msi.msgnum = 1 << (mme >> 4);
} else {
pi->pi_msi.cpu = 0;
pi->pi_msi.vector = 0;
pi->pi_msi.msgnum = 0;
}
}
CFGWRITE(pi, offset, val, bytes);
}
/*
* This function assumes that 'coff' is in the capabilities region of the
* config space.
*/
static void
pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes, uint32_t val)
{
int capid;
uint8_t capoff, nextoff;
/* Do not allow un-aligned writes */
if ((offset & (bytes - 1)) != 0)
return;
/* Find the capability that we want to update */
capoff = CAP_START_OFFSET;
while (1) {
capid = pci_get_cfgdata8(pi, capoff);
if (capid == PCIY_RESERVED)
break;
nextoff = pci_get_cfgdata8(pi, capoff + 1);
if (offset >= capoff && offset < nextoff)
break;
capoff = nextoff;
}
assert(offset >= capoff);
/*
* Capability ID and Next Capability Pointer are readonly
*/
if (offset == capoff || offset == capoff + 1)
return;
switch (capid) {
case PCIY_MSI:
msicap_cfgwrite(pi, capoff, offset, bytes, val);
break;
default:
break;
}
}
static int
pci_emul_iscap(struct pci_devinst *pi, int offset)
{
int found;
uint16_t sts;
uint8_t capid, lastoff;
found = 0;
sts = pci_get_cfgdata16(pi, PCIR_STATUS);
if ((sts & PCIM_STATUS_CAPPRESENT) != 0) {
lastoff = pci_get_cfgdata8(pi, PCIR_CAP_PTR);
while (1) {
assert((lastoff & 0x3) == 0);
capid = pci_get_cfgdata8(pi, lastoff);
if (capid == PCIY_RESERVED)
break;
lastoff = pci_get_cfgdata8(pi, lastoff + 1);
}
if (offset >= CAP_START_OFFSET && offset <= lastoff)
found = 1;
}
return (found);
}
void
init_pci(struct vmctx *ctx)
{
struct pci_devemu *pde;
struct slotinfo *si;
int i;
pci_emul_iobase = PCI_EMUL_IOBASE;
pci_emul_membase32 = PCI_EMUL_MEMBASE32;
pci_emul_membase64 = PCI_EMUL_MEMBASE64;
si = pci_slotinfo;
for (i = 0; i < MAXSLOTS; i++, si++) {
if (si->si_name != NULL) {
pde = pci_emul_finddev(si->si_name);
if (pde != NULL) {
pci_emul_init(ctx, pde, i, si->si_param);
pci_add_mptable_name(si);
}
}
}
pci_finish_mptable_names();
}
int
pci_msi_enabled(struct pci_devinst *pi)
{
return (pi->pi_msi.enabled);
}
int
pci_msi_msgnum(struct pci_devinst *pi)
{
if (pi->pi_msi.enabled)
return (pi->pi_msi.msgnum);
else
return (0);
}
void
pci_generate_msi(struct pci_devinst *pi, int msg)
{
if (pci_msi_enabled(pi) && msg < pci_msi_msgnum(pi)) {
vm_lapic_irq(pi->pi_vmctx,
pi->pi_msi.cpu,
pi->pi_msi.vector + msg);
}
}
static int cfgbus, cfgslot, cfgfunc, cfgoff;
static int
pci_emul_cfgaddr(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
uint32_t x;
assert(!in);
if (bytes != 4)
return (-1);
x = *eax;
cfgoff = x & PCI_REGMAX;
cfgfunc = (x >> 8) & PCI_FUNCMAX;
cfgslot = (x >> 11) & PCI_SLOTMAX;
cfgbus = (x >> 16) & PCI_BUSMAX;
return (0);
}
INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_OUT, pci_emul_cfgaddr);
static int
pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
struct pci_devinst *pi;
struct pci_devemu *pe;
int coff, idx;
uint64_t mask, bar;
assert(bytes == 1 || bytes == 2 || bytes == 4);
pi = pci_slotinfo[cfgslot].si_devi;
coff = cfgoff + (port - CONF1_DATA_PORT);
#if 0
printf("pcicfg-%s from 0x%0x of %d bytes (%d/%d/%d)\n\r",
in ? "read" : "write", coff, bytes, cfgbus, cfgslot, cfgfunc);
#endif
if (pi == NULL || cfgfunc != 0) {
if (in)
*eax = 0xffffffff;
return (0);
}
pe = pi->pi_d;
/*
* Config read
*/
if (in) {
/* Let the device emulation override the default handler */
if (pe->pe_cfgread != NULL &&
(*pe->pe_cfgread)(ctx, vcpu, pi, coff, bytes, eax) == 0)
return (0);
if (bytes == 1)
*eax = pci_get_cfgdata8(pi, coff);
else if (bytes == 2)
*eax = pci_get_cfgdata16(pi, coff);
else
*eax = pci_get_cfgdata32(pi, coff);
} else {
/* Let the device emulation override the default handler */
if (pe->pe_cfgwrite != NULL &&
(*pe->pe_cfgwrite)(ctx, vcpu, pi, coff, bytes, *eax) == 0)
return (0);
/*
* Special handling for write to BAR registers
*/
if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) {
/*
* Ignore writes to BAR registers that are not
* 4-byte aligned.
*/
if (bytes != 4 || (coff & 0x3) != 0)
return (0);
idx = (coff - PCIR_BAR(0)) / 4;
switch (pi->pi_bar[idx].type) {
case PCIBAR_NONE:
bar = 0;
break;
case PCIBAR_IO:
mask = ~(pi->pi_bar[idx].size - 1);
mask &= PCIM_BAR_IO_BASE;
bar = (*eax & mask) | PCIM_BAR_IO_SPACE;
break;
case PCIBAR_MEM32:
mask = ~(pi->pi_bar[idx].size - 1);
mask &= PCIM_BAR_MEM_BASE;
bar = *eax & mask;
bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32;
break;
case PCIBAR_MEM64:
mask = ~(pi->pi_bar[idx].size - 1);
mask &= PCIM_BAR_MEM_BASE;
bar = *eax & mask;
bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 |
PCIM_BAR_MEM_PREFETCH;
break;
case PCIBAR_MEMHI64:
mask = ~(pi->pi_bar[idx - 1].size - 1);
mask &= PCIM_BAR_MEM_BASE;
bar = ((uint64_t)*eax << 32) & mask;
bar = bar >> 32;
break;
default:
assert(0);
}
pci_set_cfgdata32(pi, coff, bar);
} else if (pci_emul_iscap(pi, coff)) {
pci_emul_capwrite(pi, coff, bytes, *eax);
} else {
CFGWRITE(pi, coff, *eax, bytes);
}
}
return (0);
}
INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+0, IOPORT_F_INOUT, pci_emul_cfgdata);
INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+1, IOPORT_F_INOUT, pci_emul_cfgdata);
INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+2, IOPORT_F_INOUT, pci_emul_cfgdata);
INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+3, IOPORT_F_INOUT, pci_emul_cfgdata);
/*
* I/O ports to configure PCI IRQ routing. We ignore all writes to it.
*/
static int
pci_irq_port_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
assert(in == 0);
return (0);
}
INOUT_PORT(pci_irq, 0xC00, IOPORT_F_OUT, pci_irq_port_handler);
INOUT_PORT(pci_irq, 0xC01, IOPORT_F_OUT, pci_irq_port_handler);
#define PCI_EMUL_TEST
#ifdef PCI_EMUL_TEST
/*
* Define a dummy test device
*/
#define DREGSZ 20
struct pci_emul_dsoftc {
uint8_t regs[DREGSZ];
};
#define PCI_EMUL_MSGS 4
int
pci_emul_dinit(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
{
int error;
struct pci_emul_dsoftc *sc;
sc = malloc(sizeof(struct pci_emul_dsoftc));
memset(sc, 0, sizeof(struct pci_emul_dsoftc));
pi->pi_arg = sc;
pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0001);
pci_set_cfgdata16(pi, PCIR_VENDOR, 0x10DD);
pci_set_cfgdata8(pi, PCIR_CLASS, 0x02);
error = pci_emul_alloc_bar(pi, 0, 0, PCIBAR_IO, DREGSZ);
assert(error == 0);
error = pci_emul_add_msicap(pi, PCI_EMUL_MSGS);
assert(error == 0);
return (0);
}
void
pci_emul_diow(struct pci_devinst *pi, int baridx, int offset, int size,
uint32_t value)
{
int i;
struct pci_emul_dsoftc *sc = pi->pi_arg;
if (offset + size > DREGSZ) {
printf("diow: too large, offset %d size %d\n", offset, size);
return;
}
if (size == 1) {
sc->regs[offset] = value & 0xff;
} else if (size == 2) {
*(uint16_t *)&sc->regs[offset] = value & 0xffff;
} else {
*(uint32_t *)&sc->regs[offset] = value;
}
/*
* Special magic value to generate an interrupt
*/
if (offset == 4 && size == 4 && pci_msi_enabled(pi))
pci_generate_msi(pi, value % pci_msi_msgnum(pi));
if (value == 0xabcdef) {
for (i = 0; i < pci_msi_msgnum(pi); i++)
pci_generate_msi(pi, i);
}
}
uint32_t
pci_emul_dior(struct pci_devinst *pi, int baridx, int offset, int size)
{
struct pci_emul_dsoftc *sc = pi->pi_arg;
uint32_t value;
if (offset + size > DREGSZ) {
printf("dior: too large, offset %d size %d\n", offset, size);
return (0);
}
if (size == 1) {
value = sc->regs[offset];
} else if (size == 2) {
value = *(uint16_t *) &sc->regs[offset];
} else {
value = *(uint32_t *) &sc->regs[offset];
}
return (value);
}
struct pci_devemu pci_dummy = {
.pe_emu = "dummy",
.pe_init = pci_emul_dinit,
.pe_iow = pci_emul_diow,
.pe_ior = pci_emul_dior
};
PCI_EMUL_SET(pci_dummy);
#endif /* PCI_EMUL_TEST */

171
usr.sbin/bhyve/pci_emul.h Normal file
View File

@ -0,0 +1,171 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _PCI_EMUL_H_
#define _PCI_EMUL_H_
#include <sys/types.h>
#include <sys/queue.h>
#include <sys/kernel.h>
#include <dev/pci/pcireg.h>
#include <assert.h>
#define PCI_BARMAX PCIR_MAX_BAR_0 /* BAR registers in a Type 0 header */
#define PCIY_RESERVED 0x00
struct vmctx;
struct pci_devinst;
struct pci_devemu {
char *pe_emu; /* Name of device emulation */
/* instance creation */
int (*pe_init)(struct vmctx *, struct pci_devinst *, char *opts);
/* config space read/write callbacks */
int (*pe_cfgwrite)(struct vmctx *ctx, int vcpu,
struct pci_devinst *pi, int offset,
int bytes, uint32_t val);
int (*pe_cfgread)(struct vmctx *ctx, int vcpu,
struct pci_devinst *pi, int offset,
int bytes, uint32_t *retval);
/* I/O space read/write callbacks */
void (*pe_iow)(struct pci_devinst *pi, int baridx,
int offset, int size, uint32_t value);
uint32_t (*pe_ior)(struct pci_devinst *pi, int baridx,
int offset, int size);
};
#define PCI_EMUL_SET(x) DATA_SET(pci_devemu_set, x);
enum pcibar_type {
PCIBAR_NONE,
PCIBAR_IO,
PCIBAR_MEM32,
PCIBAR_MEM64,
PCIBAR_MEMHI64
};
struct pcibar {
enum pcibar_type type; /* io or memory */
uint64_t size;
uint64_t addr;
};
#define PI_NAMESZ 40
struct pci_devinst {
struct pci_devemu *pi_d;
struct vmctx *pi_vmctx;
uint8_t pi_bus, pi_slot, pi_func;
char pi_name[PI_NAMESZ];
uint16_t pi_iobase;
int pi_bar_getsize;
struct {
int enabled;
int cpu;
int vector;
int msgnum;
} pi_msi;
void *pi_arg; /* devemu-private data */
u_char pi_cfgdata[PCI_REGMAX + 1];
struct pcibar pi_bar[PCI_BARMAX + 1];
};
struct msicap {
uint8_t capid;
uint8_t nextptr;
uint16_t msgctrl;
uint32_t addrlo;
uint32_t addrhi;
uint16_t msgdata;
} __packed;
void init_pci(struct vmctx *ctx);
void pci_parse_slot(char *opt);
void pci_parse_name(char *opt);
void pci_callback(void);
int pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, uint64_t hostbase,
enum pcibar_type type, uint64_t size);
int pci_emul_add_msicap(struct pci_devinst *pi, int msgnum);
void msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
int bytes, uint32_t val);
void pci_generate_msi(struct pci_devinst *pi, int msgnum);
int pci_msi_enabled(struct pci_devinst *pi);
int pci_msi_msgnum(struct pci_devinst *pi);
void pci_populate_msicap(struct msicap *cap, int msgs, int nextptr);
static __inline void
pci_set_cfgdata8(struct pci_devinst *pi, int offset, uint8_t val)
{
assert(offset <= PCI_REGMAX);
*(uint8_t *)(pi->pi_cfgdata + offset) = val;
}
static __inline void
pci_set_cfgdata16(struct pci_devinst *pi, int offset, uint16_t val)
{
assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0);
*(uint16_t *)(pi->pi_cfgdata + offset) = val;
}
static __inline void
pci_set_cfgdata32(struct pci_devinst *pi, int offset, uint32_t val)
{
assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0);
*(uint32_t *)(pi->pi_cfgdata + offset) = val;
}
static __inline uint8_t
pci_get_cfgdata8(struct pci_devinst *pi, int offset)
{
assert(offset <= PCI_REGMAX);
return (*(uint8_t *)(pi->pi_cfgdata + offset));
}
static __inline uint16_t
pci_get_cfgdata16(struct pci_devinst *pi, int offset)
{
assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0);
return (*(uint16_t *)(pi->pi_cfgdata + offset));
}
static __inline uint32_t
pci_get_cfgdata32(struct pci_devinst *pi, int offset)
{
assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0);
return (*(uint32_t *)(pi->pi_cfgdata + offset));
}
#endif /* _PCI_EMUL_H_ */

View File

@ -0,0 +1,52 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "pci_emul.h"
static int
pci_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
{
/* config space */
pci_set_cfgdata16(pi, PCIR_VENDOR, 0x1275); /* NetApp */
pci_set_cfgdata16(pi, PCIR_DEVICE, 0x1275); /* NetApp */
pci_set_cfgdata8(pi, PCIR_HDRTYPE, PCIM_HDRTYPE_BRIDGE);
pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE);
pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_HOST);
return (0);
}
struct pci_devemu pci_de_hostbridge = {
.pe_emu = "hostbridge",
.pe_init = pci_hostbridge_init,
};
PCI_EMUL_SET(pci_de_hostbridge);

View File

@ -0,0 +1,508 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/types.h>
#include <sys/pciio.h>
#include <sys/ioctl.h>
#include <dev/io/iodev.h>
#include <machine/iodev.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <fcntl.h>
#include <unistd.h>
#include <machine/vmm.h>
#include <vmmapi.h>
#include "pci_emul.h"
#ifndef _PATH_DEVPCI
#define _PATH_DEVPCI "/dev/pci"
#endif
#ifndef _PATH_DEVIO
#define _PATH_DEVIO "/dev/io"
#endif
#define LEGACY_SUPPORT 1
static int pcifd = -1;
static int iofd = -1;
struct passthru_softc {
struct pci_devinst *psc_pi;
struct pcibar psc_bar[PCI_BARMAX + 1];
struct {
int capoff;
int msgctrl;
int emulated;
} psc_msi;
struct pcisel psc_sel;
};
static int
msi_caplen(int msgctrl)
{
int len;
len = 10; /* minimum length of msi capability */
if (msgctrl & PCIM_MSICTRL_64BIT)
len += 4;
#if 0
/*
* Ignore the 'mask' and 'pending' bits in the MSI capability.
* We'll let the guest manipulate them directly.
*/
if (msgctrl & PCIM_MSICTRL_VECTOR)
len += 10;
#endif
return (len);
}
static uint32_t
read_config(const struct pcisel *sel, long reg, int width)
{
struct pci_io pi;
bzero(&pi, sizeof(pi));
pi.pi_sel = *sel;
pi.pi_reg = reg;
pi.pi_width = width;
if (ioctl(pcifd, PCIOCREAD, &pi) < 0)
return (0); /* XXX */
else
return (pi.pi_data);
}
static void
write_config(const struct pcisel *sel, long reg, int width, uint32_t data)
{
struct pci_io pi;
bzero(&pi, sizeof(pi));
pi.pi_sel = *sel;
pi.pi_reg = reg;
pi.pi_width = width;
pi.pi_data = data;
(void)ioctl(pcifd, PCIOCWRITE, &pi); /* XXX */
}
#ifdef LEGACY_SUPPORT
static int
passthru_add_msicap(struct pci_devinst *pi, int msgnum, int nextptr)
{
int capoff, i;
struct msicap msicap;
u_char *capdata;
pci_populate_msicap(&msicap, msgnum, nextptr);
/*
* XXX
* Copy the msi capability structure in the last 16 bytes of the
* config space. This is wrong because it could shadow something
* useful to the device.
*/
capoff = 256 - roundup(sizeof(msicap), 4);
capdata = (u_char *)&msicap;
for (i = 0; i < sizeof(msicap); i++)
pci_set_cfgdata8(pi, capoff + i, capdata[i]);
return (capoff);
}
#endif /* LEGACY_SUPPORT */
static int
cfginitmsi(struct passthru_softc *sc)
{
int ptr, cap, sts, caplen;
uint32_t u32;
struct pcisel sel;
struct pci_devinst *pi;
pi = sc->psc_pi;
sel = sc->psc_sel;
/*
* Parse the capabilities and cache the location of the MSI
* capability.
*/
sts = read_config(&sel, PCIR_STATUS, 2);
if (sts & PCIM_STATUS_CAPPRESENT) {
ptr = read_config(&sel, PCIR_CAP_PTR, 1);
while (ptr != 0 && ptr != 0xff) {
cap = read_config(&sel, ptr + PCICAP_ID, 1);
if (cap == PCIY_MSI) {
/*
* Copy the MSI capability into the config
* space of the emulated pci device
*/
sc->psc_msi.capoff = ptr;
sc->psc_msi.msgctrl = read_config(&sel,
ptr + 2, 2);
sc->psc_msi.emulated = 0;
caplen = msi_caplen(sc->psc_msi.msgctrl);
while (caplen > 0) {
u32 = read_config(&sel, ptr, 4);
pci_set_cfgdata32(pi, ptr, u32);
caplen -= 4;
ptr += 4;
}
break;
}
ptr = read_config(&sel, ptr + PCICAP_NEXTPTR, 1);
}
}
#ifdef LEGACY_SUPPORT
/*
* If the passthrough device does not support MSI then craft a
* MSI capability for it. We link the new MSI capability at the
* head of the list of capabilities.
*/
if ((sts & PCIM_STATUS_CAPPRESENT) != 0 && sc->psc_msi.capoff == 0) {
int origptr, msiptr;
origptr = read_config(&sel, PCIR_CAP_PTR, 1);
msiptr = passthru_add_msicap(pi, 1, origptr);
sc->psc_msi.capoff = msiptr;
sc->psc_msi.msgctrl = pci_get_cfgdata16(pi, msiptr + 2);
sc->psc_msi.emulated = 1;
pci_set_cfgdata8(pi, PCIR_CAP_PTR, msiptr);
}
#endif
if (sc->psc_msi.capoff == 0) /* MSI or bust */
return (-1);
else
return (0);
}
static int
cfginitbar(struct vmctx *ctx, struct passthru_softc *sc)
{
int i, error;
struct pci_devinst *pi;
struct pci_bar_io bar;
enum pcibar_type bartype;
uint64_t base;
pi = sc->psc_pi;
/*
* Initialize BAR registers
*/
for (i = 0; i <= PCI_BARMAX; i++) {
bzero(&bar, sizeof(bar));
bar.pbi_sel = sc->psc_sel;
bar.pbi_reg = PCIR_BAR(i);
if (ioctl(pcifd, PCIOCGETBAR, &bar) < 0)
continue;
if (PCI_BAR_IO(bar.pbi_base)) {
bartype = PCIBAR_IO;
base = bar.pbi_base & PCIM_BAR_IO_BASE;
} else {
switch (bar.pbi_base & PCIM_BAR_MEM_TYPE) {
case PCIM_BAR_MEM_64:
bartype = PCIBAR_MEM64;
break;
default:
bartype = PCIBAR_MEM32;
break;
}
base = bar.pbi_base & PCIM_BAR_MEM_BASE;
}
/* Cache information about the "real" BAR */
sc->psc_bar[i].type = bartype;
sc->psc_bar[i].size = bar.pbi_length;
sc->psc_bar[i].addr = base;
/* Allocate the BAR in the guest I/O or MMIO space */
error = pci_emul_alloc_bar(pi, i, base, bartype,
bar.pbi_length);
if (error)
return (-1);
/*
* Map the physical MMIO space in the guest MMIO space
*/
if (bartype != PCIBAR_IO) {
error = vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus,
sc->psc_sel.pc_dev, sc->psc_sel.pc_func,
pi->pi_bar[i].addr, pi->pi_bar[i].size, base);
if (error)
return (-1);
}
/*
* 64-bit BAR takes up two slots so skip the next one.
*/
if (bartype == PCIBAR_MEM64) {
i++;
assert(i <= PCI_BARMAX);
sc->psc_bar[i].type = PCIBAR_MEMHI64;
}
}
return (0);
}
static int
cfginit(struct vmctx *ctx, struct pci_devinst *pi, int bus, int slot, int func)
{
int error;
struct passthru_softc *sc;
error = 1;
sc = pi->pi_arg;
bzero(&sc->psc_sel, sizeof(struct pcisel));
sc->psc_sel.pc_bus = bus;
sc->psc_sel.pc_dev = slot;
sc->psc_sel.pc_func = func;
if (cfginitbar(ctx, sc) != 0)
goto done;
if (cfginitmsi(sc) != 0)
goto done;
error = 0; /* success */
done:
return (error);
}
static int
passthru_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
{
int bus, slot, func, error;
struct passthru_softc *sc;
sc = NULL;
error = 1;
if (pcifd < 0) {
pcifd = open(_PATH_DEVPCI, O_RDWR, 0);
if (pcifd < 0)
goto done;
}
if (iofd < 0) {
iofd = open(_PATH_DEVIO, O_RDWR, 0);
if (iofd < 0)
goto done;
}
if (opts == NULL || sscanf(opts, "%d/%d/%d", &bus, &slot, &func) != 3)
goto done;
if (vm_assign_pptdev(ctx, bus, slot, func) != 0)
goto done;
sc = malloc(sizeof(struct passthru_softc));
memset(sc, 0, sizeof(struct passthru_softc));
pi->pi_arg = sc;
sc->psc_pi = pi;
/* initialize config space */
if (cfginit(ctx, pi, bus, slot, func) != 0)
goto done;
error = 0; /* success */
done:
if (error) {
free(sc);
vm_unassign_pptdev(ctx, bus, slot, func);
}
return (error);
}
static int
bar_access(int coff)
{
if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1))
return (1);
else
return (0);
}
static int
msicap_access(struct passthru_softc *sc, int coff)
{
int caplen;
if (sc->psc_msi.capoff == 0)
return (0);
caplen = msi_caplen(sc->psc_msi.msgctrl);
if (coff >= sc->psc_msi.capoff && coff < sc->psc_msi.capoff + caplen)
return (1);
else
return (0);
}
static int
passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int coff,
int bytes, uint32_t *rv)
{
struct passthru_softc *sc;
sc = pi->pi_arg;
/*
* PCI BARs and MSI capability is emulated.
*/
if (bar_access(coff) || msicap_access(sc, coff))
return (-1);
#ifdef LEGACY_SUPPORT
/*
* Emulate PCIR_CAP_PTR if this device does not support MSI capability
* natively.
*/
if (sc->psc_msi.emulated) {
if (coff >= PCIR_CAP_PTR && coff < PCIR_CAP_PTR + 4)
return (-1);
}
#endif
/* Everything else just read from the device's config space */
*rv = read_config(&sc->psc_sel, coff, bytes);
return (0);
}
static int
passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int coff,
int bytes, uint32_t val)
{
int error;
struct passthru_softc *sc;
sc = pi->pi_arg;
/*
* PCI BARs are emulated
*/
if (bar_access(coff))
return (-1);
/*
* MSI capability is emulated
*/
if (msicap_access(sc, coff)) {
msicap_cfgwrite(pi, sc->psc_msi.capoff, coff, bytes, val);
error = vm_setup_msi(ctx, vcpu, sc->psc_sel.pc_bus,
sc->psc_sel.pc_dev, sc->psc_sel.pc_func, pi->pi_msi.cpu,
pi->pi_msi.vector, pi->pi_msi.msgnum);
if (error != 0) {
printf("vm_setup_msi returned error %d\r\n", errno);
exit(1);
}
return (0);
}
#ifdef LEGACY_SUPPORT
/*
* If this device does not support MSI natively then we cannot let
* the guest disable legacy interrupts from the device. It is the
* legacy interrupt that is triggering the virtual MSI to the guest.
*/
if (sc->psc_msi.emulated && pci_msi_enabled(pi)) {
if (coff == PCIR_COMMAND && bytes == 2)
val &= ~PCIM_CMD_INTxDIS;
}
#endif
write_config(&sc->psc_sel, coff, bytes, val);
return (0);
}
static void
passthru_iow(struct pci_devinst *pi, int baridx, int offset, int size,
uint32_t value)
{
struct passthru_softc *sc;
struct iodev_pio_req pio;
sc = pi->pi_arg;
bzero(&pio, sizeof(struct iodev_pio_req));
pio.access = IODEV_PIO_WRITE;
pio.port = sc->psc_bar[baridx].addr + offset;
pio.width = size;
pio.val = value;
(void)ioctl(iofd, IODEV_PIO, &pio);
}
static uint32_t
passthru_ior(struct pci_devinst *pi, int baridx, int offset, int size)
{
struct passthru_softc *sc;
struct iodev_pio_req pio;
sc = pi->pi_arg;
bzero(&pio, sizeof(struct iodev_pio_req));
pio.access = IODEV_PIO_READ;
pio.port = sc->psc_bar[baridx].addr + offset;
pio.width = size;
pio.val = 0;
(void)ioctl(iofd, IODEV_PIO, &pio);
return (pio.val);
}
struct pci_devemu passthru = {
.pe_emu = "passthru",
.pe_init = passthru_init,
.pe_cfgwrite = passthru_cfgwrite,
.pe_cfgread = passthru_cfgread,
.pe_iow = passthru_iow,
.pe_ior = passthru_ior,
};
PCI_EMUL_SET(passthru);

View File

@ -0,0 +1,502 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/linker_set.h>
#include <sys/stat.h>
#include <sys/uio.h>
#include <sys/ioctl.h>
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <strings.h>
#include <unistd.h>
#include <assert.h>
#include <pthread.h>
#include "fbsdrun.h"
#include "pci_emul.h"
#include "virtio.h"
#define VTBLK_RINGSZ 64
#define VTBLK_CFGSZ 28
#define VTBLK_R_CFG VTCFG_R_CFG0
#define VTBLK_R_CFG_END VTBLK_R_CFG + VTBLK_CFGSZ -1
#define VTBLK_R_MAX VTBLK_R_CFG_END
#define VTBLK_REGSZ VTBLK_R_MAX+1
#define VTBLK_MAXSEGS 32
#define VTBLK_S_OK 0
#define VTBLK_S_IOERR 1
/*
* Host capabilities
*/
#define VTBLK_S_HOSTCAPS \
( 0x00000004 | /* host maximum request segments */ \
0x10000000 ) /* supports indirect descriptors */
struct vring_hqueue {
/* Internal state */
uint16_t hq_size;
uint16_t hq_cur_aidx; /* trails behind 'avail_idx' */
/* Host-context pointers to the queue */
struct virtio_desc *hq_dtable;
uint16_t *hq_avail_flags;
uint16_t *hq_avail_idx; /* monotonically increasing */
uint16_t *hq_avail_ring;
uint16_t *hq_used_flags;
uint16_t *hq_used_idx; /* monotonically increasing */
struct virtio_used *hq_used_ring;
};
/*
* Config space
*/
struct vtblk_config {
uint64_t vbc_capacity;
uint32_t vbc_size_max;
uint32_t vbc_seg_max;
uint16_t vbc_geom_c;
uint8_t vbc_geom_h;
uint8_t vbc_geom_s;
uint32_t vbc_blk_size;
uint32_t vbc_sectors_max;
} __packed;
CTASSERT(sizeof(struct vtblk_config) == VTBLK_CFGSZ);
/*
* Fixed-size block header
*/
struct virtio_blk_hdr {
#define VBH_OP_READ 0
#define VBH_OP_WRITE 1
uint32_t vbh_type;
uint32_t vbh_ioprio;
uint64_t vbh_sector;
} __packed;
/*
* Debug printf
*/
static int pci_vtblk_debug;
#define DPRINTF(params) if (pci_vtblk_debug) printf params
#define WPRINTF(params) printf params
/*
* Per-device softc
*/
struct pci_vtblk_softc {
struct pci_devinst *vbsc_pi;
int vbsc_fd;
int vbsc_status;
int vbsc_isr;
int vbsc_lastq;
uint32_t vbsc_features;
uint64_t vbsc_pfn;
struct vring_hqueue vbsc_q;
struct vtblk_config vbsc_cfg;
};
/*
* Return the number of available descriptors in the vring taking care
* of the 16-bit index wraparound.
*/
static int
hq_num_avail(struct vring_hqueue *hq)
{
int ndesc;
if (*hq->hq_avail_idx >= hq->hq_cur_aidx)
ndesc = *hq->hq_avail_idx - hq->hq_cur_aidx;
else
ndesc = UINT16_MAX - hq->hq_cur_aidx + *hq->hq_avail_idx + 1;
assert(ndesc >= 0 && ndesc <= hq->hq_size);
return (ndesc);
}
static void
pci_vtblk_update_status(struct pci_vtblk_softc *sc, uint32_t value)
{
if (value == 0) {
DPRINTF(("vtblk: device reset requested !\n"));
}
sc->vbsc_status = value;
}
static void
pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vring_hqueue *hq)
{
struct iovec iov[VTBLK_MAXSEGS];
struct virtio_blk_hdr *vbh;
struct virtio_desc *vd, *vid;
struct virtio_used *vu;
uint8_t *status;
int i;
int err;
int iolen;
int nsegs;
int uidx, aidx, didx;
int writeop;
off_t offset;
uidx = *hq->hq_used_idx;
aidx = hq->hq_cur_aidx;
didx = hq->hq_avail_ring[aidx % hq->hq_size];
assert(didx >= 0 && didx < hq->hq_size);
vd = &hq->hq_dtable[didx];
/*
* Verify that the descriptor is indirect, and obtain
* the pointer to the indirect descriptor.
* There has to be space for at least 3 descriptors
* in the indirect descriptor array: the block header,
* 1 or more data descriptors, and a status byte.
*/
assert(vd->vd_flags & VRING_DESC_F_INDIRECT);
nsegs = vd->vd_len / sizeof(struct virtio_desc);
assert(nsegs >= 3);
assert(nsegs < VTBLK_MAXSEGS + 2);
vid = paddr_guest2host(vd->vd_addr);
assert((vid->vd_flags & VRING_DESC_F_INDIRECT) == 0);
/*
* The first descriptor will be the read-only fixed header
*/
vbh = paddr_guest2host(vid[0].vd_addr);
assert(vid[0].vd_len == sizeof(struct virtio_blk_hdr));
assert(vid[0].vd_flags & VRING_DESC_F_NEXT);
assert((vid[0].vd_flags & VRING_DESC_F_WRITE) == 0);
writeop = (vbh->vbh_type == VBH_OP_WRITE);
offset = vbh->vbh_sector * DEV_BSIZE;
/*
* Build up the iovec based on the guest's data descriptors
*/
for (i = 1, iolen = 0; i < nsegs - 1; i++) {
iov[i-1].iov_base = paddr_guest2host(vid[i].vd_addr);
iov[i-1].iov_len = vid[i].vd_len;
iolen += vid[i].vd_len;
assert(vid[i].vd_flags & VRING_DESC_F_NEXT);
assert((vid[i].vd_flags & VRING_DESC_F_INDIRECT) == 0);
/*
* - write op implies read-only descriptor,
* - read op implies write-only descriptor,
* therefore test the inverse of the descriptor bit
* to the op.
*/
assert(((vid[i].vd_flags & VRING_DESC_F_WRITE) == 0) ==
writeop);
}
/* Lastly, get the address of the status byte */
status = paddr_guest2host(vid[nsegs - 1].vd_addr);
assert(vid[nsegs - 1].vd_len == 1);
assert((vid[nsegs - 1].vd_flags & VRING_DESC_F_NEXT) == 0);
assert(vid[nsegs - 1].vd_flags & VRING_DESC_F_WRITE);
DPRINTF(("virtio-block: %s op, %d bytes, %d segs, offset %ld\n\r",
writeop ? "write" : "read", iolen, nsegs - 2, offset));
if (writeop){
err = pwritev(sc->vbsc_fd, iov, nsegs - 2, offset);
} else {
err = preadv(sc->vbsc_fd, iov, nsegs - 2, offset);
}
*status = err < 0 ? VTBLK_S_IOERR : VTBLK_S_OK;
/*
* Return the single indirect descriptor back to the host
*/
vu = &hq->hq_used_ring[uidx % hq->hq_size];
vu->vu_idx = didx;
vu->vu_tlen = 1;
hq->hq_cur_aidx++;
*hq->hq_used_idx += 1;
}
static void
pci_vtblk_qnotify(struct pci_vtblk_softc *sc)
{
struct vring_hqueue *hq = &sc->vbsc_q;
int i;
int ndescs;
/*
* Calculate number of ring entries to process
*/
ndescs = hq_num_avail(hq);
if (ndescs == 0)
return;
/*
* Run through all the entries, placing them into iovecs and
* sending when an end-of-packet is found
*/
for (i = 0; i < ndescs; i++)
pci_vtblk_proc(sc, hq);
/*
* Generate an interrupt if able
*/
if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0 &&
sc->vbsc_isr == 0) {
sc->vbsc_isr = 1;
pci_generate_msi(sc->vbsc_pi, 0);
}
}
static void
pci_vtblk_ring_init(struct pci_vtblk_softc *sc, uint64_t pfn)
{
struct vring_hqueue *hq;
sc->vbsc_pfn = pfn << VRING_PFN;
/*
* Set up host pointers to the various parts of the
* queue
*/
hq = &sc->vbsc_q;
hq->hq_size = VTBLK_RINGSZ;
hq->hq_dtable = paddr_guest2host(pfn << VRING_PFN);
hq->hq_avail_flags = (uint16_t *)(hq->hq_dtable + hq->hq_size);
hq->hq_avail_idx = hq->hq_avail_flags + 1;
hq->hq_avail_ring = hq->hq_avail_flags + 2;
hq->hq_used_flags = (uint16_t *)roundup2((uintptr_t)hq->hq_avail_ring,
VRING_ALIGN);
hq->hq_used_idx = hq->hq_used_flags + 1;
hq->hq_used_ring = (struct virtio_used *)(hq->hq_used_flags + 2);
/*
* Initialize queue indexes
*/
hq->hq_cur_aidx = 0;
}
static int
pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
{
struct stat sbuf;
struct pci_vtblk_softc *sc;
int fd;
if (opts == NULL) {
printf("virtio-block: backing device required\n");
return (1);
}
/*
* Access to guest memory is required. Fail if
* memory not mapped
*/
if (paddr_guest2host(0) == NULL)
return (1);
/*
* The supplied backing file has to exist
*/
fd = open(opts, O_RDWR);
if (fd < 0) {
perror("Could not open backing file");
return (1);
}
if (fstat(fd, &sbuf) < 0) {
perror("Could not stat backing file");
close(fd);
return (1);
}
sc = malloc(sizeof(struct pci_vtblk_softc));
memset(sc, 0, sizeof(struct pci_vtblk_softc));
pi->pi_arg = sc;
sc->vbsc_pi = pi;
sc->vbsc_fd = fd;
/* setup virtio block config space */
sc->vbsc_cfg.vbc_capacity = sbuf.st_size / DEV_BSIZE;
sc->vbsc_cfg.vbc_seg_max = VTBLK_MAXSEGS;
sc->vbsc_cfg.vbc_blk_size = DEV_BSIZE;
sc->vbsc_cfg.vbc_size_max = 0; /* not negotiated */
sc->vbsc_cfg.vbc_geom_c = 0; /* no geometry */
sc->vbsc_cfg.vbc_geom_h = 0;
sc->vbsc_cfg.vbc_geom_s = 0;
sc->vbsc_cfg.vbc_sectors_max = 0;
/* initialize config space */
pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_BLOCK);
pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_BLOCK);
pci_emul_alloc_bar(pi, 0, 0, PCIBAR_IO, VTBLK_REGSZ);
pci_emul_add_msicap(pi, 1);
return (0);
}
static void
pci_vtblk_write(struct pci_devinst *pi, int baridx, int offset, int size,
uint32_t value)
{
struct pci_vtblk_softc *sc = pi->pi_arg;
if (offset + size > VTBLK_REGSZ) {
DPRINTF(("vtblk_write: 2big, offset %d size %d\n",
offset, size));
return;
}
switch (offset) {
case VTCFG_R_GUESTCAP:
assert(size == 4);
sc->vbsc_features = value & VTBLK_S_HOSTCAPS;
break;
case VTCFG_R_PFN:
assert(size == 4);
pci_vtblk_ring_init(sc, value);
break;
case VTCFG_R_QSEL:
assert(size == 2);
sc->vbsc_lastq = value;
break;
case VTCFG_R_QNOTIFY:
assert(size == 2);
assert(value == 0);
pci_vtblk_qnotify(sc);
break;
case VTCFG_R_STATUS:
assert(size == 1);
pci_vtblk_update_status(sc, value);
break;
case VTCFG_R_HOSTCAP:
case VTCFG_R_QNUM:
case VTCFG_R_ISR:
case VTBLK_R_CFG ... VTBLK_R_CFG_END:
DPRINTF(("vtblk: write to readonly reg %d\n\r", offset));
break;
default:
DPRINTF(("vtblk: unknown i/o write offset %d\n\r", offset));
value = 0;
break;
}
}
uint32_t
pci_vtblk_read(struct pci_devinst *pi, int baridx, int offset, int size)
{
struct pci_vtblk_softc *sc = pi->pi_arg;
uint32_t value;
if (offset + size > VTBLK_REGSZ) {
DPRINTF(("vtblk_read: 2big, offset %d size %d\n",
offset, size));
return (0);
}
switch (offset) {
case VTCFG_R_HOSTCAP:
assert(size == 4);
value = VTBLK_S_HOSTCAPS;
break;
case VTCFG_R_GUESTCAP:
assert(size == 4);
value = sc->vbsc_features; /* XXX never read ? */
break;
case VTCFG_R_PFN:
assert(size == 4);
value = sc->vbsc_pfn >> VRING_PFN;
break;
case VTCFG_R_QNUM:
value = (sc->vbsc_lastq == 0) ? VTBLK_RINGSZ: 0;
break;
case VTCFG_R_QSEL:
assert(size == 2);
value = sc->vbsc_lastq; /* XXX never read ? */
break;
case VTCFG_R_QNOTIFY:
assert(size == 2);
value = 0; /* XXX never read ? */
break;
case VTCFG_R_STATUS:
assert(size == 1);
value = sc->vbsc_status;
break;
case VTCFG_R_ISR:
assert(size == 1);
value = sc->vbsc_isr;
sc->vbsc_isr = 0; /* a read clears this flag */
break;
case VTBLK_R_CFG ... VTBLK_R_CFG_END:
assert(size == 1);
value = *((uint8_t *)&sc->vbsc_cfg + offset - VTBLK_R_CFG);
break;
default:
DPRINTF(("vtblk: unknown i/o read offset %d\n\r", offset));
value = 0;
break;
}
return (value);
}
struct pci_devemu pci_de_vblk = {
.pe_emu = "virtio-blk",
.pe_init = pci_vtblk_init,
.pe_iow = pci_vtblk_write,
.pe_ior = pci_vtblk_read,
};
PCI_EMUL_SET(pci_de_vblk);

View File

@ -0,0 +1,739 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/linker_set.h>
#include <sys/select.h>
#include <sys/uio.h>
#include <sys/ioctl.h>
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <strings.h>
#include <unistd.h>
#include <assert.h>
#include <md5.h>
#include <pthread.h>
#include "fbsdrun.h"
#include "pci_emul.h"
#include "mevent.h"
#include "virtio.h"
#define VTNET_RINGSZ 256
#define VTNET_MAXSEGS 32
/*
* PCI config-space register offsets
*/
#define VTNET_R_CFG0 20
#define VTNET_R_CFG1 21
#define VTNET_R_CFG2 22
#define VTNET_R_CFG3 23
#define VTNET_R_CFG4 24
#define VTNET_R_CFG5 25
#define VTNET_R_CFG6 26
#define VTNET_R_CFG7 27
#define VTNET_R_MAX 27
#define VTNET_REGSZ VTNET_R_MAX+1
/*
* Host capabilities
*/
#define VTNET_S_HOSTCAPS \
( 0x00000020 | /* host supplies MAC */ \
0x00008000 | /* host can merge Rx buffers */ \
0x00010000 ) /* config status available */
/*
* Queue definitions.
*/
#define VTNET_RXQ 0
#define VTNET_TXQ 1
#define VTNET_CTLQ 2
#define VTNET_MAXQ 3
struct vring_hqueue {
/* Internal state */
uint16_t hq_size;
uint16_t hq_cur_aidx; /* trails behind 'avail_idx' */
/* Host-context pointers to the queue */
struct virtio_desc *hq_dtable;
uint16_t *hq_avail_flags;
uint16_t *hq_avail_idx; /* monotonically increasing */
uint16_t *hq_avail_ring;
uint16_t *hq_used_flags;
uint16_t *hq_used_idx; /* monotonically increasing */
struct virtio_used *hq_used_ring;
};
/*
* Fixed network header size
*/
struct virtio_net_rxhdr {
uint8_t vrh_flags;
uint8_t vrh_gso_type;
uint16_t vrh_hdr_len;
uint16_t vrh_gso_size;
uint16_t vrh_csum_start;
uint16_t vrh_csum_offset;
uint16_t vrh_bufs;
} __packed;
/*
* Debug printf
*/
static int pci_vtnet_debug;
#define DPRINTF(params) if (pci_vtnet_debug) printf params
#define WPRINTF(params) printf params
/*
* Per-device softc
*/
struct pci_vtnet_softc {
struct pci_devinst *vsc_pi;
pthread_mutex_t vsc_mtx;
struct mevent *vsc_mevp;
int vsc_curq;
int vsc_status;
int vsc_isr;
int vsc_tapfd;
int vsc_rx_ready;
int vsc_rxpend;
uint32_t vsc_features;
uint8_t vsc_macaddr[6];
uint64_t vsc_pfn[VTNET_MAXQ];
struct vring_hqueue vsc_hq[VTNET_MAXQ];
};
/*
* Return the number of available descriptors in the vring taking care
* of the 16-bit index wraparound.
*/
static int
hq_num_avail(struct vring_hqueue *hq)
{
int ndesc;
if (*hq->hq_avail_idx >= hq->hq_cur_aidx)
ndesc = *hq->hq_avail_idx - hq->hq_cur_aidx;
else
ndesc = UINT16_MAX - hq->hq_cur_aidx + *hq->hq_avail_idx + 1;
assert(ndesc >= 0 && ndesc <= hq->hq_size);
return (ndesc);
}
static uint16_t
pci_vtnet_qsize(int qnum)
{
/* XXX no ctl queue currently */
if (qnum == VTNET_CTLQ) {
return (0);
}
/* XXX fixed currently. Maybe different for tx/rx/ctl */
return (VTNET_RINGSZ);
}
static void
pci_vtnet_update_status(struct pci_vtnet_softc *sc, uint32_t value)
{
if (value == 0) {
DPRINTF(("vtnet: device reset requested !\n"));
}
sc->vsc_status = value;
}
/*
* Called to send a buffer chain out to the tap device
*/
static void
pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
int len)
{
char pad[60];
if (sc->vsc_tapfd == -1)
return;
/*
* If the length is < 60, pad out to that and add the
* extra zero'd segment to the iov. It is guaranteed that
* there is always an extra iov available by the caller.
*/
if (len < 60) {
memset(pad, 0, 60 - len);
iov[iovcnt].iov_base = pad;
iov[iovcnt].iov_len = 60 - len;
iovcnt++;
}
(void) writev(sc->vsc_tapfd, iov, iovcnt);
}
/*
* Called when there is read activity on the tap file descriptor.
* Each buffer posted by the guest is assumed to be able to contain
* an entire ethernet frame + rx header.
* MP note: the dummybuf is only used for discarding frames, so there
* is no need for it to be per-vtnet or locked.
*/
static uint8_t dummybuf[2048];
static void
pci_vtnet_tap_rx(struct pci_vtnet_softc *sc)
{
struct virtio_desc *vd;
struct virtio_used *vu;
struct vring_hqueue *hq;
struct virtio_net_rxhdr *vrx;
uint8_t *buf;
int i;
int len;
int ndescs;
int didx, uidx, aidx; /* descriptor, avail and used index */
/*
* Should never be called without a valid tap fd
*/
assert(sc->vsc_tapfd != -1);
/*
* But, will be called when the rx ring hasn't yet
* been set up.
*/
if (sc->vsc_rx_ready == 0) {
/*
* Drop the packet and try later.
*/
(void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
return;
}
/*
* Calculate the number of available rx buffers
*/
hq = &sc->vsc_hq[VTNET_RXQ];
ndescs = hq_num_avail(hq);
if (ndescs == 0) {
/*
* Need to wait for host notification to read
*/
if (sc->vsc_rxpend == 0) {
WPRINTF(("vtnet: no rx descriptors !\n"));
sc->vsc_rxpend = 1;
}
/*
* Drop the packet and try later
*/
(void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
return;
}
aidx = hq->hq_cur_aidx;
uidx = *hq->hq_used_idx;
for (i = 0; i < ndescs; i++) {
/*
* 'aidx' indexes into the an array of descriptor indexes
*/
didx = hq->hq_avail_ring[aidx % hq->hq_size];
assert(didx >= 0 && didx < hq->hq_size);
vd = &hq->hq_dtable[didx];
/*
* Get a pointer to the rx header, and use the
* data immediately following it for the packet buffer.
*/
vrx = (struct virtio_net_rxhdr *)paddr_guest2host(vd->vd_addr);
buf = (uint8_t *)(vrx + 1);
len = read(sc->vsc_tapfd, buf,
vd->vd_len - sizeof(struct virtio_net_rxhdr));
if (len < 0 && errno == EWOULDBLOCK) {
break;
}
/*
* The only valid field in the rx packet header is the
* number of buffers, which is always 1 without TSO
* support.
*/
memset(vrx, 0, sizeof(struct virtio_net_rxhdr));
vrx->vrh_bufs = 1;
/*
* Write this descriptor into the used ring
*/
vu = &hq->hq_used_ring[uidx % hq->hq_size];
vu->vu_idx = didx;
vu->vu_tlen = len + sizeof(struct virtio_net_rxhdr);
uidx++;
aidx++;
}
/*
* Update the used pointer, and signal an interrupt if allowed
*/
*hq->hq_used_idx = uidx;
hq->hq_cur_aidx = aidx;
if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
sc->vsc_isr |= 1;
pci_generate_msi(sc->vsc_pi, 0);
}
}
static void
pci_vtnet_tap_callback(int fd, enum ev_type type, void *param)
{
struct pci_vtnet_softc *sc = param;
pthread_mutex_lock(&sc->vsc_mtx);
pci_vtnet_tap_rx(sc);
pthread_mutex_unlock(&sc->vsc_mtx);
}
static void
pci_vtnet_ping_rxq(struct pci_vtnet_softc *sc)
{
/*
* A qnotify means that the rx process can now begin
*/
if (sc->vsc_rx_ready == 0) {
sc->vsc_rx_ready = 1;
}
/*
* If the rx queue was empty, attempt to receive a
* packet that was previously blocked due to no rx bufs
* available
*/
if (sc->vsc_rxpend) {
WPRINTF(("vtnet: rx resumed\n\r"));
sc->vsc_rxpend = 0;
pci_vtnet_tap_rx(sc);
}
}
static void
pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vring_hqueue *hq)
{
struct iovec iov[VTNET_MAXSEGS + 1];
struct virtio_desc *vd;
struct virtio_used *vu;
int i;
int plen;
int tlen;
int uidx, aidx, didx;
uidx = *hq->hq_used_idx;
aidx = hq->hq_cur_aidx;
didx = hq->hq_avail_ring[aidx % hq->hq_size];
assert(didx >= 0 && didx < hq->hq_size);
vd = &hq->hq_dtable[didx];
/*
* Run through the chain of descriptors, ignoring the
* first header descriptor. However, include the header
* length in the total length that will be put into the
* used queue.
*/
tlen = vd->vd_len;
vd = &hq->hq_dtable[vd->vd_next];
for (i = 0, plen = 0;
i < VTNET_MAXSEGS;
i++, vd = &hq->hq_dtable[vd->vd_next]) {
iov[i].iov_base = paddr_guest2host(vd->vd_addr);
iov[i].iov_len = vd->vd_len;
plen += vd->vd_len;
tlen += vd->vd_len;
if ((vd->vd_flags & VRING_DESC_F_NEXT) == 0)
break;
}
assert(i < VTNET_MAXSEGS);
DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, i + 1));
pci_vtnet_tap_tx(sc, iov, i + 1, plen);
/*
* Return this chain back to the host
*/
vu = &hq->hq_used_ring[uidx % hq->hq_size];
vu->vu_idx = didx;
vu->vu_tlen = tlen;
hq->hq_cur_aidx = aidx + 1;
*hq->hq_used_idx = uidx + 1;
/*
* Generate an interrupt if able
*/
if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
sc->vsc_isr |= 1;
pci_generate_msi(sc->vsc_pi, 0);
}
}
static void
pci_vtnet_ping_txq(struct pci_vtnet_softc *sc)
{
struct vring_hqueue *hq = &sc->vsc_hq[VTNET_TXQ];
int i;
int ndescs;
/*
* Calculate number of ring entries to process
*/
ndescs = hq_num_avail(hq);
if (ndescs == 0)
return;
/*
* Run through all the entries, placing them into iovecs and
* sending when an end-of-packet is found
*/
for (i = 0; i < ndescs; i++)
pci_vtnet_proctx(sc, hq);
}
static void
pci_vtnet_ping_ctlq(struct pci_vtnet_softc *sc)
{
DPRINTF(("vtnet: control qnotify!\n\r"));
}
static void
pci_vtnet_ring_init(struct pci_vtnet_softc *sc, uint64_t pfn)
{
struct vring_hqueue *hq;
int qnum = sc->vsc_curq;
assert(qnum < VTNET_MAXQ);
sc->vsc_pfn[qnum] = pfn << VRING_PFN;
/*
* Set up host pointers to the various parts of the
* queue
*/
hq = &sc->vsc_hq[qnum];
hq->hq_size = pci_vtnet_qsize(qnum);
hq->hq_dtable = paddr_guest2host(pfn << VRING_PFN);
hq->hq_avail_flags = (uint16_t *)(hq->hq_dtable + hq->hq_size);
hq->hq_avail_idx = hq->hq_avail_flags + 1;
hq->hq_avail_ring = hq->hq_avail_flags + 2;
hq->hq_used_flags = (uint16_t *)roundup2((uintptr_t)hq->hq_avail_ring,
VRING_ALIGN);
hq->hq_used_idx = hq->hq_used_flags + 1;
hq->hq_used_ring = (struct virtio_used *)(hq->hq_used_flags + 2);
/*
* Initialize queue indexes
*/
hq->hq_cur_aidx = 0;
}
static int
pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
{
MD5_CTX mdctx;
unsigned char digest[16];
char nstr[80];
struct pci_vtnet_softc *sc;
/*
* Access to guest memory is required. Fail if
* memory not mapped
*/
if (paddr_guest2host(0) == NULL)
return (1);
sc = malloc(sizeof(struct pci_vtnet_softc));
memset(sc, 0, sizeof(struct pci_vtnet_softc));
pi->pi_arg = sc;
sc->vsc_pi = pi;
pthread_mutex_init(&sc->vsc_mtx, NULL);
/*
* Attempt to open the tap device
*/
sc->vsc_tapfd = -1;
if (opts != NULL) {
char tbuf[80];
strcpy(tbuf, "/dev/");
strncat(tbuf, opts, sizeof(tbuf) - strlen(tbuf));
sc->vsc_tapfd = open(tbuf, O_RDWR);
if (sc->vsc_tapfd == -1) {
WPRINTF(("open of tap device %s failed\n", tbuf));
} else {
/*
* Set non-blocking and register for read
* notifications with the event loop
*/
int opt = 1;
if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) {
WPRINTF(("tap device O_NONBLOCK failed\n"));
close(sc->vsc_tapfd);
sc->vsc_tapfd = -1;
}
sc->vsc_mevp = mevent_add(sc->vsc_tapfd,
EVF_READ,
pci_vtnet_tap_callback,
sc);
if (sc->vsc_mevp == NULL) {
WPRINTF(("Could not register event\n"));
close(sc->vsc_tapfd);
sc->vsc_tapfd = -1;
}
}
}
/*
* The MAC address is the standard NetApp OUI of 00-a0-98,
* followed by an MD5 of the vm name. The slot number is
* prepended to this for slots other than 1, so that
* CFE can netboot from the equivalent of slot 1.
*/
if (pi->pi_slot == 1) {
strncpy(nstr, vmname, sizeof(nstr));
} else {
snprintf(nstr, sizeof(nstr), "%d-%s", pi->pi_slot, vmname);
}
MD5Init(&mdctx);
MD5Update(&mdctx, nstr, strlen(nstr));
MD5Final(digest, &mdctx);
sc->vsc_macaddr[0] = 0x00;
sc->vsc_macaddr[1] = 0xa0;
sc->vsc_macaddr[2] = 0x98;
sc->vsc_macaddr[3] = digest[0];
sc->vsc_macaddr[4] = digest[1];
sc->vsc_macaddr[5] = digest[2];
/* initialize config space */
pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET);
pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK);
pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET);
pci_emul_alloc_bar(pi, 0, 0, PCIBAR_IO, VTNET_REGSZ);
pci_emul_add_msicap(pi, 1);
return (0);
}
/*
* Function pointer array to handle queue notifications
*/
static void (*pci_vtnet_qnotify[VTNET_MAXQ])(struct pci_vtnet_softc *) = {
pci_vtnet_ping_rxq,
pci_vtnet_ping_txq,
pci_vtnet_ping_ctlq
};
static void
pci_vtnet_write(struct pci_devinst *pi, int baridx, int offset, int size,
uint32_t value)
{
struct pci_vtnet_softc *sc = pi->pi_arg;
if (offset + size > VTNET_REGSZ) {
DPRINTF(("vtnet_write: 2big, offset %d size %d\n",
offset, size));
return;
}
pthread_mutex_lock(&sc->vsc_mtx);
switch (offset) {
case VTCFG_R_GUESTCAP:
assert(size == 4);
sc->vsc_features = value & VTNET_S_HOSTCAPS;
break;
case VTCFG_R_PFN:
assert(size == 4);
pci_vtnet_ring_init(sc, value);
break;
case VTCFG_R_QSEL:
assert(size == 2);
assert(value < VTNET_MAXQ);
sc->vsc_curq = value;
break;
case VTCFG_R_QNOTIFY:
assert(size == 2);
assert(value < VTNET_MAXQ);
(*pci_vtnet_qnotify[value])(sc);
break;
case VTCFG_R_STATUS:
assert(size == 1);
pci_vtnet_update_status(sc, value);
break;
case VTNET_R_CFG0:
case VTNET_R_CFG1:
case VTNET_R_CFG2:
case VTNET_R_CFG3:
case VTNET_R_CFG4:
case VTNET_R_CFG5:
/*
* The driver is allowed to change the MAC address
*/
assert(size == 1);
sc->vsc_macaddr[offset - VTNET_R_CFG0] = value;
break;
case VTCFG_R_HOSTCAP:
case VTCFG_R_QNUM:
case VTCFG_R_ISR:
case VTNET_R_CFG6:
case VTNET_R_CFG7:
DPRINTF(("vtnet: write to readonly reg %d\n\r", offset));
break;
default:
DPRINTF(("vtnet: unknown i/o write offset %d\n\r", offset));
value = 0;
break;
}
pthread_mutex_unlock(&sc->vsc_mtx);
}
uint32_t
pci_vtnet_read(struct pci_devinst *pi, int baridx, int offset, int size)
{
struct pci_vtnet_softc *sc = pi->pi_arg;
uint32_t value;
if (offset + size > VTNET_REGSZ) {
DPRINTF(("vtnet_read: 2big, offset %d size %d\n",
offset, size));
return (0);
}
pthread_mutex_lock(&sc->vsc_mtx);
switch (offset) {
case VTCFG_R_HOSTCAP:
assert(size == 4);
value = VTNET_S_HOSTCAPS;
break;
case VTCFG_R_GUESTCAP:
assert(size == 4);
value = sc->vsc_features; /* XXX never read ? */
break;
case VTCFG_R_PFN:
assert(size == 4);
value = sc->vsc_pfn[sc->vsc_curq] >> VRING_PFN;
break;
case VTCFG_R_QNUM:
assert(size == 2);
value = pci_vtnet_qsize(sc->vsc_curq);
break;
case VTCFG_R_QSEL:
assert(size == 2);
value = sc->vsc_curq; /* XXX never read ? */
break;
case VTCFG_R_QNOTIFY:
assert(size == 2);
value = sc->vsc_curq; /* XXX never read ? */
break;
case VTCFG_R_STATUS:
assert(size == 1);
value = sc->vsc_status;
break;
case VTCFG_R_ISR:
assert(size == 1);
value = sc->vsc_isr;
sc->vsc_isr = 0; /* a read clears this flag */
break;
case VTNET_R_CFG0:
case VTNET_R_CFG1:
case VTNET_R_CFG2:
case VTNET_R_CFG3:
case VTNET_R_CFG4:
case VTNET_R_CFG5:
assert(size == 1);
value = sc->vsc_macaddr[offset - VTNET_R_CFG0];
break;
case VTNET_R_CFG6:
assert(size == 1);
value = 0x01; /* XXX link always up */
break;
case VTNET_R_CFG7:
assert(size == 1);
value = 0; /* link status is in the LSB */
break;
default:
DPRINTF(("vtnet: unknown i/o read offset %d\n\r", offset));
value = 0;
break;
}
pthread_mutex_unlock(&sc->vsc_mtx);
return (value);
}
struct pci_devemu pci_de_vnet = {
.pe_emu = "virtio-net",
.pe_init = pci_vtnet_init,
.pe_iow = pci_vtnet_write,
.pe_ior = pci_vtnet_read,
};
PCI_EMUL_SET(pci_de_vnet);

196
usr.sbin/bhyve/pit_8254.c Normal file
View File

@ -0,0 +1,196 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/time.h>
#include <machine/clock.h>
#include <stdio.h>
#include <assert.h>
#include "fbsdrun.h"
#include "inout.h"
#include "pit_8254.h"
#define TIMER_SEL_MASK 0xc0
#define TIMER_RW_MASK 0x30
#define TIMER_MODE_MASK 0x0f
#define TIMER_SEL_READBACK 0xc0
#define TIMER_DIV(freq, hz) (((freq) + (hz) / 2) / (hz))
#define PIT_8254_FREQ 1193182
static const int nsecs_per_tick = 1000000000 / PIT_8254_FREQ;
struct counter {
struct timeval tv; /* uptime when counter was loaded */
uint16_t initial; /* initial counter value */
uint8_t cr[2];
uint8_t ol[2];
int crbyte;
int olbyte;
};
static void
timevalfix(struct timeval *t1)
{
if (t1->tv_usec < 0) {
t1->tv_sec--;
t1->tv_usec += 1000000;
}
if (t1->tv_usec >= 1000000) {
t1->tv_sec++;
t1->tv_usec -= 1000000;
}
}
static void
timevalsub(struct timeval *t1, const struct timeval *t2)
{
t1->tv_sec -= t2->tv_sec;
t1->tv_usec -= t2->tv_usec;
timevalfix(t1);
}
static void
latch(struct counter *c)
{
struct timeval tv2;
uint16_t lval;
uint64_t delta_nsecs, delta_ticks;
/* cannot latch a new value until the old one has been consumed */
if (c->olbyte != 0)
return;
if (c->initial == 0 || c->initial == 1) {
/*
* XXX the program that runs the VM can be stopped and
* restarted at any time. This means that state that was
* created by the guest is destroyed between invocations
* of the program.
*
* If the counter's initial value is not programmed we
* assume a value that would be set to generate 'guest_hz'
* interrupts per second.
*/
c->initial = TIMER_DIV(PIT_8254_FREQ, guest_hz);
gettimeofday(&c->tv, NULL);
}
(void)gettimeofday(&tv2, NULL);
timevalsub(&tv2, &c->tv);
delta_nsecs = tv2.tv_sec * 1000000000 + tv2.tv_usec * 1000;
delta_ticks = delta_nsecs / nsecs_per_tick;
lval = c->initial - delta_ticks % c->initial;
c->olbyte = 2;
c->ol[1] = lval; /* LSB */
c->ol[0] = lval >> 8; /* MSB */
}
static int
pit_8254_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
int sel, rw, mode;
uint8_t val;
struct counter *c;
static struct counter counter[3];
if (bytes != 1)
return (-1);
val = *eax;
if (port == TIMER_MODE) {
assert(in == 0);
sel = val & TIMER_SEL_MASK;
rw = val & TIMER_RW_MASK;
mode = val & TIMER_MODE_MASK;
if (sel == TIMER_SEL_READBACK)
return (-1);
if (rw != TIMER_LATCH && rw != TIMER_16BIT)
return (-1);
if (rw != TIMER_LATCH) {
/*
* Counter mode is not affected when issuing a
* latch command.
*/
if (mode != TIMER_RATEGEN && mode != TIMER_SQWAVE)
return (-1);
}
c = &counter[sel >> 6];
if (rw == TIMER_LATCH)
latch(c);
else
c->olbyte = 0; /* reset latch after reprogramming */
return (0);
}
/* counter ports */
assert(port >= TIMER_CNTR0 && port <= TIMER_CNTR2);
c = &counter[port - TIMER_CNTR0];
if (in) {
/*
* XXX
* The spec says that once the output latch is completely
* read it should revert to "following" the counter. We don't
* do this because it is hard and any reasonable OS should
* always latch the counter before trying to read it.
*/
if (c->olbyte == 0)
c->olbyte = 2;
*eax = c->ol[--c->olbyte];
} else {
c->cr[c->crbyte++] = *eax;
if (c->crbyte == 2) {
c->crbyte = 0;
c->initial = c->cr[0] | (uint16_t)c->cr[1] << 8;
gettimeofday(&c->tv, NULL);
}
}
return (0);
}
INOUT_PORT(8254, TIMER_MODE, IOPORT_F_OUT, pit_8254_handler);
INOUT_PORT(8254, TIMER_CNTR0, IOPORT_F_INOUT, pit_8254_handler);
INOUT_PORT(8254, TIMER_CNTR1, IOPORT_F_INOUT, pit_8254_handler);
INOUT_PORT(8254, TIMER_CNTR2, IOPORT_F_INOUT, pit_8254_handler);

45
usr.sbin/bhyve/pit_8254.h Normal file
View File

@ -0,0 +1,45 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _PIT_8254_H_
#define _PIT_8254_H_
/*
* Borrowed from amd64/include/timerreg.h because in that file it is
* conditionally compiled for #ifdef _KERNEL only.
*/
#include <dev/ic/i8253reg.h>
#define IO_TIMER1 0x40 /* 8253 Timer #1 */
#define TIMER_CNTR0 (IO_TIMER1 + TIMER_REG_CNTR0)
#define TIMER_CNTR1 (IO_TIMER1 + TIMER_REG_CNTR1)
#define TIMER_CNTR2 (IO_TIMER1 + TIMER_REG_CNTR2)
#define TIMER_MODE (IO_TIMER1 + TIMER_REG_MODE)
#endif /* _PIT_8254_H_ */

51
usr.sbin/bhyve/post.c Normal file
View File

@ -0,0 +1,51 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <assert.h>
#include "inout.h"
static int
post_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
assert(in == 1);
if (bytes != 1)
return (-1);
*eax = 0xff; /* return some garbage */
return (0);
}
INOUT_PORT(post, 0x84, IOPORT_F_IN, post_data_handler);

268
usr.sbin/bhyve/rtc.c Normal file
View File

@ -0,0 +1,268 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <sys/time.h>
#include <stdio.h>
#include <time.h>
#include <assert.h>
#include "inout.h"
#define IO_RTC 0x70
#define RTC_SEC 0x00 /* seconds */
#define RTC_MIN 0x02
#define RTC_HRS 0x04
#define RTC_WDAY 0x06
#define RTC_DAY 0x07
#define RTC_MONTH 0x08
#define RTC_YEAR 0x09
#define RTC_CENTURY 0x32 /* current century */
#define RTC_STATUSA 0xA
#define RTCSA_TUP 0x80 /* time update, don't look now */
#define RTC_STATUSB 0xB
#define RTCSB_DST 0x01
#define RTCSB_24HR 0x02
#define RTCSB_BIN 0x04 /* 0 = BCD, 1 = Binary */
#define RTCSB_PINTR 0x40 /* 1 = enable periodic clock interrupt */
#define RTCSB_HALT 0x80 /* stop clock updates */
#define RTC_INTR 0x0c /* status register C (R) interrupt source */
#define RTC_STATUSD 0x0d /* status register D (R) Lost Power */
#define RTCSD_PWR 0x80 /* clock power OK */
#define RTC_DIAG 0x0e
#define RTC_RSTCODE 0x0f
static int addr;
/* XXX initialize these to default values as they would be from BIOS */
static uint8_t status_a, status_b, rstcode;
static u_char const bin2bcd_data[] = {
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09,
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29,
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99
};
#define bin2bcd(bin) (bin2bcd_data[bin])
#define rtcout(val) ((status_b & RTCSB_BIN) ? (val) : bin2bcd((val)))
static void
timevalfix(struct timeval *t1)
{
if (t1->tv_usec < 0) {
t1->tv_sec--;
t1->tv_usec += 1000000;
}
if (t1->tv_usec >= 1000000) {
t1->tv_sec++;
t1->tv_usec -= 1000000;
}
}
static void
timevalsub(struct timeval *t1, const struct timeval *t2)
{
t1->tv_sec -= t2->tv_sec;
t1->tv_usec -= t2->tv_usec;
timevalfix(t1);
}
static int
rtc_addr_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
assert(in == 0);
if (bytes != 1)
return (-1);
switch (*eax) {
case RTC_SEC:
case RTC_MIN:
case RTC_HRS:
case RTC_WDAY:
case RTC_DAY:
case RTC_MONTH:
case RTC_YEAR:
case RTC_CENTURY:
case RTC_STATUSA:
case RTC_STATUSB:
case RTC_INTR:
case RTC_STATUSD:
case RTC_DIAG:
case RTC_RSTCODE:
break;
default:
return (-1);
}
addr = *eax;
return (0);
}
static int
rtc_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
int hour;
time_t t;
struct timeval cur, delta;
static struct timeval last;
static struct tm tm;
if (bytes != 1)
return (-1);
gettimeofday(&cur, NULL);
/*
* Increment the cached time only once per second so we can guarantee
* that the guest has at least one second to read the hour:min:sec
* separately and still get a coherent view of the time.
*/
delta = cur;
timevalsub(&delta, &last);
if (delta.tv_sec >= 1 && (status_b & RTCSB_HALT) == 0) {
t = cur.tv_sec;
localtime_r(&t, &tm);
last = cur;
}
if (in) {
switch (addr) {
case RTC_SEC:
*eax = rtcout(tm.tm_sec);
return (0);
case RTC_MIN:
*eax = rtcout(tm.tm_min);
return (0);
case RTC_HRS:
if (status_b & RTCSB_24HR)
hour = tm.tm_hour;
else
hour = (tm.tm_hour % 12) + 1;
*eax = rtcout(hour);
/*
* If we are representing time in the 12-hour format
* then set the MSB to indicate PM.
*/
if ((status_b & RTCSB_24HR) == 0 && tm.tm_hour >= 12)
*eax |= 0x80;
return (0);
case RTC_WDAY:
*eax = rtcout(tm.tm_wday + 1);
return (0);
case RTC_DAY:
*eax = rtcout(tm.tm_mday);
return (0);
case RTC_MONTH:
*eax = rtcout(tm.tm_mon + 1);
return (0);
case RTC_YEAR:
*eax = rtcout(tm.tm_year % 100);
return (0);
case RTC_CENTURY:
*eax = rtcout(tm.tm_year / 100);
break;
case RTC_STATUSA:
*eax = status_a;
return (0);
case RTC_INTR:
*eax = 0;
return (0);
case RTC_STATUSD:
*eax = RTCSD_PWR;
return (0);
case RTC_DIAG:
*eax = 0;
return (0);
case RTC_RSTCODE:
*eax = rstcode;
return (0);
default:
return (-1);
}
}
switch (addr) {
case RTC_STATUSA:
status_a = *eax & ~RTCSA_TUP;
break;
case RTC_STATUSB:
/* XXX not implemented yet XXX */
if (*eax & RTCSB_PINTR)
return (-1);
status_b = *eax;
break;
case RTC_RSTCODE:
rstcode = *eax;
break;
case RTC_SEC:
case RTC_MIN:
case RTC_HRS:
case RTC_WDAY:
case RTC_DAY:
case RTC_MONTH:
case RTC_YEAR:
case RTC_CENTURY:
/*
* Ignore writes to the time of day registers
*/
break;
default:
return (-1);
}
return (0);
}
INOUT_PORT(rtc, IO_RTC, IOPORT_F_OUT, rtc_addr_handler);
INOUT_PORT(rtc, IO_RTC + 1, IOPORT_F_INOUT, rtc_data_handler);

60
usr.sbin/bhyve/uart.c Normal file
View File

@ -0,0 +1,60 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <assert.h>
#include "inout.h"
#define COM1 0x3F8
#define COM2 0x2F8
#define REG_IIR 2
static int
com_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
assert(in);
if (bytes != 1)
return (-1);
/*
* COM port is not implemented so we return 0xFF for all registers
*/
*eax = 0xFF;
return (0);
}
INOUT_PORT(uart, COM1 + REG_IIR, IOPORT_F_IN, com_handler);
INOUT_PORT(uart, COM2 + REG_IIR, IOPORT_F_IN, com_handler);

85
usr.sbin/bhyve/virtio.h Normal file
View File

@ -0,0 +1,85 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VIRTIO_H_
#define _VIRTIO_H_
#define VRING_ALIGN 4096
#define VRING_DESC_F_NEXT (1 << 0)
#define VRING_DESC_F_WRITE (1 << 1)
#define VRING_DESC_F_INDIRECT (1 << 2)
#define VRING_AVAIL_F_NO_INTERRUPT 1
struct virtio_desc {
uint64_t vd_addr;
uint32_t vd_len;
uint16_t vd_flags;
uint16_t vd_next;
} __packed;
struct virtio_used {
uint32_t vu_idx;
uint32_t vu_tlen;
} __packed;
/*
* PFN register shift amount
*/
#define VRING_PFN 12
/*
* Virtio device types
*/
#define VIRTIO_TYPE_NET 1
#define VIRTIO_TYPE_BLOCK 2
/*
* PCI vendor/device IDs
*/
#define VIRTIO_VENDOR 0x1AF4
#define VIRTIO_DEV_NET 0x1000
#define VIRTIO_DEV_BLOCK 0x1001
/*
* PCI config space constants
*/
#define VTCFG_R_HOSTCAP 0
#define VTCFG_R_GUESTCAP 4
#define VTCFG_R_PFN 8
#define VTCFG_R_QNUM 12
#define VTCFG_R_QSEL 14
#define VTCFG_R_QNOTIFY 16
#define VTCFG_R_STATUS 18
#define VTCFG_R_ISR 19
#define VTCFG_R_CFG0 20 /* No MSI-X */
#define VTCFG_R_CFG1 24 /* With MSI-X */
#define VTCFG_R_MSIX 20
#endif /* _VIRTIO_H_ */

261
usr.sbin/bhyve/xmsr.c Normal file
View File

@ -0,0 +1,261 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <machine/apicreg.h>
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <machine/vmm.h>
#include <vmmapi.h>
#include "fbsdrun.h"
#include "xmsr.h"
/*
* Trampoline for hypervisor direct 64-bit jump.
*
* 0 - signature for guest->host verification
* 8 - kernel virtual address of trampoline
* 16 - instruction virtual address
* 24 - stack pointer virtual address
* 32 - CR3, physical address of kernel page table
* 40 - 24-byte area for null/code/data GDT entries
*/
#define MP_V64T_SIG 0xcafebabecafebabeULL
struct mp_v64tramp {
uint64_t mt_sig;
uint64_t mt_virt;
uint64_t mt_eip;
uint64_t mt_rsp;
uint64_t mt_cr3;
uint64_t mt_gdtr[3];
};
/*
* CPU 0 is considered to be the BSP and is set to the RUNNING state.
* All other CPUs are set up in the INIT state.
*/
#define BSP 0
enum cpu_bstate {
CPU_S_INIT,
CPU_S_SIPI,
CPU_S_RUNNING
} static cpu_b[VM_MAXCPU] = { [BSP] = CPU_S_RUNNING };
static void spinup_ap(struct vmctx *, int, int, uint64_t *);
static void spinup_ap_direct64(struct vmctx *, int, uintptr_t, uint64_t *);
int
emulate_wrmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t val)
{
int dest;
int mode;
int thiscpu;
int vec;
int error, retval;
uint64_t rip;
retval = vcpu;
thiscpu = 1 << vcpu;
/*
* The only MSR value handled is the x2apic CR register
*/
if (code != 0x830) {
printf("Unknown WRMSR code %x, val %lx, cpu %d\n",
code, val, vcpu);
exit(1);
}
/*
* The value written to the MSR will generate an IPI to
* a set of CPUs. If this is a SIPI, create the initial
* state for the CPU and switch to it. Otherwise, inject
* an interrupt for the destination CPU(s), and request
* a switch to the next available one by returning -1
*/
dest = val >> 32;
vec = val & APIC_VECTOR_MASK;
mode = val & APIC_DELMODE_MASK;
switch (mode) {
case APIC_DELMODE_INIT:
assert(dest != 0);
assert(dest < guest_ncpus);
/*
* Ignore legacy de-assert INITs in x2apic mode
*/
if ((val & APIC_LEVEL_MASK) == APIC_LEVEL_DEASSERT) {
break;
}
assert(cpu_b[dest] == CPU_S_INIT);
/*
* Move CPU to wait-for-SIPI state
*/
error = vcpu_reset(ctx, dest);
assert(error == 0);
cpu_b[dest] = CPU_S_SIPI;
break;
case APIC_DELMODE_STARTUP:
assert(dest != 0);
assert(dest < guest_ncpus);
/*
* Ignore SIPIs in any state other than wait-for-SIPI
*/
if (cpu_b[dest] != CPU_S_SIPI) {
break;
}
/*
* Bring up the AP and signal the main loop that it is
* available and to switch to it.
*/
spinup_ap(ctx, dest, vec, &rip);
cpu_b[dest] = CPU_S_RUNNING;
fbsdrun_addcpu(ctx, dest, rip);
retval = dest;
break;
default:
printf("APIC delivery mode %lx not supported!\n",
val & APIC_DELMODE_MASK);
exit(1);
}
return (retval);
}
/*
* There are 2 startup modes possible here:
* - if the CPU supports 'unrestricted guest' mode, the spinup can
* set up the processor state in power-on 16-bit mode, with the CS:IP
* init'd to the specified low-mem 4K page.
* - if the guest has requested a 64-bit trampoline in the low-mem 4K
* page by placing in the specified signature, set up the register
* state using register state in the signature. Note that this
* requires accessing guest physical memory to read the signature
* while 'unrestricted mode' does not.
*/
static void
spinup_ap(struct vmctx *ctx, int newcpu, int vector, uint64_t *rip)
{
int error;
uint16_t cs;
uint64_t desc_base;
uint32_t desc_limit, desc_access;
if (fbsdrun_vmexit_on_hlt()) {
error = vm_set_capability(ctx, newcpu, VM_CAP_HALT_EXIT, 1);
assert(error == 0);
}
if (fbsdrun_vmexit_on_pause()) {
error = vm_set_capability(ctx, newcpu, VM_CAP_PAUSE_EXIT, 1);
assert(error == 0);
}
error = vm_set_capability(ctx, newcpu, VM_CAP_UNRESTRICTED_GUEST, 1);
if (error) {
/*
* If the guest does not support real-mode execution then
* we will bring up the AP directly in 64-bit mode.
*/
spinup_ap_direct64(ctx, newcpu, vector << PAGE_SHIFT, rip);
} else {
/*
* Update the %cs and %rip of the guest so that it starts
* executing real mode code at at 'vector << 12'.
*/
*rip = 0;
error = vm_set_register(ctx, newcpu, VM_REG_GUEST_RIP, *rip);
assert(error == 0);
error = vm_get_desc(ctx, newcpu, VM_REG_GUEST_CS, &desc_base,
&desc_limit, &desc_access);
assert(error == 0);
desc_base = vector << PAGE_SHIFT;
error = vm_set_desc(ctx, newcpu, VM_REG_GUEST_CS,
desc_base, desc_limit, desc_access);
assert(error == 0);
cs = (vector << PAGE_SHIFT) >> 4;
error = vm_set_register(ctx, newcpu, VM_REG_GUEST_CS, cs);
assert(error == 0);
}
}
static void
spinup_ap_direct64(struct vmctx *ctx, int newcpu, uintptr_t gaddr,
uint64_t *rip)
{
struct mp_v64tramp *mvt;
char *errstr;
int error;
uint64_t gdtbase;
mvt = paddr_guest2host(gaddr);
assert(mvt->mt_sig == MP_V64T_SIG);
/*
* Set up the 3-entry GDT using memory supplied in the
* guest's trampoline structure.
*/
vm_setup_freebsd_gdt(mvt->mt_gdtr);
#define CHECK_ERROR(msg) \
if (error != 0) { \
errstr = msg; \
goto err_exit; \
}
/* entry point */
*rip = mvt->mt_eip;
/* Get the guest virtual address of the GDT */
gdtbase = mvt->mt_virt + __offsetof(struct mp_v64tramp, mt_gdtr);
error = vm_setup_freebsd_registers(ctx, newcpu, mvt->mt_eip,
mvt->mt_cr3, gdtbase, mvt->mt_rsp);
CHECK_ERROR("vm_setup_freebsd_registers");
return;
err_exit:
printf("spinup_ap_direct64: machine state error: %s", errstr);
exit(1);
}

34
usr.sbin/bhyve/xmsr.h Normal file
View File

@ -0,0 +1,34 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _XMSR_H_
#define _XMSR_H_
int emulate_wrmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t val);
#endif

15
usr.sbin/vmmctl/Makefile Normal file
View File

@ -0,0 +1,15 @@
#
# $FreeBSD$
#
PROG= vmmctl
SRCS= vmmctl.c
NO_MAN=
DPADD= ${LIBVMMAPI}
LDADD= -lvmmapi
CFLAGS+= -I${.CURDIR}/../../sys/amd64/vmm
.include <bsd.prog.mk>

75
usr.sbin/vmmctl/sample.sh Executable file
View File

@ -0,0 +1,75 @@
#!/bin/sh
# $FreeBSD$
VMMCTL="sudo ./vmmctl"
VMNAME=sample
${VMMCTL} --vm=${VMNAME} --create
${VMMCTL} --vm=${VMNAME} --set-lowmem=128 --set-highmem=256
${VMMCTL} --vm=${VMNAME} --get-lowmem --get-highmem
CR0_PE=$((1 << 0))
CR0_PG=$((1 << 31))
CR0=$(($CR0_PE | $CR0_PG))
${VMMCTL} --vm=${VMNAME} --set-cr0=${CR0} --get-cr0
# XXX this is bogus the value of %cr3 should come from the loader
CR3=0
${VMMCTL} --vm=${VMNAME} --set-cr3=${CR3} --get-cr3
CR4_PAE=$((1 << 5))
CR4=$((${CR4_PAE}))
${VMMCTL} --vm=${VMNAME} --set-cr4=${CR4} --get-cr4
DR7=0x00000400 # Table 9-1 from Intel Architecture Manual 3A
${VMMCTL} --vm=${VMNAME} --set-dr7=${DR7} --get-dr7
#
# XXX the values of rsp and rip are bogus and should come from the loader.
#
RSP=0xa5a5a5a5
RIP=0x0000bfbfbfbf0000
RFLAGS=0x2
${VMMCTL} --vm=${VMNAME} --set-rsp=${RSP} --get-rsp
${VMMCTL} --vm=${VMNAME} --set-rip=${RIP} --get-rip
${VMMCTL} --vm=${VMNAME} --set-rflags=${RFLAGS} --get-rflags
# Set "hidden" state of %cs descriptor to indicate long mode code segment.
#
# Note that this should match the contents of the entry pointed to by the
# segment selector in the GDTR.
#
${VMMCTL} --vm=${VMNAME} --set-desc-cs --desc-access=0x00002098 --get-desc-cs
# Set "hidden" state of all data descriptors to indicate a usable segment.
# The only useful fields are the "Present" and "Descriptor Type" bits.
${VMMCTL} --vm=${VMNAME} --set-desc-ds --desc-access=0x00000090 --get-desc-ds
${VMMCTL} --vm=${VMNAME} --set-desc-es --desc-access=0x00000090 --get-desc-es
${VMMCTL} --vm=${VMNAME} --set-desc-fs --desc-access=0x00000090 --get-desc-fs
${VMMCTL} --vm=${VMNAME} --set-desc-gs --desc-access=0x00000090 --get-desc-gs
${VMMCTL} --vm=${VMNAME} --set-desc-ss --desc-access=0x00000090 --get-desc-ss
#
# Set the code segment selector to point to entry at offset 8 in the GDTR.
#
${VMMCTL} --vm=${VMNAME} --set-cs=0x0008 --get-cs
# Set all the remaining data segment selectors to point to entry at offset
# 16 in the GDTR.
${VMMCTL} --vm=${VMNAME} --set-ds=0x0010 --get-ds
${VMMCTL} --vm=${VMNAME} --set-es=0x0010 --get-es
${VMMCTL} --vm=${VMNAME} --set-fs=0x0010 --get-fs
${VMMCTL} --vm=${VMNAME} --set-gs=0x0010 --get-gs
${VMMCTL} --vm=${VMNAME} --set-ss=0x0010 --get-ss
# XXX the value of the GDTR should come from the loader.
# Set the GDTR
GDTR_BASE=0xffff0000
GDTR_LIMIT=0x10
${VMMCTL} --vm=${VMNAME} --set-desc-gdtr --desc-base=${GDTR_BASE} --desc-limit=${GDTR_LIMIT} --get-desc-gdtr
${VMMCTL} --vm=${VMNAME} --set-pinning=0 --get-pinning
${VMMCTL} --vm=${VMNAME} --set-pinning=-1 --get-pinning
${VMMCTL} --vm=${VMNAME} --destroy

1485
usr.sbin/vmmctl/vmmctl.c Normal file

File diff suppressed because it is too large Load Diff