First cut to port bhyve, vmmctl, and libvmmapi to HEAD.

This commit is contained in:
John Baldwin 2011-05-15 04:03:11 +00:00
commit b67e81db43
39 changed files with 8929 additions and 0 deletions

View File

@ -110,6 +110,7 @@ SUBDIR= ${SUBDIR_ORDERED} \
${_libusbhid} \
${_libusb} \
${_libvgl} \
${_libvmmapi} \
libwrap \
liby \
libz \
@ -197,6 +198,7 @@ _libsmb= libsmb
.if ${MK_NCP} != "no"
_libncp= libncp
.endif
_libvmmapi= libvmmapi
.endif
.if ${MACHINE_CPUARCH} == "powerpc"

11
lib/libvmmapi/Makefile Normal file
View File

@ -0,0 +1,11 @@
# $FreeBSD$
LIB= vmmapi
SRCS= vmmapi.c vmmapi_freebsd.c mptable.c
INCS= vmmapi.h
WARNS?= 2
CFLAGS+= -I${.CURDIR}
.include <bsd.lib.mk>

338
lib/libvmmapi/mptable.c Normal file
View File

@ -0,0 +1,338 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <sys/mman.h>
#include <stdio.h>
#include <string.h>
#include <machine/vmm.h>
#include <machine/vmm_dev.h>
#include "vmmapi.h"
#include "mptable.h"
#define LAPIC_PADDR (0xFEE00000)
#define LAPIC_VERSION (16)
#define IOAPIC_PADDR (0xFEC00000)
#define IOAPIC_VERSION (0x11)
extern int errno;
static uint8_t
mp_compute_checksum(void *base, size_t len)
{
uint8_t *bytes = base;
uint8_t sum = 0;
for(; len > 0; len--) {
sum += *bytes++;
}
return 256 - sum;
}
static void
mp_build_mpfp(struct mp_floating_pointer *mpfp, vm_paddr_t mpfp_gpa)
{
memset(mpfp, 0, sizeof(*mpfp));
memcpy(mpfp->signature, MPFP_SIGNATURE, MPFP_SIGNATURE_LEN);
mpfp->mptable_paddr = mpfp_gpa + sizeof(*mpfp);
mpfp->specrev = MP_SPECREV;
mpfp->feature2 = 0;
mpfp->checksum = mp_compute_checksum(mpfp, sizeof(*mpfp));
}
static void
mp_build_mpch(struct mp_config_hdr *mpch)
{
memset(mpch, 0, sizeof(*mpch));
mpch->specrev = MP_SPECREV;
memcpy(mpch->signature, MPCH_SIGNATURE, MPCH_SIGNATURE_LEN);
memcpy(mpch->oemid, MPCH_OEMID, MPCH_OEMID_LEN);
memcpy(mpch->prodid, MPCH_PRODID, MPCH_PRODID_LEN);
mpch->lapic_paddr = LAPIC_PADDR;
}
static void
mp_build_proc_entries(struct mpe_proc *mpep, int num_proc)
{
int i;
for (i = 0; i < num_proc; i++) {
memset(mpep, 0, sizeof(*mpep));
mpep->entry_type = MP_ENTRY_PROC;
mpep->lapic_id = i; // XXX
mpep->lapic_version = LAPIC_VERSION;
mpep->proc_flags = (i == 0)?MPEP_FLAGS_BSP:0;
mpep->proc_flags |= MPEP_FLAGS_EN;
mpep->proc_signature = MPEP_SIGNATURE;
mpep->feature_flags = MPEP_FEATURES;
mpep++;
}
}
static void
mp_build_bus_entries(struct mpe_bus *mpeb)
{
memset(mpeb, 0, sizeof(*mpeb));
mpeb->entry_type = MP_ENTRY_BUS;
mpeb->busid = MPE_BUSID_ISA;
memcpy(mpeb->busname, MPE_BUSNAME_ISA, MPE_BUSNAME_LEN);
mpeb++;
memset(mpeb, 0, sizeof(*mpeb));
mpeb->entry_type = MP_ENTRY_BUS;
mpeb->busid = MPE_BUSID_PCI;
memcpy(mpeb->busname, MPE_BUSNAME_PCI, MPE_BUSNAME_LEN);
}
#ifdef notyet
static void
mp_build_ioapic_entries(struct mpe_ioapic *mpei)
{
memset(mpei, 0, sizeof(*mpei));
mpei->entry_type = MP_ENTRY_IOAPIC;
mpei->ioapic_id = MPE_IOAPIC_ID;
mpei->ioapic_version = IOAPIC_VERSION;
mpei->ioapic_flags = MPE_IOAPIC_FLAG_EN;
mpei->ioapic_paddr = IOAPIC_PADDR;
}
static void
mp_build_ioint_entries(struct mpe_ioint *mpeii, int num_pins)
{
int pin;
/*
* The following config is taken from kernel mptable.c
* mptable_parse_default_config_ints(...), for now
* just use the default config, tweek later if needed.
*/
/* Run through all 16 pins. */
for (pin = 0; pin < num_pins; pin++) {
memset(mpeii, 0, sizeof(*mpeii));
mpeii->entry_type = MP_ENTRY_IOINT;
mpeii->src_bus_id = MPE_BUSID_ISA;
mpeii->dst_apic_id = MPE_IOAPIC_ID;
/*
* All default configs route IRQs from bus 0 to the first 16 pins
* of the first I/O APIC with an APIC ID of 2.
*/
mpeii->dst_apic_intin = pin;
switch (pin) {
case 0:
/* Pin 0 is an ExtINT pin. */
mpeii->intr_type = MPEII_INTR_EXTINT;
break;
case 2:
/* IRQ 0 is routed to pin 2. */
mpeii->intr_type = MPEII_INTR_INT;
mpeii->src_bus_irq = 0;
break;
case 5:
case 10:
case 11:
/*
* PCI Irqs set to level triggered.
*/
mpeii->intr_flags = MPEII_FLAGS_TRIGMODE_LEVEL;
mpeii->src_bus_id = MPE_BUSID_PCI;
default:
/* All other pins are identity mapped. */
mpeii->intr_type = MPEII_INTR_INT;
mpeii->src_bus_irq = pin;
break;
}
mpeii++;
}
}
#define COPYSTR(dest, src, bytes) \
memcpy(dest, src, bytes); \
str[bytes] = 0;
static void
mptable_dump(struct mp_floating_pointer *mpfp, struct mp_config_hdr *mpch)
{
static char str[16];
int i;
char *cur;
union mpe {
struct mpe_proc *proc;
struct mpe_bus *bus;
struct mpe_ioapic *ioapic;
struct mpe_ioint *ioint;
struct mpe_lint *lnit;
char *p;
};
union mpe mpe;
printf(" MP Floating Pointer :\n");
COPYSTR(str, mpfp->signature, 4);
printf(" signature: %s\n", str);
printf(" mpch paddr: %x\n", mpfp->mptable_paddr);
printf(" length: %x\n", mpfp->length);
printf(" specrec: %x\n", mpfp->specrev);
printf(" checksum: %x\n", mpfp->checksum);
printf(" feature1: %x\n", mpfp->feature1);
printf(" feature2: %x\n", mpfp->feature2);
printf(" feature3: %x\n", mpfp->feature3);
printf(" feature4: %x\n", mpfp->feature4);
printf(" MP Configuration Header :\n");
COPYSTR(str, mpch->signature, 4);
printf(" signature: %s\n", str);
printf(" length: %x\n", mpch->length);
printf(" specrec: %x\n", mpch->specrev);
printf(" checksum: %x\n", mpch->checksum);
COPYSTR(str, mpch->oemid, MPCH_OEMID_LEN);
printf(" oemid: %s\n", str);
COPYSTR(str, mpch->prodid, MPCH_PRODID_LEN);
printf(" prodid: %s\n", str);
printf(" oem_ptr: %x\n", mpch->oem_ptr);
printf(" oem_sz: %x\n", mpch->oem_sz);
printf(" nr_entries: %x\n", mpch->nr_entries);
printf(" apic paddr: %x\n", mpch->lapic_paddr);
printf(" ext_length: %x\n", mpch->ext_length);
printf(" ext_checksum: %x\n", mpch->ext_checksum);
cur = (char *)mpch + sizeof(*mpch);
for (i = 0; i < mpch->nr_entries; i++) {
mpe.p = cur;
switch(*mpe.p) {
case MP_ENTRY_PROC:
printf(" MP Processor Entry :\n");
printf(" lapic_id: %x\n", mpe.proc->lapic_id);
printf(" lapic_version: %x\n", mpe.proc->lapic_version);
printf(" proc_flags: %x\n", mpe.proc->proc_flags);
printf(" proc_signature: %x\n", mpe.proc->proc_signature);
printf(" feature_flags: %x\n", mpe.proc->feature_flags);
cur += sizeof(struct mpe_proc);
break;
case MP_ENTRY_BUS:
printf(" MP Bus Entry :\n");
printf(" busid: %x\n", mpe.bus->busid);
COPYSTR(str, mpe.bus->busname, MPE_BUSNAME_LEN);
printf(" busname: %s\n", str);
cur += sizeof(struct mpe_bus);
break;
case MP_ENTRY_IOAPIC:
printf(" MP IOAPIC Entry :\n");
printf(" ioapi_id: %x\n", mpe.ioapic->ioapic_id);
printf(" ioapi_version: %x\n", mpe.ioapic->ioapic_version);
printf(" ioapi_flags: %x\n", mpe.ioapic->ioapic_flags);
printf(" ioapi_paddr: %x\n", mpe.ioapic->ioapic_paddr);
cur += sizeof(struct mpe_ioapic);
break;
case MP_ENTRY_IOINT:
printf(" MP IO Interrupt Entry :\n");
printf(" intr_type: %x\n", mpe.ioint->intr_type);
printf(" intr_flags: %x\n", mpe.ioint->intr_flags);
printf(" src_bus_id: %x\n", mpe.ioint->src_bus_id);
printf(" src_bus_irq: %x\n", mpe.ioint->src_bus_irq);
printf(" dst_apic_id: %x\n", mpe.ioint->dst_apic_id);
printf(" dst_apic_intin: %x\n", mpe.ioint->dst_apic_intin);
cur += sizeof(struct mpe_ioint);
break;
case MP_ENTRY_LINT:
printf(" MP Local Interrupt Entry :\n");
cur += sizeof(struct mpe_lint);
break;
}
}
}
#endif
int
vm_build_mptable(struct vmctx *ctx, vm_paddr_t gpa, int len, int ncpu,
void *oemp, int oemsz)
{
struct mp_config_hdr *mpch;
char *mapaddr;
char *startaddr;
int error;
mapaddr = vm_map_memory(ctx, gpa, len);
if (mapaddr == MAP_FAILED) {
printf("%s\n", strerror(errno));
goto err;
}
startaddr = mapaddr;
mp_build_mpfp((struct mp_floating_pointer*) mapaddr, gpa);
mapaddr += sizeof(struct mp_floating_pointer);
mpch = (struct mp_config_hdr*)mapaddr;
mp_build_mpch(mpch);
mapaddr += sizeof(struct mp_config_hdr);
mp_build_proc_entries((struct mpe_proc*) mapaddr, ncpu);
mapaddr += (sizeof(struct mpe_proc)*ncpu);
mpch->nr_entries += ncpu;
mp_build_bus_entries((struct mpe_bus*)mapaddr);
mapaddr += (sizeof(struct mpe_bus)*MPE_NUM_BUSES);
mpch->nr_entries += MPE_NUM_BUSES;
#if 0
mp_build_ioapic_entries((struct mpe_ioapic*)mapaddr);
mapaddr += sizeof(struct mpe_ioapic);
mpch->nr_entries++;
mp_build_ioint_entries((struct mpe_ioint*)mapaddr, MPEII_MAX_IRQ);
mapaddr += sizeof(struct mpe_ioint)*MPEII_MAX_IRQ;
mpch->nr_entries += MPEII_MAX_IRQ;
#endif
if (oemp) {
mpch->oem_ptr = mapaddr - startaddr + gpa;
mpch->oem_sz = oemsz;
memcpy(mapaddr, oemp, oemsz);
}
mpch->length = (mapaddr) - ((char*) mpch);
mpch->checksum = mp_compute_checksum(mpch, sizeof(*mpch));
// mptable_dump((struct mp_floating_pointer*)startaddr, mpch);
err:
return (error);
}

171
lib/libvmmapi/mptable.h Normal file
View File

@ -0,0 +1,171 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _MPTABLE_h_
#define _MPTABLE_h_
#define MP_SPECREV (4) // MP spec revision 1.1
/*
* MP Floating Pointer Structure
*/
#define MPFP_SIGNATURE "_MP_"
#define MPFP_SIGNATURE_LEN (4)
#define MPFP_FEATURE2 (0x80) // IMCR is present
struct mp_floating_pointer {
uint8_t signature[MPFP_SIGNATURE_LEN];
uint32_t mptable_paddr;
uint8_t length;
uint8_t specrev;
uint8_t checksum;
uint8_t feature1;
uint8_t feature2;
uint8_t feature3;
uint8_t feature4;
uint8_t feature5;
};
/*
* MP Configuration Table Header
*/
#define MPCH_SIGNATURE "PCMP"
#define MPCH_SIGNATURE_LEN (4)
#define MPCH_OEMID "NETAPP "
#define MPCH_OEMID_LEN (8)
#define MPCH_PRODID "vFiler "
#define MPCH_PRODID_LEN (12)
struct mp_config_hdr {
uint8_t signature[MPCH_SIGNATURE_LEN];
uint16_t length;
uint8_t specrev;
uint8_t checksum;
uint8_t oemid[MPCH_OEMID_LEN];
uint8_t prodid[MPCH_PRODID_LEN];
uint32_t oem_ptr;
uint16_t oem_sz;
uint16_t nr_entries;
uint32_t lapic_paddr;
uint16_t ext_length;
uint8_t ext_checksum;
uint8_t reserved;
};
#define MP_ENTRY_PROC (0)
#define MP_ENTRY_BUS (1)
#define MP_ENTRY_IOAPIC (2)
#define MP_ENTRY_IOINT (3)
#define MP_ENTRY_LINT (4)
/*
* MP Processor Entry
*/
#define MPEP_FLAGS_EN (0x1)
#define MPEP_FLAGS_BSP (0x2)
#define MPEP_SIG_FAMILY (6)
#define MPEP_SIG_MODEL (26)
#define MPEP_SIG_STEPPING (5)
#define MPEP_SIGNATURE ((MPEP_SIG_FAMILY << 8) | (MPEP_SIG_MODEL << 4) \
| (MPEP_SIG_STEPPING))
#define MPEP_FEATURES (0xBFEBFBFF) // Value from Intel i7 CPUID
struct mpe_proc {
uint8_t entry_type;
uint8_t lapic_id;
uint8_t lapic_version;
uint8_t proc_flags;
uint32_t proc_signature;
uint32_t feature_flags;
uint8_t reserved[8];
};
/*
* MP Bus Entry
*/
#define MPE_NUM_BUSES (2)
#define MPE_BUSNAME_LEN (6)
#define MPE_BUSID_ISA (0)
#define MPE_BUSID_PCI (1)
#define MPE_BUSNAME_ISA "ISA "
#define MPE_BUSNAME_PCI "PCI "
struct mpe_bus {
uint8_t entry_type;
uint8_t busid;
uint8_t busname[MPE_BUSNAME_LEN];
};
/*
* MP IO APIC Entry
*/
#define MPE_IOAPIC_ID (2)
#define MPE_IOAPIC_FLAG_EN (1)
struct mpe_ioapic {
uint8_t entry_type;
uint8_t ioapic_id;
uint8_t ioapic_version;
uint8_t ioapic_flags;
uint32_t ioapic_paddr;
};
/*
* MP IO Interrupt Assignment Entry
*/
#define MPEII_INTR_INT (0)
#define MPEII_INTR_NMI (1)
#define MPEII_INTR_SMI (2)
#define MPEII_INTR_EXTINT (3)
#define MPEII_PCI_IRQ_MASK (0x0c20U) /* IRQ 5,10,11 are PCI connected */
#define MPEII_MAX_IRQ (16)
#define MPEII_FLAGS_TRIGMODE_LEVEL (0x3)
struct mpe_ioint {
uint8_t entry_type;
uint8_t intr_type;
uint16_t intr_flags;
uint8_t src_bus_id;
uint8_t src_bus_irq;
uint8_t dst_apic_id;
uint8_t dst_apic_intin;
};
/*
* MP Local Interrupt Assignment Entry
*/
struct mpe_lint {
uint8_t entry_type;
};
int vm_build_mptable(struct vmctx *ctxt, vm_paddr_t gpa, int len,
int ncpu, void *oemp, int oemsz);
#endif /* _MPTABLE_h_ */

645
lib/libvmmapi/vmmapi.c Normal file
View File

@ -0,0 +1,645 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <sys/sysctl.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <machine/specialreg.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>
#include <machine/vmm.h>
#include <machine/vmm_dev.h>
#include "vmmapi.h"
#include "mptable.h"
#ifndef CR4_VMXE
#define CR4_VMXE (1UL << 13)
#endif
#define BIOS_ROM_BASE (0xf0000)
#define BIOS_ROM_SIZE (0x10000)
struct vmctx {
int fd;
char *name;
};
#define CREATE(x) sysctlbyname("hw.vmm.create", NULL, NULL, (x), strlen((x)))
#define DESTROY(x) sysctlbyname("hw.vmm.destroy", NULL, NULL, (x), strlen((x)))
static int
vm_device_open(const char *name)
{
int fd, len;
char *vmfile;
len = strlen("/dev/vmm/") + strlen(name) + 1;
vmfile = malloc(len);
assert(vmfile != NULL);
snprintf(vmfile, len, "/dev/vmm/%s", name);
/* Open the device file */
fd = open(vmfile, O_RDWR, 0);
free(vmfile);
return (fd);
}
int
vm_create(const char *name)
{
return (CREATE((char *)name));
}
struct vmctx *
vm_open(const char *name)
{
struct vmctx *vm;
vm = malloc(sizeof(struct vmctx) + strlen(name) + 1);
assert(vm != NULL);
vm->fd = -1;
vm->name = (char *)(vm + 1);
strcpy(vm->name, name);
if ((vm->fd = vm_device_open(vm->name)) < 0)
goto err;
return (vm);
err:
vm_destroy(vm);
return (NULL);
}
void
vm_destroy(struct vmctx *vm)
{
assert(vm != NULL);
DESTROY(vm->name);
if (vm->fd >= 0)
close(vm->fd);
free(vm);
}
int
vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa,
vm_paddr_t *ret_hpa, size_t *ret_len)
{
int error;
struct vm_memory_segment seg;
bzero(&seg, sizeof(seg));
seg.gpa = gpa;
error = ioctl(ctx->fd, VM_GET_MEMORY_SEG, &seg);
*ret_hpa = seg.hpa;
*ret_len = seg.len;
return (error);
}
int
vm_setup_memory(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char **mapaddr)
{
int error;
struct vm_memory_segment seg;
/*
* Create and optionally map 'len' bytes of memory at guest
* physical address 'gpa'
*/
bzero(&seg, sizeof(seg));
seg.gpa = gpa;
seg.len = len;
error = ioctl(ctx->fd, VM_MAP_MEMORY, &seg);
if (error == 0 && mapaddr != NULL) {
*mapaddr = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_SHARED,
ctx->fd, gpa);
}
return (error);
}
char *
vm_map_memory(struct vmctx *ctx, vm_paddr_t gpa, size_t len)
{
/* Map 'len' bytes of memory at guest physical address 'gpa' */
return ((char *)mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_SHARED,
ctx->fd, gpa));
}
int
vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
uint64_t base, uint32_t limit, uint32_t access)
{
int error;
struct vm_seg_desc vmsegdesc;
bzero(&vmsegdesc, sizeof(vmsegdesc));
vmsegdesc.cpuid = vcpu;
vmsegdesc.regnum = reg;
vmsegdesc.desc.base = base;
vmsegdesc.desc.limit = limit;
vmsegdesc.desc.access = access;
error = ioctl(ctx->fd, VM_SET_SEGMENT_DESCRIPTOR, &vmsegdesc);
return (error);
}
int
vm_get_desc(struct vmctx *ctx, int vcpu, int reg,
uint64_t *base, uint32_t *limit, uint32_t *access)
{
int error;
struct vm_seg_desc vmsegdesc;
bzero(&vmsegdesc, sizeof(vmsegdesc));
vmsegdesc.cpuid = vcpu;
vmsegdesc.regnum = reg;
error = ioctl(ctx->fd, VM_GET_SEGMENT_DESCRIPTOR, &vmsegdesc);
if (error == 0) {
*base = vmsegdesc.desc.base;
*limit = vmsegdesc.desc.limit;
*access = vmsegdesc.desc.access;
}
return (error);
}
int
vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val)
{
int error;
struct vm_register vmreg;
bzero(&vmreg, sizeof(vmreg));
vmreg.cpuid = vcpu;
vmreg.regnum = reg;
vmreg.regval = val;
error = ioctl(ctx->fd, VM_SET_REGISTER, &vmreg);
return (error);
}
int
vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *ret_val)
{
int error;
struct vm_register vmreg;
bzero(&vmreg, sizeof(vmreg));
vmreg.cpuid = vcpu;
vmreg.regnum = reg;
error = ioctl(ctx->fd, VM_GET_REGISTER, &vmreg);
*ret_val = vmreg.regval;
return (error);
}
int
vm_get_pinning(struct vmctx *ctx, int vcpu, int *host_cpuid)
{
int error;
struct vm_pin vmpin;
bzero(&vmpin, sizeof(vmpin));
vmpin.vm_cpuid = vcpu;
error = ioctl(ctx->fd, VM_GET_PINNING, &vmpin);
*host_cpuid = vmpin.host_cpuid;
return (error);
}
int
vm_set_pinning(struct vmctx *ctx, int vcpu, int host_cpuid)
{
int error;
struct vm_pin vmpin;
bzero(&vmpin, sizeof(vmpin));
vmpin.vm_cpuid = vcpu;
vmpin.host_cpuid = host_cpuid;
error = ioctl(ctx->fd, VM_SET_PINNING, &vmpin);
return (error);
}
int
vm_run(struct vmctx *ctx, int vcpu, uint64_t rip, struct vm_exit *vmexit)
{
int error;
struct vm_run vmrun;
bzero(&vmrun, sizeof(vmrun));
vmrun.cpuid = vcpu;
vmrun.rip = rip;
error = ioctl(ctx->fd, VM_RUN, &vmrun);
bcopy(&vmrun.vm_exit, vmexit, sizeof(struct vm_exit));
return (error);
}
static int
vm_inject_event_real(struct vmctx *ctx, int vcpu, enum vm_event_type type,
int vector, int error_code, int error_code_valid)
{
struct vm_event ev;
bzero(&ev, sizeof(ev));
ev.cpuid = vcpu;
ev.type = type;
ev.vector = vector;
ev.error_code = error_code;
ev.error_code_valid = error_code_valid;
return (ioctl(ctx->fd, VM_INJECT_EVENT, &ev));
}
int
vm_inject_event(struct vmctx *ctx, int vcpu, enum vm_event_type type,
int vector)
{
return (vm_inject_event_real(ctx, vcpu, type, vector, 0, 0));
}
int
vm_inject_event2(struct vmctx *ctx, int vcpu, enum vm_event_type type,
int vector, int error_code)
{
return (vm_inject_event_real(ctx, vcpu, type, vector, error_code, 1));
}
int
vm_build_tables(struct vmctx *ctxt, int ncpu, void *oemtbl, int oemtblsz)
{
return (vm_build_mptable(ctxt, BIOS_ROM_BASE, BIOS_ROM_SIZE, ncpu,
oemtbl, oemtblsz));
}
int
vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector)
{
struct vm_lapic_irq vmirq;
bzero(&vmirq, sizeof(vmirq));
vmirq.cpuid = vcpu;
vmirq.vector = vector;
return (ioctl(ctx->fd, VM_LAPIC_IRQ, &vmirq));
}
int
vm_inject_nmi(struct vmctx *ctx, int vcpu)
{
struct vm_nmi vmnmi;
bzero(&vmnmi, sizeof(vmnmi));
vmnmi.cpuid = vcpu;
return (ioctl(ctx->fd, VM_INJECT_NMI, &vmnmi));
}
int
vm_capability_name2type(const char *capname)
{
int i;
static struct {
const char *name;
int type;
} capstrmap[] = {
{ "hlt_exit", VM_CAP_HALT_EXIT },
{ "mtrap_exit", VM_CAP_MTRAP_EXIT },
{ "pause_exit", VM_CAP_PAUSE_EXIT },
{ "unrestricted_guest", VM_CAP_UNRESTRICTED_GUEST },
{ 0 }
};
for (i = 0; capstrmap[i].name != NULL && capname != NULL; i++) {
if (strcmp(capstrmap[i].name, capname) == 0)
return (capstrmap[i].type);
}
return (-1);
}
int
vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap,
int *retval)
{
int error;
struct vm_capability vmcap;
bzero(&vmcap, sizeof(vmcap));
vmcap.cpuid = vcpu;
vmcap.captype = cap;
error = ioctl(ctx->fd, VM_GET_CAPABILITY, &vmcap);
*retval = vmcap.capval;
return (error);
}
int
vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, int val)
{
struct vm_capability vmcap;
bzero(&vmcap, sizeof(vmcap));
vmcap.cpuid = vcpu;
vmcap.captype = cap;
vmcap.capval = val;
return (ioctl(ctx->fd, VM_SET_CAPABILITY, &vmcap));
}
int
vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func)
{
struct vm_pptdev pptdev;
bzero(&pptdev, sizeof(pptdev));
pptdev.bus = bus;
pptdev.slot = slot;
pptdev.func = func;
return (ioctl(ctx->fd, VM_BIND_PPTDEV, &pptdev));
}
int
vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func)
{
struct vm_pptdev pptdev;
bzero(&pptdev, sizeof(pptdev));
pptdev.bus = bus;
pptdev.slot = slot;
pptdev.func = func;
return (ioctl(ctx->fd, VM_UNBIND_PPTDEV, &pptdev));
}
int
vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func,
vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
{
struct vm_pptdev_mmio pptmmio;
bzero(&pptmmio, sizeof(pptmmio));
pptmmio.bus = bus;
pptmmio.slot = slot;
pptmmio.func = func;
pptmmio.gpa = gpa;
pptmmio.len = len;
pptmmio.hpa = hpa;
return (ioctl(ctx->fd, VM_MAP_PPTDEV_MMIO, &pptmmio));
}
int
vm_setup_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
int destcpu, int vector, int numvec)
{
struct vm_pptdev_msi pptmsi;
bzero(&pptmsi, sizeof(pptmsi));
pptmsi.vcpu = vcpu;
pptmsi.bus = bus;
pptmsi.slot = slot;
pptmsi.func = func;
pptmsi.destcpu = destcpu;
pptmsi.vector = vector;
pptmsi.numvec = numvec;
return (ioctl(ctx->fd, VM_PPTDEV_MSI, &pptmsi));
}
uint64_t *
vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv,
int *ret_entries)
{
int error;
static struct vm_stats vmstats;
vmstats.cpuid = vcpu;
error = ioctl(ctx->fd, VM_STATS, &vmstats);
if (error == 0) {
if (ret_entries)
*ret_entries = vmstats.num_entries;
if (ret_tv)
*ret_tv = vmstats.tv;
return (vmstats.statbuf);
} else
return (NULL);
}
const char *
vm_get_stat_desc(struct vmctx *ctx, int index)
{
static struct vm_stat_desc statdesc;
statdesc.index = index;
if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0)
return (statdesc.desc);
else
return (NULL);
}
/*
* From Intel Vol 3a:
* Table 9-1. IA-32 Processor States Following Power-up, Reset or INIT
*/
int
vcpu_reset(struct vmctx *vmctx, int vcpu)
{
int error;
uint64_t rflags, rip, cr0, cr4, zero, desc_base, rdx;
uint32_t desc_access, desc_limit;
uint16_t sel;
zero = 0;
rflags = 0x2;
error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RFLAGS, rflags);
if (error)
goto done;
rip = 0xfff0;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RIP, rip)) != 0)
goto done;
cr0 = CR0_NE;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR0, cr0)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR3, zero)) != 0)
goto done;
cr4 = CR4_VMXE;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR4, cr4)) != 0)
goto done;
/*
* CS: present, r/w, accessed, 16-bit, byte granularity, usable
*/
desc_base = 0xffff0000;
desc_limit = 0xffff;
desc_access = 0x0093;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_CS,
desc_base, desc_limit, desc_access);
if (error)
goto done;
sel = 0xf000;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CS, sel)) != 0)
goto done;
/*
* SS,DS,ES,FS,GS: present, r/w, accessed, 16-bit, byte granularity
*/
desc_base = 0;
desc_limit = 0xffff;
desc_access = 0x0093;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_SS,
desc_base, desc_limit, desc_access);
if (error)
goto done;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_DS,
desc_base, desc_limit, desc_access);
if (error)
goto done;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_ES,
desc_base, desc_limit, desc_access);
if (error)
goto done;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_FS,
desc_base, desc_limit, desc_access);
if (error)
goto done;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GS,
desc_base, desc_limit, desc_access);
if (error)
goto done;
sel = 0;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_SS, sel)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DS, sel)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_ES, sel)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_FS, sel)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_GS, sel)) != 0)
goto done;
/* General purpose registers */
rdx = 0xf00;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RAX, zero)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBX, zero)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RCX, zero)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDX, rdx)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSI, zero)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDI, zero)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBP, zero)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSP, zero)) != 0)
goto done;
/* GDTR, IDTR */
desc_base = 0;
desc_limit = 0xffff;
desc_access = 0;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GDTR,
desc_base, desc_limit, desc_access);
if (error != 0)
goto done;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_IDTR,
desc_base, desc_limit, desc_access);
if (error != 0)
goto done;
/* TR */
desc_base = 0;
desc_limit = 0xffff;
desc_access = 0x0000008b;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_TR, 0, 0, desc_access);
if (error)
goto done;
sel = 0;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_TR, sel)) != 0)
goto done;
/* LDTR */
desc_base = 0;
desc_limit = 0xffff;
desc_access = 0x00000082;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_LDTR, desc_base,
desc_limit, desc_access);
if (error)
goto done;
sel = 0;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_LDTR, 0)) != 0)
goto done;
/* XXX cr2, debug registers */
error = 0;
done:
return (error);
}

98
lib/libvmmapi/vmmapi.h Normal file
View File

@ -0,0 +1,98 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VMMAPI_H_
#define _VMMAPI_H_
struct vmctx;
int vm_create(const char *name);
struct vmctx *vm_open(const char *name);
void vm_destroy(struct vmctx *ctx);
int vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa,
vm_paddr_t *ret_hpa, size_t *ret_len);
/*
* Create a memory segment of 'len' bytes in the guest physical address space
* at offset 'gpa'.
*
* If 'mapaddr' is not NULL then this region is mmap'ed into the address
* space of the calling process. If there is an mmap error then *mapaddr
* will be set to MAP_FAILED.
*/
int vm_setup_memory(struct vmctx *ctx, vm_paddr_t gpa, size_t len,
char **mapaddr);
char * vm_map_memory(struct vmctx *ctx, vm_paddr_t gpa, size_t len);
int vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
uint64_t base, uint32_t limit, uint32_t access);
int vm_get_desc(struct vmctx *ctx, int vcpu, int reg,
uint64_t *base, uint32_t *limit, uint32_t *access);
int vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val);
int vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *retval);
int vm_get_pinning(struct vmctx *ctx, int vcpu, int *host_cpuid);
int vm_set_pinning(struct vmctx *ctx, int vcpu, int host_cpuid);
int vm_run(struct vmctx *ctx, int vcpu, uint64_t rip,
struct vm_exit *ret_vmexit);
int vm_build_tables(struct vmctx *ctxt, int ncpus, void *oemtbl,
int oemtblsz);
int vm_inject_event(struct vmctx *ctx, int vcpu, enum vm_event_type type,
int vector);
int vm_inject_event2(struct vmctx *ctx, int vcpu, enum vm_event_type type,
int vector, int error_code);
int vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector);
int vm_inject_nmi(struct vmctx *ctx, int vcpu);
int vm_capability_name2type(const char *capname);
int vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap,
int *retval);
int vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap,
int val);
int vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func);
int vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func);
int vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func,
vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
int vm_setup_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
int dest, int vector, int numvec);
/*
* Return a pointer to the statistics buffer. Note that this is not MT-safe.
*/
uint64_t *vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv,
int *ret_entries);
const char *vm_get_stat_desc(struct vmctx *ctx, int index);
/* Reset vcpu register state */
int vcpu_reset(struct vmctx *ctx, int vcpu);
/*
* FreeBSD specific APIs
*/
int vm_setup_freebsd_registers(struct vmctx *ctx, int vcpu,
uint64_t rip, uint64_t cr3, uint64_t gdtbase,
uint64_t rsp);
void vm_setup_freebsd_gdt(uint64_t *gdtr);
#endif /* _VMMAPI_H_ */

View File

@ -0,0 +1,187 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <machine/specialreg.h>
#include <machine/segments.h>
#include <machine/vmm.h>
#include "vmmapi.h"
#ifndef CR4_VMXE
#define CR4_VMXE (1UL << 13)
#endif
#define DESC_UNUSABLE 0x00010000
#define GUEST_NULL_SEL 0
#define GUEST_CODE_SEL 1
#define GUEST_DATA_SEL 2
#define GUEST_GDTR_LIMIT (3 * 8 - 1)
void
vm_setup_freebsd_gdt(uint64_t *gdtr)
{
gdtr[GUEST_NULL_SEL] = 0;
gdtr[GUEST_CODE_SEL] = 0x0020980000000000;
gdtr[GUEST_DATA_SEL] = 0x0000900000000000;
}
/*
* Setup the 'vcpu' register set such that it will begin execution at
* 'rip' in long mode.
*/
int
vm_setup_freebsd_registers(struct vmctx *vmctx, int vcpu,
uint64_t rip, uint64_t cr3, uint64_t gdtbase,
uint64_t rsp)
{
int error;
uint64_t cr0, cr4, efer, rflags, desc_base;
uint32_t desc_access, desc_limit;
uint16_t gsel;
cr0 = CR0_PE | CR0_PG | CR0_NE;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR0, cr0)) != 0)
goto done;
cr4 = CR4_PAE | CR4_VMXE;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR4, cr4)) != 0)
goto done;
efer = EFER_LME | EFER_LMA;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_EFER, efer)))
goto done;
rflags = 0x2;
error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RFLAGS, rflags);
if (error)
goto done;
desc_base = 0;
desc_limit = 0;
desc_access = 0x0000209B;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_CS,
desc_base, desc_limit, desc_access);
if (error)
goto done;
desc_access = 0x00000093;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_DS,
desc_base, desc_limit, desc_access);
if (error)
goto done;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_ES,
desc_base, desc_limit, desc_access);
if (error)
goto done;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_FS,
desc_base, desc_limit, desc_access);
if (error)
goto done;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GS,
desc_base, desc_limit, desc_access);
if (error)
goto done;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_SS,
desc_base, desc_limit, desc_access);
if (error)
goto done;
/*
* XXX TR is pointing to null selector even though we set the
* TSS segment to be usable with a base address and limit of 0.
*/
desc_access = 0x0000008b;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_TR, 0, 0, desc_access);
if (error)
goto done;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_LDTR, 0, 0,
DESC_UNUSABLE);
if (error)
goto done;
gsel = GSEL(GUEST_CODE_SEL, SEL_KPL);
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CS, gsel)) != 0)
goto done;
gsel = GSEL(GUEST_DATA_SEL, SEL_KPL);
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DS, gsel)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_ES, gsel)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_FS, gsel)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_GS, gsel)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_SS, gsel)) != 0)
goto done;
/* XXX TR is pointing to the null selector */
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_TR, 0)) != 0)
goto done;
/* LDTR is pointing to the null selector */
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_LDTR, 0)) != 0)
goto done;
/* entry point */
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RIP, rip)) != 0)
goto done;
/* page table base */
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR3, cr3)) != 0)
goto done;
desc_base = gdtbase;
desc_limit = GUEST_GDTR_LIMIT;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GDTR,
desc_base, desc_limit, 0);
if (error != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSP, rsp)) != 0)
goto done;
error = 0;
done:
return (error);
}

View File

@ -160,6 +160,7 @@ LIBULOG?= ${DESTDIR}${LIBDIR}/libulog.a
LIBUTIL?= ${DESTDIR}${LIBDIR}/libutil.a
LIBUUTIL?= ${DESTDIR}${LIBDIR}/libuutil.a
LIBVGL?= ${DESTDIR}${LIBDIR}/libvgl.a
LIBVMMAPI?= ${DESTDIR}${LIBDIR}/libvmmapi.a
LIBWRAP?= ${DESTDIR}${LIBDIR}/libwrap.a
LIBXPG4?= ${DESTDIR}${LIBDIR}/libxpg4.a
LIBY?= ${DESTDIR}${LIBDIR}/liby.a

View File

@ -10,6 +10,7 @@ SUBDIR+= acpi
SUBDIR+= apm
.endif
SUBDIR+= asf
SUBDIR+= bhyve
SUBDIR+= boot0cfg
.if ${MK_TOOLCHAIN} != "no"
SUBDIR+= btxld
@ -30,4 +31,5 @@ SUBDIR+= spkrtest
.if ${MK_SYSINSTALL} != "no"
SUBDIR+= sade
.endif
SUBDIR+= vmmctl
SUBDIR+= zzz

20
usr.sbin/bhyve/Makefile Normal file
View File

@ -0,0 +1,20 @@
#
# $FreeBSD$
#
PROG= bhyve
SRCS= atpic.c consport.c dbgport.c elcr.c fbsdrun.c inout.c mevent.c
SRCS+= pci_emul.c pci_hostbridge.c pci_passthru.c pci_virtio_block.c
SRCS+= pci_virtio_net.c pit_8254.c post.c rtc.c uart.c xmsr.c
NO_MAN=
DPADD= ${LIBVMMAPI} ${LIBMD} ${LIBPTHREAD}
LDADD= -lvmmapi -lmd -lpthread
WARNS?= 2
CFLAGS+= -I${.CURDIR}/../../sys
.include <bsd.prog.mk>

68
usr.sbin/bhyve/atpic.c Normal file
View File

@ -0,0 +1,68 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include "inout.h"
/*
* FreeBSD only writes to the 8259 interrupt controllers to put them in a
* shutdown state.
*
* So, we just ignore the writes.
*/
#define IO_ICU1 0x20
#define IO_ICU2 0xA0
#define ICU_IMR_OFFSET 1
static int
atpic_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
if (bytes != 1)
return (-1);
if (in)
return (-1);
/* Pretend all writes to the 8259 are alright */
return (0);
}
INOUT_PORT(atpic, IO_ICU1, IOPORT_F_INOUT, atpic_handler);
INOUT_PORT(atpic, IO_ICU1 + ICU_IMR_OFFSET, IOPORT_F_INOUT, atpic_handler);
INOUT_PORT(atpic, IO_ICU2, IOPORT_F_INOUT, atpic_handler);
INOUT_PORT(atpic, IO_ICU2 + ICU_IMR_OFFSET, IOPORT_F_INOUT, atpic_handler);

121
usr.sbin/bhyve/consport.c Normal file
View File

@ -0,0 +1,121 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <sys/select.h>
#include <stdio.h>
#include <stdlib.h>
#include <termios.h>
#include <unistd.h>
#include <stdbool.h>
#include "inout.h"
#define BVM_CONSOLE_PORT 0x220
static struct termios tio_orig, tio_new;
static void
ttyclose(void)
{
tcsetattr(STDIN_FILENO, TCSANOW, &tio_orig);
}
static void
ttyopen(void)
{
tcgetattr(STDIN_FILENO, &tio_orig);
cfmakeraw(&tio_new);
tcsetattr(STDIN_FILENO, TCSANOW, &tio_new);
atexit(ttyclose);
}
static bool
tty_char_available(void)
{
fd_set rfds;
struct timeval tv;
FD_ZERO(&rfds);
FD_SET(STDIN_FILENO, &rfds);
tv.tv_sec = 0;
tv.tv_usec = 0;
if (select(STDIN_FILENO + 1, &rfds, NULL, NULL, &tv) > 0) {
return (true);
} else {
return (false);
}
}
static int
ttyread(void)
{
char rb;
if (tty_char_available()) {
read(STDIN_FILENO, &rb, 1);
return (rb & 0xff);
} else {
return (-1);
}
}
static void
ttywrite(unsigned char wb)
{
(void) write(STDOUT_FILENO, &wb, 1);
}
static int
console_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
static int opened;
if (bytes != 4)
return (-1);
if (!opened) {
ttyopen();
opened = 1;
}
if (in)
*eax = ttyread();
else
ttywrite(*eax);
return (0);
}
INOUT_PORT(console, BVM_CONSOLE_PORT, IOPORT_F_INOUT, console_handler);

125
usr.sbin/bhyve/dbgport.c Normal file
View File

@ -0,0 +1,125 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <sys/uio.h>
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
#include "inout.h"
#include "dbgport.h"
#define BVM_DBG_PORT 0x224
static int listen_fd, conn_fd;
static struct sockaddr_in sin;
void
init_dbgport(int sport)
{
conn_fd = -1;
if ((listen_fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
perror("socket");
exit(1);
}
sin.sin_len = sizeof(sin);
sin.sin_family = AF_INET;
sin.sin_addr.s_addr = htonl(INADDR_ANY);
sin.sin_port = htons(sport);
if (bind(listen_fd, (struct sockaddr *)&sin, sizeof(sin)) < 0) {
perror("bind");
exit(1);
}
if (listen(listen_fd, 1) < 0) {
perror("listen");
exit(1);
}
}
static int
dbg_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
char ch;
int nwritten, nread, printonce;
if (bytes != 4)
return (-1);
again:
printonce = 0;
while (conn_fd < 0) {
if (!printonce) {
printf("Waiting for connection from gdb\r\n");
printonce = 1;
}
conn_fd = accept(listen_fd, NULL, NULL);
if (conn_fd >= 0)
fcntl(conn_fd, F_SETFL, O_NONBLOCK);
else if (errno != EINTR)
perror("accept");
}
if (in) {
nread = read(conn_fd, &ch, 1);
if (nread == -1 && errno == EAGAIN)
*eax = -1;
else if (nread == 1)
*eax = ch;
else {
close(conn_fd);
conn_fd = -1;
goto again;
}
} else {
ch = *eax;
nwritten = write(conn_fd, &ch, 1);
if (nwritten != 1) {
close(conn_fd);
conn_fd = -1;
goto again;
}
}
return (0);
}
INOUT_PORT(dbg, BVM_DBG_PORT, IOPORT_F_INOUT, dbg_handler);

36
usr.sbin/bhyve/dbgport.h Normal file
View File

@ -0,0 +1,36 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _DBGPORT_H_
#define _DBGPORT_H_
#define DEFAULT_GDB_PORT 6466
void init_dbgport(int port);
#endif

65
usr.sbin/bhyve/elcr.c Normal file
View File

@ -0,0 +1,65 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include "inout.h"
/*
* EISA interrupt Level Control Register.
*
* This is a 16-bit register with one bit for each of the IRQ0 through IRQ15.
* A level triggered irq is indicated by setting the corresponding bit to '1'.
*/
#define ELCR_PORT 0x4d0
static uint8_t elcr[2] = { 0x00, 0x00 };
static int
elcr_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
int idx;
if (bytes != 1)
return (-1);
idx = port - ELCR_PORT;
if (in)
*eax = elcr[idx];
else
elcr[idx] = *eax;
return (0);
}
INOUT_PORT(elcr, ELCR_PORT + 0, IOPORT_F_INOUT, elcr_handler);
INOUT_PORT(elcr, ELCR_PORT + 1, IOPORT_F_INOUT, elcr_handler);

650
usr.sbin/bhyve/fbsdrun.c Normal file
View File

@ -0,0 +1,650 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/time.h>
#include <machine/segments.h>
#include <stdio.h>
#include <stdlib.h>
#include <libgen.h>
#include <unistd.h>
#include <assert.h>
#include <errno.h>
#include <signal.h>
#include <pthread.h>
#include <machine/vmm.h>
#include <vmmapi.h>
#include "fbsdrun.h"
#include "inout.h"
#include "dbgport.h"
#include "mevent.h"
#include "pci_emul.h"
#include "xmsr.h"
#define DEFAULT_GUEST_HZ 100
#define DEFAULT_GUEST_TSLICE 200
#define GUEST_NIO_PORT 0x488 /* guest upcalls via i/o port */
#define VMEXIT_SWITCH 0 /* force vcpu switch in mux mode */
#define VMEXIT_CONTINUE 1 /* continue from next instruction */
#define VMEXIT_RESTART 2 /* restart current instruction */
#define VMEXIT_ABORT 3 /* abort the vm run loop */
#define VMEXIT_RESET 4 /* guest machine has reset */
#define MB (1024UL * 1024)
#define GB (1024UL * MB)
typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu);
int guest_tslice = DEFAULT_GUEST_TSLICE;
int guest_hz = DEFAULT_GUEST_HZ;
char *vmname;
u_long lomem_sz;
u_long himem_sz;
int guest_ncpus;
static int pincpu = -1;
static int guest_vcpu_mux;
static int guest_vmexit_on_hlt, guest_vmexit_on_pause;
static int foundcpus;
static char *lomem_addr;
static char *himem_addr;
static char *progname;
static const int BSP = 0;
static int cpumask;
static void *oem_tbl_start;
static int oem_tbl_size;
static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip);
struct vm_exit vmexit[VM_MAXCPU];
struct fbsdstats {
uint64_t vmexit_bogus;
uint64_t vmexit_bogus_switch;
uint64_t vmexit_hlt;
uint64_t vmexit_pause;
uint64_t vmexit_mtrap;
uint64_t cpu_switch_rotate;
uint64_t cpu_switch_direct;
int io_reset;
} stats;
struct mt_vmm_info {
pthread_t mt_thr;
struct vmctx *mt_ctx;
int mt_vcpu;
} mt_vmm_info[VM_MAXCPU];
static void
usage(int code)
{
fprintf(stderr,
"Usage: %s [-hBHP][-g <gdb port>][-z <hz>][-s <pci>][-p pincpu]"
"[-n <pci>][-m lowmem][-M highmem] <vm>\n"
" -g: gdb port (default is %d and 0 means don't open)\n"
" -c: # cpus (default 1)\n"
" -p: pin vcpu 'n' to host cpu 'pincpu + n'\n"
" -B: inject breakpoint exception on vm entry\n"
" -H: vmexit from the guest on hlt\n"
" -P: vmexit from the guest on pause\n"
" -h: help\n"
" -z: guest hz (default is %d)\n"
" -s: <slot,driver,configinfo> PCI slot config\n"
" -n: <slot,name> PCI slot naming\n"
" -m: lowmem in MB\n"
" -M: highmem in MB\n"
" -x: mux vcpus to 1 hcpu\n"
" -t: mux vcpu timeslice hz (default %d)\n",
progname, DEFAULT_GDB_PORT, DEFAULT_GUEST_HZ,
DEFAULT_GUEST_TSLICE);
exit(code);
}
void *
paddr_guest2host(uintptr_t gaddr)
{
if (lomem_sz == 0)
return (NULL);
if (gaddr < lomem_sz) {
return ((void *)(lomem_addr + gaddr));
} else if (gaddr >= 4*GB && gaddr < (4*GB + himem_sz)) {
return ((void *)(himem_addr + gaddr - 4*GB));
} else
return (NULL);
}
void
fbsdrun_add_oemtbl(void *tbl, int tblsz)
{
oem_tbl_start = tbl;
oem_tbl_size = tblsz;
}
int
fbsdrun_vmexit_on_pause(void)
{
return (guest_vmexit_on_pause);
}
int
fbsdrun_vmexit_on_hlt(void)
{
return (guest_vmexit_on_hlt);
}
int
fbsdrun_muxed(void)
{
return (guest_vcpu_mux);
}
static void *
fbsdrun_start_thread(void *param)
{
int vcpu;
struct mt_vmm_info *mtp = param;
vcpu = mtp->mt_vcpu;
vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip);
/* not reached */
exit(1);
return (NULL);
}
void
fbsdrun_addcpu(struct vmctx *ctx, int vcpu, uint64_t rip)
{
int error;
if (cpumask & (1 << vcpu)) {
printf("addcpu: attempting to add existing cpu %d\n", vcpu);
exit(1);
}
cpumask |= 1 << vcpu;
foundcpus++;
/*
* Set up the vmexit struct to allow execution to start
* at the given RIP
*/
vmexit[vcpu].rip = rip;
vmexit[vcpu].inst_length = 0;
if (vcpu == BSP || !guest_vcpu_mux){
mt_vmm_info[vcpu].mt_ctx = ctx;
mt_vmm_info[vcpu].mt_vcpu = vcpu;
error = pthread_create(&mt_vmm_info[vcpu].mt_thr, NULL,
fbsdrun_start_thread, &mt_vmm_info[vcpu]);
assert(error == 0);
}
}
static int
fbsdrun_get_next_cpu(int curcpu)
{
/*
* Get the next available CPU. Assumes they arrive
* in ascending order with no gaps.
*/
return ((curcpu + 1) % foundcpus);
}
static int
vmexit_catch_reset(void)
{
stats.io_reset++;
return (VMEXIT_RESET);
}
static int
vmexit_catch_inout(void)
{
return (VMEXIT_ABORT);
}
static int
vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu,
uint32_t eax)
{
#if PG_DEBUG /* put all types of debug here */
if (eax == 0) {
pause_noswitch = 1;
} else if (eax == 1) {
pause_noswitch = 0;
} else {
pause_noswitch = 0;
if (eax == 5) {
vm_set_capability(ctx, *pvcpu, VM_CAP_MTRAP_EXIT, 1);
}
}
#endif
return (VMEXIT_CONTINUE);
}
static int
vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
{
int error;
int bytes, port, in, out;
uint32_t eax;
int vcpu;
vcpu = *pvcpu;
port = vme->u.inout.port;
bytes = vme->u.inout.bytes;
eax = vme->u.inout.eax;
in = vme->u.inout.in;
out = !in;
/* We don't deal with these */
if (vme->u.inout.string || vme->u.inout.rep)
return (VMEXIT_ABORT);
/* Special case of guest reset */
if (out && port == 0x64 && (uint8_t)eax == 0xFE)
return (vmexit_catch_reset());
/* Extra-special case of host notifications */
if (out && port == GUEST_NIO_PORT)
return (vmexit_handle_notify(ctx, vme, pvcpu, eax));
error = emulate_inout(ctx, vcpu, in, port, bytes, &eax);
if (error == 0 && in)
error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, eax);
if (error == 0)
return (VMEXIT_CONTINUE);
else {
fprintf(stderr, "Unhandled %s%c 0x%04x\n",
in ? "in" : "out",
bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port);
return (vmexit_catch_inout());
}
}
static int
vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
{
printf("vm exit rdmsr 0x%x, cpu %d\n", vme->u.msr.code, *pvcpu);
return (VMEXIT_ABORT);
}
static int
vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
{
int newcpu;
int retval = VMEXIT_CONTINUE;
newcpu = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code,vme->u.msr.wval);
if (guest_vcpu_mux && *pvcpu != newcpu) {
retval = VMEXIT_SWITCH;
*pvcpu = newcpu;
}
return (retval);
}
static int
vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
printf("vm exit[%d]\n", *pvcpu);
printf("\treason\t\tVMX\n");
printf("\trip\t\t0x%016lx\n", vmexit->rip);
printf("\tinst_length\t%d\n", vmexit->inst_length);
printf("\terror\t\t%d\n", vmexit->u.vmx.error);
printf("\texit_reason\t%u\n", vmexit->u.vmx.exit_reason);
printf("\tqualification\t0x%016lx\n", vmexit->u.vmx.exit_qualification);
return (VMEXIT_ABORT);
}
static int bogus_noswitch = 1;
static int
vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
stats.vmexit_bogus++;
if (!guest_vcpu_mux || guest_ncpus == 1 || bogus_noswitch) {
return (VMEXIT_RESTART);
} else {
stats.vmexit_bogus_switch++;
vmexit->inst_length = 0;
*pvcpu = -1;
return (VMEXIT_SWITCH);
}
}
static int
vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
stats.vmexit_hlt++;
if (fbsdrun_muxed()) {
*pvcpu = -1;
return (VMEXIT_SWITCH);
} else {
/*
* Just continue execution with the next instruction. We use
* the HLT VM exit as a way to be friendly with the host
* scheduler.
*/
return (VMEXIT_CONTINUE);
}
}
static int pause_noswitch;
static int
vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
stats.vmexit_pause++;
if (fbsdrun_muxed() && !pause_noswitch) {
*pvcpu = -1;
return (VMEXIT_SWITCH);
} else {
return (VMEXIT_CONTINUE);
}
}
static int
vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
stats.vmexit_mtrap++;
return (VMEXIT_RESTART);
}
static void
sigalrm(int sig)
{
return;
}
static void
setup_timeslice(void)
{
struct sigaction sa;
struct itimerval itv;
int error;
/*
* Setup a realtime timer to generate a SIGALRM at a
* frequency of 'guest_tslice' ticks per second.
*/
sigemptyset(&sa.sa_mask);
sa.sa_flags = 0;
sa.sa_handler = sigalrm;
error = sigaction(SIGALRM, &sa, NULL);
assert(error == 0);
itv.it_interval.tv_sec = 0;
itv.it_interval.tv_usec = 1000000 / guest_tslice;
itv.it_value.tv_sec = 0;
itv.it_value.tv_usec = 1000000 / guest_tslice;
error = setitimer(ITIMER_REAL, &itv, NULL);
assert(error == 0);
}
static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
[VM_EXITCODE_INOUT] = vmexit_inout,
[VM_EXITCODE_VMX] = vmexit_vmx,
[VM_EXITCODE_BOGUS] = vmexit_bogus,
[VM_EXITCODE_RDMSR] = vmexit_rdmsr,
[VM_EXITCODE_WRMSR] = vmexit_wrmsr,
[VM_EXITCODE_MTRAP] = vmexit_mtrap,
};
static void
vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip)
{
int error, rc, prevcpu;
if (guest_vcpu_mux)
setup_timeslice();
if (pincpu >= 0) {
error = vm_set_pinning(ctx, vcpu, pincpu + vcpu);
assert(error == 0);
}
while (1) {
error = vm_run(ctx, vcpu, rip, &vmexit[vcpu]);
if (error != 0)
break;
prevcpu = vcpu;
rc = (*handler[vmexit[vcpu].exitcode])(ctx, &vmexit[vcpu],
&vcpu);
switch (rc) {
case VMEXIT_SWITCH:
assert(guest_vcpu_mux);
if (vcpu == -1) {
stats.cpu_switch_rotate++;
vcpu = fbsdrun_get_next_cpu(prevcpu);
} else {
stats.cpu_switch_direct++;
}
/* fall through */
case VMEXIT_CONTINUE:
rip = vmexit[vcpu].rip + vmexit[vcpu].inst_length;
break;
case VMEXIT_RESTART:
rip = vmexit[vcpu].rip;
break;
case VMEXIT_RESET:
exit(0);
default:
exit(1);
}
}
fprintf(stderr, "vm_run error %d, errno %d\n", error, errno);
}
int
main(int argc, char *argv[])
{
int c, error, gdb_port, inject_bkpt, tmp, err;
struct vmctx *ctx;
uint64_t rip;
inject_bkpt = 0;
progname = basename(argv[0]);
gdb_port = DEFAULT_GDB_PORT;
guest_ncpus = 1;
while ((c = getopt(argc, argv, "hBHPxp:g:c:z:s:n:m:M:")) != -1) {
switch (c) {
case 'B':
inject_bkpt = 1;
break;
case 'x':
guest_vcpu_mux = 1;
break;
case 'p':
pincpu = atoi(optarg);
break;
case 'c':
guest_ncpus = atoi(optarg);
break;
case 'g':
gdb_port = atoi(optarg);
break;
case 'z':
guest_hz = atoi(optarg);
break;
case 't':
guest_tslice = atoi(optarg);
break;
case 's':
pci_parse_slot(optarg);
break;
case 'n':
pci_parse_name(optarg);
break;
case 'm':
lomem_sz = strtoul(optarg, NULL, 0) * MB;
break;
case 'M':
himem_sz = strtoul(optarg, NULL, 0) * MB;
break;
case 'H':
guest_vmexit_on_hlt = 1;
break;
case 'P':
guest_vmexit_on_pause = 1;
break;
case 'h':
usage(0);
default:
usage(1);
}
}
argc -= optind;
argv += optind;
if (argc != 1)
usage(1);
/* No need to mux if guest is uni-processor */
if (guest_ncpus <= 1)
guest_vcpu_mux = 0;
/* vmexit on hlt if guest is muxed */
if (guest_vcpu_mux) {
guest_vmexit_on_hlt = 1;
guest_vmexit_on_pause = 1;
}
vmname = argv[0];
ctx = vm_open(vmname);
if (ctx == NULL) {
perror("vm_open");
exit(1);
}
if (fbsdrun_vmexit_on_hlt()) {
err = vm_get_capability(ctx, BSP, VM_CAP_HALT_EXIT, &tmp);
if (err < 0) {
printf("VM exit on HLT not supported\n");
exit(1);
}
vm_set_capability(ctx, BSP, VM_CAP_HALT_EXIT, 1);
handler[VM_EXITCODE_HLT] = vmexit_hlt;
}
if (fbsdrun_vmexit_on_pause()) {
/*
* pause exit support required for this mode
*/
err = vm_get_capability(ctx, BSP, VM_CAP_PAUSE_EXIT, &tmp);
if (err < 0) {
printf("SMP mux requested, no pause support\n");
exit(1);
}
vm_set_capability(ctx, BSP, VM_CAP_PAUSE_EXIT, 1);
handler[VM_EXITCODE_PAUSE] = vmexit_pause;
}
if (lomem_sz != 0) {
lomem_addr = vm_map_memory(ctx, 0, lomem_sz);
if (lomem_addr == (char *) MAP_FAILED) {
lomem_sz = 0;
} else if (himem_sz != 0) {
himem_addr = vm_map_memory(ctx, 4*GB, himem_sz);
if (himem_addr == (char *) MAP_FAILED) {
lomem_sz = 0;
himem_sz = 0;
}
}
}
init_inout();
init_pci(ctx);
if (gdb_port != 0)
init_dbgport(gdb_port);
error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip);
assert(error == 0);
if (inject_bkpt) {
error = vm_inject_event(ctx, BSP, VM_HW_EXCEPTION, IDT_BP);
assert(error == 0);
}
/*
* build the guest tables, MP etc.
*/
vm_build_tables(ctx, guest_ncpus, oem_tbl_start, oem_tbl_size);
/*
* Add CPU 0
*/
fbsdrun_addcpu(ctx, BSP, rip);
/*
* Head off to the main event dispatch loop
*/
mevent_dispatch();
exit(1);
}

53
usr.sbin/bhyve/fbsdrun.h Normal file
View File

@ -0,0 +1,53 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _FBSDRUN_H_
#define _FBSDRUN_H_
#ifndef CTASSERT /* Allow lint to override */
#define CTASSERT(x) _CTASSERT(x, __LINE__)
#define _CTASSERT(x, y) __CTASSERT(x, y)
#define __CTASSERT(x, y) typedef char __assert ## y[(x) ? 1 : -1]
#endif
struct vmctx;
extern int guest_hz;
extern int guest_tslice;
extern int guest_ncpus;
extern char *vmname;
extern u_long lomem_sz, himem_sz;
void *paddr_guest2host(uintptr_t);
void fbsdrun_addcpu(struct vmctx *ctx, int cpu, uint64_t rip);
void fbsdrun_add_oemtbl(void *tbl, int tblsz);
int fbsdrun_muxed(void);
int fbsdrun_vmexit_on_hlt(void);
int fbsdrun_vmexit_on_pause(void);
#endif

98
usr.sbin/bhyve/inout.c Normal file
View File

@ -0,0 +1,98 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/linker_set.h>
#include <stdio.h>
#include <assert.h>
#include "inout.h"
SET_DECLARE(inout_port_set, struct inout_port);
#define MAX_IOPORTS (1 << 16)
static struct {
const char *name;
int flags;
inout_func_t handler;
void *arg;
} inout_handlers[MAX_IOPORTS];
int
emulate_inout(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax)
{
int flags;
inout_func_t handler;
void *arg;
assert(port < MAX_IOPORTS);
if ((handler = inout_handlers[port].handler) == NULL)
return (-1);
flags = inout_handlers[port].flags;
arg = inout_handlers[port].arg;
if ((in && (flags & IOPORT_F_IN)) || (!in && (flags & IOPORT_F_OUT)))
return ((*handler)(ctx, vcpu, in, port, bytes, eax, arg));
else
return (-1);
}
void
init_inout(void)
{
struct inout_port **iopp, *iop;
SET_FOREACH(iopp, inout_port_set) {
iop = *iopp;
assert(iop->port < MAX_IOPORTS);
inout_handlers[iop->port].name = iop->name;
inout_handlers[iop->port].flags = iop->flags;
inout_handlers[iop->port].handler = iop->handler;
inout_handlers[iop->port].arg = NULL;
}
}
int
register_inout(struct inout_port *iop)
{
assert(iop->port < MAX_IOPORTS);
inout_handlers[iop->port].name = iop->name;
inout_handlers[iop->port].flags = iop->flags;
inout_handlers[iop->port].handler = iop->handler;
inout_handlers[iop->port].arg = iop->arg;
return (0);
}

65
usr.sbin/bhyve/inout.h Normal file
View File

@ -0,0 +1,65 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _INOUT_H_
#define _INOUT_H_
#include <sys/linker_set.h>
struct vmctx;
typedef int (*inout_func_t)(struct vmctx *ctx, int vcpu, int in, int port,
int bytes, uint32_t *eax, void *arg);
struct inout_port {
const char *name;
int port;
int flags;
inout_func_t handler;
void *arg;
};
#define IOPORT_F_IN 0x1
#define IOPORT_F_OUT 0x2
#define IOPORT_F_INOUT 0x3
#define INOUT_PORT(name, port, flags, handler) \
static struct inout_port __CONCAT(__inout_port, __LINE__) = { \
#name, \
(port), \
(flags), \
(handler), \
0 \
}; \
DATA_SET(inout_port_set, __CONCAT(__inout_port, __LINE__))
void init_inout(void);
int emulate_inout(struct vmctx *, int vcpu, int in, int port, int bytes,
uint32_t *eax);
int register_inout(struct inout_port *iop);
#endif /* _INOUT_H_ */

419
usr.sbin/bhyve/mevent.c Normal file
View File

@ -0,0 +1,419 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
/*
* Micro event library for FreeBSD, designed for a single i/o thread
* using kqueue, and having events be persistent by default.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <assert.h>
#include <errno.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/event.h>
#include <sys/time.h>
#include <pthread.h>
#include "mevent.h"
#define MEVENT_MAX 64
#define MEV_ENABLE 1
#define MEV_DISABLE 2
#define MEV_DEL_PENDING 3
static pthread_t mevent_tid;
static int mevent_pipefd[2];
static pthread_mutex_t mevent_lmutex = PTHREAD_MUTEX_INITIALIZER;
struct mevent {
void (*me_func)(int, enum ev_type, void *);
int me_fd;
enum ev_type me_type;
void *me_param;
int me_cq;
int me_state;
int me_closefd;
LIST_ENTRY(mevent) me_list;
};
static LIST_HEAD(listhead, mevent) global_head, change_head;
static void
mevent_qlock(void)
{
pthread_mutex_lock(&mevent_lmutex);
}
static void
mevent_qunlock(void)
{
pthread_mutex_unlock(&mevent_lmutex);
}
static void
mevent_pipe_read(int fd, enum ev_type type, void *param)
{
char buf[MEVENT_MAX];
int status;
/*
* Drain the pipe read side. The fd is non-blocking so this is
* safe to do.
*/
do {
status = read(fd, buf, sizeof(buf));
} while (status == MEVENT_MAX);
}
static void
mevent_notify(void)
{
char c;
/*
* If calling from outside the i/o thread, write a byte on the
* pipe to force the i/o thread to exit the blocking kevent call.
*/
if (mevent_pipefd[1] != 0 && pthread_self() != mevent_tid) {
write(mevent_pipefd[1], &c, 1);
}
}
static int
mevent_kq_filter(struct mevent *mevp)
{
int retval;
retval = 0;
if (mevp->me_type == EVF_READ)
retval = EVFILT_READ;
if (mevp->me_type == EVF_WRITE)
retval = EVFILT_WRITE;
return (retval);
}
static int
mevent_kq_flags(struct mevent *mevp)
{
int ret;
switch (mevp->me_state) {
case MEV_ENABLE:
ret = EV_ADD;
break;
case MEV_DISABLE:
ret = EV_DISABLE;
break;
case MEV_DEL_PENDING:
ret = EV_DELETE;
break;
}
return (ret);
}
static int
mevent_kq_fflags(struct mevent *mevp)
{
/* XXX nothing yet, perhaps EV_EOF for reads ? */
return (0);
}
static int
mevent_build(int mfd, struct kevent *kev)
{
struct mevent *mevp, *tmpp;
int i;
i = 0;
mevent_qlock();
LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) {
if (mevp->me_closefd) {
/*
* A close of the file descriptor will remove the
* event
*/
close(mevp->me_fd);
} else {
kev[i].ident = mevp->me_fd;
kev[i].filter = mevent_kq_filter(mevp);
kev[i].flags = mevent_kq_flags(mevp);
kev[i].fflags = mevent_kq_fflags(mevp);
kev[i].data = 0;
kev[i].udata = mevp;
i++;
}
mevp->me_cq = 0;
LIST_REMOVE(mevp, me_list);
if (mevp->me_state == MEV_DEL_PENDING) {
free(mevp);
} else {
LIST_INSERT_HEAD(&global_head, mevp, me_list);
}
assert(i < MEVENT_MAX);
}
mevent_qunlock();
return (i);
}
static void
mevent_handle(struct kevent *kev, int numev)
{
struct mevent *mevp;
int i;
for (i = 0; i < numev; i++) {
mevp = kev[i].udata;
/* XXX check for EV_ERROR ? */
(*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param);
}
}
struct mevent *
mevent_add(int fd, enum ev_type type,
void (*func)(int, enum ev_type, void *), void *param)
{
struct mevent *lp, *mevp;
if (fd < 0 || func == NULL) {
return (NULL);
}
mevp = NULL;
mevent_qlock();
/*
* Verify that the fd/type tuple is not present in any list
*/
LIST_FOREACH(lp, &global_head, me_list) {
if (lp->me_fd == fd && lp->me_type == type) {
goto exit;
}
}
LIST_FOREACH(lp, &change_head, me_list) {
if (lp->me_fd == fd && lp->me_type == type) {
goto exit;
}
}
/*
* Allocate an entry, populate it, and add it to the change list.
*/
mevp = malloc(sizeof(struct mevent));
if (mevp == NULL) {
goto exit;
}
memset(mevp, 0, sizeof(struct mevent));
mevp->me_fd = fd;
mevp->me_type = type;
mevp->me_func = func;
mevp->me_param = param;
LIST_INSERT_HEAD(&change_head, mevp, me_list);
mevp->me_cq = 1;
mevp->me_state = MEV_ENABLE;
mevent_notify();
exit:
mevent_qunlock();
return (mevp);
}
static int
mevent_update(struct mevent *evp, int newstate)
{
/*
* It's not possible to enable/disable a deleted event
*/
if (evp->me_state == MEV_DEL_PENDING)
return (EINVAL);
/*
* No update needed if state isn't changing
*/
if (evp->me_state == newstate)
return (0);
mevent_qlock();
evp->me_state = newstate;
/*
* Place the entry onto the changed list if not already there.
*/
if (evp->me_cq == 0) {
evp->me_cq = 1;
LIST_REMOVE(evp, me_list);
LIST_INSERT_HEAD(&change_head, evp, me_list);
mevent_notify();
}
mevent_qunlock();
return (0);
}
int
mevent_enable(struct mevent *evp)
{
return (mevent_update(evp, MEV_ENABLE));
}
int
mevent_disable(struct mevent *evp)
{
return (mevent_update(evp, MEV_DISABLE));
}
static int
mevent_delete_event(struct mevent *evp, int closefd)
{
mevent_qlock();
/*
* Place the entry onto the changed list if not already there, and
* mark as to be deleted.
*/
if (evp->me_cq == 0) {
evp->me_cq = 1;
LIST_REMOVE(evp, me_list);
LIST_INSERT_HEAD(&change_head, evp, me_list);
mevent_notify();
}
evp->me_state = MEV_DEL_PENDING;
if (closefd)
evp->me_closefd = 1;
mevent_qunlock();
return (0);
}
int
mevent_delete(struct mevent *evp)
{
return (mevent_delete_event(evp, 0));
}
int
mevent_delete_close(struct mevent *evp)
{
return (mevent_delete_event(evp, 1));
}
void
mevent_dispatch(void)
{
struct kevent changelist[MEVENT_MAX];
struct kevent eventlist[MEVENT_MAX];
struct mevent *pipev;
int mfd;
int numev;
int ret;
mevent_tid = pthread_self();
mfd = kqueue();
assert(mfd > 0);
/*
* Open the pipe that will be used for other threads to force
* the blocking kqueue call to exit by writing to it. Set the
* descriptor to non-blocking.
*/
ret = pipe(mevent_pipefd);
if (ret < 0) {
perror("pipe");
exit(0);
}
/*
* Add internal event handler for the pipe write fd
*/
pipev = mevent_add(mevent_pipefd[0], EVF_READ, mevent_pipe_read, NULL);
assert(pipev != NULL);
for (;;) {
/*
* Build changelist if required.
* XXX the changelist can be put into the blocking call
* to eliminate the extra syscall. Currently better for
* debug.
*/
numev = mevent_build(mfd, changelist);
if (numev) {
ret = kevent(mfd, changelist, numev, NULL, 0, NULL);
if (ret == -1) {
perror("Error return from kevent change");
}
}
/*
* Block awaiting events
*/
ret = kevent(mfd, NULL, 0, eventlist, MEVENT_MAX, NULL);
if (ret == -1) {
perror("Error return from kevent monitor");
}
/*
* Handle reported events
*/
mevent_handle(eventlist, ret);
}
}

49
usr.sbin/bhyve/mevent.h Normal file
View File

@ -0,0 +1,49 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _MEVENT_H_
#define _MEVENT_H_
enum ev_type {
EVF_READ,
EVF_WRITE
};
struct mevent;
struct mevent *mevent_add(int fd, enum ev_type type,
void (*func)(int, enum ev_type, void *),
void *param);
int mevent_enable(struct mevent *evp);
int mevent_disable(struct mevent *evp);
int mevent_delete(struct mevent *evp);
int mevent_delete_close(struct mevent *evp);
void mevent_dispatch(void);
#endif /* _MEVENT_H_ */

View File

@ -0,0 +1,180 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
/*
* Test program for the micro event library. Set up a simple TCP echo
* service.
*
* cc mevent_test.c mevent.c -lpthread
*/
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include "mevent.h"
#define TEST_PORT 4321
static pthread_mutex_t accept_mutex = PTHREAD_MUTEX_INITIALIZER;
static pthread_cond_t accept_condvar = PTHREAD_COND_INITIALIZER;
#define MEVENT_ECHO
#ifdef MEVENT_ECHO
struct esync {
pthread_mutex_t e_mt;
pthread_cond_t e_cond;
};
static void
echoer_callback(int fd, enum ev_type type, void *param)
{
struct esync *sync = param;
pthread_mutex_lock(&sync->e_mt);
pthread_cond_signal(&sync->e_cond);
pthread_mutex_unlock(&sync->e_mt);
}
static void *
echoer(void *param)
{
struct esync sync;
struct mevent *mev;
char buf[128];
int fd = (int)(uintptr_t) param;
int len;
pthread_mutex_init(&sync.e_mt, NULL);
pthread_cond_init(&sync.e_cond, NULL);
pthread_mutex_lock(&sync.e_mt);
mev = mevent_add(fd, EVF_READ, echoer_callback, &sync);
if (mev == NULL) {
printf("Could not allocate echoer event\n");
exit(1);
}
while (!pthread_cond_wait(&sync.e_cond, &sync.e_mt)) {
len = read(fd, buf, sizeof(buf));
if (len > 0) {
write(fd, buf, len);
write(0, buf, len);
} else {
break;
}
}
mevent_delete_close(mev);
pthread_mutex_unlock(&sync.e_mt);
pthread_mutex_destroy(&sync.e_mt);
pthread_cond_destroy(&sync.e_cond);
}
#else
static void *
echoer(void *param)
{
char buf[128];
int fd = (int)(uintptr_t) param;
int len;
while ((len = read(fd, buf, sizeof(buf))) > 0) {
write(1, buf, len);
}
}
#endif /* MEVENT_ECHO */
static void
acceptor_callback(int fd, enum ev_type type, void *param)
{
pthread_mutex_lock(&accept_mutex);
pthread_cond_signal(&accept_condvar);
pthread_mutex_unlock(&accept_mutex);
}
static void *
acceptor(void *param)
{
struct sockaddr_in sin;
pthread_t tid;
int news;
int s;
if ((s = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
perror("socket");
exit(1);
}
sin.sin_len = sizeof(sin);
sin.sin_family = AF_INET;
sin.sin_addr.s_addr = htonl(INADDR_ANY);
sin.sin_port = htons(TEST_PORT);
if (bind(s, (struct sockaddr *)&sin, sizeof(sin)) < 0) {
perror("bind");
exit(1);
}
if (listen(s, 1) < 0) {
perror("listen");
exit(1);
}
(void) mevent_add(s, EVF_READ, acceptor_callback, NULL);
pthread_mutex_lock(&accept_mutex);
while (!pthread_cond_wait(&accept_condvar, &accept_mutex)) {
news = accept(s, NULL, NULL);
if (news < 0) {
perror("accept error");
} else {
printf("incoming connection, spawning thread\n");
pthread_create(&tid, NULL, echoer,
(void *)(uintptr_t)news);
}
}
}
main()
{
pthread_t tid;
pthread_create(&tid, NULL, acceptor, NULL);
mevent_dispatch();
}

976
usr.sbin/bhyve/pci_emul.c Normal file
View File

@ -0,0 +1,976 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/linker_set.h>
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>
#include <assert.h>
#include <machine/vmm.h>
#include <vmmapi.h>
#include "fbsdrun.h"
#include "inout.h"
#include "pci_emul.h"
#define CONF1_ADDR_PORT 0x0cf8
#define CONF1_DATA_PORT 0x0cfc
#define CFGWRITE(pi,off,val,b) \
do { \
if ((b) == 1) { \
pci_set_cfgdata8((pi),(off),(val)); \
} else if ((b) == 2) { \
pci_set_cfgdata16((pi),(off),(val)); \
} else { \
pci_set_cfgdata32((pi),(off),(val)); \
} \
} while (0)
#define MAXSLOTS 32
static struct slotinfo {
char *si_name;
char *si_param;
struct pci_devinst *si_devi;
int si_titled;
int si_pslot;
char si_prefix;
char si_suffix;
} pci_slotinfo[MAXSLOTS];
/*
* NetApp specific:
* struct used to build an in-core OEM table to supply device names
* to driver instances
*/
static struct mptable_pci_devnames {
#define MPT_HDR_BASE 0
#define MPT_HDR_NAME 2
uint16_t md_hdrtype;
uint16_t md_entries;
uint16_t md_cksum;
uint16_t md_pad;
#define MPT_NTAP_SIG \
((uint32_t)(('P' << 24) | ('A' << 16) | ('T' << 8) | 'N'))
uint32_t md_sig;
uint32_t md_rsvd;
struct mptable_pci_slotinfo {
uint16_t mds_type;
uint16_t mds_phys_slot;
uint8_t mds_bus;
uint8_t mds_slot;
uint8_t mds_func;
uint8_t mds_pad;
uint16_t mds_vid;
uint16_t mds_did;
uint8_t mds_suffix[4];
uint8_t mds_prefix[4];
uint32_t mds_rsvd[3];
} md_slotinfo[MAXSLOTS];
} pci_devnames;
SET_DECLARE(pci_devemu_set, struct pci_devemu);
static uint64_t pci_emul_iobase;
static uint64_t pci_emul_membase32;
static uint64_t pci_emul_membase64;
#define PCI_EMUL_IOBASE 0x2000
#define PCI_EMUL_IOLIMIT 0x10000
#define PCI_EMUL_MEMBASE32 (lomem_sz)
#define PCI_EMUL_MEMLIMIT32 0xE0000000 /* 3.5GB */
#define PCI_EMUL_MEMBASE64 0xD000000000UL
#define PCI_EMUL_MEMLIMIT64 0xFD00000000UL
static int pci_emul_devices;
static int devname_elems;
/*
* I/O access
*/
/*
* Slot options are in the form:
*
* <slot>,<emul>[,<config>]
*
* slot is 0..31
* emul is a string describing the type of PCI device e.g. virtio-net
* config is an optional string, depending on the device, that can be
* used for configuration.
* Examples are:
* 1,virtio-net,tap0
* 3,dummy
*/
static void
pci_parse_slot_usage(char *aopt)
{
printf("Invalid PCI slot info field \"%s\"\n", aopt);
free(aopt);
}
void
pci_parse_slot(char *opt)
{
char *slot, *emul, *config;
char *str, *cpy;
int snum;
str = cpy = strdup(opt);
config = NULL;
slot = strsep(&str, ",");
emul = strsep(&str, ",");
if (str != NULL) {
config = strsep(&str, ",");
}
if (emul == NULL) {
pci_parse_slot_usage(cpy);
return;
}
snum = 255;
snum = atoi(slot);
if (snum < 0 || snum >= MAXSLOTS) {
pci_parse_slot_usage(cpy);
} else {
pci_slotinfo[snum].si_name = emul;
pci_slotinfo[snum].si_param = config;
}
}
/*
*
* PCI MPTable names are of the form:
*
* <slot>,[prefix]<digit><suffix>
*
* .. with <prefix> an alphabetic char, <digit> a 1 or 2-digit string,
* and <suffix> a single char.
*
* Examples:
* 1,e0c
* 4,e0P
* 6,43a
* 7,0f
* 10,1
* 12,e0M
* 2,12a
*
* Note that this is NetApp-specific, but is ignored on other o/s's.
*/
static void
pci_parse_name_usage(char *aopt)
{
printf("Invalid PCI slot name field \"%s\"\n", aopt);
}
void
pci_parse_name(char *opt)
{
char csnum[4];
char *namestr;
char *slotend;
char prefix, suffix;
int i;
int pslot;
int snum;
pslot = -1;
prefix = suffix = 0;
slotend = strchr(opt, ',');
/*
* A comma must be present, and can't be the first character
* or no slot would be present. Also, the slot number can't be
* more than 2 characters.
*/
if (slotend == NULL || slotend == opt || (slotend - opt > 2)) {
pci_parse_name_usage(opt);
return;
}
for (i = 0; i < (slotend - opt); i++) {
csnum[i] = opt[i];
}
csnum[i] = '\0';
snum = 255;
snum = atoi(csnum);
if (snum < 0 || snum >= MAXSLOTS) {
pci_parse_name_usage(opt);
return;
}
namestr = slotend + 1;
if (strlen(namestr) > 3) {
pci_parse_name_usage(opt);
return;
}
if (isalpha(*namestr)) {
prefix = *namestr++;
}
if (!isdigit(*namestr)) {
pci_parse_name_usage(opt);
} else {
pslot = *namestr++ - '0';
if (isnumber(*namestr)) {
pslot = 10*pslot + *namestr++ - '0';
}
if (isalpha(*namestr) && *(namestr + 1) == 0) {
suffix = *namestr;
pci_slotinfo[snum].si_titled = 1;
pci_slotinfo[snum].si_pslot = pslot;
pci_slotinfo[snum].si_prefix = prefix;
pci_slotinfo[snum].si_suffix = suffix;
} else {
pci_parse_name_usage(opt);
}
}
}
static void
pci_add_mptable_name(struct slotinfo *si)
{
struct mptable_pci_slotinfo *ms;
/*
* If naming information has been supplied for this slot, populate
* the next available mptable OEM entry
*/
if (si->si_titled) {
ms = &pci_devnames.md_slotinfo[devname_elems];
ms->mds_type = MPT_HDR_NAME;
ms->mds_phys_slot = si->si_pslot;
ms->mds_bus = si->si_devi->pi_bus;
ms->mds_slot = si->si_devi->pi_slot;
ms->mds_func = si->si_devi->pi_func;
ms->mds_vid = pci_get_cfgdata16(si->si_devi, PCIR_VENDOR);
ms->mds_did = pci_get_cfgdata16(si->si_devi, PCIR_DEVICE);
ms->mds_suffix[0] = si->si_suffix;
ms->mds_prefix[0] = si->si_prefix;
devname_elems++;
}
}
static void
pci_finish_mptable_names(void)
{
int size;
if (devname_elems) {
pci_devnames.md_hdrtype = MPT_HDR_BASE;
pci_devnames.md_entries = devname_elems;
pci_devnames.md_cksum = 0; /* XXX */
pci_devnames.md_sig = MPT_NTAP_SIG;
size = (uintptr_t)&pci_devnames.md_slotinfo[devname_elems] -
(uintptr_t)&pci_devnames;
fbsdrun_add_oemtbl(&pci_devnames, size);
}
}
static int
pci_emul_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
struct pci_devinst *pdi = arg;
struct pci_devemu *pe = pdi->pi_d;
int offset, i;
for (i = 0; i <= PCI_BARMAX; i++) {
if (pdi->pi_bar[i].type == PCIBAR_IO &&
port >= pdi->pi_bar[i].addr &&
port + bytes <= pdi->pi_bar[i].addr + pdi->pi_bar[i].size) {
offset = port - pdi->pi_bar[i].addr;
if (in)
*eax = (*pe->pe_ior)(pdi, i, offset, bytes);
else
(*pe->pe_iow)(pdi, i, offset, bytes, *eax);
return (0);
}
}
return (-1);
}
static int
pci_emul_alloc_resource(uint64_t *baseptr, uint64_t limit, uint64_t size,
uint64_t *addr)
{
uint64_t base;
assert((size & (size - 1)) == 0); /* must be a power of 2 */
base = roundup2(*baseptr, size);
if (base + size <= limit) {
*addr = base;
*baseptr = base + size;
return (0);
} else
return (-1);
}
int
pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, uint64_t hostbase,
enum pcibar_type type, uint64_t size)
{
int i, error;
uint64_t *baseptr, limit, addr, mask, lobits, bar;
struct inout_port iop;
assert(idx >= 0 && idx <= PCI_BARMAX);
if ((size & (size - 1)) != 0)
size = 1UL << flsl(size); /* round up to a power of 2 */
switch (type) {
case PCIBAR_NONE:
baseptr = NULL;
addr = mask = lobits = 0;
break;
case PCIBAR_IO:
baseptr = &pci_emul_iobase;
limit = PCI_EMUL_IOLIMIT;
mask = PCIM_BAR_IO_BASE;
lobits = PCIM_BAR_IO_SPACE;
break;
case PCIBAR_MEM64:
/*
* XXX
* Some drivers do not work well if the 64-bit BAR is allocated
* above 4GB. Allow for this by allocating small requests under
* 4GB unless then allocation size is larger than some arbitrary
* number (32MB currently).
*/
if (size > 32 * 1024 * 1024) {
/*
* XXX special case for device requiring peer-peer DMA
*/
if (size == 0x100000000UL)
baseptr = &hostbase;
else
baseptr = &pci_emul_membase64;
limit = PCI_EMUL_MEMLIMIT64;
mask = PCIM_BAR_MEM_BASE;
lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 |
PCIM_BAR_MEM_PREFETCH;
break;
}
/* fallthrough */
case PCIBAR_MEM32:
baseptr = &pci_emul_membase32;
limit = PCI_EMUL_MEMLIMIT32;
mask = PCIM_BAR_MEM_BASE;
lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32;
break;
default:
printf("pci_emul_alloc_base: invalid bar type %d\n", type);
assert(0);
}
if (baseptr != NULL) {
error = pci_emul_alloc_resource(baseptr, limit, size, &addr);
if (error != 0)
return (error);
}
pdi->pi_bar[idx].type = type;
pdi->pi_bar[idx].addr = addr;
pdi->pi_bar[idx].size = size;
/* Initialize the BAR register in config space */
bar = (addr & mask) | lobits;
pci_set_cfgdata32(pdi, PCIR_BAR(idx), bar);
if (type == PCIBAR_MEM64) {
assert(idx + 1 <= PCI_BARMAX);
pdi->pi_bar[idx + 1].type = PCIBAR_MEMHI64;
pci_set_cfgdata32(pdi, PCIR_BAR(idx + 1), bar >> 32);
}
/* add a handler to intercept accesses to the I/O bar */
if (type == PCIBAR_IO) {
iop.name = pdi->pi_name;
iop.flags = IOPORT_F_INOUT;
iop.handler = pci_emul_handler;
iop.arg = pdi;
for (i = 0; i < size; i++) {
iop.port = addr + i;
register_inout(&iop);
}
}
return (0);
}
#define CAP_START_OFFSET 0x40
static int
pci_emul_add_capability(struct pci_devinst *pi, u_char *capdata, int caplen)
{
int i, capoff, capid, reallen;
uint16_t sts;
static u_char endofcap[4] = {
PCIY_RESERVED, 0, 0, 0
};
assert(caplen > 0 && capdata[0] != PCIY_RESERVED);
reallen = roundup2(caplen, 4); /* dword aligned */
sts = pci_get_cfgdata16(pi, PCIR_STATUS);
if ((sts & PCIM_STATUS_CAPPRESENT) == 0) {
capoff = CAP_START_OFFSET;
pci_set_cfgdata8(pi, PCIR_CAP_PTR, capoff);
pci_set_cfgdata16(pi, PCIR_STATUS, sts|PCIM_STATUS_CAPPRESENT);
} else {
capoff = pci_get_cfgdata8(pi, PCIR_CAP_PTR);
while (1) {
assert((capoff & 0x3) == 0);
capid = pci_get_cfgdata8(pi, capoff);
if (capid == PCIY_RESERVED)
break;
capoff = pci_get_cfgdata8(pi, capoff + 1);
}
}
/* Check if we have enough space */
if (capoff + reallen + sizeof(endofcap) > PCI_REGMAX + 1)
return (-1);
/* Copy the capability */
for (i = 0; i < caplen; i++)
pci_set_cfgdata8(pi, capoff + i, capdata[i]);
/* Set the next capability pointer */
pci_set_cfgdata8(pi, capoff + 1, capoff + reallen);
/* Copy of the reserved capability which serves as the end marker */
for (i = 0; i < sizeof(endofcap); i++)
pci_set_cfgdata8(pi, capoff + reallen + i, endofcap[i]);
return (0);
}
static struct pci_devemu *
pci_emul_finddev(char *name)
{
struct pci_devemu **pdpp, *pdp;
SET_FOREACH(pdpp, pci_devemu_set) {
pdp = *pdpp;
if (!strcmp(pdp->pe_emu, name)) {
return (pdp);
}
}
return (NULL);
}
static void
pci_emul_init(struct vmctx *ctx, struct pci_devemu *pde, int slot, char *params)
{
struct pci_devinst *pdi;
pdi = malloc(sizeof(struct pci_devinst));
bzero(pdi, sizeof(*pdi));
pdi->pi_vmctx = ctx;
pdi->pi_bus = 0;
pdi->pi_slot = slot;
pdi->pi_func = 0;
pdi->pi_d = pde;
snprintf(pdi->pi_name, PI_NAMESZ, "%s-pci-%d", pde->pe_emu, slot);
/* Disable legacy interrupts */
pci_set_cfgdata8(pdi, PCIR_INTLINE, 255);
pci_set_cfgdata8(pdi, PCIR_INTPIN, 0);
pci_set_cfgdata8(pdi, PCIR_COMMAND,
PCIM_CMD_PORTEN | PCIM_CMD_MEMEN | PCIM_CMD_BUSMASTEREN);
if ((*pde->pe_init)(ctx, pdi, params) != 0) {
free(pdi);
} else {
pci_emul_devices++;
pci_slotinfo[slot].si_devi = pdi;
}
}
void
pci_populate_msicap(struct msicap *msicap, int msgnum, int nextptr)
{
int mmc;
CTASSERT(sizeof(struct msicap) == 14);
/* Number of msi messages must be a power of 2 between 1 and 32 */
assert((msgnum & (msgnum - 1)) == 0 && msgnum >= 1 && msgnum <= 32);
mmc = ffs(msgnum) - 1;
bzero(msicap, sizeof(struct msicap));
msicap->capid = PCIY_MSI;
msicap->nextptr = nextptr;
msicap->msgctrl = PCIM_MSICTRL_64BIT | (mmc << 1);
}
int
pci_emul_add_msicap(struct pci_devinst *pi, int msgnum)
{
struct msicap msicap;
pci_populate_msicap(&msicap, msgnum, 0);
return (pci_emul_add_capability(pi, (u_char *)&msicap, sizeof(msicap)));
}
void
msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
int bytes, uint32_t val)
{
uint16_t msgctrl, rwmask, msgdata, mme;
uint32_t addrlo;
/*
* If guest is writing to the message control register make sure
* we do not overwrite read-only fields.
*/
if ((offset - capoff) == 2 && bytes == 2) {
rwmask = PCIM_MSICTRL_MME_MASK | PCIM_MSICTRL_MSI_ENABLE;
msgctrl = pci_get_cfgdata16(pi, offset);
msgctrl &= ~rwmask;
msgctrl |= val & rwmask;
val = msgctrl;
addrlo = pci_get_cfgdata32(pi, capoff + 4);
if (msgctrl & PCIM_MSICTRL_64BIT)
msgdata = pci_get_cfgdata16(pi, capoff + 12);
else
msgdata = pci_get_cfgdata16(pi, capoff + 8);
/*
* XXX check delivery mode, destination mode etc
*/
mme = msgctrl & PCIM_MSICTRL_MME_MASK;
pi->pi_msi.enabled = msgctrl & PCIM_MSICTRL_MSI_ENABLE ? 1 : 0;
if (pi->pi_msi.enabled) {
pi->pi_msi.cpu = (addrlo >> 12) & 0xff;
pi->pi_msi.vector = msgdata & 0xff;
pi->pi_msi.msgnum = 1 << (mme >> 4);
} else {
pi->pi_msi.cpu = 0;
pi->pi_msi.vector = 0;
pi->pi_msi.msgnum = 0;
}
}
CFGWRITE(pi, offset, val, bytes);
}
/*
* This function assumes that 'coff' is in the capabilities region of the
* config space.
*/
static void
pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes, uint32_t val)
{
int capid;
uint8_t capoff, nextoff;
/* Do not allow un-aligned writes */
if ((offset & (bytes - 1)) != 0)
return;
/* Find the capability that we want to update */
capoff = CAP_START_OFFSET;
while (1) {
capid = pci_get_cfgdata8(pi, capoff);
if (capid == PCIY_RESERVED)
break;
nextoff = pci_get_cfgdata8(pi, capoff + 1);
if (offset >= capoff && offset < nextoff)
break;
capoff = nextoff;
}
assert(offset >= capoff);
/*
* Capability ID and Next Capability Pointer are readonly
*/
if (offset == capoff || offset == capoff + 1)
return;
switch (capid) {
case PCIY_MSI:
msicap_cfgwrite(pi, capoff, offset, bytes, val);
break;
default:
break;
}
}
static int
pci_emul_iscap(struct pci_devinst *pi, int offset)
{
int found;
uint16_t sts;
uint8_t capid, lastoff;
found = 0;
sts = pci_get_cfgdata16(pi, PCIR_STATUS);
if ((sts & PCIM_STATUS_CAPPRESENT) != 0) {
lastoff = pci_get_cfgdata8(pi, PCIR_CAP_PTR);
while (1) {
assert((lastoff & 0x3) == 0);
capid = pci_get_cfgdata8(pi, lastoff);
if (capid == PCIY_RESERVED)
break;
lastoff = pci_get_cfgdata8(pi, lastoff + 1);
}
if (offset >= CAP_START_OFFSET && offset <= lastoff)
found = 1;
}
return (found);
}
void
init_pci(struct vmctx *ctx)
{
struct pci_devemu *pde;
struct slotinfo *si;
int i;
pci_emul_iobase = PCI_EMUL_IOBASE;
pci_emul_membase32 = PCI_EMUL_MEMBASE32;
pci_emul_membase64 = PCI_EMUL_MEMBASE64;
si = pci_slotinfo;
for (i = 0; i < MAXSLOTS; i++, si++) {
if (si->si_name != NULL) {
pde = pci_emul_finddev(si->si_name);
if (pde != NULL) {
pci_emul_init(ctx, pde, i, si->si_param);
pci_add_mptable_name(si);
}
}
}
pci_finish_mptable_names();
}
int
pci_msi_enabled(struct pci_devinst *pi)
{
return (pi->pi_msi.enabled);
}
int
pci_msi_msgnum(struct pci_devinst *pi)
{
if (pi->pi_msi.enabled)
return (pi->pi_msi.msgnum);
else
return (0);
}
void
pci_generate_msi(struct pci_devinst *pi, int msg)
{
if (pci_msi_enabled(pi) && msg < pci_msi_msgnum(pi)) {
vm_lapic_irq(pi->pi_vmctx,
pi->pi_msi.cpu,
pi->pi_msi.vector + msg);
}
}
static int cfgbus, cfgslot, cfgfunc, cfgoff;
static int
pci_emul_cfgaddr(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
uint32_t x;
assert(!in);
if (bytes != 4)
return (-1);
x = *eax;
cfgoff = x & PCI_REGMAX;
cfgfunc = (x >> 8) & PCI_FUNCMAX;
cfgslot = (x >> 11) & PCI_SLOTMAX;
cfgbus = (x >> 16) & PCI_BUSMAX;
return (0);
}
INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_OUT, pci_emul_cfgaddr);
static int
pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
struct pci_devinst *pi;
struct pci_devemu *pe;
int coff, idx;
uint64_t mask, bar;
assert(bytes == 1 || bytes == 2 || bytes == 4);
pi = pci_slotinfo[cfgslot].si_devi;
coff = cfgoff + (port - CONF1_DATA_PORT);
#if 0
printf("pcicfg-%s from 0x%0x of %d bytes (%d/%d/%d)\n\r",
in ? "read" : "write", coff, bytes, cfgbus, cfgslot, cfgfunc);
#endif
if (pi == NULL || cfgfunc != 0) {
if (in)
*eax = 0xffffffff;
return (0);
}
pe = pi->pi_d;
/*
* Config read
*/
if (in) {
/* Let the device emulation override the default handler */
if (pe->pe_cfgread != NULL &&
(*pe->pe_cfgread)(ctx, vcpu, pi, coff, bytes, eax) == 0)
return (0);
if (bytes == 1)
*eax = pci_get_cfgdata8(pi, coff);
else if (bytes == 2)
*eax = pci_get_cfgdata16(pi, coff);
else
*eax = pci_get_cfgdata32(pi, coff);
} else {
/* Let the device emulation override the default handler */
if (pe->pe_cfgwrite != NULL &&
(*pe->pe_cfgwrite)(ctx, vcpu, pi, coff, bytes, *eax) == 0)
return (0);
/*
* Special handling for write to BAR registers
*/
if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) {
/*
* Ignore writes to BAR registers that are not
* 4-byte aligned.
*/
if (bytes != 4 || (coff & 0x3) != 0)
return (0);
idx = (coff - PCIR_BAR(0)) / 4;
switch (pi->pi_bar[idx].type) {
case PCIBAR_NONE:
bar = 0;
break;
case PCIBAR_IO:
mask = ~(pi->pi_bar[idx].size - 1);
mask &= PCIM_BAR_IO_BASE;
bar = (*eax & mask) | PCIM_BAR_IO_SPACE;
break;
case PCIBAR_MEM32:
mask = ~(pi->pi_bar[idx].size - 1);
mask &= PCIM_BAR_MEM_BASE;
bar = *eax & mask;
bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32;
break;
case PCIBAR_MEM64:
mask = ~(pi->pi_bar[idx].size - 1);
mask &= PCIM_BAR_MEM_BASE;
bar = *eax & mask;
bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 |
PCIM_BAR_MEM_PREFETCH;
break;
case PCIBAR_MEMHI64:
mask = ~(pi->pi_bar[idx - 1].size - 1);
mask &= PCIM_BAR_MEM_BASE;
bar = ((uint64_t)*eax << 32) & mask;
bar = bar >> 32;
break;
default:
assert(0);
}
pci_set_cfgdata32(pi, coff, bar);
} else if (pci_emul_iscap(pi, coff)) {
pci_emul_capwrite(pi, coff, bytes, *eax);
} else {
CFGWRITE(pi, coff, *eax, bytes);
}
}
return (0);
}
INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+0, IOPORT_F_INOUT, pci_emul_cfgdata);
INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+1, IOPORT_F_INOUT, pci_emul_cfgdata);
INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+2, IOPORT_F_INOUT, pci_emul_cfgdata);
INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+3, IOPORT_F_INOUT, pci_emul_cfgdata);
/*
* I/O ports to configure PCI IRQ routing. We ignore all writes to it.
*/
static int
pci_irq_port_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
assert(in == 0);
return (0);
}
INOUT_PORT(pci_irq, 0xC00, IOPORT_F_OUT, pci_irq_port_handler);
INOUT_PORT(pci_irq, 0xC01, IOPORT_F_OUT, pci_irq_port_handler);
#define PCI_EMUL_TEST
#ifdef PCI_EMUL_TEST
/*
* Define a dummy test device
*/
#define DREGSZ 20
struct pci_emul_dsoftc {
uint8_t regs[DREGSZ];
};
#define PCI_EMUL_MSGS 4
static int
pci_emul_dinit(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
{
int error;
struct pci_emul_dsoftc *sc;
sc = malloc(sizeof(struct pci_emul_dsoftc));
memset(sc, 0, sizeof(struct pci_emul_dsoftc));
pi->pi_arg = sc;
pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0001);
pci_set_cfgdata16(pi, PCIR_VENDOR, 0x10DD);
pci_set_cfgdata8(pi, PCIR_CLASS, 0x02);
error = pci_emul_alloc_bar(pi, 0, 0, PCIBAR_IO, DREGSZ);
assert(error == 0);
error = pci_emul_add_msicap(pi, PCI_EMUL_MSGS);
assert(error == 0);
return (0);
}
static void
pci_emul_diow(struct pci_devinst *pi, int baridx, int offset, int size,
uint32_t value)
{
int i;
struct pci_emul_dsoftc *sc = pi->pi_arg;
if (offset + size > DREGSZ) {
printf("diow: too large, offset %d size %d\n", offset, size);
return;
}
if (size == 1) {
sc->regs[offset] = value & 0xff;
} else if (size == 2) {
*(uint16_t *)&sc->regs[offset] = value & 0xffff;
} else {
*(uint32_t *)&sc->regs[offset] = value;
}
/*
* Special magic value to generate an interrupt
*/
if (offset == 4 && size == 4 && pci_msi_enabled(pi))
pci_generate_msi(pi, value % pci_msi_msgnum(pi));
if (value == 0xabcdef) {
for (i = 0; i < pci_msi_msgnum(pi); i++)
pci_generate_msi(pi, i);
}
}
static uint32_t
pci_emul_dior(struct pci_devinst *pi, int baridx, int offset, int size)
{
struct pci_emul_dsoftc *sc = pi->pi_arg;
uint32_t value;
if (offset + size > DREGSZ) {
printf("dior: too large, offset %d size %d\n", offset, size);
return (0);
}
if (size == 1) {
value = sc->regs[offset];
} else if (size == 2) {
value = *(uint16_t *) &sc->regs[offset];
} else {
value = *(uint32_t *) &sc->regs[offset];
}
return (value);
}
struct pci_devemu pci_dummy = {
.pe_emu = "dummy",
.pe_init = pci_emul_dinit,
.pe_iow = pci_emul_diow,
.pe_ior = pci_emul_dior
};
PCI_EMUL_SET(pci_dummy);
#endif /* PCI_EMUL_TEST */

171
usr.sbin/bhyve/pci_emul.h Normal file
View File

@ -0,0 +1,171 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _PCI_EMUL_H_
#define _PCI_EMUL_H_
#include <sys/types.h>
#include <sys/queue.h>
#include <sys/kernel.h>
#include <dev/pci/pcireg.h>
#include <assert.h>
#define PCI_BARMAX PCIR_MAX_BAR_0 /* BAR registers in a Type 0 header */
#define PCIY_RESERVED 0x00
struct vmctx;
struct pci_devinst;
struct pci_devemu {
char *pe_emu; /* Name of device emulation */
/* instance creation */
int (*pe_init)(struct vmctx *, struct pci_devinst *, char *opts);
/* config space read/write callbacks */
int (*pe_cfgwrite)(struct vmctx *ctx, int vcpu,
struct pci_devinst *pi, int offset,
int bytes, uint32_t val);
int (*pe_cfgread)(struct vmctx *ctx, int vcpu,
struct pci_devinst *pi, int offset,
int bytes, uint32_t *retval);
/* I/O space read/write callbacks */
void (*pe_iow)(struct pci_devinst *pi, int baridx,
int offset, int size, uint32_t value);
uint32_t (*pe_ior)(struct pci_devinst *pi, int baridx,
int offset, int size);
};
#define PCI_EMUL_SET(x) DATA_SET(pci_devemu_set, x);
enum pcibar_type {
PCIBAR_NONE,
PCIBAR_IO,
PCIBAR_MEM32,
PCIBAR_MEM64,
PCIBAR_MEMHI64
};
struct pcibar {
enum pcibar_type type; /* io or memory */
uint64_t size;
uint64_t addr;
};
#define PI_NAMESZ 40
struct pci_devinst {
struct pci_devemu *pi_d;
struct vmctx *pi_vmctx;
uint8_t pi_bus, pi_slot, pi_func;
char pi_name[PI_NAMESZ];
uint16_t pi_iobase;
int pi_bar_getsize;
struct {
int enabled;
int cpu;
int vector;
int msgnum;
} pi_msi;
void *pi_arg; /* devemu-private data */
u_char pi_cfgdata[PCI_REGMAX + 1];
struct pcibar pi_bar[PCI_BARMAX + 1];
};
struct msicap {
uint8_t capid;
uint8_t nextptr;
uint16_t msgctrl;
uint32_t addrlo;
uint32_t addrhi;
uint16_t msgdata;
} __packed;
void init_pci(struct vmctx *ctx);
void pci_parse_slot(char *opt);
void pci_parse_name(char *opt);
void pci_callback(void);
int pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, uint64_t hostbase,
enum pcibar_type type, uint64_t size);
int pci_emul_add_msicap(struct pci_devinst *pi, int msgnum);
void msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
int bytes, uint32_t val);
void pci_generate_msi(struct pci_devinst *pi, int msgnum);
int pci_msi_enabled(struct pci_devinst *pi);
int pci_msi_msgnum(struct pci_devinst *pi);
void pci_populate_msicap(struct msicap *cap, int msgs, int nextptr);
static __inline void
pci_set_cfgdata8(struct pci_devinst *pi, int offset, uint8_t val)
{
assert(offset <= PCI_REGMAX);
*(uint8_t *)(pi->pi_cfgdata + offset) = val;
}
static __inline void
pci_set_cfgdata16(struct pci_devinst *pi, int offset, uint16_t val)
{
assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0);
*(uint16_t *)(pi->pi_cfgdata + offset) = val;
}
static __inline void
pci_set_cfgdata32(struct pci_devinst *pi, int offset, uint32_t val)
{
assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0);
*(uint32_t *)(pi->pi_cfgdata + offset) = val;
}
static __inline uint8_t
pci_get_cfgdata8(struct pci_devinst *pi, int offset)
{
assert(offset <= PCI_REGMAX);
return (*(uint8_t *)(pi->pi_cfgdata + offset));
}
static __inline uint16_t
pci_get_cfgdata16(struct pci_devinst *pi, int offset)
{
assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0);
return (*(uint16_t *)(pi->pi_cfgdata + offset));
}
static __inline uint32_t
pci_get_cfgdata32(struct pci_devinst *pi, int offset)
{
assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0);
return (*(uint32_t *)(pi->pi_cfgdata + offset));
}
#endif /* _PCI_EMUL_H_ */

View File

@ -0,0 +1,52 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "pci_emul.h"
static int
pci_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
{
/* config space */
pci_set_cfgdata16(pi, PCIR_VENDOR, 0x1275); /* NetApp */
pci_set_cfgdata16(pi, PCIR_DEVICE, 0x1275); /* NetApp */
pci_set_cfgdata8(pi, PCIR_HDRTYPE, PCIM_HDRTYPE_BRIDGE);
pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE);
pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_HOST);
return (0);
}
struct pci_devemu pci_de_hostbridge = {
.pe_emu = "hostbridge",
.pe_init = pci_hostbridge_init,
};
PCI_EMUL_SET(pci_de_hostbridge);

View File

@ -0,0 +1,508 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/types.h>
#include <sys/pciio.h>
#include <sys/ioctl.h>
#include <dev/io/iodev.h>
#include <machine/iodev.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <fcntl.h>
#include <unistd.h>
#include <machine/vmm.h>
#include <vmmapi.h>
#include "pci_emul.h"
#ifndef _PATH_DEVPCI
#define _PATH_DEVPCI "/dev/pci"
#endif
#ifndef _PATH_DEVIO
#define _PATH_DEVIO "/dev/io"
#endif
#define LEGACY_SUPPORT 1
static int pcifd = -1;
static int iofd = -1;
struct passthru_softc {
struct pci_devinst *psc_pi;
struct pcibar psc_bar[PCI_BARMAX + 1];
struct {
int capoff;
int msgctrl;
int emulated;
} psc_msi;
struct pcisel psc_sel;
};
static int
msi_caplen(int msgctrl)
{
int len;
len = 10; /* minimum length of msi capability */
if (msgctrl & PCIM_MSICTRL_64BIT)
len += 4;
#if 0
/*
* Ignore the 'mask' and 'pending' bits in the MSI capability.
* We'll let the guest manipulate them directly.
*/
if (msgctrl & PCIM_MSICTRL_VECTOR)
len += 10;
#endif
return (len);
}
static uint32_t
read_config(const struct pcisel *sel, long reg, int width)
{
struct pci_io pi;
bzero(&pi, sizeof(pi));
pi.pi_sel = *sel;
pi.pi_reg = reg;
pi.pi_width = width;
if (ioctl(pcifd, PCIOCREAD, &pi) < 0)
return (0); /* XXX */
else
return (pi.pi_data);
}
static void
write_config(const struct pcisel *sel, long reg, int width, uint32_t data)
{
struct pci_io pi;
bzero(&pi, sizeof(pi));
pi.pi_sel = *sel;
pi.pi_reg = reg;
pi.pi_width = width;
pi.pi_data = data;
(void)ioctl(pcifd, PCIOCWRITE, &pi); /* XXX */
}
#ifdef LEGACY_SUPPORT
static int
passthru_add_msicap(struct pci_devinst *pi, int msgnum, int nextptr)
{
int capoff, i;
struct msicap msicap;
u_char *capdata;
pci_populate_msicap(&msicap, msgnum, nextptr);
/*
* XXX
* Copy the msi capability structure in the last 16 bytes of the
* config space. This is wrong because it could shadow something
* useful to the device.
*/
capoff = 256 - roundup(sizeof(msicap), 4);
capdata = (u_char *)&msicap;
for (i = 0; i < sizeof(msicap); i++)
pci_set_cfgdata8(pi, capoff + i, capdata[i]);
return (capoff);
}
#endif /* LEGACY_SUPPORT */
static int
cfginitmsi(struct passthru_softc *sc)
{
int ptr, cap, sts, caplen;
uint32_t u32;
struct pcisel sel;
struct pci_devinst *pi;
pi = sc->psc_pi;
sel = sc->psc_sel;
/*
* Parse the capabilities and cache the location of the MSI
* capability.
*/
sts = read_config(&sel, PCIR_STATUS, 2);
if (sts & PCIM_STATUS_CAPPRESENT) {
ptr = read_config(&sel, PCIR_CAP_PTR, 1);
while (ptr != 0 && ptr != 0xff) {
cap = read_config(&sel, ptr + PCICAP_ID, 1);
if (cap == PCIY_MSI) {
/*
* Copy the MSI capability into the config
* space of the emulated pci device
*/
sc->psc_msi.capoff = ptr;
sc->psc_msi.msgctrl = read_config(&sel,
ptr + 2, 2);
sc->psc_msi.emulated = 0;
caplen = msi_caplen(sc->psc_msi.msgctrl);
while (caplen > 0) {
u32 = read_config(&sel, ptr, 4);
pci_set_cfgdata32(pi, ptr, u32);
caplen -= 4;
ptr += 4;
}
break;
}
ptr = read_config(&sel, ptr + PCICAP_NEXTPTR, 1);
}
}
#ifdef LEGACY_SUPPORT
/*
* If the passthrough device does not support MSI then craft a
* MSI capability for it. We link the new MSI capability at the
* head of the list of capabilities.
*/
if ((sts & PCIM_STATUS_CAPPRESENT) != 0 && sc->psc_msi.capoff == 0) {
int origptr, msiptr;
origptr = read_config(&sel, PCIR_CAP_PTR, 1);
msiptr = passthru_add_msicap(pi, 1, origptr);
sc->psc_msi.capoff = msiptr;
sc->psc_msi.msgctrl = pci_get_cfgdata16(pi, msiptr + 2);
sc->psc_msi.emulated = 1;
pci_set_cfgdata8(pi, PCIR_CAP_PTR, msiptr);
}
#endif
if (sc->psc_msi.capoff == 0) /* MSI or bust */
return (-1);
else
return (0);
}
static int
cfginitbar(struct vmctx *ctx, struct passthru_softc *sc)
{
int i, error;
struct pci_devinst *pi;
struct pci_bar_io bar;
enum pcibar_type bartype;
uint64_t base;
pi = sc->psc_pi;
/*
* Initialize BAR registers
*/
for (i = 0; i <= PCI_BARMAX; i++) {
bzero(&bar, sizeof(bar));
bar.pbi_sel = sc->psc_sel;
bar.pbi_reg = PCIR_BAR(i);
if (ioctl(pcifd, PCIOCGETBAR, &bar) < 0)
continue;
if (PCI_BAR_IO(bar.pbi_base)) {
bartype = PCIBAR_IO;
base = bar.pbi_base & PCIM_BAR_IO_BASE;
} else {
switch (bar.pbi_base & PCIM_BAR_MEM_TYPE) {
case PCIM_BAR_MEM_64:
bartype = PCIBAR_MEM64;
break;
default:
bartype = PCIBAR_MEM32;
break;
}
base = bar.pbi_base & PCIM_BAR_MEM_BASE;
}
/* Cache information about the "real" BAR */
sc->psc_bar[i].type = bartype;
sc->psc_bar[i].size = bar.pbi_length;
sc->psc_bar[i].addr = base;
/* Allocate the BAR in the guest I/O or MMIO space */
error = pci_emul_alloc_bar(pi, i, base, bartype,
bar.pbi_length);
if (error)
return (-1);
/*
* Map the physical MMIO space in the guest MMIO space
*/
if (bartype != PCIBAR_IO) {
error = vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus,
sc->psc_sel.pc_dev, sc->psc_sel.pc_func,
pi->pi_bar[i].addr, pi->pi_bar[i].size, base);
if (error)
return (-1);
}
/*
* 64-bit BAR takes up two slots so skip the next one.
*/
if (bartype == PCIBAR_MEM64) {
i++;
assert(i <= PCI_BARMAX);
sc->psc_bar[i].type = PCIBAR_MEMHI64;
}
}
return (0);
}
static int
cfginit(struct vmctx *ctx, struct pci_devinst *pi, int bus, int slot, int func)
{
int error;
struct passthru_softc *sc;
error = 1;
sc = pi->pi_arg;
bzero(&sc->psc_sel, sizeof(struct pcisel));
sc->psc_sel.pc_bus = bus;
sc->psc_sel.pc_dev = slot;
sc->psc_sel.pc_func = func;
if (cfginitbar(ctx, sc) != 0)
goto done;
if (cfginitmsi(sc) != 0)
goto done;
error = 0; /* success */
done:
return (error);
}
static int
passthru_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
{
int bus, slot, func, error;
struct passthru_softc *sc;
sc = NULL;
error = 1;
if (pcifd < 0) {
pcifd = open(_PATH_DEVPCI, O_RDWR, 0);
if (pcifd < 0)
goto done;
}
if (iofd < 0) {
iofd = open(_PATH_DEVIO, O_RDWR, 0);
if (iofd < 0)
goto done;
}
if (opts == NULL || sscanf(opts, "%d/%d/%d", &bus, &slot, &func) != 3)
goto done;
if (vm_assign_pptdev(ctx, bus, slot, func) != 0)
goto done;
sc = malloc(sizeof(struct passthru_softc));
memset(sc, 0, sizeof(struct passthru_softc));
pi->pi_arg = sc;
sc->psc_pi = pi;
/* initialize config space */
if (cfginit(ctx, pi, bus, slot, func) != 0)
goto done;
error = 0; /* success */
done:
if (error) {
free(sc);
vm_unassign_pptdev(ctx, bus, slot, func);
}
return (error);
}
static int
bar_access(int coff)
{
if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1))
return (1);
else
return (0);
}
static int
msicap_access(struct passthru_softc *sc, int coff)
{
int caplen;
if (sc->psc_msi.capoff == 0)
return (0);
caplen = msi_caplen(sc->psc_msi.msgctrl);
if (coff >= sc->psc_msi.capoff && coff < sc->psc_msi.capoff + caplen)
return (1);
else
return (0);
}
static int
passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int coff,
int bytes, uint32_t *rv)
{
struct passthru_softc *sc;
sc = pi->pi_arg;
/*
* PCI BARs and MSI capability is emulated.
*/
if (bar_access(coff) || msicap_access(sc, coff))
return (-1);
#ifdef LEGACY_SUPPORT
/*
* Emulate PCIR_CAP_PTR if this device does not support MSI capability
* natively.
*/
if (sc->psc_msi.emulated) {
if (coff >= PCIR_CAP_PTR && coff < PCIR_CAP_PTR + 4)
return (-1);
}
#endif
/* Everything else just read from the device's config space */
*rv = read_config(&sc->psc_sel, coff, bytes);
return (0);
}
static int
passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int coff,
int bytes, uint32_t val)
{
int error;
struct passthru_softc *sc;
sc = pi->pi_arg;
/*
* PCI BARs are emulated
*/
if (bar_access(coff))
return (-1);
/*
* MSI capability is emulated
*/
if (msicap_access(sc, coff)) {
msicap_cfgwrite(pi, sc->psc_msi.capoff, coff, bytes, val);
error = vm_setup_msi(ctx, vcpu, sc->psc_sel.pc_bus,
sc->psc_sel.pc_dev, sc->psc_sel.pc_func, pi->pi_msi.cpu,
pi->pi_msi.vector, pi->pi_msi.msgnum);
if (error != 0) {
printf("vm_setup_msi returned error %d\r\n", errno);
exit(1);
}
return (0);
}
#ifdef LEGACY_SUPPORT
/*
* If this device does not support MSI natively then we cannot let
* the guest disable legacy interrupts from the device. It is the
* legacy interrupt that is triggering the virtual MSI to the guest.
*/
if (sc->psc_msi.emulated && pci_msi_enabled(pi)) {
if (coff == PCIR_COMMAND && bytes == 2)
val &= ~PCIM_CMD_INTxDIS;
}
#endif
write_config(&sc->psc_sel, coff, bytes, val);
return (0);
}
static void
passthru_iow(struct pci_devinst *pi, int baridx, int offset, int size,
uint32_t value)
{
struct passthru_softc *sc;
struct iodev_pio_req pio;
sc = pi->pi_arg;
bzero(&pio, sizeof(struct iodev_pio_req));
pio.access = IODEV_PIO_WRITE;
pio.port = sc->psc_bar[baridx].addr + offset;
pio.width = size;
pio.val = value;
(void)ioctl(iofd, IODEV_PIO, &pio);
}
static uint32_t
passthru_ior(struct pci_devinst *pi, int baridx, int offset, int size)
{
struct passthru_softc *sc;
struct iodev_pio_req pio;
sc = pi->pi_arg;
bzero(&pio, sizeof(struct iodev_pio_req));
pio.access = IODEV_PIO_READ;
pio.port = sc->psc_bar[baridx].addr + offset;
pio.width = size;
pio.val = 0;
(void)ioctl(iofd, IODEV_PIO, &pio);
return (pio.val);
}
struct pci_devemu passthru = {
.pe_emu = "passthru",
.pe_init = passthru_init,
.pe_cfgwrite = passthru_cfgwrite,
.pe_cfgread = passthru_cfgread,
.pe_iow = passthru_iow,
.pe_ior = passthru_ior,
};
PCI_EMUL_SET(passthru);

View File

@ -0,0 +1,502 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/linker_set.h>
#include <sys/stat.h>
#include <sys/uio.h>
#include <sys/ioctl.h>
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <strings.h>
#include <unistd.h>
#include <assert.h>
#include <pthread.h>
#include "fbsdrun.h"
#include "pci_emul.h"
#include "virtio.h"
#define VTBLK_RINGSZ 64
#define VTBLK_CFGSZ 28
#define VTBLK_R_CFG VTCFG_R_CFG0
#define VTBLK_R_CFG_END VTBLK_R_CFG + VTBLK_CFGSZ -1
#define VTBLK_R_MAX VTBLK_R_CFG_END
#define VTBLK_REGSZ VTBLK_R_MAX+1
#define VTBLK_MAXSEGS 32
#define VTBLK_S_OK 0
#define VTBLK_S_IOERR 1
/*
* Host capabilities
*/
#define VTBLK_S_HOSTCAPS \
( 0x00000004 | /* host maximum request segments */ \
0x10000000 ) /* supports indirect descriptors */
struct vring_hqueue {
/* Internal state */
uint16_t hq_size;
uint16_t hq_cur_aidx; /* trails behind 'avail_idx' */
/* Host-context pointers to the queue */
struct virtio_desc *hq_dtable;
uint16_t *hq_avail_flags;
uint16_t *hq_avail_idx; /* monotonically increasing */
uint16_t *hq_avail_ring;
uint16_t *hq_used_flags;
uint16_t *hq_used_idx; /* monotonically increasing */
struct virtio_used *hq_used_ring;
};
/*
* Config space
*/
struct vtblk_config {
uint64_t vbc_capacity;
uint32_t vbc_size_max;
uint32_t vbc_seg_max;
uint16_t vbc_geom_c;
uint8_t vbc_geom_h;
uint8_t vbc_geom_s;
uint32_t vbc_blk_size;
uint32_t vbc_sectors_max;
} __packed;
CTASSERT(sizeof(struct vtblk_config) == VTBLK_CFGSZ);
/*
* Fixed-size block header
*/
struct virtio_blk_hdr {
#define VBH_OP_READ 0
#define VBH_OP_WRITE 1
uint32_t vbh_type;
uint32_t vbh_ioprio;
uint64_t vbh_sector;
} __packed;
/*
* Debug printf
*/
static int pci_vtblk_debug;
#define DPRINTF(params) if (pci_vtblk_debug) printf params
#define WPRINTF(params) printf params
/*
* Per-device softc
*/
struct pci_vtblk_softc {
struct pci_devinst *vbsc_pi;
int vbsc_fd;
int vbsc_status;
int vbsc_isr;
int vbsc_lastq;
uint32_t vbsc_features;
uint64_t vbsc_pfn;
struct vring_hqueue vbsc_q;
struct vtblk_config vbsc_cfg;
};
/*
* Return the number of available descriptors in the vring taking care
* of the 16-bit index wraparound.
*/
static int
hq_num_avail(struct vring_hqueue *hq)
{
int ndesc;
if (*hq->hq_avail_idx >= hq->hq_cur_aidx)
ndesc = *hq->hq_avail_idx - hq->hq_cur_aidx;
else
ndesc = UINT16_MAX - hq->hq_cur_aidx + *hq->hq_avail_idx + 1;
assert(ndesc >= 0 && ndesc <= hq->hq_size);
return (ndesc);
}
static void
pci_vtblk_update_status(struct pci_vtblk_softc *sc, uint32_t value)
{
if (value == 0) {
DPRINTF(("vtblk: device reset requested !\n"));
}
sc->vbsc_status = value;
}
static void
pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vring_hqueue *hq)
{
struct iovec iov[VTBLK_MAXSEGS];
struct virtio_blk_hdr *vbh;
struct virtio_desc *vd, *vid;
struct virtio_used *vu;
uint8_t *status;
int i;
int err;
int iolen;
int nsegs;
int uidx, aidx, didx;
int writeop;
off_t offset;
uidx = *hq->hq_used_idx;
aidx = hq->hq_cur_aidx;
didx = hq->hq_avail_ring[aidx % hq->hq_size];
assert(didx >= 0 && didx < hq->hq_size);
vd = &hq->hq_dtable[didx];
/*
* Verify that the descriptor is indirect, and obtain
* the pointer to the indirect descriptor.
* There has to be space for at least 3 descriptors
* in the indirect descriptor array: the block header,
* 1 or more data descriptors, and a status byte.
*/
assert(vd->vd_flags & VRING_DESC_F_INDIRECT);
nsegs = vd->vd_len / sizeof(struct virtio_desc);
assert(nsegs >= 3);
assert(nsegs < VTBLK_MAXSEGS + 2);
vid = paddr_guest2host(vd->vd_addr);
assert((vid->vd_flags & VRING_DESC_F_INDIRECT) == 0);
/*
* The first descriptor will be the read-only fixed header
*/
vbh = paddr_guest2host(vid[0].vd_addr);
assert(vid[0].vd_len == sizeof(struct virtio_blk_hdr));
assert(vid[0].vd_flags & VRING_DESC_F_NEXT);
assert((vid[0].vd_flags & VRING_DESC_F_WRITE) == 0);
writeop = (vbh->vbh_type == VBH_OP_WRITE);
offset = vbh->vbh_sector * DEV_BSIZE;
/*
* Build up the iovec based on the guest's data descriptors
*/
for (i = 1, iolen = 0; i < nsegs - 1; i++) {
iov[i-1].iov_base = paddr_guest2host(vid[i].vd_addr);
iov[i-1].iov_len = vid[i].vd_len;
iolen += vid[i].vd_len;
assert(vid[i].vd_flags & VRING_DESC_F_NEXT);
assert((vid[i].vd_flags & VRING_DESC_F_INDIRECT) == 0);
/*
* - write op implies read-only descriptor,
* - read op implies write-only descriptor,
* therefore test the inverse of the descriptor bit
* to the op.
*/
assert(((vid[i].vd_flags & VRING_DESC_F_WRITE) == 0) ==
writeop);
}
/* Lastly, get the address of the status byte */
status = paddr_guest2host(vid[nsegs - 1].vd_addr);
assert(vid[nsegs - 1].vd_len == 1);
assert((vid[nsegs - 1].vd_flags & VRING_DESC_F_NEXT) == 0);
assert(vid[nsegs - 1].vd_flags & VRING_DESC_F_WRITE);
DPRINTF(("virtio-block: %s op, %d bytes, %d segs, offset %ld\n\r",
writeop ? "write" : "read", iolen, nsegs - 2, offset));
if (writeop){
err = pwritev(sc->vbsc_fd, iov, nsegs - 2, offset);
} else {
err = preadv(sc->vbsc_fd, iov, nsegs - 2, offset);
}
*status = err < 0 ? VTBLK_S_IOERR : VTBLK_S_OK;
/*
* Return the single indirect descriptor back to the host
*/
vu = &hq->hq_used_ring[uidx % hq->hq_size];
vu->vu_idx = didx;
vu->vu_tlen = 1;
hq->hq_cur_aidx++;
*hq->hq_used_idx += 1;
}
static void
pci_vtblk_qnotify(struct pci_vtblk_softc *sc)
{
struct vring_hqueue *hq = &sc->vbsc_q;
int i;
int ndescs;
/*
* Calculate number of ring entries to process
*/
ndescs = hq_num_avail(hq);
if (ndescs == 0)
return;
/*
* Run through all the entries, placing them into iovecs and
* sending when an end-of-packet is found
*/
for (i = 0; i < ndescs; i++)
pci_vtblk_proc(sc, hq);
/*
* Generate an interrupt if able
*/
if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0 &&
sc->vbsc_isr == 0) {
sc->vbsc_isr = 1;
pci_generate_msi(sc->vbsc_pi, 0);
}
}
static void
pci_vtblk_ring_init(struct pci_vtblk_softc *sc, uint64_t pfn)
{
struct vring_hqueue *hq;
sc->vbsc_pfn = pfn << VRING_PFN;
/*
* Set up host pointers to the various parts of the
* queue
*/
hq = &sc->vbsc_q;
hq->hq_size = VTBLK_RINGSZ;
hq->hq_dtable = paddr_guest2host(pfn << VRING_PFN);
hq->hq_avail_flags = (uint16_t *)(hq->hq_dtable + hq->hq_size);
hq->hq_avail_idx = hq->hq_avail_flags + 1;
hq->hq_avail_ring = hq->hq_avail_flags + 2;
hq->hq_used_flags = (uint16_t *)roundup2((uintptr_t)hq->hq_avail_ring,
VRING_ALIGN);
hq->hq_used_idx = hq->hq_used_flags + 1;
hq->hq_used_ring = (struct virtio_used *)(hq->hq_used_flags + 2);
/*
* Initialize queue indexes
*/
hq->hq_cur_aidx = 0;
}
static int
pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
{
struct stat sbuf;
struct pci_vtblk_softc *sc;
int fd;
if (opts == NULL) {
printf("virtio-block: backing device required\n");
return (1);
}
/*
* Access to guest memory is required. Fail if
* memory not mapped
*/
if (paddr_guest2host(0) == NULL)
return (1);
/*
* The supplied backing file has to exist
*/
fd = open(opts, O_RDWR);
if (fd < 0) {
perror("Could not open backing file");
return (1);
}
if (fstat(fd, &sbuf) < 0) {
perror("Could not stat backing file");
close(fd);
return (1);
}
sc = malloc(sizeof(struct pci_vtblk_softc));
memset(sc, 0, sizeof(struct pci_vtblk_softc));
pi->pi_arg = sc;
sc->vbsc_pi = pi;
sc->vbsc_fd = fd;
/* setup virtio block config space */
sc->vbsc_cfg.vbc_capacity = sbuf.st_size / DEV_BSIZE;
sc->vbsc_cfg.vbc_seg_max = VTBLK_MAXSEGS;
sc->vbsc_cfg.vbc_blk_size = DEV_BSIZE;
sc->vbsc_cfg.vbc_size_max = 0; /* not negotiated */
sc->vbsc_cfg.vbc_geom_c = 0; /* no geometry */
sc->vbsc_cfg.vbc_geom_h = 0;
sc->vbsc_cfg.vbc_geom_s = 0;
sc->vbsc_cfg.vbc_sectors_max = 0;
/* initialize config space */
pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_BLOCK);
pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_BLOCK);
pci_emul_alloc_bar(pi, 0, 0, PCIBAR_IO, VTBLK_REGSZ);
pci_emul_add_msicap(pi, 1);
return (0);
}
static void
pci_vtblk_write(struct pci_devinst *pi, int baridx, int offset, int size,
uint32_t value)
{
struct pci_vtblk_softc *sc = pi->pi_arg;
if (offset + size > VTBLK_REGSZ) {
DPRINTF(("vtblk_write: 2big, offset %d size %d\n",
offset, size));
return;
}
switch (offset) {
case VTCFG_R_GUESTCAP:
assert(size == 4);
sc->vbsc_features = value & VTBLK_S_HOSTCAPS;
break;
case VTCFG_R_PFN:
assert(size == 4);
pci_vtblk_ring_init(sc, value);
break;
case VTCFG_R_QSEL:
assert(size == 2);
sc->vbsc_lastq = value;
break;
case VTCFG_R_QNOTIFY:
assert(size == 2);
assert(value == 0);
pci_vtblk_qnotify(sc);
break;
case VTCFG_R_STATUS:
assert(size == 1);
pci_vtblk_update_status(sc, value);
break;
case VTCFG_R_HOSTCAP:
case VTCFG_R_QNUM:
case VTCFG_R_ISR:
case VTBLK_R_CFG ... VTBLK_R_CFG_END:
DPRINTF(("vtblk: write to readonly reg %d\n\r", offset));
break;
default:
DPRINTF(("vtblk: unknown i/o write offset %d\n\r", offset));
value = 0;
break;
}
}
uint32_t
pci_vtblk_read(struct pci_devinst *pi, int baridx, int offset, int size)
{
struct pci_vtblk_softc *sc = pi->pi_arg;
uint32_t value;
if (offset + size > VTBLK_REGSZ) {
DPRINTF(("vtblk_read: 2big, offset %d size %d\n",
offset, size));
return (0);
}
switch (offset) {
case VTCFG_R_HOSTCAP:
assert(size == 4);
value = VTBLK_S_HOSTCAPS;
break;
case VTCFG_R_GUESTCAP:
assert(size == 4);
value = sc->vbsc_features; /* XXX never read ? */
break;
case VTCFG_R_PFN:
assert(size == 4);
value = sc->vbsc_pfn >> VRING_PFN;
break;
case VTCFG_R_QNUM:
value = (sc->vbsc_lastq == 0) ? VTBLK_RINGSZ: 0;
break;
case VTCFG_R_QSEL:
assert(size == 2);
value = sc->vbsc_lastq; /* XXX never read ? */
break;
case VTCFG_R_QNOTIFY:
assert(size == 2);
value = 0; /* XXX never read ? */
break;
case VTCFG_R_STATUS:
assert(size == 1);
value = sc->vbsc_status;
break;
case VTCFG_R_ISR:
assert(size == 1);
value = sc->vbsc_isr;
sc->vbsc_isr = 0; /* a read clears this flag */
break;
case VTBLK_R_CFG ... VTBLK_R_CFG_END:
assert(size == 1);
value = *((uint8_t *)&sc->vbsc_cfg + offset - VTBLK_R_CFG);
break;
default:
DPRINTF(("vtblk: unknown i/o read offset %d\n\r", offset));
value = 0;
break;
}
return (value);
}
struct pci_devemu pci_de_vblk = {
.pe_emu = "virtio-blk",
.pe_init = pci_vtblk_init,
.pe_iow = pci_vtblk_write,
.pe_ior = pci_vtblk_read,
};
PCI_EMUL_SET(pci_de_vblk);

View File

@ -0,0 +1,739 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/linker_set.h>
#include <sys/select.h>
#include <sys/uio.h>
#include <sys/ioctl.h>
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <strings.h>
#include <unistd.h>
#include <assert.h>
#include <md5.h>
#include <pthread.h>
#include "fbsdrun.h"
#include "pci_emul.h"
#include "mevent.h"
#include "virtio.h"
#define VTNET_RINGSZ 256
#define VTNET_MAXSEGS 32
/*
* PCI config-space register offsets
*/
#define VTNET_R_CFG0 20
#define VTNET_R_CFG1 21
#define VTNET_R_CFG2 22
#define VTNET_R_CFG3 23
#define VTNET_R_CFG4 24
#define VTNET_R_CFG5 25
#define VTNET_R_CFG6 26
#define VTNET_R_CFG7 27
#define VTNET_R_MAX 27
#define VTNET_REGSZ VTNET_R_MAX+1
/*
* Host capabilities
*/
#define VTNET_S_HOSTCAPS \
( 0x00000020 | /* host supplies MAC */ \
0x00008000 | /* host can merge Rx buffers */ \
0x00010000 ) /* config status available */
/*
* Queue definitions.
*/
#define VTNET_RXQ 0
#define VTNET_TXQ 1
#define VTNET_CTLQ 2
#define VTNET_MAXQ 3
struct vring_hqueue {
/* Internal state */
uint16_t hq_size;
uint16_t hq_cur_aidx; /* trails behind 'avail_idx' */
/* Host-context pointers to the queue */
struct virtio_desc *hq_dtable;
uint16_t *hq_avail_flags;
uint16_t *hq_avail_idx; /* monotonically increasing */
uint16_t *hq_avail_ring;
uint16_t *hq_used_flags;
uint16_t *hq_used_idx; /* monotonically increasing */
struct virtio_used *hq_used_ring;
};
/*
* Fixed network header size
*/
struct virtio_net_rxhdr {
uint8_t vrh_flags;
uint8_t vrh_gso_type;
uint16_t vrh_hdr_len;
uint16_t vrh_gso_size;
uint16_t vrh_csum_start;
uint16_t vrh_csum_offset;
uint16_t vrh_bufs;
} __packed;
/*
* Debug printf
*/
static int pci_vtnet_debug;
#define DPRINTF(params) if (pci_vtnet_debug) printf params
#define WPRINTF(params) printf params
/*
* Per-device softc
*/
struct pci_vtnet_softc {
struct pci_devinst *vsc_pi;
pthread_mutex_t vsc_mtx;
struct mevent *vsc_mevp;
int vsc_curq;
int vsc_status;
int vsc_isr;
int vsc_tapfd;
int vsc_rx_ready;
int vsc_rxpend;
uint32_t vsc_features;
uint8_t vsc_macaddr[6];
uint64_t vsc_pfn[VTNET_MAXQ];
struct vring_hqueue vsc_hq[VTNET_MAXQ];
};
/*
* Return the number of available descriptors in the vring taking care
* of the 16-bit index wraparound.
*/
static int
hq_num_avail(struct vring_hqueue *hq)
{
int ndesc;
if (*hq->hq_avail_idx >= hq->hq_cur_aidx)
ndesc = *hq->hq_avail_idx - hq->hq_cur_aidx;
else
ndesc = UINT16_MAX - hq->hq_cur_aidx + *hq->hq_avail_idx + 1;
assert(ndesc >= 0 && ndesc <= hq->hq_size);
return (ndesc);
}
static uint16_t
pci_vtnet_qsize(int qnum)
{
/* XXX no ctl queue currently */
if (qnum == VTNET_CTLQ) {
return (0);
}
/* XXX fixed currently. Maybe different for tx/rx/ctl */
return (VTNET_RINGSZ);
}
static void
pci_vtnet_update_status(struct pci_vtnet_softc *sc, uint32_t value)
{
if (value == 0) {
DPRINTF(("vtnet: device reset requested !\n"));
}
sc->vsc_status = value;
}
/*
* Called to send a buffer chain out to the tap device
*/
static void
pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
int len)
{
char pad[60];
if (sc->vsc_tapfd == -1)
return;
/*
* If the length is < 60, pad out to that and add the
* extra zero'd segment to the iov. It is guaranteed that
* there is always an extra iov available by the caller.
*/
if (len < 60) {
memset(pad, 0, 60 - len);
iov[iovcnt].iov_base = pad;
iov[iovcnt].iov_len = 60 - len;
iovcnt++;
}
(void) writev(sc->vsc_tapfd, iov, iovcnt);
}
/*
* Called when there is read activity on the tap file descriptor.
* Each buffer posted by the guest is assumed to be able to contain
* an entire ethernet frame + rx header.
* MP note: the dummybuf is only used for discarding frames, so there
* is no need for it to be per-vtnet or locked.
*/
static uint8_t dummybuf[2048];
static void
pci_vtnet_tap_rx(struct pci_vtnet_softc *sc)
{
struct virtio_desc *vd;
struct virtio_used *vu;
struct vring_hqueue *hq;
struct virtio_net_rxhdr *vrx;
uint8_t *buf;
int i;
int len;
int ndescs;
int didx, uidx, aidx; /* descriptor, avail and used index */
/*
* Should never be called without a valid tap fd
*/
assert(sc->vsc_tapfd != -1);
/*
* But, will be called when the rx ring hasn't yet
* been set up.
*/
if (sc->vsc_rx_ready == 0) {
/*
* Drop the packet and try later.
*/
(void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
return;
}
/*
* Calculate the number of available rx buffers
*/
hq = &sc->vsc_hq[VTNET_RXQ];
ndescs = hq_num_avail(hq);
if (ndescs == 0) {
/*
* Need to wait for host notification to read
*/
if (sc->vsc_rxpend == 0) {
WPRINTF(("vtnet: no rx descriptors !\n"));
sc->vsc_rxpend = 1;
}
/*
* Drop the packet and try later
*/
(void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
return;
}
aidx = hq->hq_cur_aidx;
uidx = *hq->hq_used_idx;
for (i = 0; i < ndescs; i++) {
/*
* 'aidx' indexes into the an array of descriptor indexes
*/
didx = hq->hq_avail_ring[aidx % hq->hq_size];
assert(didx >= 0 && didx < hq->hq_size);
vd = &hq->hq_dtable[didx];
/*
* Get a pointer to the rx header, and use the
* data immediately following it for the packet buffer.
*/
vrx = (struct virtio_net_rxhdr *)paddr_guest2host(vd->vd_addr);
buf = (uint8_t *)(vrx + 1);
len = read(sc->vsc_tapfd, buf,
vd->vd_len - sizeof(struct virtio_net_rxhdr));
if (len < 0 && errno == EWOULDBLOCK) {
break;
}
/*
* The only valid field in the rx packet header is the
* number of buffers, which is always 1 without TSO
* support.
*/
memset(vrx, 0, sizeof(struct virtio_net_rxhdr));
vrx->vrh_bufs = 1;
/*
* Write this descriptor into the used ring
*/
vu = &hq->hq_used_ring[uidx % hq->hq_size];
vu->vu_idx = didx;
vu->vu_tlen = len + sizeof(struct virtio_net_rxhdr);
uidx++;
aidx++;
}
/*
* Update the used pointer, and signal an interrupt if allowed
*/
*hq->hq_used_idx = uidx;
hq->hq_cur_aidx = aidx;
if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
sc->vsc_isr |= 1;
pci_generate_msi(sc->vsc_pi, 0);
}
}
static void
pci_vtnet_tap_callback(int fd, enum ev_type type, void *param)
{
struct pci_vtnet_softc *sc = param;
pthread_mutex_lock(&sc->vsc_mtx);
pci_vtnet_tap_rx(sc);
pthread_mutex_unlock(&sc->vsc_mtx);
}
static void
pci_vtnet_ping_rxq(struct pci_vtnet_softc *sc)
{
/*
* A qnotify means that the rx process can now begin
*/
if (sc->vsc_rx_ready == 0) {
sc->vsc_rx_ready = 1;
}
/*
* If the rx queue was empty, attempt to receive a
* packet that was previously blocked due to no rx bufs
* available
*/
if (sc->vsc_rxpend) {
WPRINTF(("vtnet: rx resumed\n\r"));
sc->vsc_rxpend = 0;
pci_vtnet_tap_rx(sc);
}
}
static void
pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vring_hqueue *hq)
{
struct iovec iov[VTNET_MAXSEGS + 1];
struct virtio_desc *vd;
struct virtio_used *vu;
int i;
int plen;
int tlen;
int uidx, aidx, didx;
uidx = *hq->hq_used_idx;
aidx = hq->hq_cur_aidx;
didx = hq->hq_avail_ring[aidx % hq->hq_size];
assert(didx >= 0 && didx < hq->hq_size);
vd = &hq->hq_dtable[didx];
/*
* Run through the chain of descriptors, ignoring the
* first header descriptor. However, include the header
* length in the total length that will be put into the
* used queue.
*/
tlen = vd->vd_len;
vd = &hq->hq_dtable[vd->vd_next];
for (i = 0, plen = 0;
i < VTNET_MAXSEGS;
i++, vd = &hq->hq_dtable[vd->vd_next]) {
iov[i].iov_base = paddr_guest2host(vd->vd_addr);
iov[i].iov_len = vd->vd_len;
plen += vd->vd_len;
tlen += vd->vd_len;
if ((vd->vd_flags & VRING_DESC_F_NEXT) == 0)
break;
}
assert(i < VTNET_MAXSEGS);
DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, i + 1));
pci_vtnet_tap_tx(sc, iov, i + 1, plen);
/*
* Return this chain back to the host
*/
vu = &hq->hq_used_ring[uidx % hq->hq_size];
vu->vu_idx = didx;
vu->vu_tlen = tlen;
hq->hq_cur_aidx = aidx + 1;
*hq->hq_used_idx = uidx + 1;
/*
* Generate an interrupt if able
*/
if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
sc->vsc_isr |= 1;
pci_generate_msi(sc->vsc_pi, 0);
}
}
static void
pci_vtnet_ping_txq(struct pci_vtnet_softc *sc)
{
struct vring_hqueue *hq = &sc->vsc_hq[VTNET_TXQ];
int i;
int ndescs;
/*
* Calculate number of ring entries to process
*/
ndescs = hq_num_avail(hq);
if (ndescs == 0)
return;
/*
* Run through all the entries, placing them into iovecs and
* sending when an end-of-packet is found
*/
for (i = 0; i < ndescs; i++)
pci_vtnet_proctx(sc, hq);
}
static void
pci_vtnet_ping_ctlq(struct pci_vtnet_softc *sc)
{
DPRINTF(("vtnet: control qnotify!\n\r"));
}
static void
pci_vtnet_ring_init(struct pci_vtnet_softc *sc, uint64_t pfn)
{
struct vring_hqueue *hq;
int qnum = sc->vsc_curq;
assert(qnum < VTNET_MAXQ);
sc->vsc_pfn[qnum] = pfn << VRING_PFN;
/*
* Set up host pointers to the various parts of the
* queue
*/
hq = &sc->vsc_hq[qnum];
hq->hq_size = pci_vtnet_qsize(qnum);
hq->hq_dtable = paddr_guest2host(pfn << VRING_PFN);
hq->hq_avail_flags = (uint16_t *)(hq->hq_dtable + hq->hq_size);
hq->hq_avail_idx = hq->hq_avail_flags + 1;
hq->hq_avail_ring = hq->hq_avail_flags + 2;
hq->hq_used_flags = (uint16_t *)roundup2((uintptr_t)hq->hq_avail_ring,
VRING_ALIGN);
hq->hq_used_idx = hq->hq_used_flags + 1;
hq->hq_used_ring = (struct virtio_used *)(hq->hq_used_flags + 2);
/*
* Initialize queue indexes
*/
hq->hq_cur_aidx = 0;
}
static int
pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
{
MD5_CTX mdctx;
unsigned char digest[16];
char nstr[80];
struct pci_vtnet_softc *sc;
/*
* Access to guest memory is required. Fail if
* memory not mapped
*/
if (paddr_guest2host(0) == NULL)
return (1);
sc = malloc(sizeof(struct pci_vtnet_softc));
memset(sc, 0, sizeof(struct pci_vtnet_softc));
pi->pi_arg = sc;
sc->vsc_pi = pi;
pthread_mutex_init(&sc->vsc_mtx, NULL);
/*
* Attempt to open the tap device
*/
sc->vsc_tapfd = -1;
if (opts != NULL) {
char tbuf[80];
strcpy(tbuf, "/dev/");
strncat(tbuf, opts, sizeof(tbuf) - strlen(tbuf));
sc->vsc_tapfd = open(tbuf, O_RDWR);
if (sc->vsc_tapfd == -1) {
WPRINTF(("open of tap device %s failed\n", tbuf));
} else {
/*
* Set non-blocking and register for read
* notifications with the event loop
*/
int opt = 1;
if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) {
WPRINTF(("tap device O_NONBLOCK failed\n"));
close(sc->vsc_tapfd);
sc->vsc_tapfd = -1;
}
sc->vsc_mevp = mevent_add(sc->vsc_tapfd,
EVF_READ,
pci_vtnet_tap_callback,
sc);
if (sc->vsc_mevp == NULL) {
WPRINTF(("Could not register event\n"));
close(sc->vsc_tapfd);
sc->vsc_tapfd = -1;
}
}
}
/*
* The MAC address is the standard NetApp OUI of 00-a0-98,
* followed by an MD5 of the vm name. The slot number is
* prepended to this for slots other than 1, so that
* CFE can netboot from the equivalent of slot 1.
*/
if (pi->pi_slot == 1) {
strncpy(nstr, vmname, sizeof(nstr));
} else {
snprintf(nstr, sizeof(nstr), "%d-%s", pi->pi_slot, vmname);
}
MD5Init(&mdctx);
MD5Update(&mdctx, nstr, strlen(nstr));
MD5Final(digest, &mdctx);
sc->vsc_macaddr[0] = 0x00;
sc->vsc_macaddr[1] = 0xa0;
sc->vsc_macaddr[2] = 0x98;
sc->vsc_macaddr[3] = digest[0];
sc->vsc_macaddr[4] = digest[1];
sc->vsc_macaddr[5] = digest[2];
/* initialize config space */
pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET);
pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK);
pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET);
pci_emul_alloc_bar(pi, 0, 0, PCIBAR_IO, VTNET_REGSZ);
pci_emul_add_msicap(pi, 1);
return (0);
}
/*
* Function pointer array to handle queue notifications
*/
static void (*pci_vtnet_qnotify[VTNET_MAXQ])(struct pci_vtnet_softc *) = {
pci_vtnet_ping_rxq,
pci_vtnet_ping_txq,
pci_vtnet_ping_ctlq
};
static void
pci_vtnet_write(struct pci_devinst *pi, int baridx, int offset, int size,
uint32_t value)
{
struct pci_vtnet_softc *sc = pi->pi_arg;
if (offset + size > VTNET_REGSZ) {
DPRINTF(("vtnet_write: 2big, offset %d size %d\n",
offset, size));
return;
}
pthread_mutex_lock(&sc->vsc_mtx);
switch (offset) {
case VTCFG_R_GUESTCAP:
assert(size == 4);
sc->vsc_features = value & VTNET_S_HOSTCAPS;
break;
case VTCFG_R_PFN:
assert(size == 4);
pci_vtnet_ring_init(sc, value);
break;
case VTCFG_R_QSEL:
assert(size == 2);
assert(value < VTNET_MAXQ);
sc->vsc_curq = value;
break;
case VTCFG_R_QNOTIFY:
assert(size == 2);
assert(value < VTNET_MAXQ);
(*pci_vtnet_qnotify[value])(sc);
break;
case VTCFG_R_STATUS:
assert(size == 1);
pci_vtnet_update_status(sc, value);
break;
case VTNET_R_CFG0:
case VTNET_R_CFG1:
case VTNET_R_CFG2:
case VTNET_R_CFG3:
case VTNET_R_CFG4:
case VTNET_R_CFG5:
/*
* The driver is allowed to change the MAC address
*/
assert(size == 1);
sc->vsc_macaddr[offset - VTNET_R_CFG0] = value;
break;
case VTCFG_R_HOSTCAP:
case VTCFG_R_QNUM:
case VTCFG_R_ISR:
case VTNET_R_CFG6:
case VTNET_R_CFG7:
DPRINTF(("vtnet: write to readonly reg %d\n\r", offset));
break;
default:
DPRINTF(("vtnet: unknown i/o write offset %d\n\r", offset));
value = 0;
break;
}
pthread_mutex_unlock(&sc->vsc_mtx);
}
uint32_t
pci_vtnet_read(struct pci_devinst *pi, int baridx, int offset, int size)
{
struct pci_vtnet_softc *sc = pi->pi_arg;
uint32_t value;
if (offset + size > VTNET_REGSZ) {
DPRINTF(("vtnet_read: 2big, offset %d size %d\n",
offset, size));
return (0);
}
pthread_mutex_lock(&sc->vsc_mtx);
switch (offset) {
case VTCFG_R_HOSTCAP:
assert(size == 4);
value = VTNET_S_HOSTCAPS;
break;
case VTCFG_R_GUESTCAP:
assert(size == 4);
value = sc->vsc_features; /* XXX never read ? */
break;
case VTCFG_R_PFN:
assert(size == 4);
value = sc->vsc_pfn[sc->vsc_curq] >> VRING_PFN;
break;
case VTCFG_R_QNUM:
assert(size == 2);
value = pci_vtnet_qsize(sc->vsc_curq);
break;
case VTCFG_R_QSEL:
assert(size == 2);
value = sc->vsc_curq; /* XXX never read ? */
break;
case VTCFG_R_QNOTIFY:
assert(size == 2);
value = sc->vsc_curq; /* XXX never read ? */
break;
case VTCFG_R_STATUS:
assert(size == 1);
value = sc->vsc_status;
break;
case VTCFG_R_ISR:
assert(size == 1);
value = sc->vsc_isr;
sc->vsc_isr = 0; /* a read clears this flag */
break;
case VTNET_R_CFG0:
case VTNET_R_CFG1:
case VTNET_R_CFG2:
case VTNET_R_CFG3:
case VTNET_R_CFG4:
case VTNET_R_CFG5:
assert(size == 1);
value = sc->vsc_macaddr[offset - VTNET_R_CFG0];
break;
case VTNET_R_CFG6:
assert(size == 1);
value = 0x01; /* XXX link always up */
break;
case VTNET_R_CFG7:
assert(size == 1);
value = 0; /* link status is in the LSB */
break;
default:
DPRINTF(("vtnet: unknown i/o read offset %d\n\r", offset));
value = 0;
break;
}
pthread_mutex_unlock(&sc->vsc_mtx);
return (value);
}
struct pci_devemu pci_de_vnet = {
.pe_emu = "virtio-net",
.pe_init = pci_vtnet_init,
.pe_iow = pci_vtnet_write,
.pe_ior = pci_vtnet_read,
};
PCI_EMUL_SET(pci_de_vnet);

196
usr.sbin/bhyve/pit_8254.c Normal file
View File

@ -0,0 +1,196 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/time.h>
#include <machine/clock.h>
#include <stdio.h>
#include <assert.h>
#include "fbsdrun.h"
#include "inout.h"
#include "pit_8254.h"
#define TIMER_SEL_MASK 0xc0
#define TIMER_RW_MASK 0x30
#define TIMER_MODE_MASK 0x0f
#define TIMER_SEL_READBACK 0xc0
#define TIMER_DIV(freq, hz) (((freq) + (hz) / 2) / (hz))
#define PIT_8254_FREQ 1193182
static const int nsecs_per_tick = 1000000000 / PIT_8254_FREQ;
struct counter {
struct timeval tv; /* uptime when counter was loaded */
uint16_t initial; /* initial counter value */
uint8_t cr[2];
uint8_t ol[2];
int crbyte;
int olbyte;
};
static void
timevalfix(struct timeval *t1)
{
if (t1->tv_usec < 0) {
t1->tv_sec--;
t1->tv_usec += 1000000;
}
if (t1->tv_usec >= 1000000) {
t1->tv_sec++;
t1->tv_usec -= 1000000;
}
}
static void
timevalsub(struct timeval *t1, const struct timeval *t2)
{
t1->tv_sec -= t2->tv_sec;
t1->tv_usec -= t2->tv_usec;
timevalfix(t1);
}
static void
latch(struct counter *c)
{
struct timeval tv2;
uint16_t lval;
uint64_t delta_nsecs, delta_ticks;
/* cannot latch a new value until the old one has been consumed */
if (c->olbyte != 0)
return;
if (c->initial == 0 || c->initial == 1) {
/*
* XXX the program that runs the VM can be stopped and
* restarted at any time. This means that state that was
* created by the guest is destroyed between invocations
* of the program.
*
* If the counter's initial value is not programmed we
* assume a value that would be set to generate 'guest_hz'
* interrupts per second.
*/
c->initial = TIMER_DIV(PIT_8254_FREQ, guest_hz);
gettimeofday(&c->tv, NULL);
}
(void)gettimeofday(&tv2, NULL);
timevalsub(&tv2, &c->tv);
delta_nsecs = tv2.tv_sec * 1000000000 + tv2.tv_usec * 1000;
delta_ticks = delta_nsecs / nsecs_per_tick;
lval = c->initial - delta_ticks % c->initial;
c->olbyte = 2;
c->ol[1] = lval; /* LSB */
c->ol[0] = lval >> 8; /* MSB */
}
static int
pit_8254_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
int sel, rw, mode;
uint8_t val;
struct counter *c;
static struct counter counter[3];
if (bytes != 1)
return (-1);
val = *eax;
if (port == TIMER_MODE) {
assert(in == 0);
sel = val & TIMER_SEL_MASK;
rw = val & TIMER_RW_MASK;
mode = val & TIMER_MODE_MASK;
if (sel == TIMER_SEL_READBACK)
return (-1);
if (rw != TIMER_LATCH && rw != TIMER_16BIT)
return (-1);
if (rw != TIMER_LATCH) {
/*
* Counter mode is not affected when issuing a
* latch command.
*/
if (mode != TIMER_RATEGEN && mode != TIMER_SQWAVE)
return (-1);
}
c = &counter[sel >> 6];
if (rw == TIMER_LATCH)
latch(c);
else
c->olbyte = 0; /* reset latch after reprogramming */
return (0);
}
/* counter ports */
assert(port >= TIMER_CNTR0 && port <= TIMER_CNTR2);
c = &counter[port - TIMER_CNTR0];
if (in) {
/*
* XXX
* The spec says that once the output latch is completely
* read it should revert to "following" the counter. We don't
* do this because it is hard and any reasonable OS should
* always latch the counter before trying to read it.
*/
if (c->olbyte == 0)
c->olbyte = 2;
*eax = c->ol[--c->olbyte];
} else {
c->cr[c->crbyte++] = *eax;
if (c->crbyte == 2) {
c->crbyte = 0;
c->initial = c->cr[0] | (uint16_t)c->cr[1] << 8;
gettimeofday(&c->tv, NULL);
}
}
return (0);
}
INOUT_PORT(8254, TIMER_MODE, IOPORT_F_OUT, pit_8254_handler);
INOUT_PORT(8254, TIMER_CNTR0, IOPORT_F_INOUT, pit_8254_handler);
INOUT_PORT(8254, TIMER_CNTR1, IOPORT_F_INOUT, pit_8254_handler);
INOUT_PORT(8254, TIMER_CNTR2, IOPORT_F_INOUT, pit_8254_handler);

45
usr.sbin/bhyve/pit_8254.h Normal file
View File

@ -0,0 +1,45 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _PIT_8254_H_
#define _PIT_8254_H_
/*
* Borrowed from amd64/include/timerreg.h because in that file it is
* conditionally compiled for #ifdef _KERNEL only.
*/
#include <dev/ic/i8253reg.h>
#define IO_TIMER1 0x40 /* 8253 Timer #1 */
#define TIMER_CNTR0 (IO_TIMER1 + TIMER_REG_CNTR0)
#define TIMER_CNTR1 (IO_TIMER1 + TIMER_REG_CNTR1)
#define TIMER_CNTR2 (IO_TIMER1 + TIMER_REG_CNTR2)
#define TIMER_MODE (IO_TIMER1 + TIMER_REG_MODE)
#endif /* _PIT_8254_H_ */

51
usr.sbin/bhyve/post.c Normal file
View File

@ -0,0 +1,51 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <assert.h>
#include "inout.h"
static int
post_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
assert(in == 1);
if (bytes != 1)
return (-1);
*eax = 0xff; /* return some garbage */
return (0);
}
INOUT_PORT(post, 0x84, IOPORT_F_IN, post_data_handler);

268
usr.sbin/bhyve/rtc.c Normal file
View File

@ -0,0 +1,268 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <sys/time.h>
#include <stdio.h>
#include <time.h>
#include <assert.h>
#include "inout.h"
#define IO_RTC 0x70
#define RTC_SEC 0x00 /* seconds */
#define RTC_MIN 0x02
#define RTC_HRS 0x04
#define RTC_WDAY 0x06
#define RTC_DAY 0x07
#define RTC_MONTH 0x08
#define RTC_YEAR 0x09
#define RTC_CENTURY 0x32 /* current century */
#define RTC_STATUSA 0xA
#define RTCSA_TUP 0x80 /* time update, don't look now */
#define RTC_STATUSB 0xB
#define RTCSB_DST 0x01
#define RTCSB_24HR 0x02
#define RTCSB_BIN 0x04 /* 0 = BCD, 1 = Binary */
#define RTCSB_PINTR 0x40 /* 1 = enable periodic clock interrupt */
#define RTCSB_HALT 0x80 /* stop clock updates */
#define RTC_INTR 0x0c /* status register C (R) interrupt source */
#define RTC_STATUSD 0x0d /* status register D (R) Lost Power */
#define RTCSD_PWR 0x80 /* clock power OK */
#define RTC_DIAG 0x0e
#define RTC_RSTCODE 0x0f
static int addr;
/* XXX initialize these to default values as they would be from BIOS */
static uint8_t status_a, status_b, rstcode;
static u_char const bin2bcd_data[] = {
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09,
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29,
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99
};
#define bin2bcd(bin) (bin2bcd_data[bin])
#define rtcout(val) ((status_b & RTCSB_BIN) ? (val) : bin2bcd((val)))
static void
timevalfix(struct timeval *t1)
{
if (t1->tv_usec < 0) {
t1->tv_sec--;
t1->tv_usec += 1000000;
}
if (t1->tv_usec >= 1000000) {
t1->tv_sec++;
t1->tv_usec -= 1000000;
}
}
static void
timevalsub(struct timeval *t1, const struct timeval *t2)
{
t1->tv_sec -= t2->tv_sec;
t1->tv_usec -= t2->tv_usec;
timevalfix(t1);
}
static int
rtc_addr_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
assert(in == 0);
if (bytes != 1)
return (-1);
switch (*eax) {
case RTC_SEC:
case RTC_MIN:
case RTC_HRS:
case RTC_WDAY:
case RTC_DAY:
case RTC_MONTH:
case RTC_YEAR:
case RTC_CENTURY:
case RTC_STATUSA:
case RTC_STATUSB:
case RTC_INTR:
case RTC_STATUSD:
case RTC_DIAG:
case RTC_RSTCODE:
break;
default:
return (-1);
}
addr = *eax;
return (0);
}
static int
rtc_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
int hour;
time_t t;
struct timeval cur, delta;
static struct timeval last;
static struct tm tm;
if (bytes != 1)
return (-1);
gettimeofday(&cur, NULL);
/*
* Increment the cached time only once per second so we can guarantee
* that the guest has at least one second to read the hour:min:sec
* separately and still get a coherent view of the time.
*/
delta = cur;
timevalsub(&delta, &last);
if (delta.tv_sec >= 1 && (status_b & RTCSB_HALT) == 0) {
t = cur.tv_sec;
localtime_r(&t, &tm);
last = cur;
}
if (in) {
switch (addr) {
case RTC_SEC:
*eax = rtcout(tm.tm_sec);
return (0);
case RTC_MIN:
*eax = rtcout(tm.tm_min);
return (0);
case RTC_HRS:
if (status_b & RTCSB_24HR)
hour = tm.tm_hour;
else
hour = (tm.tm_hour % 12) + 1;
*eax = rtcout(hour);
/*
* If we are representing time in the 12-hour format
* then set the MSB to indicate PM.
*/
if ((status_b & RTCSB_24HR) == 0 && tm.tm_hour >= 12)
*eax |= 0x80;
return (0);
case RTC_WDAY:
*eax = rtcout(tm.tm_wday + 1);
return (0);
case RTC_DAY:
*eax = rtcout(tm.tm_mday);
return (0);
case RTC_MONTH:
*eax = rtcout(tm.tm_mon + 1);
return (0);
case RTC_YEAR:
*eax = rtcout(tm.tm_year % 100);
return (0);
case RTC_CENTURY:
*eax = rtcout(tm.tm_year / 100);
break;
case RTC_STATUSA:
*eax = status_a;
return (0);
case RTC_INTR:
*eax = 0;
return (0);
case RTC_STATUSD:
*eax = RTCSD_PWR;
return (0);
case RTC_DIAG:
*eax = 0;
return (0);
case RTC_RSTCODE:
*eax = rstcode;
return (0);
default:
return (-1);
}
}
switch (addr) {
case RTC_STATUSA:
status_a = *eax & ~RTCSA_TUP;
break;
case RTC_STATUSB:
/* XXX not implemented yet XXX */
if (*eax & RTCSB_PINTR)
return (-1);
status_b = *eax;
break;
case RTC_RSTCODE:
rstcode = *eax;
break;
case RTC_SEC:
case RTC_MIN:
case RTC_HRS:
case RTC_WDAY:
case RTC_DAY:
case RTC_MONTH:
case RTC_YEAR:
case RTC_CENTURY:
/*
* Ignore writes to the time of day registers
*/
break;
default:
return (-1);
}
return (0);
}
INOUT_PORT(rtc, IO_RTC, IOPORT_F_OUT, rtc_addr_handler);
INOUT_PORT(rtc, IO_RTC + 1, IOPORT_F_INOUT, rtc_data_handler);

60
usr.sbin/bhyve/uart.c Normal file
View File

@ -0,0 +1,60 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <assert.h>
#include "inout.h"
#define COM1 0x3F8
#define COM2 0x2F8
#define REG_IIR 2
static int
com_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
assert(in);
if (bytes != 1)
return (-1);
/*
* COM port is not implemented so we return 0xFF for all registers
*/
*eax = 0xFF;
return (0);
}
INOUT_PORT(uart, COM1 + REG_IIR, IOPORT_F_IN, com_handler);
INOUT_PORT(uart, COM2 + REG_IIR, IOPORT_F_IN, com_handler);

85
usr.sbin/bhyve/virtio.h Normal file
View File

@ -0,0 +1,85 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VIRTIO_H_
#define _VIRTIO_H_
#define VRING_ALIGN 4096
#define VRING_DESC_F_NEXT (1 << 0)
#define VRING_DESC_F_WRITE (1 << 1)
#define VRING_DESC_F_INDIRECT (1 << 2)
#define VRING_AVAIL_F_NO_INTERRUPT 1
struct virtio_desc {
uint64_t vd_addr;
uint32_t vd_len;
uint16_t vd_flags;
uint16_t vd_next;
} __packed;
struct virtio_used {
uint32_t vu_idx;
uint32_t vu_tlen;
} __packed;
/*
* PFN register shift amount
*/
#define VRING_PFN 12
/*
* Virtio device types
*/
#define VIRTIO_TYPE_NET 1
#define VIRTIO_TYPE_BLOCK 2
/*
* PCI vendor/device IDs
*/
#define VIRTIO_VENDOR 0x1AF4
#define VIRTIO_DEV_NET 0x1000
#define VIRTIO_DEV_BLOCK 0x1001
/*
* PCI config space constants
*/
#define VTCFG_R_HOSTCAP 0
#define VTCFG_R_GUESTCAP 4
#define VTCFG_R_PFN 8
#define VTCFG_R_QNUM 12
#define VTCFG_R_QSEL 14
#define VTCFG_R_QNOTIFY 16
#define VTCFG_R_STATUS 18
#define VTCFG_R_ISR 19
#define VTCFG_R_CFG0 20 /* No MSI-X */
#define VTCFG_R_CFG1 24 /* With MSI-X */
#define VTCFG_R_MSIX 20
#endif /* _VIRTIO_H_ */

261
usr.sbin/bhyve/xmsr.c Normal file
View File

@ -0,0 +1,261 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <x86/apicreg.h>
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <machine/vmm.h>
#include <vmmapi.h>
#include "fbsdrun.h"
#include "xmsr.h"
/*
* Trampoline for hypervisor direct 64-bit jump.
*
* 0 - signature for guest->host verification
* 8 - kernel virtual address of trampoline
* 16 - instruction virtual address
* 24 - stack pointer virtual address
* 32 - CR3, physical address of kernel page table
* 40 - 24-byte area for null/code/data GDT entries
*/
#define MP_V64T_SIG 0xcafebabecafebabeULL
struct mp_v64tramp {
uint64_t mt_sig;
uint64_t mt_virt;
uint64_t mt_eip;
uint64_t mt_rsp;
uint64_t mt_cr3;
uint64_t mt_gdtr[3];
};
/*
* CPU 0 is considered to be the BSP and is set to the RUNNING state.
* All other CPUs are set up in the INIT state.
*/
#define BSP 0
enum cpu_bstate {
CPU_S_INIT,
CPU_S_SIPI,
CPU_S_RUNNING
} static cpu_b[VM_MAXCPU] = { [BSP] = CPU_S_RUNNING };
static void spinup_ap(struct vmctx *, int, int, uint64_t *);
static void spinup_ap_direct64(struct vmctx *, int, uintptr_t, uint64_t *);
int
emulate_wrmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t val)
{
int dest;
int mode;
int thiscpu;
int vec;
int error, retval;
uint64_t rip;
retval = vcpu;
thiscpu = 1 << vcpu;
/*
* The only MSR value handled is the x2apic CR register
*/
if (code != 0x830) {
printf("Unknown WRMSR code %x, val %lx, cpu %d\n",
code, val, vcpu);
exit(1);
}
/*
* The value written to the MSR will generate an IPI to
* a set of CPUs. If this is a SIPI, create the initial
* state for the CPU and switch to it. Otherwise, inject
* an interrupt for the destination CPU(s), and request
* a switch to the next available one by returning -1
*/
dest = val >> 32;
vec = val & APIC_VECTOR_MASK;
mode = val & APIC_DELMODE_MASK;
switch (mode) {
case APIC_DELMODE_INIT:
assert(dest != 0);
assert(dest < guest_ncpus);
/*
* Ignore legacy de-assert INITs in x2apic mode
*/
if ((val & APIC_LEVEL_MASK) == APIC_LEVEL_DEASSERT) {
break;
}
assert(cpu_b[dest] == CPU_S_INIT);
/*
* Move CPU to wait-for-SIPI state
*/
error = vcpu_reset(ctx, dest);
assert(error == 0);
cpu_b[dest] = CPU_S_SIPI;
break;
case APIC_DELMODE_STARTUP:
assert(dest != 0);
assert(dest < guest_ncpus);
/*
* Ignore SIPIs in any state other than wait-for-SIPI
*/
if (cpu_b[dest] != CPU_S_SIPI) {
break;
}
/*
* Bring up the AP and signal the main loop that it is
* available and to switch to it.
*/
spinup_ap(ctx, dest, vec, &rip);
cpu_b[dest] = CPU_S_RUNNING;
fbsdrun_addcpu(ctx, dest, rip);
retval = dest;
break;
default:
printf("APIC delivery mode %lx not supported!\n",
val & APIC_DELMODE_MASK);
exit(1);
}
return (retval);
}
/*
* There are 2 startup modes possible here:
* - if the CPU supports 'unrestricted guest' mode, the spinup can
* set up the processor state in power-on 16-bit mode, with the CS:IP
* init'd to the specified low-mem 4K page.
* - if the guest has requested a 64-bit trampoline in the low-mem 4K
* page by placing in the specified signature, set up the register
* state using register state in the signature. Note that this
* requires accessing guest physical memory to read the signature
* while 'unrestricted mode' does not.
*/
static void
spinup_ap(struct vmctx *ctx, int newcpu, int vector, uint64_t *rip)
{
int error;
uint16_t cs;
uint64_t desc_base;
uint32_t desc_limit, desc_access;
if (fbsdrun_vmexit_on_hlt()) {
error = vm_set_capability(ctx, newcpu, VM_CAP_HALT_EXIT, 1);
assert(error == 0);
}
if (fbsdrun_vmexit_on_pause()) {
error = vm_set_capability(ctx, newcpu, VM_CAP_PAUSE_EXIT, 1);
assert(error == 0);
}
error = vm_set_capability(ctx, newcpu, VM_CAP_UNRESTRICTED_GUEST, 1);
if (error) {
/*
* If the guest does not support real-mode execution then
* we will bring up the AP directly in 64-bit mode.
*/
spinup_ap_direct64(ctx, newcpu, vector << PAGE_SHIFT, rip);
} else {
/*
* Update the %cs and %rip of the guest so that it starts
* executing real mode code at at 'vector << 12'.
*/
*rip = 0;
error = vm_set_register(ctx, newcpu, VM_REG_GUEST_RIP, *rip);
assert(error == 0);
error = vm_get_desc(ctx, newcpu, VM_REG_GUEST_CS, &desc_base,
&desc_limit, &desc_access);
assert(error == 0);
desc_base = vector << PAGE_SHIFT;
error = vm_set_desc(ctx, newcpu, VM_REG_GUEST_CS,
desc_base, desc_limit, desc_access);
assert(error == 0);
cs = (vector << PAGE_SHIFT) >> 4;
error = vm_set_register(ctx, newcpu, VM_REG_GUEST_CS, cs);
assert(error == 0);
}
}
static void
spinup_ap_direct64(struct vmctx *ctx, int newcpu, uintptr_t gaddr,
uint64_t *rip)
{
struct mp_v64tramp *mvt;
char *errstr;
int error;
uint64_t gdtbase;
mvt = paddr_guest2host(gaddr);
assert(mvt->mt_sig == MP_V64T_SIG);
/*
* Set up the 3-entry GDT using memory supplied in the
* guest's trampoline structure.
*/
vm_setup_freebsd_gdt(mvt->mt_gdtr);
#define CHECK_ERROR(msg) \
if (error != 0) { \
errstr = msg; \
goto err_exit; \
}
/* entry point */
*rip = mvt->mt_eip;
/* Get the guest virtual address of the GDT */
gdtbase = mvt->mt_virt + __offsetof(struct mp_v64tramp, mt_gdtr);
error = vm_setup_freebsd_registers(ctx, newcpu, mvt->mt_eip,
mvt->mt_cr3, gdtbase, mvt->mt_rsp);
CHECK_ERROR("vm_setup_freebsd_registers");
return;
err_exit:
printf("spinup_ap_direct64: machine state error: %s", errstr);
exit(1);
}

34
usr.sbin/bhyve/xmsr.h Normal file
View File

@ -0,0 +1,34 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _XMSR_H_
#define _XMSR_H_
int emulate_wrmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t val);
#endif

17
usr.sbin/vmmctl/Makefile Normal file
View File

@ -0,0 +1,17 @@
#
# $FreeBSD$
#
PROG= vmmctl
SRCS= vmmctl.c
NO_MAN=
DPADD= ${LIBVMMAPI}
LDADD= -lvmmapi
WARNS?= 3
CFLAGS+= -I${.CURDIR}/../../sys/amd64/vmm
.include <bsd.prog.mk>

75
usr.sbin/vmmctl/sample.sh Executable file
View File

@ -0,0 +1,75 @@
#!/bin/sh
# $FreeBSD$
VMMCTL="sudo ./vmmctl"
VMNAME=sample
${VMMCTL} --vm=${VMNAME} --create
${VMMCTL} --vm=${VMNAME} --set-lowmem=128 --set-highmem=256
${VMMCTL} --vm=${VMNAME} --get-lowmem --get-highmem
CR0_PE=$((1 << 0))
CR0_PG=$((1 << 31))
CR0=$(($CR0_PE | $CR0_PG))
${VMMCTL} --vm=${VMNAME} --set-cr0=${CR0} --get-cr0
# XXX this is bogus the value of %cr3 should come from the loader
CR3=0
${VMMCTL} --vm=${VMNAME} --set-cr3=${CR3} --get-cr3
CR4_PAE=$((1 << 5))
CR4=$((${CR4_PAE}))
${VMMCTL} --vm=${VMNAME} --set-cr4=${CR4} --get-cr4
DR7=0x00000400 # Table 9-1 from Intel Architecture Manual 3A
${VMMCTL} --vm=${VMNAME} --set-dr7=${DR7} --get-dr7
#
# XXX the values of rsp and rip are bogus and should come from the loader.
#
RSP=0xa5a5a5a5
RIP=0x0000bfbfbfbf0000
RFLAGS=0x2
${VMMCTL} --vm=${VMNAME} --set-rsp=${RSP} --get-rsp
${VMMCTL} --vm=${VMNAME} --set-rip=${RIP} --get-rip
${VMMCTL} --vm=${VMNAME} --set-rflags=${RFLAGS} --get-rflags
# Set "hidden" state of %cs descriptor to indicate long mode code segment.
#
# Note that this should match the contents of the entry pointed to by the
# segment selector in the GDTR.
#
${VMMCTL} --vm=${VMNAME} --set-desc-cs --desc-access=0x00002098 --get-desc-cs
# Set "hidden" state of all data descriptors to indicate a usable segment.
# The only useful fields are the "Present" and "Descriptor Type" bits.
${VMMCTL} --vm=${VMNAME} --set-desc-ds --desc-access=0x00000090 --get-desc-ds
${VMMCTL} --vm=${VMNAME} --set-desc-es --desc-access=0x00000090 --get-desc-es
${VMMCTL} --vm=${VMNAME} --set-desc-fs --desc-access=0x00000090 --get-desc-fs
${VMMCTL} --vm=${VMNAME} --set-desc-gs --desc-access=0x00000090 --get-desc-gs
${VMMCTL} --vm=${VMNAME} --set-desc-ss --desc-access=0x00000090 --get-desc-ss
#
# Set the code segment selector to point to entry at offset 8 in the GDTR.
#
${VMMCTL} --vm=${VMNAME} --set-cs=0x0008 --get-cs
# Set all the remaining data segment selectors to point to entry at offset
# 16 in the GDTR.
${VMMCTL} --vm=${VMNAME} --set-ds=0x0010 --get-ds
${VMMCTL} --vm=${VMNAME} --set-es=0x0010 --get-es
${VMMCTL} --vm=${VMNAME} --set-fs=0x0010 --get-fs
${VMMCTL} --vm=${VMNAME} --set-gs=0x0010 --get-gs
${VMMCTL} --vm=${VMNAME} --set-ss=0x0010 --get-ss
# XXX the value of the GDTR should come from the loader.
# Set the GDTR
GDTR_BASE=0xffff0000
GDTR_LIMIT=0x10
${VMMCTL} --vm=${VMNAME} --set-desc-gdtr --desc-base=${GDTR_BASE} --desc-limit=${GDTR_LIMIT} --get-desc-gdtr
${VMMCTL} --vm=${VMNAME} --set-pinning=0 --get-pinning
${VMMCTL} --vm=${VMNAME} --set-pinning=-1 --get-pinning
${VMMCTL} --vm=${VMNAME} --destroy

1485
usr.sbin/vmmctl/vmmctl.c Normal file

File diff suppressed because it is too large Load Diff