diff --git a/lib/Makefile b/lib/Makefile index 7be186fca21c..8e48f4c8b1aa 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -110,6 +110,7 @@ SUBDIR= ${SUBDIR_ORDERED} \ ${_libusbhid} \ ${_libusb} \ ${_libvgl} \ + ${_libvmmapi} \ libwrap \ liby \ libz \ @@ -197,6 +198,7 @@ _libsmb= libsmb .if ${MK_NCP} != "no" _libncp= libncp .endif +_libvmmapi= libvmmapi .endif .if ${MACHINE_CPUARCH} == "powerpc" diff --git a/lib/libvmmapi/Makefile b/lib/libvmmapi/Makefile new file mode 100644 index 000000000000..82dde083d0b7 --- /dev/null +++ b/lib/libvmmapi/Makefile @@ -0,0 +1,11 @@ +# $FreeBSD$ + +LIB= vmmapi +SRCS= vmmapi.c vmmapi_freebsd.c mptable.c +INCS= vmmapi.h + +WARNS?= 2 + +CFLAGS+= -I${.CURDIR} + +.include diff --git a/lib/libvmmapi/mptable.c b/lib/libvmmapi/mptable.c new file mode 100644 index 000000000000..fe84f35408b0 --- /dev/null +++ b/lib/libvmmapi/mptable.c @@ -0,0 +1,338 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include +#include +#include +#include + +#include "vmmapi.h" +#include "mptable.h" + +#define LAPIC_PADDR (0xFEE00000) +#define LAPIC_VERSION (16) + +#define IOAPIC_PADDR (0xFEC00000) +#define IOAPIC_VERSION (0x11) + +extern int errno; + +static uint8_t +mp_compute_checksum(void *base, size_t len) +{ + uint8_t *bytes = base; + uint8_t sum = 0; + for(; len > 0; len--) { + sum += *bytes++; + } + return 256 - sum; +} + +static void +mp_build_mpfp(struct mp_floating_pointer *mpfp, vm_paddr_t mpfp_gpa) +{ + memset(mpfp, 0, sizeof(*mpfp)); + memcpy(mpfp->signature, MPFP_SIGNATURE, MPFP_SIGNATURE_LEN); + mpfp->mptable_paddr = mpfp_gpa + sizeof(*mpfp); + mpfp->specrev = MP_SPECREV; + mpfp->feature2 = 0; + mpfp->checksum = mp_compute_checksum(mpfp, sizeof(*mpfp)); +} + +static void +mp_build_mpch(struct mp_config_hdr *mpch) +{ + memset(mpch, 0, sizeof(*mpch)); + mpch->specrev = MP_SPECREV; + memcpy(mpch->signature, MPCH_SIGNATURE, MPCH_SIGNATURE_LEN); + memcpy(mpch->oemid, MPCH_OEMID, MPCH_OEMID_LEN); + memcpy(mpch->prodid, MPCH_PRODID, MPCH_PRODID_LEN); + mpch->lapic_paddr = LAPIC_PADDR; + + +} + +static void +mp_build_proc_entries(struct mpe_proc *mpep, int num_proc) +{ + int i; + + for (i = 0; i < num_proc; i++) { + memset(mpep, 0, sizeof(*mpep)); + mpep->entry_type = MP_ENTRY_PROC; + mpep->lapic_id = i; // XXX + mpep->lapic_version = LAPIC_VERSION; + mpep->proc_flags = (i == 0)?MPEP_FLAGS_BSP:0; + mpep->proc_flags |= MPEP_FLAGS_EN; + mpep->proc_signature = MPEP_SIGNATURE; + mpep->feature_flags = MPEP_FEATURES; + mpep++; + } + +} + +static void +mp_build_bus_entries(struct mpe_bus *mpeb) +{ + memset(mpeb, 0, sizeof(*mpeb)); + mpeb->entry_type = MP_ENTRY_BUS; + mpeb->busid = MPE_BUSID_ISA; + memcpy(mpeb->busname, MPE_BUSNAME_ISA, MPE_BUSNAME_LEN); + mpeb++; + + memset(mpeb, 0, sizeof(*mpeb)); + mpeb->entry_type = MP_ENTRY_BUS; + mpeb->busid = MPE_BUSID_PCI; + memcpy(mpeb->busname, MPE_BUSNAME_PCI, MPE_BUSNAME_LEN); + +} + +#ifdef notyet +static void +mp_build_ioapic_entries(struct mpe_ioapic *mpei) +{ + memset(mpei, 0, sizeof(*mpei)); + mpei->entry_type = MP_ENTRY_IOAPIC; + mpei->ioapic_id = MPE_IOAPIC_ID; + mpei->ioapic_version = IOAPIC_VERSION; + mpei->ioapic_flags = MPE_IOAPIC_FLAG_EN; + mpei->ioapic_paddr = IOAPIC_PADDR; +} + +static void +mp_build_ioint_entries(struct mpe_ioint *mpeii, int num_pins) +{ + int pin; + + /* + * The following config is taken from kernel mptable.c + * mptable_parse_default_config_ints(...), for now + * just use the default config, tweek later if needed. + */ + + + /* Run through all 16 pins. */ + for (pin = 0; pin < num_pins; pin++) { + memset(mpeii, 0, sizeof(*mpeii)); + mpeii->entry_type = MP_ENTRY_IOINT; + mpeii->src_bus_id = MPE_BUSID_ISA; + mpeii->dst_apic_id = MPE_IOAPIC_ID; + + /* + * All default configs route IRQs from bus 0 to the first 16 pins + * of the first I/O APIC with an APIC ID of 2. + */ + mpeii->dst_apic_intin = pin; + switch (pin) { + case 0: + /* Pin 0 is an ExtINT pin. */ + mpeii->intr_type = MPEII_INTR_EXTINT; + break; + case 2: + /* IRQ 0 is routed to pin 2. */ + mpeii->intr_type = MPEII_INTR_INT; + mpeii->src_bus_irq = 0; + break; + case 5: + case 10: + case 11: + /* + * PCI Irqs set to level triggered. + */ + mpeii->intr_flags = MPEII_FLAGS_TRIGMODE_LEVEL; + mpeii->src_bus_id = MPE_BUSID_PCI; + default: + /* All other pins are identity mapped. */ + mpeii->intr_type = MPEII_INTR_INT; + mpeii->src_bus_irq = pin; + break; + } + mpeii++; + } + +} + +#define COPYSTR(dest, src, bytes) \ + memcpy(dest, src, bytes); \ + str[bytes] = 0; + + +static void +mptable_dump(struct mp_floating_pointer *mpfp, struct mp_config_hdr *mpch) +{ + static char str[16]; + int i; + char *cur; + + union mpe { + struct mpe_proc *proc; + struct mpe_bus *bus; + struct mpe_ioapic *ioapic; + struct mpe_ioint *ioint; + struct mpe_lint *lnit; + char *p; + }; + + union mpe mpe; + + printf(" MP Floating Pointer :\n"); + COPYSTR(str, mpfp->signature, 4); + printf(" signature: %s\n", str); + printf(" mpch paddr: %x\n", mpfp->mptable_paddr); + printf(" length: %x\n", mpfp->length); + printf(" specrec: %x\n", mpfp->specrev); + printf(" checksum: %x\n", mpfp->checksum); + printf(" feature1: %x\n", mpfp->feature1); + printf(" feature2: %x\n", mpfp->feature2); + printf(" feature3: %x\n", mpfp->feature3); + printf(" feature4: %x\n", mpfp->feature4); + + printf(" MP Configuration Header :\n"); + COPYSTR(str, mpch->signature, 4); + printf(" signature: %s\n", str); + printf(" length: %x\n", mpch->length); + printf(" specrec: %x\n", mpch->specrev); + printf(" checksum: %x\n", mpch->checksum); + COPYSTR(str, mpch->oemid, MPCH_OEMID_LEN); + printf(" oemid: %s\n", str); + COPYSTR(str, mpch->prodid, MPCH_PRODID_LEN); + printf(" prodid: %s\n", str); + printf(" oem_ptr: %x\n", mpch->oem_ptr); + printf(" oem_sz: %x\n", mpch->oem_sz); + printf(" nr_entries: %x\n", mpch->nr_entries); + printf(" apic paddr: %x\n", mpch->lapic_paddr); + printf(" ext_length: %x\n", mpch->ext_length); + printf(" ext_checksum: %x\n", mpch->ext_checksum); + + cur = (char *)mpch + sizeof(*mpch); + for (i = 0; i < mpch->nr_entries; i++) { + mpe.p = cur; + switch(*mpe.p) { + case MP_ENTRY_PROC: + printf(" MP Processor Entry :\n"); + printf(" lapic_id: %x\n", mpe.proc->lapic_id); + printf(" lapic_version: %x\n", mpe.proc->lapic_version); + printf(" proc_flags: %x\n", mpe.proc->proc_flags); + printf(" proc_signature: %x\n", mpe.proc->proc_signature); + printf(" feature_flags: %x\n", mpe.proc->feature_flags); + cur += sizeof(struct mpe_proc); + break; + case MP_ENTRY_BUS: + printf(" MP Bus Entry :\n"); + printf(" busid: %x\n", mpe.bus->busid); + COPYSTR(str, mpe.bus->busname, MPE_BUSNAME_LEN); + printf(" busname: %s\n", str); + cur += sizeof(struct mpe_bus); + break; + case MP_ENTRY_IOAPIC: + printf(" MP IOAPIC Entry :\n"); + printf(" ioapi_id: %x\n", mpe.ioapic->ioapic_id); + printf(" ioapi_version: %x\n", mpe.ioapic->ioapic_version); + printf(" ioapi_flags: %x\n", mpe.ioapic->ioapic_flags); + printf(" ioapi_paddr: %x\n", mpe.ioapic->ioapic_paddr); + cur += sizeof(struct mpe_ioapic); + break; + case MP_ENTRY_IOINT: + printf(" MP IO Interrupt Entry :\n"); + printf(" intr_type: %x\n", mpe.ioint->intr_type); + printf(" intr_flags: %x\n", mpe.ioint->intr_flags); + printf(" src_bus_id: %x\n", mpe.ioint->src_bus_id); + printf(" src_bus_irq: %x\n", mpe.ioint->src_bus_irq); + printf(" dst_apic_id: %x\n", mpe.ioint->dst_apic_id); + printf(" dst_apic_intin: %x\n", mpe.ioint->dst_apic_intin); + cur += sizeof(struct mpe_ioint); + break; + case MP_ENTRY_LINT: + printf(" MP Local Interrupt Entry :\n"); + cur += sizeof(struct mpe_lint); + break; + } + + } +} +#endif + +int +vm_build_mptable(struct vmctx *ctx, vm_paddr_t gpa, int len, int ncpu, + void *oemp, int oemsz) +{ + struct mp_config_hdr *mpch; + char *mapaddr; + char *startaddr; + int error; + + mapaddr = vm_map_memory(ctx, gpa, len); + if (mapaddr == MAP_FAILED) { + printf("%s\n", strerror(errno)); + goto err; + } + startaddr = mapaddr; + + mp_build_mpfp((struct mp_floating_pointer*) mapaddr, gpa); + mapaddr += sizeof(struct mp_floating_pointer); + + mpch = (struct mp_config_hdr*)mapaddr; + mp_build_mpch(mpch); + mapaddr += sizeof(struct mp_config_hdr); + + mp_build_proc_entries((struct mpe_proc*) mapaddr, ncpu); + mapaddr += (sizeof(struct mpe_proc)*ncpu); + mpch->nr_entries += ncpu; + + mp_build_bus_entries((struct mpe_bus*)mapaddr); + mapaddr += (sizeof(struct mpe_bus)*MPE_NUM_BUSES); + mpch->nr_entries += MPE_NUM_BUSES; +#if 0 + mp_build_ioapic_entries((struct mpe_ioapic*)mapaddr); + mapaddr += sizeof(struct mpe_ioapic); + mpch->nr_entries++; + + mp_build_ioint_entries((struct mpe_ioint*)mapaddr, MPEII_MAX_IRQ); + mapaddr += sizeof(struct mpe_ioint)*MPEII_MAX_IRQ; + mpch->nr_entries += MPEII_MAX_IRQ; + +#endif + if (oemp) { + mpch->oem_ptr = mapaddr - startaddr + gpa; + mpch->oem_sz = oemsz; + memcpy(mapaddr, oemp, oemsz); + } + mpch->length = (mapaddr) - ((char*) mpch); + mpch->checksum = mp_compute_checksum(mpch, sizeof(*mpch)); + + + // mptable_dump((struct mp_floating_pointer*)startaddr, mpch); +err: + return (error); +} diff --git a/lib/libvmmapi/mptable.h b/lib/libvmmapi/mptable.h new file mode 100644 index 000000000000..cad8834a47b2 --- /dev/null +++ b/lib/libvmmapi/mptable.h @@ -0,0 +1,171 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _MPTABLE_h_ +#define _MPTABLE_h_ + +#define MP_SPECREV (4) // MP spec revision 1.1 + +/* + * MP Floating Pointer Structure + */ +#define MPFP_SIGNATURE "_MP_" +#define MPFP_SIGNATURE_LEN (4) +#define MPFP_FEATURE2 (0x80) // IMCR is present +struct mp_floating_pointer { + uint8_t signature[MPFP_SIGNATURE_LEN]; + uint32_t mptable_paddr; + uint8_t length; + uint8_t specrev; + uint8_t checksum; + uint8_t feature1; + uint8_t feature2; + uint8_t feature3; + uint8_t feature4; + uint8_t feature5; +}; + + +/* + * MP Configuration Table Header + */ +#define MPCH_SIGNATURE "PCMP" +#define MPCH_SIGNATURE_LEN (4) + +#define MPCH_OEMID "NETAPP " +#define MPCH_OEMID_LEN (8) +#define MPCH_PRODID "vFiler " +#define MPCH_PRODID_LEN (12) + +struct mp_config_hdr { + uint8_t signature[MPCH_SIGNATURE_LEN]; + uint16_t length; + uint8_t specrev; + uint8_t checksum; + uint8_t oemid[MPCH_OEMID_LEN]; + uint8_t prodid[MPCH_PRODID_LEN]; + uint32_t oem_ptr; + uint16_t oem_sz; + uint16_t nr_entries; + uint32_t lapic_paddr; + uint16_t ext_length; + uint8_t ext_checksum; + uint8_t reserved; +}; + +#define MP_ENTRY_PROC (0) +#define MP_ENTRY_BUS (1) +#define MP_ENTRY_IOAPIC (2) +#define MP_ENTRY_IOINT (3) +#define MP_ENTRY_LINT (4) + +/* + * MP Processor Entry + */ + +#define MPEP_FLAGS_EN (0x1) +#define MPEP_FLAGS_BSP (0x2) + +#define MPEP_SIG_FAMILY (6) +#define MPEP_SIG_MODEL (26) +#define MPEP_SIG_STEPPING (5) +#define MPEP_SIGNATURE ((MPEP_SIG_FAMILY << 8) | (MPEP_SIG_MODEL << 4) \ + | (MPEP_SIG_STEPPING)) + +#define MPEP_FEATURES (0xBFEBFBFF) // Value from Intel i7 CPUID + +struct mpe_proc { + uint8_t entry_type; + uint8_t lapic_id; + uint8_t lapic_version; + uint8_t proc_flags; + uint32_t proc_signature; + uint32_t feature_flags; + uint8_t reserved[8]; +}; + +/* + * MP Bus Entry + */ + +#define MPE_NUM_BUSES (2) +#define MPE_BUSNAME_LEN (6) +#define MPE_BUSID_ISA (0) +#define MPE_BUSID_PCI (1) +#define MPE_BUSNAME_ISA "ISA " +#define MPE_BUSNAME_PCI "PCI " +struct mpe_bus { + uint8_t entry_type; + uint8_t busid; + uint8_t busname[MPE_BUSNAME_LEN]; +}; + +/* + * MP IO APIC Entry + */ +#define MPE_IOAPIC_ID (2) +#define MPE_IOAPIC_FLAG_EN (1) +struct mpe_ioapic { + uint8_t entry_type; + uint8_t ioapic_id; + uint8_t ioapic_version; + uint8_t ioapic_flags; + uint32_t ioapic_paddr; + +}; + +/* + * MP IO Interrupt Assignment Entry + */ +#define MPEII_INTR_INT (0) +#define MPEII_INTR_NMI (1) +#define MPEII_INTR_SMI (2) +#define MPEII_INTR_EXTINT (3) +#define MPEII_PCI_IRQ_MASK (0x0c20U) /* IRQ 5,10,11 are PCI connected */ +#define MPEII_MAX_IRQ (16) +#define MPEII_FLAGS_TRIGMODE_LEVEL (0x3) +struct mpe_ioint { + uint8_t entry_type; + uint8_t intr_type; + uint16_t intr_flags; + uint8_t src_bus_id; + uint8_t src_bus_irq; + uint8_t dst_apic_id; + uint8_t dst_apic_intin; +}; + +/* + * MP Local Interrupt Assignment Entry + */ +struct mpe_lint { + uint8_t entry_type; +}; + +int vm_build_mptable(struct vmctx *ctxt, vm_paddr_t gpa, int len, + int ncpu, void *oemp, int oemsz); +#endif /* _MPTABLE_h_ */ diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c new file mode 100644 index 000000000000..17f2063f432a --- /dev/null +++ b/lib/libvmmapi/vmmapi.c @@ -0,0 +1,645 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "vmmapi.h" +#include "mptable.h" + +#ifndef CR4_VMXE +#define CR4_VMXE (1UL << 13) +#endif + +#define BIOS_ROM_BASE (0xf0000) +#define BIOS_ROM_SIZE (0x10000) + +struct vmctx { + int fd; + char *name; +}; + +#define CREATE(x) sysctlbyname("hw.vmm.create", NULL, NULL, (x), strlen((x))) +#define DESTROY(x) sysctlbyname("hw.vmm.destroy", NULL, NULL, (x), strlen((x))) + +static int +vm_device_open(const char *name) +{ + int fd, len; + char *vmfile; + + len = strlen("/dev/vmm/") + strlen(name) + 1; + vmfile = malloc(len); + assert(vmfile != NULL); + snprintf(vmfile, len, "/dev/vmm/%s", name); + + /* Open the device file */ + fd = open(vmfile, O_RDWR, 0); + + free(vmfile); + return (fd); +} + +int +vm_create(const char *name) +{ + + return (CREATE((char *)name)); +} + +struct vmctx * +vm_open(const char *name) +{ + struct vmctx *vm; + + vm = malloc(sizeof(struct vmctx) + strlen(name) + 1); + assert(vm != NULL); + + vm->fd = -1; + vm->name = (char *)(vm + 1); + strcpy(vm->name, name); + + if ((vm->fd = vm_device_open(vm->name)) < 0) + goto err; + + return (vm); +err: + vm_destroy(vm); + return (NULL); +} + +void +vm_destroy(struct vmctx *vm) +{ + assert(vm != NULL); + + DESTROY(vm->name); + if (vm->fd >= 0) + close(vm->fd); + free(vm); +} + +int +vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, + vm_paddr_t *ret_hpa, size_t *ret_len) +{ + int error; + struct vm_memory_segment seg; + + bzero(&seg, sizeof(seg)); + seg.gpa = gpa; + error = ioctl(ctx->fd, VM_GET_MEMORY_SEG, &seg); + *ret_hpa = seg.hpa; + *ret_len = seg.len; + return (error); +} + +int +vm_setup_memory(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char **mapaddr) +{ + int error; + struct vm_memory_segment seg; + + /* + * Create and optionally map 'len' bytes of memory at guest + * physical address 'gpa' + */ + bzero(&seg, sizeof(seg)); + seg.gpa = gpa; + seg.len = len; + error = ioctl(ctx->fd, VM_MAP_MEMORY, &seg); + if (error == 0 && mapaddr != NULL) { + *mapaddr = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_SHARED, + ctx->fd, gpa); + } + return (error); +} + +char * +vm_map_memory(struct vmctx *ctx, vm_paddr_t gpa, size_t len) +{ + + /* Map 'len' bytes of memory at guest physical address 'gpa' */ + return ((char *)mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_SHARED, + ctx->fd, gpa)); +} + +int +vm_set_desc(struct vmctx *ctx, int vcpu, int reg, + uint64_t base, uint32_t limit, uint32_t access) +{ + int error; + struct vm_seg_desc vmsegdesc; + + bzero(&vmsegdesc, sizeof(vmsegdesc)); + vmsegdesc.cpuid = vcpu; + vmsegdesc.regnum = reg; + vmsegdesc.desc.base = base; + vmsegdesc.desc.limit = limit; + vmsegdesc.desc.access = access; + + error = ioctl(ctx->fd, VM_SET_SEGMENT_DESCRIPTOR, &vmsegdesc); + return (error); +} + +int +vm_get_desc(struct vmctx *ctx, int vcpu, int reg, + uint64_t *base, uint32_t *limit, uint32_t *access) +{ + int error; + struct vm_seg_desc vmsegdesc; + + bzero(&vmsegdesc, sizeof(vmsegdesc)); + vmsegdesc.cpuid = vcpu; + vmsegdesc.regnum = reg; + + error = ioctl(ctx->fd, VM_GET_SEGMENT_DESCRIPTOR, &vmsegdesc); + if (error == 0) { + *base = vmsegdesc.desc.base; + *limit = vmsegdesc.desc.limit; + *access = vmsegdesc.desc.access; + } + return (error); +} + +int +vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val) +{ + int error; + struct vm_register vmreg; + + bzero(&vmreg, sizeof(vmreg)); + vmreg.cpuid = vcpu; + vmreg.regnum = reg; + vmreg.regval = val; + + error = ioctl(ctx->fd, VM_SET_REGISTER, &vmreg); + return (error); +} + +int +vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *ret_val) +{ + int error; + struct vm_register vmreg; + + bzero(&vmreg, sizeof(vmreg)); + vmreg.cpuid = vcpu; + vmreg.regnum = reg; + + error = ioctl(ctx->fd, VM_GET_REGISTER, &vmreg); + *ret_val = vmreg.regval; + return (error); +} + +int +vm_get_pinning(struct vmctx *ctx, int vcpu, int *host_cpuid) +{ + int error; + struct vm_pin vmpin; + + bzero(&vmpin, sizeof(vmpin)); + vmpin.vm_cpuid = vcpu; + + error = ioctl(ctx->fd, VM_GET_PINNING, &vmpin); + *host_cpuid = vmpin.host_cpuid; + return (error); +} + +int +vm_set_pinning(struct vmctx *ctx, int vcpu, int host_cpuid) +{ + int error; + struct vm_pin vmpin; + + bzero(&vmpin, sizeof(vmpin)); + vmpin.vm_cpuid = vcpu; + vmpin.host_cpuid = host_cpuid; + + error = ioctl(ctx->fd, VM_SET_PINNING, &vmpin); + return (error); +} + +int +vm_run(struct vmctx *ctx, int vcpu, uint64_t rip, struct vm_exit *vmexit) +{ + int error; + struct vm_run vmrun; + + bzero(&vmrun, sizeof(vmrun)); + vmrun.cpuid = vcpu; + vmrun.rip = rip; + + error = ioctl(ctx->fd, VM_RUN, &vmrun); + bcopy(&vmrun.vm_exit, vmexit, sizeof(struct vm_exit)); + return (error); +} + +static int +vm_inject_event_real(struct vmctx *ctx, int vcpu, enum vm_event_type type, + int vector, int error_code, int error_code_valid) +{ + struct vm_event ev; + + bzero(&ev, sizeof(ev)); + ev.cpuid = vcpu; + ev.type = type; + ev.vector = vector; + ev.error_code = error_code; + ev.error_code_valid = error_code_valid; + + return (ioctl(ctx->fd, VM_INJECT_EVENT, &ev)); +} + +int +vm_inject_event(struct vmctx *ctx, int vcpu, enum vm_event_type type, + int vector) +{ + + return (vm_inject_event_real(ctx, vcpu, type, vector, 0, 0)); +} + +int +vm_inject_event2(struct vmctx *ctx, int vcpu, enum vm_event_type type, + int vector, int error_code) +{ + + return (vm_inject_event_real(ctx, vcpu, type, vector, error_code, 1)); +} + +int +vm_build_tables(struct vmctx *ctxt, int ncpu, void *oemtbl, int oemtblsz) +{ + + return (vm_build_mptable(ctxt, BIOS_ROM_BASE, BIOS_ROM_SIZE, ncpu, + oemtbl, oemtblsz)); +} + +int +vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector) +{ + struct vm_lapic_irq vmirq; + + bzero(&vmirq, sizeof(vmirq)); + vmirq.cpuid = vcpu; + vmirq.vector = vector; + + return (ioctl(ctx->fd, VM_LAPIC_IRQ, &vmirq)); +} + +int +vm_inject_nmi(struct vmctx *ctx, int vcpu) +{ + struct vm_nmi vmnmi; + + bzero(&vmnmi, sizeof(vmnmi)); + vmnmi.cpuid = vcpu; + + return (ioctl(ctx->fd, VM_INJECT_NMI, &vmnmi)); +} + +int +vm_capability_name2type(const char *capname) +{ + int i; + + static struct { + const char *name; + int type; + } capstrmap[] = { + { "hlt_exit", VM_CAP_HALT_EXIT }, + { "mtrap_exit", VM_CAP_MTRAP_EXIT }, + { "pause_exit", VM_CAP_PAUSE_EXIT }, + { "unrestricted_guest", VM_CAP_UNRESTRICTED_GUEST }, + { 0 } + }; + + for (i = 0; capstrmap[i].name != NULL && capname != NULL; i++) { + if (strcmp(capstrmap[i].name, capname) == 0) + return (capstrmap[i].type); + } + + return (-1); +} + +int +vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, + int *retval) +{ + int error; + struct vm_capability vmcap; + + bzero(&vmcap, sizeof(vmcap)); + vmcap.cpuid = vcpu; + vmcap.captype = cap; + + error = ioctl(ctx->fd, VM_GET_CAPABILITY, &vmcap); + *retval = vmcap.capval; + return (error); +} + +int +vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, int val) +{ + struct vm_capability vmcap; + + bzero(&vmcap, sizeof(vmcap)); + vmcap.cpuid = vcpu; + vmcap.captype = cap; + vmcap.capval = val; + + return (ioctl(ctx->fd, VM_SET_CAPABILITY, &vmcap)); +} + +int +vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func) +{ + struct vm_pptdev pptdev; + + bzero(&pptdev, sizeof(pptdev)); + pptdev.bus = bus; + pptdev.slot = slot; + pptdev.func = func; + + return (ioctl(ctx->fd, VM_BIND_PPTDEV, &pptdev)); +} + +int +vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func) +{ + struct vm_pptdev pptdev; + + bzero(&pptdev, sizeof(pptdev)); + pptdev.bus = bus; + pptdev.slot = slot; + pptdev.func = func; + + return (ioctl(ctx->fd, VM_UNBIND_PPTDEV, &pptdev)); +} + +int +vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func, + vm_paddr_t gpa, size_t len, vm_paddr_t hpa) +{ + struct vm_pptdev_mmio pptmmio; + + bzero(&pptmmio, sizeof(pptmmio)); + pptmmio.bus = bus; + pptmmio.slot = slot; + pptmmio.func = func; + pptmmio.gpa = gpa; + pptmmio.len = len; + pptmmio.hpa = hpa; + + return (ioctl(ctx->fd, VM_MAP_PPTDEV_MMIO, &pptmmio)); +} + +int +vm_setup_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int func, + int destcpu, int vector, int numvec) +{ + struct vm_pptdev_msi pptmsi; + + bzero(&pptmsi, sizeof(pptmsi)); + pptmsi.vcpu = vcpu; + pptmsi.bus = bus; + pptmsi.slot = slot; + pptmsi.func = func; + pptmsi.destcpu = destcpu; + pptmsi.vector = vector; + pptmsi.numvec = numvec; + + return (ioctl(ctx->fd, VM_PPTDEV_MSI, &pptmsi)); +} + +uint64_t * +vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv, + int *ret_entries) +{ + int error; + + static struct vm_stats vmstats; + + vmstats.cpuid = vcpu; + + error = ioctl(ctx->fd, VM_STATS, &vmstats); + if (error == 0) { + if (ret_entries) + *ret_entries = vmstats.num_entries; + if (ret_tv) + *ret_tv = vmstats.tv; + return (vmstats.statbuf); + } else + return (NULL); +} + +const char * +vm_get_stat_desc(struct vmctx *ctx, int index) +{ + static struct vm_stat_desc statdesc; + + statdesc.index = index; + if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0) + return (statdesc.desc); + else + return (NULL); +} + +/* + * From Intel Vol 3a: + * Table 9-1. IA-32 Processor States Following Power-up, Reset or INIT + */ +int +vcpu_reset(struct vmctx *vmctx, int vcpu) +{ + int error; + uint64_t rflags, rip, cr0, cr4, zero, desc_base, rdx; + uint32_t desc_access, desc_limit; + uint16_t sel; + + zero = 0; + + rflags = 0x2; + error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RFLAGS, rflags); + if (error) + goto done; + + rip = 0xfff0; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RIP, rip)) != 0) + goto done; + + cr0 = CR0_NE; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR0, cr0)) != 0) + goto done; + + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR3, zero)) != 0) + goto done; + + cr4 = CR4_VMXE; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR4, cr4)) != 0) + goto done; + + /* + * CS: present, r/w, accessed, 16-bit, byte granularity, usable + */ + desc_base = 0xffff0000; + desc_limit = 0xffff; + desc_access = 0x0093; + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_CS, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + sel = 0xf000; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CS, sel)) != 0) + goto done; + + /* + * SS,DS,ES,FS,GS: present, r/w, accessed, 16-bit, byte granularity + */ + desc_base = 0; + desc_limit = 0xffff; + desc_access = 0x0093; + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_SS, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_DS, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_ES, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_FS, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GS, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + sel = 0; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_SS, sel)) != 0) + goto done; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DS, sel)) != 0) + goto done; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_ES, sel)) != 0) + goto done; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_FS, sel)) != 0) + goto done; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_GS, sel)) != 0) + goto done; + + /* General purpose registers */ + rdx = 0xf00; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RAX, zero)) != 0) + goto done; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBX, zero)) != 0) + goto done; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RCX, zero)) != 0) + goto done; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDX, rdx)) != 0) + goto done; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSI, zero)) != 0) + goto done; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDI, zero)) != 0) + goto done; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBP, zero)) != 0) + goto done; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSP, zero)) != 0) + goto done; + + /* GDTR, IDTR */ + desc_base = 0; + desc_limit = 0xffff; + desc_access = 0; + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GDTR, + desc_base, desc_limit, desc_access); + if (error != 0) + goto done; + + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_IDTR, + desc_base, desc_limit, desc_access); + if (error != 0) + goto done; + + /* TR */ + desc_base = 0; + desc_limit = 0xffff; + desc_access = 0x0000008b; + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_TR, 0, 0, desc_access); + if (error) + goto done; + + sel = 0; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_TR, sel)) != 0) + goto done; + + /* LDTR */ + desc_base = 0; + desc_limit = 0xffff; + desc_access = 0x00000082; + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_LDTR, desc_base, + desc_limit, desc_access); + if (error) + goto done; + + sel = 0; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_LDTR, 0)) != 0) + goto done; + + /* XXX cr2, debug registers */ + + error = 0; +done: + return (error); +} diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h new file mode 100644 index 000000000000..38533a894b17 --- /dev/null +++ b/lib/libvmmapi/vmmapi.h @@ -0,0 +1,98 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMMAPI_H_ +#define _VMMAPI_H_ + +struct vmctx; + +int vm_create(const char *name); +struct vmctx *vm_open(const char *name); +void vm_destroy(struct vmctx *ctx); +int vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, + vm_paddr_t *ret_hpa, size_t *ret_len); +/* + * Create a memory segment of 'len' bytes in the guest physical address space + * at offset 'gpa'. + * + * If 'mapaddr' is not NULL then this region is mmap'ed into the address + * space of the calling process. If there is an mmap error then *mapaddr + * will be set to MAP_FAILED. + */ + +int vm_setup_memory(struct vmctx *ctx, vm_paddr_t gpa, size_t len, + char **mapaddr); +char * vm_map_memory(struct vmctx *ctx, vm_paddr_t gpa, size_t len); +int vm_set_desc(struct vmctx *ctx, int vcpu, int reg, + uint64_t base, uint32_t limit, uint32_t access); +int vm_get_desc(struct vmctx *ctx, int vcpu, int reg, + uint64_t *base, uint32_t *limit, uint32_t *access); +int vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val); +int vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *retval); +int vm_get_pinning(struct vmctx *ctx, int vcpu, int *host_cpuid); +int vm_set_pinning(struct vmctx *ctx, int vcpu, int host_cpuid); +int vm_run(struct vmctx *ctx, int vcpu, uint64_t rip, + struct vm_exit *ret_vmexit); +int vm_build_tables(struct vmctx *ctxt, int ncpus, void *oemtbl, + int oemtblsz); +int vm_inject_event(struct vmctx *ctx, int vcpu, enum vm_event_type type, + int vector); +int vm_inject_event2(struct vmctx *ctx, int vcpu, enum vm_event_type type, + int vector, int error_code); +int vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector); +int vm_inject_nmi(struct vmctx *ctx, int vcpu); +int vm_capability_name2type(const char *capname); +int vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, + int *retval); +int vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, + int val); +int vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func); +int vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func); +int vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func, + vm_paddr_t gpa, size_t len, vm_paddr_t hpa); +int vm_setup_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int func, + int dest, int vector, int numvec); + +/* + * Return a pointer to the statistics buffer. Note that this is not MT-safe. + */ +uint64_t *vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv, + int *ret_entries); +const char *vm_get_stat_desc(struct vmctx *ctx, int index); + +/* Reset vcpu register state */ +int vcpu_reset(struct vmctx *ctx, int vcpu); + +/* + * FreeBSD specific APIs + */ +int vm_setup_freebsd_registers(struct vmctx *ctx, int vcpu, + uint64_t rip, uint64_t cr3, uint64_t gdtbase, + uint64_t rsp); +void vm_setup_freebsd_gdt(uint64_t *gdtr); +#endif /* _VMMAPI_H_ */ diff --git a/lib/libvmmapi/vmmapi_freebsd.c b/lib/libvmmapi/vmmapi_freebsd.c new file mode 100644 index 000000000000..c4ad9898ede1 --- /dev/null +++ b/lib/libvmmapi/vmmapi_freebsd.c @@ -0,0 +1,187 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include + +#include +#include +#include + +#include "vmmapi.h" + +#ifndef CR4_VMXE +#define CR4_VMXE (1UL << 13) +#endif + +#define DESC_UNUSABLE 0x00010000 + +#define GUEST_NULL_SEL 0 +#define GUEST_CODE_SEL 1 +#define GUEST_DATA_SEL 2 +#define GUEST_GDTR_LIMIT (3 * 8 - 1) + +void +vm_setup_freebsd_gdt(uint64_t *gdtr) +{ + gdtr[GUEST_NULL_SEL] = 0; + gdtr[GUEST_CODE_SEL] = 0x0020980000000000; + gdtr[GUEST_DATA_SEL] = 0x0000900000000000; +} + +/* + * Setup the 'vcpu' register set such that it will begin execution at + * 'rip' in long mode. + */ +int +vm_setup_freebsd_registers(struct vmctx *vmctx, int vcpu, + uint64_t rip, uint64_t cr3, uint64_t gdtbase, + uint64_t rsp) +{ + int error; + uint64_t cr0, cr4, efer, rflags, desc_base; + uint32_t desc_access, desc_limit; + uint16_t gsel; + + cr0 = CR0_PE | CR0_PG | CR0_NE; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR0, cr0)) != 0) + goto done; + + cr4 = CR4_PAE | CR4_VMXE; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR4, cr4)) != 0) + goto done; + + efer = EFER_LME | EFER_LMA; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_EFER, efer))) + goto done; + + rflags = 0x2; + error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RFLAGS, rflags); + if (error) + goto done; + + desc_base = 0; + desc_limit = 0; + desc_access = 0x0000209B; + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_CS, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + desc_access = 0x00000093; + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_DS, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_ES, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_FS, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GS, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_SS, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + /* + * XXX TR is pointing to null selector even though we set the + * TSS segment to be usable with a base address and limit of 0. + */ + desc_access = 0x0000008b; + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_TR, 0, 0, desc_access); + if (error) + goto done; + + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_LDTR, 0, 0, + DESC_UNUSABLE); + if (error) + goto done; + + gsel = GSEL(GUEST_CODE_SEL, SEL_KPL); + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CS, gsel)) != 0) + goto done; + + gsel = GSEL(GUEST_DATA_SEL, SEL_KPL); + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DS, gsel)) != 0) + goto done; + + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_ES, gsel)) != 0) + goto done; + + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_FS, gsel)) != 0) + goto done; + + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_GS, gsel)) != 0) + goto done; + + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_SS, gsel)) != 0) + goto done; + + /* XXX TR is pointing to the null selector */ + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_TR, 0)) != 0) + goto done; + + /* LDTR is pointing to the null selector */ + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_LDTR, 0)) != 0) + goto done; + + /* entry point */ + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RIP, rip)) != 0) + goto done; + + /* page table base */ + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR3, cr3)) != 0) + goto done; + + desc_base = gdtbase; + desc_limit = GUEST_GDTR_LIMIT; + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GDTR, + desc_base, desc_limit, 0); + if (error != 0) + goto done; + + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSP, rsp)) != 0) + goto done; + + error = 0; +done: + return (error); +} diff --git a/share/mk/bsd.libnames.mk b/share/mk/bsd.libnames.mk index 9a948ff4bbef..f3388930c01d 100644 --- a/share/mk/bsd.libnames.mk +++ b/share/mk/bsd.libnames.mk @@ -160,6 +160,7 @@ LIBULOG?= ${DESTDIR}${LIBDIR}/libulog.a LIBUTIL?= ${DESTDIR}${LIBDIR}/libutil.a LIBUUTIL?= ${DESTDIR}${LIBDIR}/libuutil.a LIBVGL?= ${DESTDIR}${LIBDIR}/libvgl.a +LIBVMMAPI?= ${DESTDIR}${LIBDIR}/libvmmapi.a LIBWRAP?= ${DESTDIR}${LIBDIR}/libwrap.a LIBXPG4?= ${DESTDIR}${LIBDIR}/libxpg4.a LIBY?= ${DESTDIR}${LIBDIR}/liby.a diff --git a/usr.sbin/Makefile.amd64 b/usr.sbin/Makefile.amd64 index 232831b35c8c..811202b72d95 100644 --- a/usr.sbin/Makefile.amd64 +++ b/usr.sbin/Makefile.amd64 @@ -10,6 +10,7 @@ SUBDIR+= acpi SUBDIR+= apm .endif SUBDIR+= asf +SUBDIR+= bhyve SUBDIR+= boot0cfg .if ${MK_TOOLCHAIN} != "no" SUBDIR+= btxld @@ -30,4 +31,5 @@ SUBDIR+= spkrtest .if ${MK_SYSINSTALL} != "no" SUBDIR+= sade .endif +SUBDIR+= vmmctl SUBDIR+= zzz diff --git a/usr.sbin/bhyve/Makefile b/usr.sbin/bhyve/Makefile new file mode 100644 index 000000000000..b0398ed66d4a --- /dev/null +++ b/usr.sbin/bhyve/Makefile @@ -0,0 +1,20 @@ +# +# $FreeBSD$ +# + +PROG= bhyve + +SRCS= atpic.c consport.c dbgport.c elcr.c fbsdrun.c inout.c mevent.c +SRCS+= pci_emul.c pci_hostbridge.c pci_passthru.c pci_virtio_block.c +SRCS+= pci_virtio_net.c pit_8254.c post.c rtc.c uart.c xmsr.c + +NO_MAN= + +DPADD= ${LIBVMMAPI} ${LIBMD} ${LIBPTHREAD} +LDADD= -lvmmapi -lmd -lpthread + +WARNS?= 2 + +CFLAGS+= -I${.CURDIR}/../../sys + +.include diff --git a/usr.sbin/bhyve/atpic.c b/usr.sbin/bhyve/atpic.c new file mode 100644 index 000000000000..a9fb0842c358 --- /dev/null +++ b/usr.sbin/bhyve/atpic.c @@ -0,0 +1,68 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include + +#include +#include +#include + +#include "inout.h" + +/* + * FreeBSD only writes to the 8259 interrupt controllers to put them in a + * shutdown state. + * + * So, we just ignore the writes. + */ + +#define IO_ICU1 0x20 +#define IO_ICU2 0xA0 +#define ICU_IMR_OFFSET 1 + +static int +atpic_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + if (bytes != 1) + return (-1); + + if (in) + return (-1); + + /* Pretend all writes to the 8259 are alright */ + return (0); +} + +INOUT_PORT(atpic, IO_ICU1, IOPORT_F_INOUT, atpic_handler); +INOUT_PORT(atpic, IO_ICU1 + ICU_IMR_OFFSET, IOPORT_F_INOUT, atpic_handler); +INOUT_PORT(atpic, IO_ICU2, IOPORT_F_INOUT, atpic_handler); +INOUT_PORT(atpic, IO_ICU2 + ICU_IMR_OFFSET, IOPORT_F_INOUT, atpic_handler); diff --git a/usr.sbin/bhyve/consport.c b/usr.sbin/bhyve/consport.c new file mode 100644 index 000000000000..34f94a6cc775 --- /dev/null +++ b/usr.sbin/bhyve/consport.c @@ -0,0 +1,121 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include +#include +#include +#include +#include + +#include "inout.h" + +#define BVM_CONSOLE_PORT 0x220 + +static struct termios tio_orig, tio_new; + +static void +ttyclose(void) +{ + tcsetattr(STDIN_FILENO, TCSANOW, &tio_orig); +} + +static void +ttyopen(void) +{ + tcgetattr(STDIN_FILENO, &tio_orig); + + cfmakeraw(&tio_new); + tcsetattr(STDIN_FILENO, TCSANOW, &tio_new); + + atexit(ttyclose); +} + +static bool +tty_char_available(void) +{ + fd_set rfds; + struct timeval tv; + + FD_ZERO(&rfds); + FD_SET(STDIN_FILENO, &rfds); + tv.tv_sec = 0; + tv.tv_usec = 0; + if (select(STDIN_FILENO + 1, &rfds, NULL, NULL, &tv) > 0) { + return (true); + } else { + return (false); + } +} + +static int +ttyread(void) +{ + char rb; + + if (tty_char_available()) { + read(STDIN_FILENO, &rb, 1); + return (rb & 0xff); + } else { + return (-1); + } +} + +static void +ttywrite(unsigned char wb) +{ + (void) write(STDOUT_FILENO, &wb, 1); +} + +static int +console_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + static int opened; + + if (bytes != 4) + return (-1); + + if (!opened) { + ttyopen(); + opened = 1; + } + + if (in) + *eax = ttyread(); + else + ttywrite(*eax); + + return (0); +} +INOUT_PORT(console, BVM_CONSOLE_PORT, IOPORT_F_INOUT, console_handler); diff --git a/usr.sbin/bhyve/dbgport.c b/usr.sbin/bhyve/dbgport.c new file mode 100644 index 000000000000..55efdadf9150 --- /dev/null +++ b/usr.sbin/bhyve/dbgport.c @@ -0,0 +1,125 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "inout.h" +#include "dbgport.h" + +#define BVM_DBG_PORT 0x224 + +static int listen_fd, conn_fd; + +static struct sockaddr_in sin; + +void +init_dbgport(int sport) +{ + conn_fd = -1; + + if ((listen_fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { + perror("socket"); + exit(1); + } + + sin.sin_len = sizeof(sin); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = htonl(INADDR_ANY); + sin.sin_port = htons(sport); + + if (bind(listen_fd, (struct sockaddr *)&sin, sizeof(sin)) < 0) { + perror("bind"); + exit(1); + } + + if (listen(listen_fd, 1) < 0) { + perror("listen"); + exit(1); + } +} + +static int +dbg_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + char ch; + int nwritten, nread, printonce; + + if (bytes != 4) + return (-1); + +again: + printonce = 0; + while (conn_fd < 0) { + if (!printonce) { + printf("Waiting for connection from gdb\r\n"); + printonce = 1; + } + conn_fd = accept(listen_fd, NULL, NULL); + if (conn_fd >= 0) + fcntl(conn_fd, F_SETFL, O_NONBLOCK); + else if (errno != EINTR) + perror("accept"); + } + + if (in) { + nread = read(conn_fd, &ch, 1); + if (nread == -1 && errno == EAGAIN) + *eax = -1; + else if (nread == 1) + *eax = ch; + else { + close(conn_fd); + conn_fd = -1; + goto again; + } + } else { + ch = *eax; + nwritten = write(conn_fd, &ch, 1); + if (nwritten != 1) { + close(conn_fd); + conn_fd = -1; + goto again; + } + } + return (0); +} + +INOUT_PORT(dbg, BVM_DBG_PORT, IOPORT_F_INOUT, dbg_handler); diff --git a/usr.sbin/bhyve/dbgport.h b/usr.sbin/bhyve/dbgport.h new file mode 100644 index 000000000000..8c7dab7f83cc --- /dev/null +++ b/usr.sbin/bhyve/dbgport.h @@ -0,0 +1,36 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _DBGPORT_H_ +#define _DBGPORT_H_ + +#define DEFAULT_GDB_PORT 6466 + +void init_dbgport(int port); + +#endif diff --git a/usr.sbin/bhyve/elcr.c b/usr.sbin/bhyve/elcr.c new file mode 100644 index 000000000000..2417ae1bb276 --- /dev/null +++ b/usr.sbin/bhyve/elcr.c @@ -0,0 +1,65 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include + +#include "inout.h" + +/* + * EISA interrupt Level Control Register. + * + * This is a 16-bit register with one bit for each of the IRQ0 through IRQ15. + * A level triggered irq is indicated by setting the corresponding bit to '1'. + */ +#define ELCR_PORT 0x4d0 + +static uint8_t elcr[2] = { 0x00, 0x00 }; + +static int +elcr_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + int idx; + + if (bytes != 1) + return (-1); + + idx = port - ELCR_PORT; + + if (in) + *eax = elcr[idx]; + else + elcr[idx] = *eax; + + return (0); +} +INOUT_PORT(elcr, ELCR_PORT + 0, IOPORT_F_INOUT, elcr_handler); +INOUT_PORT(elcr, ELCR_PORT + 1, IOPORT_F_INOUT, elcr_handler); diff --git a/usr.sbin/bhyve/fbsdrun.c b/usr.sbin/bhyve/fbsdrun.c new file mode 100644 index 000000000000..62349583f686 --- /dev/null +++ b/usr.sbin/bhyve/fbsdrun.c @@ -0,0 +1,650 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "fbsdrun.h" +#include "inout.h" +#include "dbgport.h" +#include "mevent.h" +#include "pci_emul.h" +#include "xmsr.h" + +#define DEFAULT_GUEST_HZ 100 +#define DEFAULT_GUEST_TSLICE 200 + +#define GUEST_NIO_PORT 0x488 /* guest upcalls via i/o port */ + +#define VMEXIT_SWITCH 0 /* force vcpu switch in mux mode */ +#define VMEXIT_CONTINUE 1 /* continue from next instruction */ +#define VMEXIT_RESTART 2 /* restart current instruction */ +#define VMEXIT_ABORT 3 /* abort the vm run loop */ +#define VMEXIT_RESET 4 /* guest machine has reset */ + +#define MB (1024UL * 1024) +#define GB (1024UL * MB) + +typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu); + +int guest_tslice = DEFAULT_GUEST_TSLICE; +int guest_hz = DEFAULT_GUEST_HZ; +char *vmname; + +u_long lomem_sz; +u_long himem_sz; + +int guest_ncpus; + +static int pincpu = -1; +static int guest_vcpu_mux; +static int guest_vmexit_on_hlt, guest_vmexit_on_pause; + +static int foundcpus; + +static char *lomem_addr; +static char *himem_addr; + +static char *progname; +static const int BSP = 0; + +static int cpumask; + +static void *oem_tbl_start; +static int oem_tbl_size; + +static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip); + +struct vm_exit vmexit[VM_MAXCPU]; + +struct fbsdstats { + uint64_t vmexit_bogus; + uint64_t vmexit_bogus_switch; + uint64_t vmexit_hlt; + uint64_t vmexit_pause; + uint64_t vmexit_mtrap; + uint64_t cpu_switch_rotate; + uint64_t cpu_switch_direct; + int io_reset; +} stats; + +struct mt_vmm_info { + pthread_t mt_thr; + struct vmctx *mt_ctx; + int mt_vcpu; +} mt_vmm_info[VM_MAXCPU]; + +static void +usage(int code) +{ + + fprintf(stderr, + "Usage: %s [-hBHP][-g ][-z ][-s ][-p pincpu]" + "[-n ][-m lowmem][-M highmem] \n" + " -g: gdb port (default is %d and 0 means don't open)\n" + " -c: # cpus (default 1)\n" + " -p: pin vcpu 'n' to host cpu 'pincpu + n'\n" + " -B: inject breakpoint exception on vm entry\n" + " -H: vmexit from the guest on hlt\n" + " -P: vmexit from the guest on pause\n" + " -h: help\n" + " -z: guest hz (default is %d)\n" + " -s: PCI slot config\n" + " -n: PCI slot naming\n" + " -m: lowmem in MB\n" + " -M: highmem in MB\n" + " -x: mux vcpus to 1 hcpu\n" + " -t: mux vcpu timeslice hz (default %d)\n", + progname, DEFAULT_GDB_PORT, DEFAULT_GUEST_HZ, + DEFAULT_GUEST_TSLICE); + exit(code); +} + +void * +paddr_guest2host(uintptr_t gaddr) +{ + if (lomem_sz == 0) + return (NULL); + + if (gaddr < lomem_sz) { + return ((void *)(lomem_addr + gaddr)); + } else if (gaddr >= 4*GB && gaddr < (4*GB + himem_sz)) { + return ((void *)(himem_addr + gaddr - 4*GB)); + } else + return (NULL); +} + +void +fbsdrun_add_oemtbl(void *tbl, int tblsz) +{ + oem_tbl_start = tbl; + oem_tbl_size = tblsz; +} + +int +fbsdrun_vmexit_on_pause(void) +{ + + return (guest_vmexit_on_pause); +} + +int +fbsdrun_vmexit_on_hlt(void) +{ + + return (guest_vmexit_on_hlt); +} + +int +fbsdrun_muxed(void) +{ + + return (guest_vcpu_mux); +} + +static void * +fbsdrun_start_thread(void *param) +{ + int vcpu; + struct mt_vmm_info *mtp = param; + + vcpu = mtp->mt_vcpu; + vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip); + + /* not reached */ + exit(1); + return (NULL); +} + +void +fbsdrun_addcpu(struct vmctx *ctx, int vcpu, uint64_t rip) +{ + int error; + + if (cpumask & (1 << vcpu)) { + printf("addcpu: attempting to add existing cpu %d\n", vcpu); + exit(1); + } + + cpumask |= 1 << vcpu; + foundcpus++; + + /* + * Set up the vmexit struct to allow execution to start + * at the given RIP + */ + vmexit[vcpu].rip = rip; + vmexit[vcpu].inst_length = 0; + + if (vcpu == BSP || !guest_vcpu_mux){ + mt_vmm_info[vcpu].mt_ctx = ctx; + mt_vmm_info[vcpu].mt_vcpu = vcpu; + + error = pthread_create(&mt_vmm_info[vcpu].mt_thr, NULL, + fbsdrun_start_thread, &mt_vmm_info[vcpu]); + assert(error == 0); + } +} + +static int +fbsdrun_get_next_cpu(int curcpu) +{ + + /* + * Get the next available CPU. Assumes they arrive + * in ascending order with no gaps. + */ + return ((curcpu + 1) % foundcpus); +} + +static int +vmexit_catch_reset(void) +{ + stats.io_reset++; + return (VMEXIT_RESET); +} + +static int +vmexit_catch_inout(void) +{ + return (VMEXIT_ABORT); +} + +static int +vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu, + uint32_t eax) +{ +#if PG_DEBUG /* put all types of debug here */ + if (eax == 0) { + pause_noswitch = 1; + } else if (eax == 1) { + pause_noswitch = 0; + } else { + pause_noswitch = 0; + if (eax == 5) { + vm_set_capability(ctx, *pvcpu, VM_CAP_MTRAP_EXIT, 1); + } + } +#endif + return (VMEXIT_CONTINUE); +} + +static int +vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) +{ + int error; + int bytes, port, in, out; + uint32_t eax; + int vcpu; + + vcpu = *pvcpu; + + port = vme->u.inout.port; + bytes = vme->u.inout.bytes; + eax = vme->u.inout.eax; + in = vme->u.inout.in; + out = !in; + + /* We don't deal with these */ + if (vme->u.inout.string || vme->u.inout.rep) + return (VMEXIT_ABORT); + + /* Special case of guest reset */ + if (out && port == 0x64 && (uint8_t)eax == 0xFE) + return (vmexit_catch_reset()); + + /* Extra-special case of host notifications */ + if (out && port == GUEST_NIO_PORT) + return (vmexit_handle_notify(ctx, vme, pvcpu, eax)); + + error = emulate_inout(ctx, vcpu, in, port, bytes, &eax); + if (error == 0 && in) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, eax); + + if (error == 0) + return (VMEXIT_CONTINUE); + else { + fprintf(stderr, "Unhandled %s%c 0x%04x\n", + in ? "in" : "out", + bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port); + return (vmexit_catch_inout()); + } +} + +static int +vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) +{ + printf("vm exit rdmsr 0x%x, cpu %d\n", vme->u.msr.code, *pvcpu); + return (VMEXIT_ABORT); +} + +static int +vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) +{ + int newcpu; + int retval = VMEXIT_CONTINUE; + + newcpu = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code,vme->u.msr.wval); + + if (guest_vcpu_mux && *pvcpu != newcpu) { + retval = VMEXIT_SWITCH; + *pvcpu = newcpu; + } + + return (retval); +} + +static int +vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + + printf("vm exit[%d]\n", *pvcpu); + printf("\treason\t\tVMX\n"); + printf("\trip\t\t0x%016lx\n", vmexit->rip); + printf("\tinst_length\t%d\n", vmexit->inst_length); + printf("\terror\t\t%d\n", vmexit->u.vmx.error); + printf("\texit_reason\t%u\n", vmexit->u.vmx.exit_reason); + printf("\tqualification\t0x%016lx\n", vmexit->u.vmx.exit_qualification); + + return (VMEXIT_ABORT); +} + +static int bogus_noswitch = 1; + +static int +vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + stats.vmexit_bogus++; + + if (!guest_vcpu_mux || guest_ncpus == 1 || bogus_noswitch) { + return (VMEXIT_RESTART); + } else { + stats.vmexit_bogus_switch++; + vmexit->inst_length = 0; + *pvcpu = -1; + return (VMEXIT_SWITCH); + } +} + +static int +vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + stats.vmexit_hlt++; + if (fbsdrun_muxed()) { + *pvcpu = -1; + return (VMEXIT_SWITCH); + } else { + /* + * Just continue execution with the next instruction. We use + * the HLT VM exit as a way to be friendly with the host + * scheduler. + */ + return (VMEXIT_CONTINUE); + } +} + +static int pause_noswitch; + +static int +vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + stats.vmexit_pause++; + + if (fbsdrun_muxed() && !pause_noswitch) { + *pvcpu = -1; + return (VMEXIT_SWITCH); + } else { + return (VMEXIT_CONTINUE); + } +} + +static int +vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + stats.vmexit_mtrap++; + + return (VMEXIT_RESTART); +} + +static void +sigalrm(int sig) +{ + return; +} + +static void +setup_timeslice(void) +{ + struct sigaction sa; + struct itimerval itv; + int error; + + /* + * Setup a realtime timer to generate a SIGALRM at a + * frequency of 'guest_tslice' ticks per second. + */ + sigemptyset(&sa.sa_mask); + sa.sa_flags = 0; + sa.sa_handler = sigalrm; + + error = sigaction(SIGALRM, &sa, NULL); + assert(error == 0); + + itv.it_interval.tv_sec = 0; + itv.it_interval.tv_usec = 1000000 / guest_tslice; + itv.it_value.tv_sec = 0; + itv.it_value.tv_usec = 1000000 / guest_tslice; + + error = setitimer(ITIMER_REAL, &itv, NULL); + assert(error == 0); +} + +static vmexit_handler_t handler[VM_EXITCODE_MAX] = { + [VM_EXITCODE_INOUT] = vmexit_inout, + [VM_EXITCODE_VMX] = vmexit_vmx, + [VM_EXITCODE_BOGUS] = vmexit_bogus, + [VM_EXITCODE_RDMSR] = vmexit_rdmsr, + [VM_EXITCODE_WRMSR] = vmexit_wrmsr, + [VM_EXITCODE_MTRAP] = vmexit_mtrap, +}; + +static void +vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip) +{ + int error, rc, prevcpu; + + if (guest_vcpu_mux) + setup_timeslice(); + + if (pincpu >= 0) { + error = vm_set_pinning(ctx, vcpu, pincpu + vcpu); + assert(error == 0); + } + + while (1) { + error = vm_run(ctx, vcpu, rip, &vmexit[vcpu]); + if (error != 0) + break; + + prevcpu = vcpu; + rc = (*handler[vmexit[vcpu].exitcode])(ctx, &vmexit[vcpu], + &vcpu); + switch (rc) { + case VMEXIT_SWITCH: + assert(guest_vcpu_mux); + if (vcpu == -1) { + stats.cpu_switch_rotate++; + vcpu = fbsdrun_get_next_cpu(prevcpu); + } else { + stats.cpu_switch_direct++; + } + /* fall through */ + case VMEXIT_CONTINUE: + rip = vmexit[vcpu].rip + vmexit[vcpu].inst_length; + break; + case VMEXIT_RESTART: + rip = vmexit[vcpu].rip; + break; + case VMEXIT_RESET: + exit(0); + default: + exit(1); + } + } + fprintf(stderr, "vm_run error %d, errno %d\n", error, errno); +} + + +int +main(int argc, char *argv[]) +{ + int c, error, gdb_port, inject_bkpt, tmp, err; + struct vmctx *ctx; + uint64_t rip; + + inject_bkpt = 0; + progname = basename(argv[0]); + gdb_port = DEFAULT_GDB_PORT; + guest_ncpus = 1; + + while ((c = getopt(argc, argv, "hBHPxp:g:c:z:s:n:m:M:")) != -1) { + switch (c) { + case 'B': + inject_bkpt = 1; + break; + case 'x': + guest_vcpu_mux = 1; + break; + case 'p': + pincpu = atoi(optarg); + break; + case 'c': + guest_ncpus = atoi(optarg); + break; + case 'g': + gdb_port = atoi(optarg); + break; + case 'z': + guest_hz = atoi(optarg); + break; + case 't': + guest_tslice = atoi(optarg); + break; + case 's': + pci_parse_slot(optarg); + break; + case 'n': + pci_parse_name(optarg); + break; + case 'm': + lomem_sz = strtoul(optarg, NULL, 0) * MB; + break; + case 'M': + himem_sz = strtoul(optarg, NULL, 0) * MB; + break; + case 'H': + guest_vmexit_on_hlt = 1; + break; + case 'P': + guest_vmexit_on_pause = 1; + break; + case 'h': + usage(0); + default: + usage(1); + } + } + argc -= optind; + argv += optind; + + if (argc != 1) + usage(1); + + /* No need to mux if guest is uni-processor */ + if (guest_ncpus <= 1) + guest_vcpu_mux = 0; + + /* vmexit on hlt if guest is muxed */ + if (guest_vcpu_mux) { + guest_vmexit_on_hlt = 1; + guest_vmexit_on_pause = 1; + } + + vmname = argv[0]; + + ctx = vm_open(vmname); + if (ctx == NULL) { + perror("vm_open"); + exit(1); + } + + if (fbsdrun_vmexit_on_hlt()) { + err = vm_get_capability(ctx, BSP, VM_CAP_HALT_EXIT, &tmp); + if (err < 0) { + printf("VM exit on HLT not supported\n"); + exit(1); + } + vm_set_capability(ctx, BSP, VM_CAP_HALT_EXIT, 1); + handler[VM_EXITCODE_HLT] = vmexit_hlt; + } + + if (fbsdrun_vmexit_on_pause()) { + /* + * pause exit support required for this mode + */ + err = vm_get_capability(ctx, BSP, VM_CAP_PAUSE_EXIT, &tmp); + if (err < 0) { + printf("SMP mux requested, no pause support\n"); + exit(1); + } + vm_set_capability(ctx, BSP, VM_CAP_PAUSE_EXIT, 1); + handler[VM_EXITCODE_PAUSE] = vmexit_pause; + } + + if (lomem_sz != 0) { + lomem_addr = vm_map_memory(ctx, 0, lomem_sz); + if (lomem_addr == (char *) MAP_FAILED) { + lomem_sz = 0; + } else if (himem_sz != 0) { + himem_addr = vm_map_memory(ctx, 4*GB, himem_sz); + if (himem_addr == (char *) MAP_FAILED) { + lomem_sz = 0; + himem_sz = 0; + } + } + } + + init_inout(); + init_pci(ctx); + + if (gdb_port != 0) + init_dbgport(gdb_port); + + error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip); + assert(error == 0); + + if (inject_bkpt) { + error = vm_inject_event(ctx, BSP, VM_HW_EXCEPTION, IDT_BP); + assert(error == 0); + } + + /* + * build the guest tables, MP etc. + */ + vm_build_tables(ctx, guest_ncpus, oem_tbl_start, oem_tbl_size); + + /* + * Add CPU 0 + */ + fbsdrun_addcpu(ctx, BSP, rip); + + /* + * Head off to the main event dispatch loop + */ + mevent_dispatch(); + + exit(1); +} diff --git a/usr.sbin/bhyve/fbsdrun.h b/usr.sbin/bhyve/fbsdrun.h new file mode 100644 index 000000000000..81061222a1a1 --- /dev/null +++ b/usr.sbin/bhyve/fbsdrun.h @@ -0,0 +1,53 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _FBSDRUN_H_ +#define _FBSDRUN_H_ + +#ifndef CTASSERT /* Allow lint to override */ +#define CTASSERT(x) _CTASSERT(x, __LINE__) +#define _CTASSERT(x, y) __CTASSERT(x, y) +#define __CTASSERT(x, y) typedef char __assert ## y[(x) ? 1 : -1] +#endif + +struct vmctx; +extern int guest_hz; +extern int guest_tslice; +extern int guest_ncpus; +extern char *vmname; + +extern u_long lomem_sz, himem_sz; + +void *paddr_guest2host(uintptr_t); + +void fbsdrun_addcpu(struct vmctx *ctx, int cpu, uint64_t rip); +void fbsdrun_add_oemtbl(void *tbl, int tblsz); +int fbsdrun_muxed(void); +int fbsdrun_vmexit_on_hlt(void); +int fbsdrun_vmexit_on_pause(void); +#endif diff --git a/usr.sbin/bhyve/inout.c b/usr.sbin/bhyve/inout.c new file mode 100644 index 000000000000..84445b1f0a99 --- /dev/null +++ b/usr.sbin/bhyve/inout.c @@ -0,0 +1,98 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include +#include + +#include "inout.h" + +SET_DECLARE(inout_port_set, struct inout_port); + +#define MAX_IOPORTS (1 << 16) + +static struct { + const char *name; + int flags; + inout_func_t handler; + void *arg; +} inout_handlers[MAX_IOPORTS]; + +int +emulate_inout(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax) +{ + int flags; + inout_func_t handler; + void *arg; + + assert(port < MAX_IOPORTS); + + if ((handler = inout_handlers[port].handler) == NULL) + return (-1); + + flags = inout_handlers[port].flags; + arg = inout_handlers[port].arg; + + if ((in && (flags & IOPORT_F_IN)) || (!in && (flags & IOPORT_F_OUT))) + return ((*handler)(ctx, vcpu, in, port, bytes, eax, arg)); + else + return (-1); +} + +void +init_inout(void) +{ + struct inout_port **iopp, *iop; + + SET_FOREACH(iopp, inout_port_set) { + iop = *iopp; + assert(iop->port < MAX_IOPORTS); + inout_handlers[iop->port].name = iop->name; + inout_handlers[iop->port].flags = iop->flags; + inout_handlers[iop->port].handler = iop->handler; + inout_handlers[iop->port].arg = NULL; + } +} + +int +register_inout(struct inout_port *iop) +{ + assert(iop->port < MAX_IOPORTS); + inout_handlers[iop->port].name = iop->name; + inout_handlers[iop->port].flags = iop->flags; + inout_handlers[iop->port].handler = iop->handler; + inout_handlers[iop->port].arg = iop->arg; + + return (0); +} diff --git a/usr.sbin/bhyve/inout.h b/usr.sbin/bhyve/inout.h new file mode 100644 index 000000000000..b15011d3f483 --- /dev/null +++ b/usr.sbin/bhyve/inout.h @@ -0,0 +1,65 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _INOUT_H_ +#define _INOUT_H_ + +#include + +struct vmctx; + +typedef int (*inout_func_t)(struct vmctx *ctx, int vcpu, int in, int port, + int bytes, uint32_t *eax, void *arg); + +struct inout_port { + const char *name; + int port; + int flags; + inout_func_t handler; + void *arg; +}; +#define IOPORT_F_IN 0x1 +#define IOPORT_F_OUT 0x2 +#define IOPORT_F_INOUT 0x3 + +#define INOUT_PORT(name, port, flags, handler) \ + static struct inout_port __CONCAT(__inout_port, __LINE__) = { \ + #name, \ + (port), \ + (flags), \ + (handler), \ + 0 \ + }; \ + DATA_SET(inout_port_set, __CONCAT(__inout_port, __LINE__)) + +void init_inout(void); +int emulate_inout(struct vmctx *, int vcpu, int in, int port, int bytes, + uint32_t *eax); +int register_inout(struct inout_port *iop); + +#endif /* _INOUT_H_ */ diff --git a/usr.sbin/bhyve/mevent.c b/usr.sbin/bhyve/mevent.c new file mode 100644 index 000000000000..0d3b2872e1b4 --- /dev/null +++ b/usr.sbin/bhyve/mevent.c @@ -0,0 +1,419 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Micro event library for FreeBSD, designed for a single i/o thread + * using kqueue, and having events be persistent by default. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include "mevent.h" + +#define MEVENT_MAX 64 + +#define MEV_ENABLE 1 +#define MEV_DISABLE 2 +#define MEV_DEL_PENDING 3 + +static pthread_t mevent_tid; +static int mevent_pipefd[2]; +static pthread_mutex_t mevent_lmutex = PTHREAD_MUTEX_INITIALIZER; + +struct mevent { + void (*me_func)(int, enum ev_type, void *); + int me_fd; + enum ev_type me_type; + void *me_param; + int me_cq; + int me_state; + int me_closefd; + LIST_ENTRY(mevent) me_list; +}; + +static LIST_HEAD(listhead, mevent) global_head, change_head; + +static void +mevent_qlock(void) +{ + pthread_mutex_lock(&mevent_lmutex); +} + +static void +mevent_qunlock(void) +{ + pthread_mutex_unlock(&mevent_lmutex); +} + +static void +mevent_pipe_read(int fd, enum ev_type type, void *param) +{ + char buf[MEVENT_MAX]; + int status; + + /* + * Drain the pipe read side. The fd is non-blocking so this is + * safe to do. + */ + do { + status = read(fd, buf, sizeof(buf)); + } while (status == MEVENT_MAX); +} + +static void +mevent_notify(void) +{ + char c; + + /* + * If calling from outside the i/o thread, write a byte on the + * pipe to force the i/o thread to exit the blocking kevent call. + */ + if (mevent_pipefd[1] != 0 && pthread_self() != mevent_tid) { + write(mevent_pipefd[1], &c, 1); + } +} + +static int +mevent_kq_filter(struct mevent *mevp) +{ + int retval; + + retval = 0; + + if (mevp->me_type == EVF_READ) + retval = EVFILT_READ; + + if (mevp->me_type == EVF_WRITE) + retval = EVFILT_WRITE; + + return (retval); +} + +static int +mevent_kq_flags(struct mevent *mevp) +{ + int ret; + + switch (mevp->me_state) { + case MEV_ENABLE: + ret = EV_ADD; + break; + case MEV_DISABLE: + ret = EV_DISABLE; + break; + case MEV_DEL_PENDING: + ret = EV_DELETE; + break; + } + + return (ret); +} + +static int +mevent_kq_fflags(struct mevent *mevp) +{ + /* XXX nothing yet, perhaps EV_EOF for reads ? */ + return (0); +} + +static int +mevent_build(int mfd, struct kevent *kev) +{ + struct mevent *mevp, *tmpp; + int i; + + i = 0; + + mevent_qlock(); + + LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) { + if (mevp->me_closefd) { + /* + * A close of the file descriptor will remove the + * event + */ + close(mevp->me_fd); + } else { + kev[i].ident = mevp->me_fd; + kev[i].filter = mevent_kq_filter(mevp); + kev[i].flags = mevent_kq_flags(mevp); + kev[i].fflags = mevent_kq_fflags(mevp); + kev[i].data = 0; + kev[i].udata = mevp; + i++; + } + + mevp->me_cq = 0; + LIST_REMOVE(mevp, me_list); + + if (mevp->me_state == MEV_DEL_PENDING) { + free(mevp); + } else { + LIST_INSERT_HEAD(&global_head, mevp, me_list); + } + + assert(i < MEVENT_MAX); + } + + mevent_qunlock(); + + return (i); +} + +static void +mevent_handle(struct kevent *kev, int numev) +{ + struct mevent *mevp; + int i; + + for (i = 0; i < numev; i++) { + mevp = kev[i].udata; + + /* XXX check for EV_ERROR ? */ + + (*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param); + } +} + +struct mevent * +mevent_add(int fd, enum ev_type type, + void (*func)(int, enum ev_type, void *), void *param) +{ + struct mevent *lp, *mevp; + + if (fd < 0 || func == NULL) { + return (NULL); + } + + mevp = NULL; + + mevent_qlock(); + + /* + * Verify that the fd/type tuple is not present in any list + */ + LIST_FOREACH(lp, &global_head, me_list) { + if (lp->me_fd == fd && lp->me_type == type) { + goto exit; + } + } + + LIST_FOREACH(lp, &change_head, me_list) { + if (lp->me_fd == fd && lp->me_type == type) { + goto exit; + } + } + + /* + * Allocate an entry, populate it, and add it to the change list. + */ + mevp = malloc(sizeof(struct mevent)); + if (mevp == NULL) { + goto exit; + } + + memset(mevp, 0, sizeof(struct mevent)); + mevp->me_fd = fd; + mevp->me_type = type; + mevp->me_func = func; + mevp->me_param = param; + + LIST_INSERT_HEAD(&change_head, mevp, me_list); + mevp->me_cq = 1; + mevp->me_state = MEV_ENABLE; + mevent_notify(); + +exit: + mevent_qunlock(); + + return (mevp); +} + +static int +mevent_update(struct mevent *evp, int newstate) +{ + /* + * It's not possible to enable/disable a deleted event + */ + if (evp->me_state == MEV_DEL_PENDING) + return (EINVAL); + + /* + * No update needed if state isn't changing + */ + if (evp->me_state == newstate) + return (0); + + mevent_qlock(); + + evp->me_state = newstate; + + /* + * Place the entry onto the changed list if not already there. + */ + if (evp->me_cq == 0) { + evp->me_cq = 1; + LIST_REMOVE(evp, me_list); + LIST_INSERT_HEAD(&change_head, evp, me_list); + mevent_notify(); + } + + mevent_qunlock(); + + return (0); +} + +int +mevent_enable(struct mevent *evp) +{ + + return (mevent_update(evp, MEV_ENABLE)); +} + +int +mevent_disable(struct mevent *evp) +{ + + return (mevent_update(evp, MEV_DISABLE)); +} + +static int +mevent_delete_event(struct mevent *evp, int closefd) +{ + mevent_qlock(); + + /* + * Place the entry onto the changed list if not already there, and + * mark as to be deleted. + */ + if (evp->me_cq == 0) { + evp->me_cq = 1; + LIST_REMOVE(evp, me_list); + LIST_INSERT_HEAD(&change_head, evp, me_list); + mevent_notify(); + } + evp->me_state = MEV_DEL_PENDING; + + if (closefd) + evp->me_closefd = 1; + + mevent_qunlock(); + + return (0); +} + +int +mevent_delete(struct mevent *evp) +{ + + return (mevent_delete_event(evp, 0)); +} + +int +mevent_delete_close(struct mevent *evp) +{ + + return (mevent_delete_event(evp, 1)); +} + +void +mevent_dispatch(void) +{ + struct kevent changelist[MEVENT_MAX]; + struct kevent eventlist[MEVENT_MAX]; + struct mevent *pipev; + int mfd; + int numev; + int ret; + + mevent_tid = pthread_self(); + + mfd = kqueue(); + assert(mfd > 0); + + /* + * Open the pipe that will be used for other threads to force + * the blocking kqueue call to exit by writing to it. Set the + * descriptor to non-blocking. + */ + ret = pipe(mevent_pipefd); + if (ret < 0) { + perror("pipe"); + exit(0); + } + + /* + * Add internal event handler for the pipe write fd + */ + pipev = mevent_add(mevent_pipefd[0], EVF_READ, mevent_pipe_read, NULL); + assert(pipev != NULL); + + for (;;) { + /* + * Build changelist if required. + * XXX the changelist can be put into the blocking call + * to eliminate the extra syscall. Currently better for + * debug. + */ + numev = mevent_build(mfd, changelist); + if (numev) { + ret = kevent(mfd, changelist, numev, NULL, 0, NULL); + if (ret == -1) { + perror("Error return from kevent change"); + } + } + + /* + * Block awaiting events + */ + ret = kevent(mfd, NULL, 0, eventlist, MEVENT_MAX, NULL); + if (ret == -1) { + perror("Error return from kevent monitor"); + } + + /* + * Handle reported events + */ + mevent_handle(eventlist, ret); + } +} diff --git a/usr.sbin/bhyve/mevent.h b/usr.sbin/bhyve/mevent.h new file mode 100644 index 000000000000..32a9d74ab566 --- /dev/null +++ b/usr.sbin/bhyve/mevent.h @@ -0,0 +1,49 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _MEVENT_H_ +#define _MEVENT_H_ + +enum ev_type { + EVF_READ, + EVF_WRITE +}; + +struct mevent; + +struct mevent *mevent_add(int fd, enum ev_type type, + void (*func)(int, enum ev_type, void *), + void *param); +int mevent_enable(struct mevent *evp); +int mevent_disable(struct mevent *evp); +int mevent_delete(struct mevent *evp); +int mevent_delete_close(struct mevent *evp); + +void mevent_dispatch(void); + +#endif /* _MEVENT_H_ */ diff --git a/usr.sbin/bhyve/mevent_test.c b/usr.sbin/bhyve/mevent_test.c new file mode 100644 index 000000000000..c72a49718188 --- /dev/null +++ b/usr.sbin/bhyve/mevent_test.c @@ -0,0 +1,180 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Test program for the micro event library. Set up a simple TCP echo + * service. + * + * cc mevent_test.c mevent.c -lpthread + */ + +#include +#include +#include + +#include +#include +#include + +#include "mevent.h" + +#define TEST_PORT 4321 + +static pthread_mutex_t accept_mutex = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t accept_condvar = PTHREAD_COND_INITIALIZER; + +#define MEVENT_ECHO + +#ifdef MEVENT_ECHO +struct esync { + pthread_mutex_t e_mt; + pthread_cond_t e_cond; +}; + +static void +echoer_callback(int fd, enum ev_type type, void *param) +{ + struct esync *sync = param; + + pthread_mutex_lock(&sync->e_mt); + pthread_cond_signal(&sync->e_cond); + pthread_mutex_unlock(&sync->e_mt); +} + +static void * +echoer(void *param) +{ + struct esync sync; + struct mevent *mev; + char buf[128]; + int fd = (int)(uintptr_t) param; + int len; + + pthread_mutex_init(&sync.e_mt, NULL); + pthread_cond_init(&sync.e_cond, NULL); + + pthread_mutex_lock(&sync.e_mt); + + mev = mevent_add(fd, EVF_READ, echoer_callback, &sync); + if (mev == NULL) { + printf("Could not allocate echoer event\n"); + exit(1); + } + + while (!pthread_cond_wait(&sync.e_cond, &sync.e_mt)) { + len = read(fd, buf, sizeof(buf)); + if (len > 0) { + write(fd, buf, len); + write(0, buf, len); + } else { + break; + } + } + + mevent_delete_close(mev); + + pthread_mutex_unlock(&sync.e_mt); + pthread_mutex_destroy(&sync.e_mt); + pthread_cond_destroy(&sync.e_cond); +} + +#else + +static void * +echoer(void *param) +{ + char buf[128]; + int fd = (int)(uintptr_t) param; + int len; + + while ((len = read(fd, buf, sizeof(buf))) > 0) { + write(1, buf, len); + } +} +#endif /* MEVENT_ECHO */ + +static void +acceptor_callback(int fd, enum ev_type type, void *param) +{ + pthread_mutex_lock(&accept_mutex); + pthread_cond_signal(&accept_condvar); + pthread_mutex_unlock(&accept_mutex); +} + +static void * +acceptor(void *param) +{ + struct sockaddr_in sin; + pthread_t tid; + int news; + int s; + + if ((s = socket(AF_INET, SOCK_STREAM, 0)) < 0) { + perror("socket"); + exit(1); + } + + sin.sin_len = sizeof(sin); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = htonl(INADDR_ANY); + sin.sin_port = htons(TEST_PORT); + + if (bind(s, (struct sockaddr *)&sin, sizeof(sin)) < 0) { + perror("bind"); + exit(1); + } + + if (listen(s, 1) < 0) { + perror("listen"); + exit(1); + } + + (void) mevent_add(s, EVF_READ, acceptor_callback, NULL); + + pthread_mutex_lock(&accept_mutex); + + while (!pthread_cond_wait(&accept_condvar, &accept_mutex)) { + news = accept(s, NULL, NULL); + if (news < 0) { + perror("accept error"); + } else { + printf("incoming connection, spawning thread\n"); + pthread_create(&tid, NULL, echoer, + (void *)(uintptr_t)news); + } + } +} + +main() +{ + pthread_t tid; + + pthread_create(&tid, NULL, acceptor, NULL); + + mevent_dispatch(); +} diff --git a/usr.sbin/bhyve/pci_emul.c b/usr.sbin/bhyve/pci_emul.c new file mode 100644 index 000000000000..650c4dea478e --- /dev/null +++ b/usr.sbin/bhyve/pci_emul.c @@ -0,0 +1,976 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "fbsdrun.h" +#include "inout.h" +#include "pci_emul.h" + +#define CONF1_ADDR_PORT 0x0cf8 +#define CONF1_DATA_PORT 0x0cfc + +#define CFGWRITE(pi,off,val,b) \ +do { \ + if ((b) == 1) { \ + pci_set_cfgdata8((pi),(off),(val)); \ + } else if ((b) == 2) { \ + pci_set_cfgdata16((pi),(off),(val)); \ + } else { \ + pci_set_cfgdata32((pi),(off),(val)); \ + } \ +} while (0) + +#define MAXSLOTS 32 + +static struct slotinfo { + char *si_name; + char *si_param; + struct pci_devinst *si_devi; + int si_titled; + int si_pslot; + char si_prefix; + char si_suffix; +} pci_slotinfo[MAXSLOTS]; + +/* + * NetApp specific: + * struct used to build an in-core OEM table to supply device names + * to driver instances + */ +static struct mptable_pci_devnames { +#define MPT_HDR_BASE 0 +#define MPT_HDR_NAME 2 + uint16_t md_hdrtype; + uint16_t md_entries; + uint16_t md_cksum; + uint16_t md_pad; +#define MPT_NTAP_SIG \ + ((uint32_t)(('P' << 24) | ('A' << 16) | ('T' << 8) | 'N')) + uint32_t md_sig; + uint32_t md_rsvd; + struct mptable_pci_slotinfo { + uint16_t mds_type; + uint16_t mds_phys_slot; + uint8_t mds_bus; + uint8_t mds_slot; + uint8_t mds_func; + uint8_t mds_pad; + uint16_t mds_vid; + uint16_t mds_did; + uint8_t mds_suffix[4]; + uint8_t mds_prefix[4]; + uint32_t mds_rsvd[3]; + } md_slotinfo[MAXSLOTS]; +} pci_devnames; + +SET_DECLARE(pci_devemu_set, struct pci_devemu); + +static uint64_t pci_emul_iobase; +static uint64_t pci_emul_membase32; +static uint64_t pci_emul_membase64; + +#define PCI_EMUL_IOBASE 0x2000 +#define PCI_EMUL_IOLIMIT 0x10000 + +#define PCI_EMUL_MEMBASE32 (lomem_sz) +#define PCI_EMUL_MEMLIMIT32 0xE0000000 /* 3.5GB */ + +#define PCI_EMUL_MEMBASE64 0xD000000000UL +#define PCI_EMUL_MEMLIMIT64 0xFD00000000UL + +static int pci_emul_devices; +static int devname_elems; + +/* + * I/O access + */ + +/* + * Slot options are in the form: + * + * ,[,] + * + * slot is 0..31 + * emul is a string describing the type of PCI device e.g. virtio-net + * config is an optional string, depending on the device, that can be + * used for configuration. + * Examples are: + * 1,virtio-net,tap0 + * 3,dummy + */ +static void +pci_parse_slot_usage(char *aopt) +{ + printf("Invalid PCI slot info field \"%s\"\n", aopt); + free(aopt); +} + +void +pci_parse_slot(char *opt) +{ + char *slot, *emul, *config; + char *str, *cpy; + int snum; + + str = cpy = strdup(opt); + config = NULL; + + slot = strsep(&str, ","); + emul = strsep(&str, ","); + if (str != NULL) { + config = strsep(&str, ","); + } + + if (emul == NULL) { + pci_parse_slot_usage(cpy); + return; + } + + snum = 255; + snum = atoi(slot); + if (snum < 0 || snum >= MAXSLOTS) { + pci_parse_slot_usage(cpy); + } else { + pci_slotinfo[snum].si_name = emul; + pci_slotinfo[snum].si_param = config; + } +} + + +/* + * + * PCI MPTable names are of the form: + * + * ,[prefix] + * + * .. with an alphabetic char, a 1 or 2-digit string, + * and a single char. + * + * Examples: + * 1,e0c + * 4,e0P + * 6,43a + * 7,0f + * 10,1 + * 12,e0M + * 2,12a + * + * Note that this is NetApp-specific, but is ignored on other o/s's. + */ +static void +pci_parse_name_usage(char *aopt) +{ + printf("Invalid PCI slot name field \"%s\"\n", aopt); +} + +void +pci_parse_name(char *opt) +{ + char csnum[4]; + char *namestr; + char *slotend; + char prefix, suffix; + int i; + int pslot; + int snum; + + pslot = -1; + prefix = suffix = 0; + slotend = strchr(opt, ','); + + /* + * A comma must be present, and can't be the first character + * or no slot would be present. Also, the slot number can't be + * more than 2 characters. + */ + if (slotend == NULL || slotend == opt || (slotend - opt > 2)) { + pci_parse_name_usage(opt); + return; + } + + for (i = 0; i < (slotend - opt); i++) { + csnum[i] = opt[i]; + } + csnum[i] = '\0'; + + snum = 255; + snum = atoi(csnum); + if (snum < 0 || snum >= MAXSLOTS) { + pci_parse_name_usage(opt); + return; + } + + namestr = slotend + 1; + + if (strlen(namestr) > 3) { + pci_parse_name_usage(opt); + return; + } + + if (isalpha(*namestr)) { + prefix = *namestr++; + } + + if (!isdigit(*namestr)) { + pci_parse_name_usage(opt); + } else { + pslot = *namestr++ - '0'; + if (isnumber(*namestr)) { + pslot = 10*pslot + *namestr++ - '0'; + + } + if (isalpha(*namestr) && *(namestr + 1) == 0) { + suffix = *namestr; + pci_slotinfo[snum].si_titled = 1; + pci_slotinfo[snum].si_pslot = pslot; + pci_slotinfo[snum].si_prefix = prefix; + pci_slotinfo[snum].si_suffix = suffix; + + } else { + pci_parse_name_usage(opt); + } + } +} + +static void +pci_add_mptable_name(struct slotinfo *si) +{ + struct mptable_pci_slotinfo *ms; + + /* + * If naming information has been supplied for this slot, populate + * the next available mptable OEM entry + */ + if (si->si_titled) { + ms = &pci_devnames.md_slotinfo[devname_elems]; + + ms->mds_type = MPT_HDR_NAME; + ms->mds_phys_slot = si->si_pslot; + ms->mds_bus = si->si_devi->pi_bus; + ms->mds_slot = si->si_devi->pi_slot; + ms->mds_func = si->si_devi->pi_func; + ms->mds_vid = pci_get_cfgdata16(si->si_devi, PCIR_VENDOR); + ms->mds_did = pci_get_cfgdata16(si->si_devi, PCIR_DEVICE); + ms->mds_suffix[0] = si->si_suffix; + ms->mds_prefix[0] = si->si_prefix; + + devname_elems++; + } +} + +static void +pci_finish_mptable_names(void) +{ + int size; + + if (devname_elems) { + pci_devnames.md_hdrtype = MPT_HDR_BASE; + pci_devnames.md_entries = devname_elems; + pci_devnames.md_cksum = 0; /* XXX */ + pci_devnames.md_sig = MPT_NTAP_SIG; + + size = (uintptr_t)&pci_devnames.md_slotinfo[devname_elems] - + (uintptr_t)&pci_devnames; + + fbsdrun_add_oemtbl(&pci_devnames, size); + } +} + +static int +pci_emul_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + struct pci_devinst *pdi = arg; + struct pci_devemu *pe = pdi->pi_d; + int offset, i; + + for (i = 0; i <= PCI_BARMAX; i++) { + if (pdi->pi_bar[i].type == PCIBAR_IO && + port >= pdi->pi_bar[i].addr && + port + bytes <= pdi->pi_bar[i].addr + pdi->pi_bar[i].size) { + offset = port - pdi->pi_bar[i].addr; + if (in) + *eax = (*pe->pe_ior)(pdi, i, offset, bytes); + else + (*pe->pe_iow)(pdi, i, offset, bytes, *eax); + return (0); + } + } + return (-1); +} + +static int +pci_emul_alloc_resource(uint64_t *baseptr, uint64_t limit, uint64_t size, + uint64_t *addr) +{ + uint64_t base; + + assert((size & (size - 1)) == 0); /* must be a power of 2 */ + + base = roundup2(*baseptr, size); + + if (base + size <= limit) { + *addr = base; + *baseptr = base + size; + return (0); + } else + return (-1); +} + +int +pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, uint64_t hostbase, + enum pcibar_type type, uint64_t size) +{ + int i, error; + uint64_t *baseptr, limit, addr, mask, lobits, bar; + struct inout_port iop; + + assert(idx >= 0 && idx <= PCI_BARMAX); + + if ((size & (size - 1)) != 0) + size = 1UL << flsl(size); /* round up to a power of 2 */ + + switch (type) { + case PCIBAR_NONE: + baseptr = NULL; + addr = mask = lobits = 0; + break; + case PCIBAR_IO: + baseptr = &pci_emul_iobase; + limit = PCI_EMUL_IOLIMIT; + mask = PCIM_BAR_IO_BASE; + lobits = PCIM_BAR_IO_SPACE; + break; + case PCIBAR_MEM64: + /* + * XXX + * Some drivers do not work well if the 64-bit BAR is allocated + * above 4GB. Allow for this by allocating small requests under + * 4GB unless then allocation size is larger than some arbitrary + * number (32MB currently). + */ + if (size > 32 * 1024 * 1024) { + /* + * XXX special case for device requiring peer-peer DMA + */ + if (size == 0x100000000UL) + baseptr = &hostbase; + else + baseptr = &pci_emul_membase64; + limit = PCI_EMUL_MEMLIMIT64; + mask = PCIM_BAR_MEM_BASE; + lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 | + PCIM_BAR_MEM_PREFETCH; + break; + } + /* fallthrough */ + case PCIBAR_MEM32: + baseptr = &pci_emul_membase32; + limit = PCI_EMUL_MEMLIMIT32; + mask = PCIM_BAR_MEM_BASE; + lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32; + break; + default: + printf("pci_emul_alloc_base: invalid bar type %d\n", type); + assert(0); + } + + if (baseptr != NULL) { + error = pci_emul_alloc_resource(baseptr, limit, size, &addr); + if (error != 0) + return (error); + } + + pdi->pi_bar[idx].type = type; + pdi->pi_bar[idx].addr = addr; + pdi->pi_bar[idx].size = size; + + /* Initialize the BAR register in config space */ + bar = (addr & mask) | lobits; + pci_set_cfgdata32(pdi, PCIR_BAR(idx), bar); + + if (type == PCIBAR_MEM64) { + assert(idx + 1 <= PCI_BARMAX); + pdi->pi_bar[idx + 1].type = PCIBAR_MEMHI64; + pci_set_cfgdata32(pdi, PCIR_BAR(idx + 1), bar >> 32); + } + + /* add a handler to intercept accesses to the I/O bar */ + if (type == PCIBAR_IO) { + iop.name = pdi->pi_name; + iop.flags = IOPORT_F_INOUT; + iop.handler = pci_emul_handler; + iop.arg = pdi; + + for (i = 0; i < size; i++) { + iop.port = addr + i; + register_inout(&iop); + } + } + + return (0); +} + +#define CAP_START_OFFSET 0x40 +static int +pci_emul_add_capability(struct pci_devinst *pi, u_char *capdata, int caplen) +{ + int i, capoff, capid, reallen; + uint16_t sts; + + static u_char endofcap[4] = { + PCIY_RESERVED, 0, 0, 0 + }; + + assert(caplen > 0 && capdata[0] != PCIY_RESERVED); + + reallen = roundup2(caplen, 4); /* dword aligned */ + + sts = pci_get_cfgdata16(pi, PCIR_STATUS); + if ((sts & PCIM_STATUS_CAPPRESENT) == 0) { + capoff = CAP_START_OFFSET; + pci_set_cfgdata8(pi, PCIR_CAP_PTR, capoff); + pci_set_cfgdata16(pi, PCIR_STATUS, sts|PCIM_STATUS_CAPPRESENT); + } else { + capoff = pci_get_cfgdata8(pi, PCIR_CAP_PTR); + while (1) { + assert((capoff & 0x3) == 0); + capid = pci_get_cfgdata8(pi, capoff); + if (capid == PCIY_RESERVED) + break; + capoff = pci_get_cfgdata8(pi, capoff + 1); + } + } + + /* Check if we have enough space */ + if (capoff + reallen + sizeof(endofcap) > PCI_REGMAX + 1) + return (-1); + + /* Copy the capability */ + for (i = 0; i < caplen; i++) + pci_set_cfgdata8(pi, capoff + i, capdata[i]); + + /* Set the next capability pointer */ + pci_set_cfgdata8(pi, capoff + 1, capoff + reallen); + + /* Copy of the reserved capability which serves as the end marker */ + for (i = 0; i < sizeof(endofcap); i++) + pci_set_cfgdata8(pi, capoff + reallen + i, endofcap[i]); + + return (0); +} + +static struct pci_devemu * +pci_emul_finddev(char *name) +{ + struct pci_devemu **pdpp, *pdp; + + SET_FOREACH(pdpp, pci_devemu_set) { + pdp = *pdpp; + if (!strcmp(pdp->pe_emu, name)) { + return (pdp); + } + } + + return (NULL); +} + +static void +pci_emul_init(struct vmctx *ctx, struct pci_devemu *pde, int slot, char *params) +{ + struct pci_devinst *pdi; + pdi = malloc(sizeof(struct pci_devinst)); + bzero(pdi, sizeof(*pdi)); + + pdi->pi_vmctx = ctx; + pdi->pi_bus = 0; + pdi->pi_slot = slot; + pdi->pi_func = 0; + pdi->pi_d = pde; + snprintf(pdi->pi_name, PI_NAMESZ, "%s-pci-%d", pde->pe_emu, slot); + + /* Disable legacy interrupts */ + pci_set_cfgdata8(pdi, PCIR_INTLINE, 255); + pci_set_cfgdata8(pdi, PCIR_INTPIN, 0); + + pci_set_cfgdata8(pdi, PCIR_COMMAND, + PCIM_CMD_PORTEN | PCIM_CMD_MEMEN | PCIM_CMD_BUSMASTEREN); + + if ((*pde->pe_init)(ctx, pdi, params) != 0) { + free(pdi); + } else { + pci_emul_devices++; + pci_slotinfo[slot].si_devi = pdi; + } +} + +void +pci_populate_msicap(struct msicap *msicap, int msgnum, int nextptr) +{ + int mmc; + + CTASSERT(sizeof(struct msicap) == 14); + + /* Number of msi messages must be a power of 2 between 1 and 32 */ + assert((msgnum & (msgnum - 1)) == 0 && msgnum >= 1 && msgnum <= 32); + mmc = ffs(msgnum) - 1; + + bzero(msicap, sizeof(struct msicap)); + msicap->capid = PCIY_MSI; + msicap->nextptr = nextptr; + msicap->msgctrl = PCIM_MSICTRL_64BIT | (mmc << 1); +} + +int +pci_emul_add_msicap(struct pci_devinst *pi, int msgnum) +{ + struct msicap msicap; + + pci_populate_msicap(&msicap, msgnum, 0); + + return (pci_emul_add_capability(pi, (u_char *)&msicap, sizeof(msicap))); +} + +void +msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, + int bytes, uint32_t val) +{ + uint16_t msgctrl, rwmask, msgdata, mme; + uint32_t addrlo; + + /* + * If guest is writing to the message control register make sure + * we do not overwrite read-only fields. + */ + if ((offset - capoff) == 2 && bytes == 2) { + rwmask = PCIM_MSICTRL_MME_MASK | PCIM_MSICTRL_MSI_ENABLE; + msgctrl = pci_get_cfgdata16(pi, offset); + msgctrl &= ~rwmask; + msgctrl |= val & rwmask; + val = msgctrl; + + addrlo = pci_get_cfgdata32(pi, capoff + 4); + if (msgctrl & PCIM_MSICTRL_64BIT) + msgdata = pci_get_cfgdata16(pi, capoff + 12); + else + msgdata = pci_get_cfgdata16(pi, capoff + 8); + + /* + * XXX check delivery mode, destination mode etc + */ + mme = msgctrl & PCIM_MSICTRL_MME_MASK; + pi->pi_msi.enabled = msgctrl & PCIM_MSICTRL_MSI_ENABLE ? 1 : 0; + if (pi->pi_msi.enabled) { + pi->pi_msi.cpu = (addrlo >> 12) & 0xff; + pi->pi_msi.vector = msgdata & 0xff; + pi->pi_msi.msgnum = 1 << (mme >> 4); + } else { + pi->pi_msi.cpu = 0; + pi->pi_msi.vector = 0; + pi->pi_msi.msgnum = 0; + } + } + + CFGWRITE(pi, offset, val, bytes); +} + +/* + * This function assumes that 'coff' is in the capabilities region of the + * config space. + */ +static void +pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes, uint32_t val) +{ + int capid; + uint8_t capoff, nextoff; + + /* Do not allow un-aligned writes */ + if ((offset & (bytes - 1)) != 0) + return; + + /* Find the capability that we want to update */ + capoff = CAP_START_OFFSET; + while (1) { + capid = pci_get_cfgdata8(pi, capoff); + if (capid == PCIY_RESERVED) + break; + + nextoff = pci_get_cfgdata8(pi, capoff + 1); + if (offset >= capoff && offset < nextoff) + break; + + capoff = nextoff; + } + assert(offset >= capoff); + + /* + * Capability ID and Next Capability Pointer are readonly + */ + if (offset == capoff || offset == capoff + 1) + return; + + switch (capid) { + case PCIY_MSI: + msicap_cfgwrite(pi, capoff, offset, bytes, val); + break; + default: + break; + } +} + +static int +pci_emul_iscap(struct pci_devinst *pi, int offset) +{ + int found; + uint16_t sts; + uint8_t capid, lastoff; + + found = 0; + sts = pci_get_cfgdata16(pi, PCIR_STATUS); + if ((sts & PCIM_STATUS_CAPPRESENT) != 0) { + lastoff = pci_get_cfgdata8(pi, PCIR_CAP_PTR); + while (1) { + assert((lastoff & 0x3) == 0); + capid = pci_get_cfgdata8(pi, lastoff); + if (capid == PCIY_RESERVED) + break; + lastoff = pci_get_cfgdata8(pi, lastoff + 1); + } + if (offset >= CAP_START_OFFSET && offset <= lastoff) + found = 1; + } + return (found); +} + +void +init_pci(struct vmctx *ctx) +{ + struct pci_devemu *pde; + struct slotinfo *si; + int i; + + pci_emul_iobase = PCI_EMUL_IOBASE; + pci_emul_membase32 = PCI_EMUL_MEMBASE32; + pci_emul_membase64 = PCI_EMUL_MEMBASE64; + + si = pci_slotinfo; + + for (i = 0; i < MAXSLOTS; i++, si++) { + if (si->si_name != NULL) { + pde = pci_emul_finddev(si->si_name); + if (pde != NULL) { + pci_emul_init(ctx, pde, i, si->si_param); + pci_add_mptable_name(si); + } + } + } + pci_finish_mptable_names(); +} + +int +pci_msi_enabled(struct pci_devinst *pi) +{ + return (pi->pi_msi.enabled); +} + +int +pci_msi_msgnum(struct pci_devinst *pi) +{ + if (pi->pi_msi.enabled) + return (pi->pi_msi.msgnum); + else + return (0); +} + +void +pci_generate_msi(struct pci_devinst *pi, int msg) +{ + + if (pci_msi_enabled(pi) && msg < pci_msi_msgnum(pi)) { + vm_lapic_irq(pi->pi_vmctx, + pi->pi_msi.cpu, + pi->pi_msi.vector + msg); + } +} + +static int cfgbus, cfgslot, cfgfunc, cfgoff; + +static int +pci_emul_cfgaddr(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + uint32_t x; + + assert(!in); + + if (bytes != 4) + return (-1); + + x = *eax; + cfgoff = x & PCI_REGMAX; + cfgfunc = (x >> 8) & PCI_FUNCMAX; + cfgslot = (x >> 11) & PCI_SLOTMAX; + cfgbus = (x >> 16) & PCI_BUSMAX; + + return (0); +} +INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_OUT, pci_emul_cfgaddr); + +static int +pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + struct pci_devinst *pi; + struct pci_devemu *pe; + int coff, idx; + uint64_t mask, bar; + + assert(bytes == 1 || bytes == 2 || bytes == 4); + + pi = pci_slotinfo[cfgslot].si_devi; + coff = cfgoff + (port - CONF1_DATA_PORT); + +#if 0 + printf("pcicfg-%s from 0x%0x of %d bytes (%d/%d/%d)\n\r", + in ? "read" : "write", coff, bytes, cfgbus, cfgslot, cfgfunc); +#endif + + if (pi == NULL || cfgfunc != 0) { + if (in) + *eax = 0xffffffff; + return (0); + } + + pe = pi->pi_d; + + /* + * Config read + */ + if (in) { + /* Let the device emulation override the default handler */ + if (pe->pe_cfgread != NULL && + (*pe->pe_cfgread)(ctx, vcpu, pi, coff, bytes, eax) == 0) + return (0); + + if (bytes == 1) + *eax = pci_get_cfgdata8(pi, coff); + else if (bytes == 2) + *eax = pci_get_cfgdata16(pi, coff); + else + *eax = pci_get_cfgdata32(pi, coff); + } else { + /* Let the device emulation override the default handler */ + if (pe->pe_cfgwrite != NULL && + (*pe->pe_cfgwrite)(ctx, vcpu, pi, coff, bytes, *eax) == 0) + return (0); + + /* + * Special handling for write to BAR registers + */ + if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) { + /* + * Ignore writes to BAR registers that are not + * 4-byte aligned. + */ + if (bytes != 4 || (coff & 0x3) != 0) + return (0); + idx = (coff - PCIR_BAR(0)) / 4; + switch (pi->pi_bar[idx].type) { + case PCIBAR_NONE: + bar = 0; + break; + case PCIBAR_IO: + mask = ~(pi->pi_bar[idx].size - 1); + mask &= PCIM_BAR_IO_BASE; + bar = (*eax & mask) | PCIM_BAR_IO_SPACE; + break; + case PCIBAR_MEM32: + mask = ~(pi->pi_bar[idx].size - 1); + mask &= PCIM_BAR_MEM_BASE; + bar = *eax & mask; + bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32; + break; + case PCIBAR_MEM64: + mask = ~(pi->pi_bar[idx].size - 1); + mask &= PCIM_BAR_MEM_BASE; + bar = *eax & mask; + bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 | + PCIM_BAR_MEM_PREFETCH; + break; + case PCIBAR_MEMHI64: + mask = ~(pi->pi_bar[idx - 1].size - 1); + mask &= PCIM_BAR_MEM_BASE; + bar = ((uint64_t)*eax << 32) & mask; + bar = bar >> 32; + break; + default: + assert(0); + } + pci_set_cfgdata32(pi, coff, bar); + } else if (pci_emul_iscap(pi, coff)) { + pci_emul_capwrite(pi, coff, bytes, *eax); + } else { + CFGWRITE(pi, coff, *eax, bytes); + } + } + + return (0); +} + +INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+0, IOPORT_F_INOUT, pci_emul_cfgdata); +INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+1, IOPORT_F_INOUT, pci_emul_cfgdata); +INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+2, IOPORT_F_INOUT, pci_emul_cfgdata); +INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+3, IOPORT_F_INOUT, pci_emul_cfgdata); + +/* + * I/O ports to configure PCI IRQ routing. We ignore all writes to it. + */ +static int +pci_irq_port_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + assert(in == 0); + return (0); +} +INOUT_PORT(pci_irq, 0xC00, IOPORT_F_OUT, pci_irq_port_handler); +INOUT_PORT(pci_irq, 0xC01, IOPORT_F_OUT, pci_irq_port_handler); + +#define PCI_EMUL_TEST +#ifdef PCI_EMUL_TEST +/* + * Define a dummy test device + */ +#define DREGSZ 20 +struct pci_emul_dsoftc { + uint8_t regs[DREGSZ]; +}; + +#define PCI_EMUL_MSGS 4 + +static int +pci_emul_dinit(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + int error; + struct pci_emul_dsoftc *sc; + + sc = malloc(sizeof(struct pci_emul_dsoftc)); + memset(sc, 0, sizeof(struct pci_emul_dsoftc)); + + pi->pi_arg = sc; + + pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0001); + pci_set_cfgdata16(pi, PCIR_VENDOR, 0x10DD); + pci_set_cfgdata8(pi, PCIR_CLASS, 0x02); + + error = pci_emul_alloc_bar(pi, 0, 0, PCIBAR_IO, DREGSZ); + assert(error == 0); + + error = pci_emul_add_msicap(pi, PCI_EMUL_MSGS); + assert(error == 0); + + return (0); +} + +static void +pci_emul_diow(struct pci_devinst *pi, int baridx, int offset, int size, + uint32_t value) +{ + int i; + struct pci_emul_dsoftc *sc = pi->pi_arg; + + if (offset + size > DREGSZ) { + printf("diow: too large, offset %d size %d\n", offset, size); + return; + } + + if (size == 1) { + sc->regs[offset] = value & 0xff; + } else if (size == 2) { + *(uint16_t *)&sc->regs[offset] = value & 0xffff; + } else { + *(uint32_t *)&sc->regs[offset] = value; + } + + /* + * Special magic value to generate an interrupt + */ + if (offset == 4 && size == 4 && pci_msi_enabled(pi)) + pci_generate_msi(pi, value % pci_msi_msgnum(pi)); + + if (value == 0xabcdef) { + for (i = 0; i < pci_msi_msgnum(pi); i++) + pci_generate_msi(pi, i); + } +} + +static uint32_t +pci_emul_dior(struct pci_devinst *pi, int baridx, int offset, int size) +{ + struct pci_emul_dsoftc *sc = pi->pi_arg; + uint32_t value; + + if (offset + size > DREGSZ) { + printf("dior: too large, offset %d size %d\n", offset, size); + return (0); + } + + if (size == 1) { + value = sc->regs[offset]; + } else if (size == 2) { + value = *(uint16_t *) &sc->regs[offset]; + } else { + value = *(uint32_t *) &sc->regs[offset]; + } + + return (value); +} + +struct pci_devemu pci_dummy = { + .pe_emu = "dummy", + .pe_init = pci_emul_dinit, + .pe_iow = pci_emul_diow, + .pe_ior = pci_emul_dior +}; +PCI_EMUL_SET(pci_dummy); + +#endif /* PCI_EMUL_TEST */ diff --git a/usr.sbin/bhyve/pci_emul.h b/usr.sbin/bhyve/pci_emul.h new file mode 100644 index 000000000000..f5f8e228c55e --- /dev/null +++ b/usr.sbin/bhyve/pci_emul.h @@ -0,0 +1,171 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _PCI_EMUL_H_ +#define _PCI_EMUL_H_ + +#include +#include +#include + +#include + +#include + +#define PCI_BARMAX PCIR_MAX_BAR_0 /* BAR registers in a Type 0 header */ +#define PCIY_RESERVED 0x00 + +struct vmctx; +struct pci_devinst; + +struct pci_devemu { + char *pe_emu; /* Name of device emulation */ + + /* instance creation */ + int (*pe_init)(struct vmctx *, struct pci_devinst *, char *opts); + + /* config space read/write callbacks */ + int (*pe_cfgwrite)(struct vmctx *ctx, int vcpu, + struct pci_devinst *pi, int offset, + int bytes, uint32_t val); + int (*pe_cfgread)(struct vmctx *ctx, int vcpu, + struct pci_devinst *pi, int offset, + int bytes, uint32_t *retval); + + /* I/O space read/write callbacks */ + void (*pe_iow)(struct pci_devinst *pi, int baridx, + int offset, int size, uint32_t value); + uint32_t (*pe_ior)(struct pci_devinst *pi, int baridx, + int offset, int size); +}; +#define PCI_EMUL_SET(x) DATA_SET(pci_devemu_set, x); + +enum pcibar_type { + PCIBAR_NONE, + PCIBAR_IO, + PCIBAR_MEM32, + PCIBAR_MEM64, + PCIBAR_MEMHI64 +}; + +struct pcibar { + enum pcibar_type type; /* io or memory */ + uint64_t size; + uint64_t addr; +}; + +#define PI_NAMESZ 40 + +struct pci_devinst { + struct pci_devemu *pi_d; + struct vmctx *pi_vmctx; + uint8_t pi_bus, pi_slot, pi_func; + char pi_name[PI_NAMESZ]; + uint16_t pi_iobase; + int pi_bar_getsize; + + struct { + int enabled; + int cpu; + int vector; + int msgnum; + } pi_msi; + + void *pi_arg; /* devemu-private data */ + + u_char pi_cfgdata[PCI_REGMAX + 1]; + struct pcibar pi_bar[PCI_BARMAX + 1]; +}; + +struct msicap { + uint8_t capid; + uint8_t nextptr; + uint16_t msgctrl; + uint32_t addrlo; + uint32_t addrhi; + uint16_t msgdata; +} __packed; + +void init_pci(struct vmctx *ctx); +void pci_parse_slot(char *opt); +void pci_parse_name(char *opt); +void pci_callback(void); +int pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, uint64_t hostbase, + enum pcibar_type type, uint64_t size); +int pci_emul_add_msicap(struct pci_devinst *pi, int msgnum); +void msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, + int bytes, uint32_t val); + +void pci_generate_msi(struct pci_devinst *pi, int msgnum); +int pci_msi_enabled(struct pci_devinst *pi); +int pci_msi_msgnum(struct pci_devinst *pi); +void pci_populate_msicap(struct msicap *cap, int msgs, int nextptr); + +static __inline void +pci_set_cfgdata8(struct pci_devinst *pi, int offset, uint8_t val) +{ + assert(offset <= PCI_REGMAX); + *(uint8_t *)(pi->pi_cfgdata + offset) = val; +} + +static __inline void +pci_set_cfgdata16(struct pci_devinst *pi, int offset, uint16_t val) +{ + assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0); + *(uint16_t *)(pi->pi_cfgdata + offset) = val; +} + +static __inline void +pci_set_cfgdata32(struct pci_devinst *pi, int offset, uint32_t val) +{ + assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0); + *(uint32_t *)(pi->pi_cfgdata + offset) = val; +} + +static __inline uint8_t +pci_get_cfgdata8(struct pci_devinst *pi, int offset) +{ + assert(offset <= PCI_REGMAX); + return (*(uint8_t *)(pi->pi_cfgdata + offset)); +} + +static __inline uint16_t +pci_get_cfgdata16(struct pci_devinst *pi, int offset) +{ + assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0); + return (*(uint16_t *)(pi->pi_cfgdata + offset)); +} + +static __inline uint32_t +pci_get_cfgdata32(struct pci_devinst *pi, int offset) +{ + assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0); + return (*(uint32_t *)(pi->pi_cfgdata + offset)); +} + +#endif /* _PCI_EMUL_H_ */ diff --git a/usr.sbin/bhyve/pci_hostbridge.c b/usr.sbin/bhyve/pci_hostbridge.c new file mode 100644 index 000000000000..c77762d8f921 --- /dev/null +++ b/usr.sbin/bhyve/pci_hostbridge.c @@ -0,0 +1,52 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "pci_emul.h" + +static int +pci_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + + /* config space */ + pci_set_cfgdata16(pi, PCIR_VENDOR, 0x1275); /* NetApp */ + pci_set_cfgdata16(pi, PCIR_DEVICE, 0x1275); /* NetApp */ + pci_set_cfgdata8(pi, PCIR_HDRTYPE, PCIM_HDRTYPE_BRIDGE); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE); + pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_HOST); + + return (0); +} + +struct pci_devemu pci_de_hostbridge = { + .pe_emu = "hostbridge", + .pe_init = pci_hostbridge_init, +}; +PCI_EMUL_SET(pci_de_hostbridge); diff --git a/usr.sbin/bhyve/pci_passthru.c b/usr.sbin/bhyve/pci_passthru.c new file mode 100644 index 000000000000..1c417fd58725 --- /dev/null +++ b/usr.sbin/bhyve/pci_passthru.c @@ -0,0 +1,508 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include "pci_emul.h" + +#ifndef _PATH_DEVPCI +#define _PATH_DEVPCI "/dev/pci" +#endif + +#ifndef _PATH_DEVIO +#define _PATH_DEVIO "/dev/io" +#endif + +#define LEGACY_SUPPORT 1 + +static int pcifd = -1; +static int iofd = -1; + +struct passthru_softc { + struct pci_devinst *psc_pi; + struct pcibar psc_bar[PCI_BARMAX + 1]; + struct { + int capoff; + int msgctrl; + int emulated; + } psc_msi; + struct pcisel psc_sel; +}; + +static int +msi_caplen(int msgctrl) +{ + int len; + + len = 10; /* minimum length of msi capability */ + + if (msgctrl & PCIM_MSICTRL_64BIT) + len += 4; + +#if 0 + /* + * Ignore the 'mask' and 'pending' bits in the MSI capability. + * We'll let the guest manipulate them directly. + */ + if (msgctrl & PCIM_MSICTRL_VECTOR) + len += 10; +#endif + + return (len); +} + +static uint32_t +read_config(const struct pcisel *sel, long reg, int width) +{ + struct pci_io pi; + + bzero(&pi, sizeof(pi)); + pi.pi_sel = *sel; + pi.pi_reg = reg; + pi.pi_width = width; + + if (ioctl(pcifd, PCIOCREAD, &pi) < 0) + return (0); /* XXX */ + else + return (pi.pi_data); +} + +static void +write_config(const struct pcisel *sel, long reg, int width, uint32_t data) +{ + struct pci_io pi; + + bzero(&pi, sizeof(pi)); + pi.pi_sel = *sel; + pi.pi_reg = reg; + pi.pi_width = width; + pi.pi_data = data; + + (void)ioctl(pcifd, PCIOCWRITE, &pi); /* XXX */ +} + +#ifdef LEGACY_SUPPORT +static int +passthru_add_msicap(struct pci_devinst *pi, int msgnum, int nextptr) +{ + int capoff, i; + struct msicap msicap; + u_char *capdata; + + pci_populate_msicap(&msicap, msgnum, nextptr); + + /* + * XXX + * Copy the msi capability structure in the last 16 bytes of the + * config space. This is wrong because it could shadow something + * useful to the device. + */ + capoff = 256 - roundup(sizeof(msicap), 4); + capdata = (u_char *)&msicap; + for (i = 0; i < sizeof(msicap); i++) + pci_set_cfgdata8(pi, capoff + i, capdata[i]); + + return (capoff); +} +#endif /* LEGACY_SUPPORT */ + +static int +cfginitmsi(struct passthru_softc *sc) +{ + int ptr, cap, sts, caplen; + uint32_t u32; + struct pcisel sel; + struct pci_devinst *pi; + + pi = sc->psc_pi; + sel = sc->psc_sel; + + /* + * Parse the capabilities and cache the location of the MSI + * capability. + */ + sts = read_config(&sel, PCIR_STATUS, 2); + if (sts & PCIM_STATUS_CAPPRESENT) { + ptr = read_config(&sel, PCIR_CAP_PTR, 1); + while (ptr != 0 && ptr != 0xff) { + cap = read_config(&sel, ptr + PCICAP_ID, 1); + if (cap == PCIY_MSI) { + /* + * Copy the MSI capability into the config + * space of the emulated pci device + */ + sc->psc_msi.capoff = ptr; + sc->psc_msi.msgctrl = read_config(&sel, + ptr + 2, 2); + sc->psc_msi.emulated = 0; + caplen = msi_caplen(sc->psc_msi.msgctrl); + while (caplen > 0) { + u32 = read_config(&sel, ptr, 4); + pci_set_cfgdata32(pi, ptr, u32); + caplen -= 4; + ptr += 4; + } + break; + } + ptr = read_config(&sel, ptr + PCICAP_NEXTPTR, 1); + } + } + +#ifdef LEGACY_SUPPORT + /* + * If the passthrough device does not support MSI then craft a + * MSI capability for it. We link the new MSI capability at the + * head of the list of capabilities. + */ + if ((sts & PCIM_STATUS_CAPPRESENT) != 0 && sc->psc_msi.capoff == 0) { + int origptr, msiptr; + origptr = read_config(&sel, PCIR_CAP_PTR, 1); + msiptr = passthru_add_msicap(pi, 1, origptr); + sc->psc_msi.capoff = msiptr; + sc->psc_msi.msgctrl = pci_get_cfgdata16(pi, msiptr + 2); + sc->psc_msi.emulated = 1; + pci_set_cfgdata8(pi, PCIR_CAP_PTR, msiptr); + } +#endif + + if (sc->psc_msi.capoff == 0) /* MSI or bust */ + return (-1); + else + return (0); +} + +static int +cfginitbar(struct vmctx *ctx, struct passthru_softc *sc) +{ + int i, error; + struct pci_devinst *pi; + struct pci_bar_io bar; + enum pcibar_type bartype; + uint64_t base; + + pi = sc->psc_pi; + + /* + * Initialize BAR registers + */ + for (i = 0; i <= PCI_BARMAX; i++) { + bzero(&bar, sizeof(bar)); + bar.pbi_sel = sc->psc_sel; + bar.pbi_reg = PCIR_BAR(i); + + if (ioctl(pcifd, PCIOCGETBAR, &bar) < 0) + continue; + + if (PCI_BAR_IO(bar.pbi_base)) { + bartype = PCIBAR_IO; + base = bar.pbi_base & PCIM_BAR_IO_BASE; + } else { + switch (bar.pbi_base & PCIM_BAR_MEM_TYPE) { + case PCIM_BAR_MEM_64: + bartype = PCIBAR_MEM64; + break; + default: + bartype = PCIBAR_MEM32; + break; + } + base = bar.pbi_base & PCIM_BAR_MEM_BASE; + } + + /* Cache information about the "real" BAR */ + sc->psc_bar[i].type = bartype; + sc->psc_bar[i].size = bar.pbi_length; + sc->psc_bar[i].addr = base; + + /* Allocate the BAR in the guest I/O or MMIO space */ + error = pci_emul_alloc_bar(pi, i, base, bartype, + bar.pbi_length); + if (error) + return (-1); + + /* + * Map the physical MMIO space in the guest MMIO space + */ + if (bartype != PCIBAR_IO) { + error = vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus, + sc->psc_sel.pc_dev, sc->psc_sel.pc_func, + pi->pi_bar[i].addr, pi->pi_bar[i].size, base); + if (error) + return (-1); + } + + /* + * 64-bit BAR takes up two slots so skip the next one. + */ + if (bartype == PCIBAR_MEM64) { + i++; + assert(i <= PCI_BARMAX); + sc->psc_bar[i].type = PCIBAR_MEMHI64; + } + } + return (0); +} + +static int +cfginit(struct vmctx *ctx, struct pci_devinst *pi, int bus, int slot, int func) +{ + int error; + struct passthru_softc *sc; + + error = 1; + sc = pi->pi_arg; + + bzero(&sc->psc_sel, sizeof(struct pcisel)); + sc->psc_sel.pc_bus = bus; + sc->psc_sel.pc_dev = slot; + sc->psc_sel.pc_func = func; + + if (cfginitbar(ctx, sc) != 0) + goto done; + + if (cfginitmsi(sc) != 0) + goto done; + + error = 0; /* success */ +done: + return (error); +} + +static int +passthru_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + int bus, slot, func, error; + struct passthru_softc *sc; + + sc = NULL; + error = 1; + + if (pcifd < 0) { + pcifd = open(_PATH_DEVPCI, O_RDWR, 0); + if (pcifd < 0) + goto done; + } + + if (iofd < 0) { + iofd = open(_PATH_DEVIO, O_RDWR, 0); + if (iofd < 0) + goto done; + } + + if (opts == NULL || sscanf(opts, "%d/%d/%d", &bus, &slot, &func) != 3) + goto done; + + if (vm_assign_pptdev(ctx, bus, slot, func) != 0) + goto done; + + sc = malloc(sizeof(struct passthru_softc)); + memset(sc, 0, sizeof(struct passthru_softc)); + + pi->pi_arg = sc; + sc->psc_pi = pi; + + /* initialize config space */ + if (cfginit(ctx, pi, bus, slot, func) != 0) + goto done; + + error = 0; /* success */ +done: + if (error) { + free(sc); + vm_unassign_pptdev(ctx, bus, slot, func); + } + return (error); +} + +static int +bar_access(int coff) +{ + if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) + return (1); + else + return (0); +} + +static int +msicap_access(struct passthru_softc *sc, int coff) +{ + int caplen; + + if (sc->psc_msi.capoff == 0) + return (0); + + caplen = msi_caplen(sc->psc_msi.msgctrl); + + if (coff >= sc->psc_msi.capoff && coff < sc->psc_msi.capoff + caplen) + return (1); + else + return (0); +} + +static int +passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int coff, + int bytes, uint32_t *rv) +{ + struct passthru_softc *sc; + + sc = pi->pi_arg; + + /* + * PCI BARs and MSI capability is emulated. + */ + if (bar_access(coff) || msicap_access(sc, coff)) + return (-1); + +#ifdef LEGACY_SUPPORT + /* + * Emulate PCIR_CAP_PTR if this device does not support MSI capability + * natively. + */ + if (sc->psc_msi.emulated) { + if (coff >= PCIR_CAP_PTR && coff < PCIR_CAP_PTR + 4) + return (-1); + } +#endif + + /* Everything else just read from the device's config space */ + *rv = read_config(&sc->psc_sel, coff, bytes); + + return (0); +} + +static int +passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int coff, + int bytes, uint32_t val) +{ + int error; + struct passthru_softc *sc; + + sc = pi->pi_arg; + + /* + * PCI BARs are emulated + */ + if (bar_access(coff)) + return (-1); + + /* + * MSI capability is emulated + */ + if (msicap_access(sc, coff)) { + msicap_cfgwrite(pi, sc->psc_msi.capoff, coff, bytes, val); + + error = vm_setup_msi(ctx, vcpu, sc->psc_sel.pc_bus, + sc->psc_sel.pc_dev, sc->psc_sel.pc_func, pi->pi_msi.cpu, + pi->pi_msi.vector, pi->pi_msi.msgnum); + if (error != 0) { + printf("vm_setup_msi returned error %d\r\n", errno); + exit(1); + } + return (0); + } + +#ifdef LEGACY_SUPPORT + /* + * If this device does not support MSI natively then we cannot let + * the guest disable legacy interrupts from the device. It is the + * legacy interrupt that is triggering the virtual MSI to the guest. + */ + if (sc->psc_msi.emulated && pci_msi_enabled(pi)) { + if (coff == PCIR_COMMAND && bytes == 2) + val &= ~PCIM_CMD_INTxDIS; + } +#endif + + write_config(&sc->psc_sel, coff, bytes, val); + + return (0); +} + +static void +passthru_iow(struct pci_devinst *pi, int baridx, int offset, int size, + uint32_t value) +{ + struct passthru_softc *sc; + struct iodev_pio_req pio; + + sc = pi->pi_arg; + + bzero(&pio, sizeof(struct iodev_pio_req)); + pio.access = IODEV_PIO_WRITE; + pio.port = sc->psc_bar[baridx].addr + offset; + pio.width = size; + pio.val = value; + + (void)ioctl(iofd, IODEV_PIO, &pio); +} + +static uint32_t +passthru_ior(struct pci_devinst *pi, int baridx, int offset, int size) +{ + struct passthru_softc *sc; + struct iodev_pio_req pio; + + sc = pi->pi_arg; + + bzero(&pio, sizeof(struct iodev_pio_req)); + pio.access = IODEV_PIO_READ; + pio.port = sc->psc_bar[baridx].addr + offset; + pio.width = size; + pio.val = 0; + + (void)ioctl(iofd, IODEV_PIO, &pio); + + return (pio.val); +} + +struct pci_devemu passthru = { + .pe_emu = "passthru", + .pe_init = passthru_init, + .pe_cfgwrite = passthru_cfgwrite, + .pe_cfgread = passthru_cfgread, + .pe_iow = passthru_iow, + .pe_ior = passthru_ior, +}; +PCI_EMUL_SET(passthru); diff --git a/usr.sbin/bhyve/pci_virtio_block.c b/usr.sbin/bhyve/pci_virtio_block.c new file mode 100644 index 000000000000..b86e21dff64e --- /dev/null +++ b/usr.sbin/bhyve/pci_virtio_block.c @@ -0,0 +1,502 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "fbsdrun.h" +#include "pci_emul.h" +#include "virtio.h" + +#define VTBLK_RINGSZ 64 + +#define VTBLK_CFGSZ 28 + +#define VTBLK_R_CFG VTCFG_R_CFG0 +#define VTBLK_R_CFG_END VTBLK_R_CFG + VTBLK_CFGSZ -1 +#define VTBLK_R_MAX VTBLK_R_CFG_END + +#define VTBLK_REGSZ VTBLK_R_MAX+1 + +#define VTBLK_MAXSEGS 32 + +#define VTBLK_S_OK 0 +#define VTBLK_S_IOERR 1 + +/* + * Host capabilities + */ +#define VTBLK_S_HOSTCAPS \ + ( 0x00000004 | /* host maximum request segments */ \ + 0x10000000 ) /* supports indirect descriptors */ + +struct vring_hqueue { + /* Internal state */ + uint16_t hq_size; + uint16_t hq_cur_aidx; /* trails behind 'avail_idx' */ + + /* Host-context pointers to the queue */ + struct virtio_desc *hq_dtable; + uint16_t *hq_avail_flags; + uint16_t *hq_avail_idx; /* monotonically increasing */ + uint16_t *hq_avail_ring; + + uint16_t *hq_used_flags; + uint16_t *hq_used_idx; /* monotonically increasing */ + struct virtio_used *hq_used_ring; +}; + +/* + * Config space + */ +struct vtblk_config { + uint64_t vbc_capacity; + uint32_t vbc_size_max; + uint32_t vbc_seg_max; + uint16_t vbc_geom_c; + uint8_t vbc_geom_h; + uint8_t vbc_geom_s; + uint32_t vbc_blk_size; + uint32_t vbc_sectors_max; +} __packed; +CTASSERT(sizeof(struct vtblk_config) == VTBLK_CFGSZ); + +/* + * Fixed-size block header + */ +struct virtio_blk_hdr { +#define VBH_OP_READ 0 +#define VBH_OP_WRITE 1 + uint32_t vbh_type; + uint32_t vbh_ioprio; + uint64_t vbh_sector; +} __packed; + +/* + * Debug printf + */ +static int pci_vtblk_debug; +#define DPRINTF(params) if (pci_vtblk_debug) printf params +#define WPRINTF(params) printf params + +/* + * Per-device softc + */ +struct pci_vtblk_softc { + struct pci_devinst *vbsc_pi; + int vbsc_fd; + int vbsc_status; + int vbsc_isr; + int vbsc_lastq; + uint32_t vbsc_features; + uint64_t vbsc_pfn; + struct vring_hqueue vbsc_q; + struct vtblk_config vbsc_cfg; +}; + +/* + * Return the number of available descriptors in the vring taking care + * of the 16-bit index wraparound. + */ +static int +hq_num_avail(struct vring_hqueue *hq) +{ + int ndesc; + + if (*hq->hq_avail_idx >= hq->hq_cur_aidx) + ndesc = *hq->hq_avail_idx - hq->hq_cur_aidx; + else + ndesc = UINT16_MAX - hq->hq_cur_aidx + *hq->hq_avail_idx + 1; + + assert(ndesc >= 0 && ndesc <= hq->hq_size); + + return (ndesc); +} + +static void +pci_vtblk_update_status(struct pci_vtblk_softc *sc, uint32_t value) +{ + if (value == 0) { + DPRINTF(("vtblk: device reset requested !\n")); + } + + sc->vbsc_status = value; +} + +static void +pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vring_hqueue *hq) +{ + struct iovec iov[VTBLK_MAXSEGS]; + struct virtio_blk_hdr *vbh; + struct virtio_desc *vd, *vid; + struct virtio_used *vu; + uint8_t *status; + int i; + int err; + int iolen; + int nsegs; + int uidx, aidx, didx; + int writeop; + off_t offset; + + uidx = *hq->hq_used_idx; + aidx = hq->hq_cur_aidx; + didx = hq->hq_avail_ring[aidx % hq->hq_size]; + assert(didx >= 0 && didx < hq->hq_size); + + vd = &hq->hq_dtable[didx]; + + /* + * Verify that the descriptor is indirect, and obtain + * the pointer to the indirect descriptor. + * There has to be space for at least 3 descriptors + * in the indirect descriptor array: the block header, + * 1 or more data descriptors, and a status byte. + */ + assert(vd->vd_flags & VRING_DESC_F_INDIRECT); + + nsegs = vd->vd_len / sizeof(struct virtio_desc); + assert(nsegs >= 3); + assert(nsegs < VTBLK_MAXSEGS + 2); + + vid = paddr_guest2host(vd->vd_addr); + assert((vid->vd_flags & VRING_DESC_F_INDIRECT) == 0); + + /* + * The first descriptor will be the read-only fixed header + */ + vbh = paddr_guest2host(vid[0].vd_addr); + assert(vid[0].vd_len == sizeof(struct virtio_blk_hdr)); + assert(vid[0].vd_flags & VRING_DESC_F_NEXT); + assert((vid[0].vd_flags & VRING_DESC_F_WRITE) == 0); + + writeop = (vbh->vbh_type == VBH_OP_WRITE); + + offset = vbh->vbh_sector * DEV_BSIZE; + + /* + * Build up the iovec based on the guest's data descriptors + */ + for (i = 1, iolen = 0; i < nsegs - 1; i++) { + iov[i-1].iov_base = paddr_guest2host(vid[i].vd_addr); + iov[i-1].iov_len = vid[i].vd_len; + iolen += vid[i].vd_len; + + assert(vid[i].vd_flags & VRING_DESC_F_NEXT); + assert((vid[i].vd_flags & VRING_DESC_F_INDIRECT) == 0); + + /* + * - write op implies read-only descriptor, + * - read op implies write-only descriptor, + * therefore test the inverse of the descriptor bit + * to the op. + */ + assert(((vid[i].vd_flags & VRING_DESC_F_WRITE) == 0) == + writeop); + } + + /* Lastly, get the address of the status byte */ + status = paddr_guest2host(vid[nsegs - 1].vd_addr); + assert(vid[nsegs - 1].vd_len == 1); + assert((vid[nsegs - 1].vd_flags & VRING_DESC_F_NEXT) == 0); + assert(vid[nsegs - 1].vd_flags & VRING_DESC_F_WRITE); + + DPRINTF(("virtio-block: %s op, %d bytes, %d segs, offset %ld\n\r", + writeop ? "write" : "read", iolen, nsegs - 2, offset)); + + if (writeop){ + err = pwritev(sc->vbsc_fd, iov, nsegs - 2, offset); + } else { + err = preadv(sc->vbsc_fd, iov, nsegs - 2, offset); + } + + *status = err < 0 ? VTBLK_S_IOERR : VTBLK_S_OK; + + /* + * Return the single indirect descriptor back to the host + */ + vu = &hq->hq_used_ring[uidx % hq->hq_size]; + vu->vu_idx = didx; + vu->vu_tlen = 1; + hq->hq_cur_aidx++; + *hq->hq_used_idx += 1; +} + +static void +pci_vtblk_qnotify(struct pci_vtblk_softc *sc) +{ + struct vring_hqueue *hq = &sc->vbsc_q; + int i; + int ndescs; + + /* + * Calculate number of ring entries to process + */ + ndescs = hq_num_avail(hq); + + if (ndescs == 0) + return; + + /* + * Run through all the entries, placing them into iovecs and + * sending when an end-of-packet is found + */ + for (i = 0; i < ndescs; i++) + pci_vtblk_proc(sc, hq); + + /* + * Generate an interrupt if able + */ + if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0 && + sc->vbsc_isr == 0) { + sc->vbsc_isr = 1; + pci_generate_msi(sc->vbsc_pi, 0); + } + +} + +static void +pci_vtblk_ring_init(struct pci_vtblk_softc *sc, uint64_t pfn) +{ + struct vring_hqueue *hq; + + sc->vbsc_pfn = pfn << VRING_PFN; + + /* + * Set up host pointers to the various parts of the + * queue + */ + hq = &sc->vbsc_q; + hq->hq_size = VTBLK_RINGSZ; + + hq->hq_dtable = paddr_guest2host(pfn << VRING_PFN); + hq->hq_avail_flags = (uint16_t *)(hq->hq_dtable + hq->hq_size); + hq->hq_avail_idx = hq->hq_avail_flags + 1; + hq->hq_avail_ring = hq->hq_avail_flags + 2; + hq->hq_used_flags = (uint16_t *)roundup2((uintptr_t)hq->hq_avail_ring, + VRING_ALIGN); + hq->hq_used_idx = hq->hq_used_flags + 1; + hq->hq_used_ring = (struct virtio_used *)(hq->hq_used_flags + 2); + + /* + * Initialize queue indexes + */ + hq->hq_cur_aidx = 0; +} + +static int +pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + struct stat sbuf; + struct pci_vtblk_softc *sc; + int fd; + + if (opts == NULL) { + printf("virtio-block: backing device required\n"); + return (1); + } + + /* + * Access to guest memory is required. Fail if + * memory not mapped + */ + if (paddr_guest2host(0) == NULL) + return (1); + + /* + * The supplied backing file has to exist + */ + fd = open(opts, O_RDWR); + if (fd < 0) { + perror("Could not open backing file"); + return (1); + } + + if (fstat(fd, &sbuf) < 0) { + perror("Could not stat backing file"); + close(fd); + return (1); + } + + sc = malloc(sizeof(struct pci_vtblk_softc)); + memset(sc, 0, sizeof(struct pci_vtblk_softc)); + + pi->pi_arg = sc; + sc->vbsc_pi = pi; + sc->vbsc_fd = fd; + + /* setup virtio block config space */ + sc->vbsc_cfg.vbc_capacity = sbuf.st_size / DEV_BSIZE; + sc->vbsc_cfg.vbc_seg_max = VTBLK_MAXSEGS; + sc->vbsc_cfg.vbc_blk_size = DEV_BSIZE; + sc->vbsc_cfg.vbc_size_max = 0; /* not negotiated */ + sc->vbsc_cfg.vbc_geom_c = 0; /* no geometry */ + sc->vbsc_cfg.vbc_geom_h = 0; + sc->vbsc_cfg.vbc_geom_s = 0; + sc->vbsc_cfg.vbc_sectors_max = 0; + + /* initialize config space */ + pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_BLOCK); + pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); + pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_BLOCK); + pci_emul_alloc_bar(pi, 0, 0, PCIBAR_IO, VTBLK_REGSZ); + pci_emul_add_msicap(pi, 1); + + return (0); +} + +static void +pci_vtblk_write(struct pci_devinst *pi, int baridx, int offset, int size, + uint32_t value) +{ + struct pci_vtblk_softc *sc = pi->pi_arg; + + if (offset + size > VTBLK_REGSZ) { + DPRINTF(("vtblk_write: 2big, offset %d size %d\n", + offset, size)); + return; + } + + switch (offset) { + case VTCFG_R_GUESTCAP: + assert(size == 4); + sc->vbsc_features = value & VTBLK_S_HOSTCAPS; + break; + case VTCFG_R_PFN: + assert(size == 4); + pci_vtblk_ring_init(sc, value); + break; + case VTCFG_R_QSEL: + assert(size == 2); + sc->vbsc_lastq = value; + break; + case VTCFG_R_QNOTIFY: + assert(size == 2); + assert(value == 0); + pci_vtblk_qnotify(sc); + break; + case VTCFG_R_STATUS: + assert(size == 1); + pci_vtblk_update_status(sc, value); + break; + case VTCFG_R_HOSTCAP: + case VTCFG_R_QNUM: + case VTCFG_R_ISR: + case VTBLK_R_CFG ... VTBLK_R_CFG_END: + DPRINTF(("vtblk: write to readonly reg %d\n\r", offset)); + break; + default: + DPRINTF(("vtblk: unknown i/o write offset %d\n\r", offset)); + value = 0; + break; + } +} + +uint32_t +pci_vtblk_read(struct pci_devinst *pi, int baridx, int offset, int size) +{ + struct pci_vtblk_softc *sc = pi->pi_arg; + uint32_t value; + + if (offset + size > VTBLK_REGSZ) { + DPRINTF(("vtblk_read: 2big, offset %d size %d\n", + offset, size)); + return (0); + } + + switch (offset) { + case VTCFG_R_HOSTCAP: + assert(size == 4); + value = VTBLK_S_HOSTCAPS; + break; + case VTCFG_R_GUESTCAP: + assert(size == 4); + value = sc->vbsc_features; /* XXX never read ? */ + break; + case VTCFG_R_PFN: + assert(size == 4); + value = sc->vbsc_pfn >> VRING_PFN; + break; + case VTCFG_R_QNUM: + value = (sc->vbsc_lastq == 0) ? VTBLK_RINGSZ: 0; + break; + case VTCFG_R_QSEL: + assert(size == 2); + value = sc->vbsc_lastq; /* XXX never read ? */ + break; + case VTCFG_R_QNOTIFY: + assert(size == 2); + value = 0; /* XXX never read ? */ + break; + case VTCFG_R_STATUS: + assert(size == 1); + value = sc->vbsc_status; + break; + case VTCFG_R_ISR: + assert(size == 1); + value = sc->vbsc_isr; + sc->vbsc_isr = 0; /* a read clears this flag */ + break; + case VTBLK_R_CFG ... VTBLK_R_CFG_END: + assert(size == 1); + value = *((uint8_t *)&sc->vbsc_cfg + offset - VTBLK_R_CFG); + break; + default: + DPRINTF(("vtblk: unknown i/o read offset %d\n\r", offset)); + value = 0; + break; + } + + return (value); +} + +struct pci_devemu pci_de_vblk = { + .pe_emu = "virtio-blk", + .pe_init = pci_vtblk_init, + .pe_iow = pci_vtblk_write, + .pe_ior = pci_vtblk_read, +}; +PCI_EMUL_SET(pci_de_vblk); diff --git a/usr.sbin/bhyve/pci_virtio_net.c b/usr.sbin/bhyve/pci_virtio_net.c new file mode 100644 index 000000000000..5db1eb7f5a73 --- /dev/null +++ b/usr.sbin/bhyve/pci_virtio_net.c @@ -0,0 +1,739 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "fbsdrun.h" +#include "pci_emul.h" +#include "mevent.h" +#include "virtio.h" + +#define VTNET_RINGSZ 256 + +#define VTNET_MAXSEGS 32 + +/* + * PCI config-space register offsets + */ +#define VTNET_R_CFG0 20 +#define VTNET_R_CFG1 21 +#define VTNET_R_CFG2 22 +#define VTNET_R_CFG3 23 +#define VTNET_R_CFG4 24 +#define VTNET_R_CFG5 25 +#define VTNET_R_CFG6 26 +#define VTNET_R_CFG7 27 +#define VTNET_R_MAX 27 + +#define VTNET_REGSZ VTNET_R_MAX+1 + +/* + * Host capabilities + */ +#define VTNET_S_HOSTCAPS \ + ( 0x00000020 | /* host supplies MAC */ \ + 0x00008000 | /* host can merge Rx buffers */ \ + 0x00010000 ) /* config status available */ + +/* + * Queue definitions. + */ +#define VTNET_RXQ 0 +#define VTNET_TXQ 1 +#define VTNET_CTLQ 2 + +#define VTNET_MAXQ 3 + +struct vring_hqueue { + /* Internal state */ + uint16_t hq_size; + uint16_t hq_cur_aidx; /* trails behind 'avail_idx' */ + + /* Host-context pointers to the queue */ + struct virtio_desc *hq_dtable; + uint16_t *hq_avail_flags; + uint16_t *hq_avail_idx; /* monotonically increasing */ + uint16_t *hq_avail_ring; + + uint16_t *hq_used_flags; + uint16_t *hq_used_idx; /* monotonically increasing */ + struct virtio_used *hq_used_ring; +}; + +/* + * Fixed network header size + */ +struct virtio_net_rxhdr { + uint8_t vrh_flags; + uint8_t vrh_gso_type; + uint16_t vrh_hdr_len; + uint16_t vrh_gso_size; + uint16_t vrh_csum_start; + uint16_t vrh_csum_offset; + uint16_t vrh_bufs; +} __packed; + +/* + * Debug printf + */ +static int pci_vtnet_debug; +#define DPRINTF(params) if (pci_vtnet_debug) printf params +#define WPRINTF(params) printf params + +/* + * Per-device softc + */ +struct pci_vtnet_softc { + struct pci_devinst *vsc_pi; + pthread_mutex_t vsc_mtx; + struct mevent *vsc_mevp; + + int vsc_curq; + int vsc_status; + int vsc_isr; + int vsc_tapfd; + int vsc_rx_ready; + int vsc_rxpend; + + uint32_t vsc_features; + uint8_t vsc_macaddr[6]; + + uint64_t vsc_pfn[VTNET_MAXQ]; + struct vring_hqueue vsc_hq[VTNET_MAXQ]; +}; + +/* + * Return the number of available descriptors in the vring taking care + * of the 16-bit index wraparound. + */ +static int +hq_num_avail(struct vring_hqueue *hq) +{ + int ndesc; + + if (*hq->hq_avail_idx >= hq->hq_cur_aidx) + ndesc = *hq->hq_avail_idx - hq->hq_cur_aidx; + else + ndesc = UINT16_MAX - hq->hq_cur_aidx + *hq->hq_avail_idx + 1; + + assert(ndesc >= 0 && ndesc <= hq->hq_size); + + return (ndesc); +} + +static uint16_t +pci_vtnet_qsize(int qnum) +{ + /* XXX no ctl queue currently */ + if (qnum == VTNET_CTLQ) { + return (0); + } + + /* XXX fixed currently. Maybe different for tx/rx/ctl */ + return (VTNET_RINGSZ); +} + +static void +pci_vtnet_update_status(struct pci_vtnet_softc *sc, uint32_t value) +{ + if (value == 0) { + DPRINTF(("vtnet: device reset requested !\n")); + } + + sc->vsc_status = value; +} + +/* + * Called to send a buffer chain out to the tap device + */ +static void +pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt, + int len) +{ + char pad[60]; + + if (sc->vsc_tapfd == -1) + return; + + /* + * If the length is < 60, pad out to that and add the + * extra zero'd segment to the iov. It is guaranteed that + * there is always an extra iov available by the caller. + */ + if (len < 60) { + memset(pad, 0, 60 - len); + iov[iovcnt].iov_base = pad; + iov[iovcnt].iov_len = 60 - len; + iovcnt++; + } + (void) writev(sc->vsc_tapfd, iov, iovcnt); +} + +/* + * Called when there is read activity on the tap file descriptor. + * Each buffer posted by the guest is assumed to be able to contain + * an entire ethernet frame + rx header. + * MP note: the dummybuf is only used for discarding frames, so there + * is no need for it to be per-vtnet or locked. + */ +static uint8_t dummybuf[2048]; + +static void +pci_vtnet_tap_rx(struct pci_vtnet_softc *sc) +{ + struct virtio_desc *vd; + struct virtio_used *vu; + struct vring_hqueue *hq; + struct virtio_net_rxhdr *vrx; + uint8_t *buf; + int i; + int len; + int ndescs; + int didx, uidx, aidx; /* descriptor, avail and used index */ + + /* + * Should never be called without a valid tap fd + */ + assert(sc->vsc_tapfd != -1); + + /* + * But, will be called when the rx ring hasn't yet + * been set up. + */ + if (sc->vsc_rx_ready == 0) { + /* + * Drop the packet and try later. + */ + (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf)); + return; + } + + /* + * Calculate the number of available rx buffers + */ + hq = &sc->vsc_hq[VTNET_RXQ]; + + ndescs = hq_num_avail(hq); + + if (ndescs == 0) { + /* + * Need to wait for host notification to read + */ + if (sc->vsc_rxpend == 0) { + WPRINTF(("vtnet: no rx descriptors !\n")); + sc->vsc_rxpend = 1; + } + + /* + * Drop the packet and try later + */ + (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf)); + return; + } + + aidx = hq->hq_cur_aidx; + uidx = *hq->hq_used_idx; + for (i = 0; i < ndescs; i++) { + /* + * 'aidx' indexes into the an array of descriptor indexes + */ + didx = hq->hq_avail_ring[aidx % hq->hq_size]; + assert(didx >= 0 && didx < hq->hq_size); + + vd = &hq->hq_dtable[didx]; + + /* + * Get a pointer to the rx header, and use the + * data immediately following it for the packet buffer. + */ + vrx = (struct virtio_net_rxhdr *)paddr_guest2host(vd->vd_addr); + buf = (uint8_t *)(vrx + 1); + + len = read(sc->vsc_tapfd, buf, + vd->vd_len - sizeof(struct virtio_net_rxhdr)); + + if (len < 0 && errno == EWOULDBLOCK) { + break; + } + + /* + * The only valid field in the rx packet header is the + * number of buffers, which is always 1 without TSO + * support. + */ + memset(vrx, 0, sizeof(struct virtio_net_rxhdr)); + vrx->vrh_bufs = 1; + + /* + * Write this descriptor into the used ring + */ + vu = &hq->hq_used_ring[uidx % hq->hq_size]; + vu->vu_idx = didx; + vu->vu_tlen = len + sizeof(struct virtio_net_rxhdr); + uidx++; + aidx++; + } + + /* + * Update the used pointer, and signal an interrupt if allowed + */ + *hq->hq_used_idx = uidx; + hq->hq_cur_aidx = aidx; + + if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) { + sc->vsc_isr |= 1; + pci_generate_msi(sc->vsc_pi, 0); + } +} + +static void +pci_vtnet_tap_callback(int fd, enum ev_type type, void *param) +{ + struct pci_vtnet_softc *sc = param; + + pthread_mutex_lock(&sc->vsc_mtx); + pci_vtnet_tap_rx(sc); + pthread_mutex_unlock(&sc->vsc_mtx); + +} + +static void +pci_vtnet_ping_rxq(struct pci_vtnet_softc *sc) +{ + /* + * A qnotify means that the rx process can now begin + */ + if (sc->vsc_rx_ready == 0) { + sc->vsc_rx_ready = 1; + } + + /* + * If the rx queue was empty, attempt to receive a + * packet that was previously blocked due to no rx bufs + * available + */ + if (sc->vsc_rxpend) { + WPRINTF(("vtnet: rx resumed\n\r")); + sc->vsc_rxpend = 0; + pci_vtnet_tap_rx(sc); + } +} + +static void +pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vring_hqueue *hq) +{ + struct iovec iov[VTNET_MAXSEGS + 1]; + struct virtio_desc *vd; + struct virtio_used *vu; + int i; + int plen; + int tlen; + int uidx, aidx, didx; + + uidx = *hq->hq_used_idx; + aidx = hq->hq_cur_aidx; + didx = hq->hq_avail_ring[aidx % hq->hq_size]; + assert(didx >= 0 && didx < hq->hq_size); + + vd = &hq->hq_dtable[didx]; + + /* + * Run through the chain of descriptors, ignoring the + * first header descriptor. However, include the header + * length in the total length that will be put into the + * used queue. + */ + tlen = vd->vd_len; + vd = &hq->hq_dtable[vd->vd_next]; + + for (i = 0, plen = 0; + i < VTNET_MAXSEGS; + i++, vd = &hq->hq_dtable[vd->vd_next]) { + iov[i].iov_base = paddr_guest2host(vd->vd_addr); + iov[i].iov_len = vd->vd_len; + plen += vd->vd_len; + tlen += vd->vd_len; + + if ((vd->vd_flags & VRING_DESC_F_NEXT) == 0) + break; + } + assert(i < VTNET_MAXSEGS); + + DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, i + 1)); + pci_vtnet_tap_tx(sc, iov, i + 1, plen); + + /* + * Return this chain back to the host + */ + vu = &hq->hq_used_ring[uidx % hq->hq_size]; + vu->vu_idx = didx; + vu->vu_tlen = tlen; + hq->hq_cur_aidx = aidx + 1; + *hq->hq_used_idx = uidx + 1; + + /* + * Generate an interrupt if able + */ + if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) { + sc->vsc_isr |= 1; + pci_generate_msi(sc->vsc_pi, 0); + } +} + +static void +pci_vtnet_ping_txq(struct pci_vtnet_softc *sc) +{ + struct vring_hqueue *hq = &sc->vsc_hq[VTNET_TXQ]; + int i; + int ndescs; + + /* + * Calculate number of ring entries to process + */ + ndescs = hq_num_avail(hq); + + if (ndescs == 0) + return; + + /* + * Run through all the entries, placing them into iovecs and + * sending when an end-of-packet is found + */ + for (i = 0; i < ndescs; i++) + pci_vtnet_proctx(sc, hq); +} + +static void +pci_vtnet_ping_ctlq(struct pci_vtnet_softc *sc) +{ + + DPRINTF(("vtnet: control qnotify!\n\r")); +} + +static void +pci_vtnet_ring_init(struct pci_vtnet_softc *sc, uint64_t pfn) +{ + struct vring_hqueue *hq; + int qnum = sc->vsc_curq; + + assert(qnum < VTNET_MAXQ); + + sc->vsc_pfn[qnum] = pfn << VRING_PFN; + + /* + * Set up host pointers to the various parts of the + * queue + */ + hq = &sc->vsc_hq[qnum]; + hq->hq_size = pci_vtnet_qsize(qnum); + + hq->hq_dtable = paddr_guest2host(pfn << VRING_PFN); + hq->hq_avail_flags = (uint16_t *)(hq->hq_dtable + hq->hq_size); + hq->hq_avail_idx = hq->hq_avail_flags + 1; + hq->hq_avail_ring = hq->hq_avail_flags + 2; + hq->hq_used_flags = (uint16_t *)roundup2((uintptr_t)hq->hq_avail_ring, + VRING_ALIGN); + hq->hq_used_idx = hq->hq_used_flags + 1; + hq->hq_used_ring = (struct virtio_used *)(hq->hq_used_flags + 2); + + /* + * Initialize queue indexes + */ + hq->hq_cur_aidx = 0; +} + +static int +pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + MD5_CTX mdctx; + unsigned char digest[16]; + char nstr[80]; + struct pci_vtnet_softc *sc; + + /* + * Access to guest memory is required. Fail if + * memory not mapped + */ + if (paddr_guest2host(0) == NULL) + return (1); + + sc = malloc(sizeof(struct pci_vtnet_softc)); + memset(sc, 0, sizeof(struct pci_vtnet_softc)); + + pi->pi_arg = sc; + sc->vsc_pi = pi; + + pthread_mutex_init(&sc->vsc_mtx, NULL); + + /* + * Attempt to open the tap device + */ + sc->vsc_tapfd = -1; + if (opts != NULL) { + char tbuf[80]; + + strcpy(tbuf, "/dev/"); + strncat(tbuf, opts, sizeof(tbuf) - strlen(tbuf)); + + sc->vsc_tapfd = open(tbuf, O_RDWR); + if (sc->vsc_tapfd == -1) { + WPRINTF(("open of tap device %s failed\n", tbuf)); + } else { + /* + * Set non-blocking and register for read + * notifications with the event loop + */ + int opt = 1; + if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) { + WPRINTF(("tap device O_NONBLOCK failed\n")); + close(sc->vsc_tapfd); + sc->vsc_tapfd = -1; + } + + sc->vsc_mevp = mevent_add(sc->vsc_tapfd, + EVF_READ, + pci_vtnet_tap_callback, + sc); + if (sc->vsc_mevp == NULL) { + WPRINTF(("Could not register event\n")); + close(sc->vsc_tapfd); + sc->vsc_tapfd = -1; + } + } + } + + /* + * The MAC address is the standard NetApp OUI of 00-a0-98, + * followed by an MD5 of the vm name. The slot number is + * prepended to this for slots other than 1, so that + * CFE can netboot from the equivalent of slot 1. + */ + if (pi->pi_slot == 1) { + strncpy(nstr, vmname, sizeof(nstr)); + } else { + snprintf(nstr, sizeof(nstr), "%d-%s", pi->pi_slot, vmname); + } + + MD5Init(&mdctx); + MD5Update(&mdctx, nstr, strlen(nstr)); + MD5Final(digest, &mdctx); + + sc->vsc_macaddr[0] = 0x00; + sc->vsc_macaddr[1] = 0xa0; + sc->vsc_macaddr[2] = 0x98; + sc->vsc_macaddr[3] = digest[0]; + sc->vsc_macaddr[4] = digest[1]; + sc->vsc_macaddr[5] = digest[2]; + + /* initialize config space */ + pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET); + pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK); + pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET); + pci_emul_alloc_bar(pi, 0, 0, PCIBAR_IO, VTNET_REGSZ); + pci_emul_add_msicap(pi, 1); + + return (0); +} + +/* + * Function pointer array to handle queue notifications + */ +static void (*pci_vtnet_qnotify[VTNET_MAXQ])(struct pci_vtnet_softc *) = { + pci_vtnet_ping_rxq, + pci_vtnet_ping_txq, + pci_vtnet_ping_ctlq +}; + +static void +pci_vtnet_write(struct pci_devinst *pi, int baridx, int offset, int size, + uint32_t value) +{ + struct pci_vtnet_softc *sc = pi->pi_arg; + + if (offset + size > VTNET_REGSZ) { + DPRINTF(("vtnet_write: 2big, offset %d size %d\n", + offset, size)); + return; + } + + pthread_mutex_lock(&sc->vsc_mtx); + + switch (offset) { + case VTCFG_R_GUESTCAP: + assert(size == 4); + sc->vsc_features = value & VTNET_S_HOSTCAPS; + break; + case VTCFG_R_PFN: + assert(size == 4); + pci_vtnet_ring_init(sc, value); + break; + case VTCFG_R_QSEL: + assert(size == 2); + assert(value < VTNET_MAXQ); + sc->vsc_curq = value; + break; + case VTCFG_R_QNOTIFY: + assert(size == 2); + assert(value < VTNET_MAXQ); + (*pci_vtnet_qnotify[value])(sc); + break; + case VTCFG_R_STATUS: + assert(size == 1); + pci_vtnet_update_status(sc, value); + break; + case VTNET_R_CFG0: + case VTNET_R_CFG1: + case VTNET_R_CFG2: + case VTNET_R_CFG3: + case VTNET_R_CFG4: + case VTNET_R_CFG5: + /* + * The driver is allowed to change the MAC address + */ + assert(size == 1); + sc->vsc_macaddr[offset - VTNET_R_CFG0] = value; + break; + case VTCFG_R_HOSTCAP: + case VTCFG_R_QNUM: + case VTCFG_R_ISR: + case VTNET_R_CFG6: + case VTNET_R_CFG7: + DPRINTF(("vtnet: write to readonly reg %d\n\r", offset)); + break; + default: + DPRINTF(("vtnet: unknown i/o write offset %d\n\r", offset)); + value = 0; + break; + } + + pthread_mutex_unlock(&sc->vsc_mtx); +} + +uint32_t +pci_vtnet_read(struct pci_devinst *pi, int baridx, int offset, int size) +{ + struct pci_vtnet_softc *sc = pi->pi_arg; + uint32_t value; + + if (offset + size > VTNET_REGSZ) { + DPRINTF(("vtnet_read: 2big, offset %d size %d\n", + offset, size)); + return (0); + } + + pthread_mutex_lock(&sc->vsc_mtx); + + switch (offset) { + case VTCFG_R_HOSTCAP: + assert(size == 4); + value = VTNET_S_HOSTCAPS; + break; + case VTCFG_R_GUESTCAP: + assert(size == 4); + value = sc->vsc_features; /* XXX never read ? */ + break; + case VTCFG_R_PFN: + assert(size == 4); + value = sc->vsc_pfn[sc->vsc_curq] >> VRING_PFN; + break; + case VTCFG_R_QNUM: + assert(size == 2); + value = pci_vtnet_qsize(sc->vsc_curq); + break; + case VTCFG_R_QSEL: + assert(size == 2); + value = sc->vsc_curq; /* XXX never read ? */ + break; + case VTCFG_R_QNOTIFY: + assert(size == 2); + value = sc->vsc_curq; /* XXX never read ? */ + break; + case VTCFG_R_STATUS: + assert(size == 1); + value = sc->vsc_status; + break; + case VTCFG_R_ISR: + assert(size == 1); + value = sc->vsc_isr; + sc->vsc_isr = 0; /* a read clears this flag */ + break; + case VTNET_R_CFG0: + case VTNET_R_CFG1: + case VTNET_R_CFG2: + case VTNET_R_CFG3: + case VTNET_R_CFG4: + case VTNET_R_CFG5: + assert(size == 1); + value = sc->vsc_macaddr[offset - VTNET_R_CFG0]; + break; + case VTNET_R_CFG6: + assert(size == 1); + value = 0x01; /* XXX link always up */ + break; + case VTNET_R_CFG7: + assert(size == 1); + value = 0; /* link status is in the LSB */ + break; + default: + DPRINTF(("vtnet: unknown i/o read offset %d\n\r", offset)); + value = 0; + break; + } + + pthread_mutex_unlock(&sc->vsc_mtx); + + return (value); +} + +struct pci_devemu pci_de_vnet = { + .pe_emu = "virtio-net", + .pe_init = pci_vtnet_init, + .pe_iow = pci_vtnet_write, + .pe_ior = pci_vtnet_read, +}; +PCI_EMUL_SET(pci_de_vnet); diff --git a/usr.sbin/bhyve/pit_8254.c b/usr.sbin/bhyve/pit_8254.c new file mode 100644 index 000000000000..b5101616b035 --- /dev/null +++ b/usr.sbin/bhyve/pit_8254.c @@ -0,0 +1,196 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include + +#include + +#include +#include + +#include "fbsdrun.h" +#include "inout.h" +#include "pit_8254.h" + +#define TIMER_SEL_MASK 0xc0 +#define TIMER_RW_MASK 0x30 +#define TIMER_MODE_MASK 0x0f +#define TIMER_SEL_READBACK 0xc0 + +#define TIMER_DIV(freq, hz) (((freq) + (hz) / 2) / (hz)) + +#define PIT_8254_FREQ 1193182 +static const int nsecs_per_tick = 1000000000 / PIT_8254_FREQ; + +struct counter { + struct timeval tv; /* uptime when counter was loaded */ + uint16_t initial; /* initial counter value */ + uint8_t cr[2]; + uint8_t ol[2]; + int crbyte; + int olbyte; +}; + +static void +timevalfix(struct timeval *t1) +{ + + if (t1->tv_usec < 0) { + t1->tv_sec--; + t1->tv_usec += 1000000; + } + if (t1->tv_usec >= 1000000) { + t1->tv_sec++; + t1->tv_usec -= 1000000; + } +} + +static void +timevalsub(struct timeval *t1, const struct timeval *t2) +{ + + t1->tv_sec -= t2->tv_sec; + t1->tv_usec -= t2->tv_usec; + timevalfix(t1); +} + +static void +latch(struct counter *c) +{ + struct timeval tv2; + uint16_t lval; + uint64_t delta_nsecs, delta_ticks; + + /* cannot latch a new value until the old one has been consumed */ + if (c->olbyte != 0) + return; + + if (c->initial == 0 || c->initial == 1) { + /* + * XXX the program that runs the VM can be stopped and + * restarted at any time. This means that state that was + * created by the guest is destroyed between invocations + * of the program. + * + * If the counter's initial value is not programmed we + * assume a value that would be set to generate 'guest_hz' + * interrupts per second. + */ + c->initial = TIMER_DIV(PIT_8254_FREQ, guest_hz); + gettimeofday(&c->tv, NULL); + } + + (void)gettimeofday(&tv2, NULL); + timevalsub(&tv2, &c->tv); + delta_nsecs = tv2.tv_sec * 1000000000 + tv2.tv_usec * 1000; + delta_ticks = delta_nsecs / nsecs_per_tick; + + lval = c->initial - delta_ticks % c->initial; + c->olbyte = 2; + c->ol[1] = lval; /* LSB */ + c->ol[0] = lval >> 8; /* MSB */ +} + +static int +pit_8254_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + int sel, rw, mode; + uint8_t val; + struct counter *c; + + static struct counter counter[3]; + + if (bytes != 1) + return (-1); + + val = *eax; + + if (port == TIMER_MODE) { + assert(in == 0); + sel = val & TIMER_SEL_MASK; + rw = val & TIMER_RW_MASK; + mode = val & TIMER_MODE_MASK; + + if (sel == TIMER_SEL_READBACK) + return (-1); + if (rw != TIMER_LATCH && rw != TIMER_16BIT) + return (-1); + + if (rw != TIMER_LATCH) { + /* + * Counter mode is not affected when issuing a + * latch command. + */ + if (mode != TIMER_RATEGEN && mode != TIMER_SQWAVE) + return (-1); + } + + c = &counter[sel >> 6]; + if (rw == TIMER_LATCH) + latch(c); + else + c->olbyte = 0; /* reset latch after reprogramming */ + + return (0); + } + + /* counter ports */ + assert(port >= TIMER_CNTR0 && port <= TIMER_CNTR2); + c = &counter[port - TIMER_CNTR0]; + + if (in) { + /* + * XXX + * The spec says that once the output latch is completely + * read it should revert to "following" the counter. We don't + * do this because it is hard and any reasonable OS should + * always latch the counter before trying to read it. + */ + if (c->olbyte == 0) + c->olbyte = 2; + *eax = c->ol[--c->olbyte]; + } else { + c->cr[c->crbyte++] = *eax; + if (c->crbyte == 2) { + c->crbyte = 0; + c->initial = c->cr[0] | (uint16_t)c->cr[1] << 8; + gettimeofday(&c->tv, NULL); + } + } + + return (0); +} + +INOUT_PORT(8254, TIMER_MODE, IOPORT_F_OUT, pit_8254_handler); +INOUT_PORT(8254, TIMER_CNTR0, IOPORT_F_INOUT, pit_8254_handler); +INOUT_PORT(8254, TIMER_CNTR1, IOPORT_F_INOUT, pit_8254_handler); +INOUT_PORT(8254, TIMER_CNTR2, IOPORT_F_INOUT, pit_8254_handler); diff --git a/usr.sbin/bhyve/pit_8254.h b/usr.sbin/bhyve/pit_8254.h new file mode 100644 index 000000000000..61bd15d13b1c --- /dev/null +++ b/usr.sbin/bhyve/pit_8254.h @@ -0,0 +1,45 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _PIT_8254_H_ +#define _PIT_8254_H_ + +/* + * Borrowed from amd64/include/timerreg.h because in that file it is + * conditionally compiled for #ifdef _KERNEL only. + */ + +#include + +#define IO_TIMER1 0x40 /* 8253 Timer #1 */ +#define TIMER_CNTR0 (IO_TIMER1 + TIMER_REG_CNTR0) +#define TIMER_CNTR1 (IO_TIMER1 + TIMER_REG_CNTR1) +#define TIMER_CNTR2 (IO_TIMER1 + TIMER_REG_CNTR2) +#define TIMER_MODE (IO_TIMER1 + TIMER_REG_MODE) + +#endif /* _PIT_8254_H_ */ diff --git a/usr.sbin/bhyve/post.c b/usr.sbin/bhyve/post.c new file mode 100644 index 000000000000..092a551d87b3 --- /dev/null +++ b/usr.sbin/bhyve/post.c @@ -0,0 +1,51 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include + +#include + +#include "inout.h" + +static int +post_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + assert(in == 1); + + if (bytes != 1) + return (-1); + + *eax = 0xff; /* return some garbage */ + return (0); +} + +INOUT_PORT(post, 0x84, IOPORT_F_IN, post_data_handler); diff --git a/usr.sbin/bhyve/rtc.c b/usr.sbin/bhyve/rtc.c new file mode 100644 index 000000000000..a6f44e00bcd8 --- /dev/null +++ b/usr.sbin/bhyve/rtc.c @@ -0,0 +1,268 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include +#include +#include + +#include "inout.h" + +#define IO_RTC 0x70 + +#define RTC_SEC 0x00 /* seconds */ +#define RTC_MIN 0x02 +#define RTC_HRS 0x04 +#define RTC_WDAY 0x06 +#define RTC_DAY 0x07 +#define RTC_MONTH 0x08 +#define RTC_YEAR 0x09 +#define RTC_CENTURY 0x32 /* current century */ + +#define RTC_STATUSA 0xA +#define RTCSA_TUP 0x80 /* time update, don't look now */ + +#define RTC_STATUSB 0xB +#define RTCSB_DST 0x01 +#define RTCSB_24HR 0x02 +#define RTCSB_BIN 0x04 /* 0 = BCD, 1 = Binary */ +#define RTCSB_PINTR 0x40 /* 1 = enable periodic clock interrupt */ +#define RTCSB_HALT 0x80 /* stop clock updates */ + +#define RTC_INTR 0x0c /* status register C (R) interrupt source */ + +#define RTC_STATUSD 0x0d /* status register D (R) Lost Power */ +#define RTCSD_PWR 0x80 /* clock power OK */ + +#define RTC_DIAG 0x0e + +#define RTC_RSTCODE 0x0f + +static int addr; + +/* XXX initialize these to default values as they would be from BIOS */ +static uint8_t status_a, status_b, rstcode; + +static u_char const bin2bcd_data[] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99 +}; +#define bin2bcd(bin) (bin2bcd_data[bin]) + +#define rtcout(val) ((status_b & RTCSB_BIN) ? (val) : bin2bcd((val))) + +static void +timevalfix(struct timeval *t1) +{ + + if (t1->tv_usec < 0) { + t1->tv_sec--; + t1->tv_usec += 1000000; + } + if (t1->tv_usec >= 1000000) { + t1->tv_sec++; + t1->tv_usec -= 1000000; + } +} + +static void +timevalsub(struct timeval *t1, const struct timeval *t2) +{ + + t1->tv_sec -= t2->tv_sec; + t1->tv_usec -= t2->tv_usec; + timevalfix(t1); +} + +static int +rtc_addr_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + assert(in == 0); + + if (bytes != 1) + return (-1); + + switch (*eax) { + case RTC_SEC: + case RTC_MIN: + case RTC_HRS: + case RTC_WDAY: + case RTC_DAY: + case RTC_MONTH: + case RTC_YEAR: + case RTC_CENTURY: + case RTC_STATUSA: + case RTC_STATUSB: + case RTC_INTR: + case RTC_STATUSD: + case RTC_DIAG: + case RTC_RSTCODE: + break; + default: + return (-1); + } + + addr = *eax; + return (0); +} + +static int +rtc_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + int hour; + time_t t; + struct timeval cur, delta; + + static struct timeval last; + static struct tm tm; + + if (bytes != 1) + return (-1); + + gettimeofday(&cur, NULL); + + /* + * Increment the cached time only once per second so we can guarantee + * that the guest has at least one second to read the hour:min:sec + * separately and still get a coherent view of the time. + */ + delta = cur; + timevalsub(&delta, &last); + if (delta.tv_sec >= 1 && (status_b & RTCSB_HALT) == 0) { + t = cur.tv_sec; + localtime_r(&t, &tm); + last = cur; + } + + if (in) { + switch (addr) { + case RTC_SEC: + *eax = rtcout(tm.tm_sec); + return (0); + case RTC_MIN: + *eax = rtcout(tm.tm_min); + return (0); + case RTC_HRS: + if (status_b & RTCSB_24HR) + hour = tm.tm_hour; + else + hour = (tm.tm_hour % 12) + 1; + + *eax = rtcout(hour); + + /* + * If we are representing time in the 12-hour format + * then set the MSB to indicate PM. + */ + if ((status_b & RTCSB_24HR) == 0 && tm.tm_hour >= 12) + *eax |= 0x80; + + return (0); + case RTC_WDAY: + *eax = rtcout(tm.tm_wday + 1); + return (0); + case RTC_DAY: + *eax = rtcout(tm.tm_mday); + return (0); + case RTC_MONTH: + *eax = rtcout(tm.tm_mon + 1); + return (0); + case RTC_YEAR: + *eax = rtcout(tm.tm_year % 100); + return (0); + case RTC_CENTURY: + *eax = rtcout(tm.tm_year / 100); + break; + case RTC_STATUSA: + *eax = status_a; + return (0); + case RTC_INTR: + *eax = 0; + return (0); + case RTC_STATUSD: + *eax = RTCSD_PWR; + return (0); + case RTC_DIAG: + *eax = 0; + return (0); + case RTC_RSTCODE: + *eax = rstcode; + return (0); + default: + return (-1); + } + } + + switch (addr) { + case RTC_STATUSA: + status_a = *eax & ~RTCSA_TUP; + break; + case RTC_STATUSB: + /* XXX not implemented yet XXX */ + if (*eax & RTCSB_PINTR) + return (-1); + status_b = *eax; + break; + case RTC_RSTCODE: + rstcode = *eax; + break; + case RTC_SEC: + case RTC_MIN: + case RTC_HRS: + case RTC_WDAY: + case RTC_DAY: + case RTC_MONTH: + case RTC_YEAR: + case RTC_CENTURY: + /* + * Ignore writes to the time of day registers + */ + break; + default: + return (-1); + } + return (0); +} + +INOUT_PORT(rtc, IO_RTC, IOPORT_F_OUT, rtc_addr_handler); +INOUT_PORT(rtc, IO_RTC + 1, IOPORT_F_INOUT, rtc_data_handler); diff --git a/usr.sbin/bhyve/uart.c b/usr.sbin/bhyve/uart.c new file mode 100644 index 000000000000..640f3bf0f70d --- /dev/null +++ b/usr.sbin/bhyve/uart.c @@ -0,0 +1,60 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include "inout.h" + +#define COM1 0x3F8 +#define COM2 0x2F8 + +#define REG_IIR 2 + +static int +com_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + assert(in); + + if (bytes != 1) + return (-1); + + /* + * COM port is not implemented so we return 0xFF for all registers + */ + *eax = 0xFF; + + return (0); +} + +INOUT_PORT(uart, COM1 + REG_IIR, IOPORT_F_IN, com_handler); +INOUT_PORT(uart, COM2 + REG_IIR, IOPORT_F_IN, com_handler); diff --git a/usr.sbin/bhyve/virtio.h b/usr.sbin/bhyve/virtio.h new file mode 100644 index 000000000000..474e244c5965 --- /dev/null +++ b/usr.sbin/bhyve/virtio.h @@ -0,0 +1,85 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VIRTIO_H_ +#define _VIRTIO_H_ + +#define VRING_ALIGN 4096 + +#define VRING_DESC_F_NEXT (1 << 0) +#define VRING_DESC_F_WRITE (1 << 1) +#define VRING_DESC_F_INDIRECT (1 << 2) + +#define VRING_AVAIL_F_NO_INTERRUPT 1 + +struct virtio_desc { + uint64_t vd_addr; + uint32_t vd_len; + uint16_t vd_flags; + uint16_t vd_next; +} __packed; + +struct virtio_used { + uint32_t vu_idx; + uint32_t vu_tlen; +} __packed; + +/* + * PFN register shift amount + */ +#define VRING_PFN 12 + +/* + * Virtio device types + */ +#define VIRTIO_TYPE_NET 1 +#define VIRTIO_TYPE_BLOCK 2 + +/* + * PCI vendor/device IDs + */ +#define VIRTIO_VENDOR 0x1AF4 +#define VIRTIO_DEV_NET 0x1000 +#define VIRTIO_DEV_BLOCK 0x1001 + +/* + * PCI config space constants + */ +#define VTCFG_R_HOSTCAP 0 +#define VTCFG_R_GUESTCAP 4 +#define VTCFG_R_PFN 8 +#define VTCFG_R_QNUM 12 +#define VTCFG_R_QSEL 14 +#define VTCFG_R_QNOTIFY 16 +#define VTCFG_R_STATUS 18 +#define VTCFG_R_ISR 19 +#define VTCFG_R_CFG0 20 /* No MSI-X */ +#define VTCFG_R_CFG1 24 /* With MSI-X */ +#define VTCFG_R_MSIX 20 + +#endif /* _VIRTIO_H_ */ diff --git a/usr.sbin/bhyve/xmsr.c b/usr.sbin/bhyve/xmsr.c new file mode 100644 index 000000000000..676b5df84d6b --- /dev/null +++ b/usr.sbin/bhyve/xmsr.c @@ -0,0 +1,261 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include +#include +#include +#include +#include + +#include "fbsdrun.h" +#include "xmsr.h" + +/* + * Trampoline for hypervisor direct 64-bit jump. + * + * 0 - signature for guest->host verification + * 8 - kernel virtual address of trampoline + * 16 - instruction virtual address + * 24 - stack pointer virtual address + * 32 - CR3, physical address of kernel page table + * 40 - 24-byte area for null/code/data GDT entries + */ +#define MP_V64T_SIG 0xcafebabecafebabeULL +struct mp_v64tramp { + uint64_t mt_sig; + uint64_t mt_virt; + uint64_t mt_eip; + uint64_t mt_rsp; + uint64_t mt_cr3; + uint64_t mt_gdtr[3]; +}; + +/* + * CPU 0 is considered to be the BSP and is set to the RUNNING state. + * All other CPUs are set up in the INIT state. + */ +#define BSP 0 +enum cpu_bstate { + CPU_S_INIT, + CPU_S_SIPI, + CPU_S_RUNNING +} static cpu_b[VM_MAXCPU] = { [BSP] = CPU_S_RUNNING }; + +static void spinup_ap(struct vmctx *, int, int, uint64_t *); +static void spinup_ap_direct64(struct vmctx *, int, uintptr_t, uint64_t *); + +int +emulate_wrmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t val) +{ + int dest; + int mode; + int thiscpu; + int vec; + int error, retval; + uint64_t rip; + + retval = vcpu; + thiscpu = 1 << vcpu; + + /* + * The only MSR value handled is the x2apic CR register + */ + if (code != 0x830) { + printf("Unknown WRMSR code %x, val %lx, cpu %d\n", + code, val, vcpu); + exit(1); + } + + /* + * The value written to the MSR will generate an IPI to + * a set of CPUs. If this is a SIPI, create the initial + * state for the CPU and switch to it. Otherwise, inject + * an interrupt for the destination CPU(s), and request + * a switch to the next available one by returning -1 + */ + dest = val >> 32; + vec = val & APIC_VECTOR_MASK; + mode = val & APIC_DELMODE_MASK; + + switch (mode) { + case APIC_DELMODE_INIT: + assert(dest != 0); + assert(dest < guest_ncpus); + + /* + * Ignore legacy de-assert INITs in x2apic mode + */ + if ((val & APIC_LEVEL_MASK) == APIC_LEVEL_DEASSERT) { + break; + } + assert(cpu_b[dest] == CPU_S_INIT); + + /* + * Move CPU to wait-for-SIPI state + */ + error = vcpu_reset(ctx, dest); + assert(error == 0); + + cpu_b[dest] = CPU_S_SIPI; + break; + + case APIC_DELMODE_STARTUP: + assert(dest != 0); + assert(dest < guest_ncpus); + /* + * Ignore SIPIs in any state other than wait-for-SIPI + */ + if (cpu_b[dest] != CPU_S_SIPI) { + break; + } + + /* + * Bring up the AP and signal the main loop that it is + * available and to switch to it. + */ + spinup_ap(ctx, dest, vec, &rip); + cpu_b[dest] = CPU_S_RUNNING; + fbsdrun_addcpu(ctx, dest, rip); + retval = dest; + break; + + default: + printf("APIC delivery mode %lx not supported!\n", + val & APIC_DELMODE_MASK); + exit(1); + } + + return (retval); +} + +/* + * There are 2 startup modes possible here: + * - if the CPU supports 'unrestricted guest' mode, the spinup can + * set up the processor state in power-on 16-bit mode, with the CS:IP + * init'd to the specified low-mem 4K page. + * - if the guest has requested a 64-bit trampoline in the low-mem 4K + * page by placing in the specified signature, set up the register + * state using register state in the signature. Note that this + * requires accessing guest physical memory to read the signature + * while 'unrestricted mode' does not. + */ +static void +spinup_ap(struct vmctx *ctx, int newcpu, int vector, uint64_t *rip) +{ + int error; + uint16_t cs; + uint64_t desc_base; + uint32_t desc_limit, desc_access; + + if (fbsdrun_vmexit_on_hlt()) { + error = vm_set_capability(ctx, newcpu, VM_CAP_HALT_EXIT, 1); + assert(error == 0); + } + + if (fbsdrun_vmexit_on_pause()) { + error = vm_set_capability(ctx, newcpu, VM_CAP_PAUSE_EXIT, 1); + assert(error == 0); + } + + error = vm_set_capability(ctx, newcpu, VM_CAP_UNRESTRICTED_GUEST, 1); + if (error) { + /* + * If the guest does not support real-mode execution then + * we will bring up the AP directly in 64-bit mode. + */ + spinup_ap_direct64(ctx, newcpu, vector << PAGE_SHIFT, rip); + } else { + /* + * Update the %cs and %rip of the guest so that it starts + * executing real mode code at at 'vector << 12'. + */ + *rip = 0; + error = vm_set_register(ctx, newcpu, VM_REG_GUEST_RIP, *rip); + assert(error == 0); + + error = vm_get_desc(ctx, newcpu, VM_REG_GUEST_CS, &desc_base, + &desc_limit, &desc_access); + assert(error == 0); + + desc_base = vector << PAGE_SHIFT; + error = vm_set_desc(ctx, newcpu, VM_REG_GUEST_CS, + desc_base, desc_limit, desc_access); + assert(error == 0); + + cs = (vector << PAGE_SHIFT) >> 4; + error = vm_set_register(ctx, newcpu, VM_REG_GUEST_CS, cs); + assert(error == 0); + } +} + +static void +spinup_ap_direct64(struct vmctx *ctx, int newcpu, uintptr_t gaddr, + uint64_t *rip) +{ + struct mp_v64tramp *mvt; + char *errstr; + int error; + uint64_t gdtbase; + + mvt = paddr_guest2host(gaddr); + + assert(mvt->mt_sig == MP_V64T_SIG); + + /* + * Set up the 3-entry GDT using memory supplied in the + * guest's trampoline structure. + */ + vm_setup_freebsd_gdt(mvt->mt_gdtr); + +#define CHECK_ERROR(msg) \ + if (error != 0) { \ + errstr = msg; \ + goto err_exit; \ + } + + /* entry point */ + *rip = mvt->mt_eip; + + /* Get the guest virtual address of the GDT */ + gdtbase = mvt->mt_virt + __offsetof(struct mp_v64tramp, mt_gdtr); + + error = vm_setup_freebsd_registers(ctx, newcpu, mvt->mt_eip, + mvt->mt_cr3, gdtbase, mvt->mt_rsp); + CHECK_ERROR("vm_setup_freebsd_registers"); + + return; +err_exit: + printf("spinup_ap_direct64: machine state error: %s", errstr); + exit(1); +} diff --git a/usr.sbin/bhyve/xmsr.h b/usr.sbin/bhyve/xmsr.h new file mode 100644 index 000000000000..8cebcea0cd55 --- /dev/null +++ b/usr.sbin/bhyve/xmsr.h @@ -0,0 +1,34 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _XMSR_H_ +#define _XMSR_H_ + +int emulate_wrmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t val); + +#endif diff --git a/usr.sbin/vmmctl/Makefile b/usr.sbin/vmmctl/Makefile new file mode 100644 index 000000000000..a1654dfb999b --- /dev/null +++ b/usr.sbin/vmmctl/Makefile @@ -0,0 +1,17 @@ +# +# $FreeBSD$ +# + +PROG= vmmctl +SRCS= vmmctl.c + +NO_MAN= + +DPADD= ${LIBVMMAPI} +LDADD= -lvmmapi + +WARNS?= 3 + +CFLAGS+= -I${.CURDIR}/../../sys/amd64/vmm + +.include diff --git a/usr.sbin/vmmctl/sample.sh b/usr.sbin/vmmctl/sample.sh new file mode 100755 index 000000000000..f38d0dadb882 --- /dev/null +++ b/usr.sbin/vmmctl/sample.sh @@ -0,0 +1,75 @@ +#!/bin/sh + +# $FreeBSD$ + +VMMCTL="sudo ./vmmctl" +VMNAME=sample + +${VMMCTL} --vm=${VMNAME} --create +${VMMCTL} --vm=${VMNAME} --set-lowmem=128 --set-highmem=256 +${VMMCTL} --vm=${VMNAME} --get-lowmem --get-highmem + +CR0_PE=$((1 << 0)) +CR0_PG=$((1 << 31)) +CR0=$(($CR0_PE | $CR0_PG)) +${VMMCTL} --vm=${VMNAME} --set-cr0=${CR0} --get-cr0 + +# XXX this is bogus the value of %cr3 should come from the loader +CR3=0 +${VMMCTL} --vm=${VMNAME} --set-cr3=${CR3} --get-cr3 + +CR4_PAE=$((1 << 5)) +CR4=$((${CR4_PAE})) +${VMMCTL} --vm=${VMNAME} --set-cr4=${CR4} --get-cr4 + +DR7=0x00000400 # Table 9-1 from Intel Architecture Manual 3A +${VMMCTL} --vm=${VMNAME} --set-dr7=${DR7} --get-dr7 + +# +# XXX the values of rsp and rip are bogus and should come from the loader. +# +RSP=0xa5a5a5a5 +RIP=0x0000bfbfbfbf0000 +RFLAGS=0x2 +${VMMCTL} --vm=${VMNAME} --set-rsp=${RSP} --get-rsp +${VMMCTL} --vm=${VMNAME} --set-rip=${RIP} --get-rip +${VMMCTL} --vm=${VMNAME} --set-rflags=${RFLAGS} --get-rflags + +# Set "hidden" state of %cs descriptor to indicate long mode code segment. +# +# Note that this should match the contents of the entry pointed to by the +# segment selector in the GDTR. +# +${VMMCTL} --vm=${VMNAME} --set-desc-cs --desc-access=0x00002098 --get-desc-cs + +# Set "hidden" state of all data descriptors to indicate a usable segment. +# The only useful fields are the "Present" and "Descriptor Type" bits. +${VMMCTL} --vm=${VMNAME} --set-desc-ds --desc-access=0x00000090 --get-desc-ds +${VMMCTL} --vm=${VMNAME} --set-desc-es --desc-access=0x00000090 --get-desc-es +${VMMCTL} --vm=${VMNAME} --set-desc-fs --desc-access=0x00000090 --get-desc-fs +${VMMCTL} --vm=${VMNAME} --set-desc-gs --desc-access=0x00000090 --get-desc-gs +${VMMCTL} --vm=${VMNAME} --set-desc-ss --desc-access=0x00000090 --get-desc-ss + +# +# Set the code segment selector to point to entry at offset 8 in the GDTR. +# +${VMMCTL} --vm=${VMNAME} --set-cs=0x0008 --get-cs + +# Set all the remaining data segment selectors to point to entry at offset +# 16 in the GDTR. +${VMMCTL} --vm=${VMNAME} --set-ds=0x0010 --get-ds +${VMMCTL} --vm=${VMNAME} --set-es=0x0010 --get-es +${VMMCTL} --vm=${VMNAME} --set-fs=0x0010 --get-fs +${VMMCTL} --vm=${VMNAME} --set-gs=0x0010 --get-gs +${VMMCTL} --vm=${VMNAME} --set-ss=0x0010 --get-ss + +# XXX the value of the GDTR should come from the loader. +# Set the GDTR +GDTR_BASE=0xffff0000 +GDTR_LIMIT=0x10 +${VMMCTL} --vm=${VMNAME} --set-desc-gdtr --desc-base=${GDTR_BASE} --desc-limit=${GDTR_LIMIT} --get-desc-gdtr + +${VMMCTL} --vm=${VMNAME} --set-pinning=0 --get-pinning +${VMMCTL} --vm=${VMNAME} --set-pinning=-1 --get-pinning + +${VMMCTL} --vm=${VMNAME} --destroy diff --git a/usr.sbin/vmmctl/vmmctl.c b/usr.sbin/vmmctl/vmmctl.c new file mode 100644 index 000000000000..346cde15a366 --- /dev/null +++ b/usr.sbin/vmmctl/vmmctl.c @@ -0,0 +1,1485 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "intel/vmcs.h" + +#define MB (1UL << 20) +#define GB (1UL << 30) + +#define REQ_ARG required_argument +#define NO_ARG no_argument +#define OPT_ARG optional_argument + +static const char *progname; + +static void +usage(void) +{ + + (void)fprintf(stderr, + "Usage: %s --vm=\n" + " [--cpu=]\n" + " [--create]\n" + " [--destroy]\n" + " [--get-stats]\n" + " [--set-desc-ds]\n" + " [--get-desc-ds]\n" + " [--set-desc-es]\n" + " [--get-desc-es]\n" + " [--set-desc-gs]\n" + " [--get-desc-gs]\n" + " [--set-desc-fs]\n" + " [--get-desc-fs]\n" + " [--set-desc-cs]\n" + " [--get-desc-cs]\n" + " [--set-desc-ss]\n" + " [--get-desc-ss]\n" + " [--set-desc-tr]\n" + " [--get-desc-tr]\n" + " [--set-desc-ldtr]\n" + " [--get-desc-ldtr]\n" + " [--set-desc-gdtr]\n" + " [--get-desc-gdtr]\n" + " [--set-desc-idtr]\n" + " [--get-desc-idtr]\n" + " [--run]\n" + " [--capname=]\n" + " [--getcap]\n" + " [--setcap=<0|1>]\n" + " [--desc-base=]\n" + " [--desc-limit=]\n" + " [--desc-access=]\n" + " [--set-cr0=]\n" + " [--get-cr0]\n" + " [--set-cr3=]\n" + " [--get-cr3]\n" + " [--set-cr4=]\n" + " [--get-cr4]\n" + " [--set-dr7=]\n" + " [--get-dr7]\n" + " [--set-rsp=]\n" + " [--get-rsp]\n" + " [--set-rip=]\n" + " [--get-rip]\n" + " [--get-rax]\n" + " [--set-rax=]\n" + " [--get-rbx]\n" + " [--get-rcx]\n" + " [--get-rdx]\n" + " [--get-rsi]\n" + " [--get-rdi]\n" + " [--get-rbp]\n" + " [--get-r8]\n" + " [--get-r9]\n" + " [--get-r10]\n" + " [--get-r11]\n" + " [--get-r12]\n" + " [--get-r13]\n" + " [--get-r14]\n" + " [--get-r15]\n" + " [--set-rflags=]\n" + " [--get-rflags]\n" + " [--set-cs]\n" + " [--get-cs]\n" + " [--set-ds]\n" + " [--get-ds]\n" + " [--set-es]\n" + " [--get-es]\n" + " [--set-fs]\n" + " [--get-fs]\n" + " [--set-gs]\n" + " [--get-gs]\n" + " [--set-ss]\n" + " [--get-ss]\n" + " [--get-tr]\n" + " [--get-ldtr]\n" + " [--get-vmcs-pinbased-ctls]\n" + " [--get-vmcs-procbased-ctls]\n" + " [--get-vmcs-procbased-ctls2]\n" + " [--get-vmcs-entry-interruption-info]\n" + " [--set-vmcs-entry-interruption-info=]\n" + " [--get-vmcs-eptp]\n" + " [--get-vmcs-guest-physical-address\n" + " [--get-vmcs-guest-linear-address\n" + " [--set-vmcs-exception-bitmap]\n" + " [--get-vmcs-exception-bitmap]\n" + " [--get-vmcs-io-bitmap-address]\n" + " [--get-vmcs-tsc-offset]\n" + " [--get-vmcs-guest-pat]\n" + " [--get-vmcs-host-pat]\n" + " [--get-vmcs-host-cr0]\n" + " [--get-vmcs-host-cr3]\n" + " [--get-vmcs-host-cr4]\n" + " [--get-vmcs-host-rip]\n" + " [--get-vmcs-host-rsp]\n" + " [--get-vmcs-cr0-mask]\n" + " [--get-vmcs-cr0-shadow]\n" + " [--get-vmcs-cr4-mask]\n" + " [--get-vmcs-cr4-shadow]\n" + " [--get-vmcs-cr3-targets]\n" + " [--get-vmcs-apic-access-address]\n" + " [--get-vmcs-virtual-apic-address]\n" + " [--get-vmcs-tpr-threshold]\n" + " [--get-vmcs-msr-bitmap]\n" + " [--get-vmcs-msr-bitmap-address]\n" + " [--get-vmcs-vpid]\n" + " [--get-vmcs-ple-gap]\n" + " [--get-vmcs-ple-window]\n" + " [--get-vmcs-instruction-error]\n" + " [--get-vmcs-exit-ctls]\n" + " [--get-vmcs-entry-ctls]\n" + " [--get-vmcs-guest-sysenter]\n" + " [--get-vmcs-link]\n" + " [--get-vmcs-exit-reason]\n" + " [--get-vmcs-exit-qualification]\n" + " [--get-vmcs-exit-interruption-info]\n" + " [--get-vmcs-exit-interruption-error]\n" + " [--get-vmcs-interruptibility]\n" + " [--set-pinning=]\n" + " [--get-pinning]\n" + " [--set-lowmem=]\n" + " [--get-lowmem]\n" + " [--set-highmem=]\n" + " [--get-highmem]\n", + progname); + exit(1); +} + +static int get_stats, getcap, setcap, capval; +static const char *capname; +static int create, destroy, get_lowmem, get_highmem; +static uint64_t lowmem, highmem; +static int set_cr0, get_cr0, set_cr3, get_cr3, set_cr4, get_cr4; +static int set_efer, get_efer; +static int set_dr7, get_dr7; +static int set_rsp, get_rsp, set_rip, get_rip, set_rflags, get_rflags; +static int set_rax, get_rax; +static int get_rbx, get_rcx, get_rdx, get_rsi, get_rdi, get_rbp; +static int get_r8, get_r9, get_r10, get_r11, get_r12, get_r13, get_r14, get_r15; +static int set_desc_ds, get_desc_ds; +static int set_desc_es, get_desc_es; +static int set_desc_fs, get_desc_fs; +static int set_desc_gs, get_desc_gs; +static int set_desc_cs, get_desc_cs; +static int set_desc_ss, get_desc_ss; +static int set_desc_gdtr, get_desc_gdtr; +static int set_desc_idtr, get_desc_idtr; +static int set_desc_tr, get_desc_tr; +static int set_desc_ldtr, get_desc_ldtr; +static int set_cs, set_ds, set_es, set_fs, set_gs, set_ss, set_tr, set_ldtr; +static int get_cs, get_ds, get_es, get_fs, get_gs, get_ss, get_tr, get_ldtr; +static int set_pinning, get_pinning, pincpu; +static int run; + +/* + * VMCS-specific fields + */ +static int get_pinbased_ctls, get_procbased_ctls, get_procbased_ctls2; +static int get_eptp, get_io_bitmap, get_tsc_offset; +static int get_vmcs_entry_interruption_info, set_vmcs_entry_interruption_info; +static int get_vmcs_interruptibility; +uint32_t vmcs_entry_interruption_info; +static int get_vmcs_gpa, get_vmcs_gla; +static int get_exception_bitmap, set_exception_bitmap, exception_bitmap; +static int get_cr0_mask, get_cr0_shadow; +static int get_cr4_mask, get_cr4_shadow; +static int get_cr3_targets; +static int get_apic_access_addr, get_virtual_apic_addr, get_tpr_threshold; +static int get_msr_bitmap, get_msr_bitmap_address; +static int get_vpid, get_ple_gap, get_ple_window; +static int get_inst_err, get_exit_ctls, get_entry_ctls; +static int get_host_cr0, get_host_cr3, get_host_cr4; +static int get_host_rip, get_host_rsp; +static int get_guest_pat, get_host_pat; +static int get_guest_sysenter, get_vmcs_link; +static int get_vmcs_exit_reason, get_vmcs_exit_qualification; +static int get_vmcs_exit_interruption_info, get_vmcs_exit_interruption_error; + +static uint64_t desc_base; +static uint32_t desc_limit, desc_access; + +static void +dump_vm_run_exitcode(struct vm_exit *vmexit, int vcpu) +{ + printf("vm exit[%d]\n", vcpu); + printf("\trip\t\t0x%016lx\n", vmexit->rip); + printf("\tinst_length\t%d\n", vmexit->inst_length); + switch (vmexit->exitcode) { + case VM_EXITCODE_INOUT: + printf("\treason\t\tINOUT\n"); + printf("\tdirection\t%s\n", vmexit->u.inout.in ? "IN" : "OUT"); + printf("\tbytes\t\t%d\n", vmexit->u.inout.bytes); + printf("\tflags\t\t%s%s\n", + vmexit->u.inout.string ? "STRING " : "", + vmexit->u.inout.rep ? "REP " : ""); + printf("\tport\t\t0x%04x\n", vmexit->u.inout.port); + printf("\teax\t\t0x%08x\n", vmexit->u.inout.eax); + break; + case VM_EXITCODE_VMX: + printf("\treason\t\tVMX\n"); + printf("\terror\t\t%d\n", vmexit->u.vmx.error); + printf("\texit_reason\t0x%08x (%u)\n", + vmexit->u.vmx.exit_reason, vmexit->u.vmx.exit_reason); + printf("\tqualification\t0x%016lx\n", + vmexit->u.vmx.exit_qualification); + break; + default: + printf("*** unknown vm run exitcode %d\n", vmexit->exitcode); + break; + } +} + +static int +dump_vmcs_msr_bitmap(int vcpu, u_long addr) +{ + int error, fd, byte, bit, readable, writeable; + u_int msr; + const char *bitmap; + + error = -1; + bitmap = MAP_FAILED; + + fd = open("/dev/mem", O_RDONLY, 0); + if (fd < 0) + goto done; + + bitmap = mmap(NULL, PAGE_SIZE, PROT_READ, 0, fd, addr); + if (bitmap == MAP_FAILED) + goto done; + + for (msr = 0; msr < 0x2000; msr++) { + byte = msr / 8; + bit = msr & 0x7; + + /* Look at MSRs in the range 0x00000000 to 0x00001FFF */ + readable = (bitmap[byte] & (1 << bit)) ? 0 : 1; + writeable = (bitmap[2048 + byte] & (1 << bit)) ? 0 : 1; + if (readable || writeable) { + printf("msr 0x%08x[%d]\t\t%c%c\n", msr, vcpu, + readable ? 'R' : '-', + writeable ? 'W' : '-'); + } + + /* Look at MSRs in the range 0xC0000000 to 0xC0001FFF */ + byte += 1024; + readable = (bitmap[byte] & (1 << bit)) ? 0 : 1; + writeable = (bitmap[2048 + byte] & (1 << bit)) ? 0 : 1; + if (readable || writeable) { + printf("msr 0x%08x[%d]\t\t%c%c\n", + 0xc0000000 + msr, vcpu, + readable ? 'R' : '-', + writeable ? 'W' : '-'); + } + } + + error = 0; +done: + if (bitmap != MAP_FAILED) + munmap((void *)bitmap, PAGE_SIZE); + if (fd >= 0) + close(fd); + return (error); +} + +static int +vm_get_vmcs_field(struct vmctx *ctx, int vcpu, int field, uint64_t *ret_val) +{ + + return (vm_get_register(ctx, vcpu, VMCS_IDENT(field), ret_val)); +} + +static int +vm_set_vmcs_field(struct vmctx *ctx, int vcpu, int field, uint64_t val) +{ + + return (vm_set_register(ctx, vcpu, VMCS_IDENT(field), val)); +} + +enum { + VMNAME = 1000, /* avoid collision with return values from getopt */ + VCPU, + SET_LOWMEM, + SET_HIGHMEM, + SET_EFER, + SET_CR0, + SET_CR3, + SET_CR4, + SET_DR7, + SET_RSP, + SET_RIP, + SET_RAX, + SET_RFLAGS, + DESC_BASE, + DESC_LIMIT, + DESC_ACCESS, + SET_CS, + SET_DS, + SET_ES, + SET_FS, + SET_GS, + SET_SS, + SET_TR, + SET_LDTR, + SET_PINNING, + SET_VMCS_EXCEPTION_BITMAP, + SET_VMCS_ENTRY_INTERRUPTION_INFO, + SET_CAP, + CAPNAME, +}; + +int +main(int argc, char *argv[]) +{ + char *vmname; + int error, ch, vcpu; + vm_paddr_t hpa; + size_t len; + struct vm_exit vmexit; + uint64_t ctl, eptp, bm, addr, u64; + struct vmctx *ctx; + + uint64_t cr0, cr3, cr4, dr7, rsp, rip, rflags, efer, pat; + uint64_t rax, rbx, rcx, rdx, rsi, rdi, rbp; + uint64_t r8, r9, r10, r11, r12, r13, r14, r15; + uint64_t cs, ds, es, fs, gs, ss, tr, ldtr; + + struct option opts[] = { + { "vm", REQ_ARG, 0, VMNAME }, + { "cpu", REQ_ARG, 0, VCPU }, + { "set-lowmem", REQ_ARG, 0, SET_LOWMEM }, + { "set-highmem",REQ_ARG, 0, SET_HIGHMEM }, + { "set-efer", REQ_ARG, 0, SET_EFER }, + { "set-cr0", REQ_ARG, 0, SET_CR0 }, + { "set-cr3", REQ_ARG, 0, SET_CR3 }, + { "set-cr4", REQ_ARG, 0, SET_CR4 }, + { "set-dr7", REQ_ARG, 0, SET_DR7 }, + { "set-rsp", REQ_ARG, 0, SET_RSP }, + { "set-rip", REQ_ARG, 0, SET_RIP }, + { "set-rax", REQ_ARG, 0, SET_RAX }, + { "set-rflags", REQ_ARG, 0, SET_RFLAGS }, + { "desc-base", REQ_ARG, 0, DESC_BASE }, + { "desc-limit", REQ_ARG, 0, DESC_LIMIT }, + { "desc-access",REQ_ARG, 0, DESC_ACCESS }, + { "set-cs", REQ_ARG, 0, SET_CS }, + { "set-ds", REQ_ARG, 0, SET_DS }, + { "set-es", REQ_ARG, 0, SET_ES }, + { "set-fs", REQ_ARG, 0, SET_FS }, + { "set-gs", REQ_ARG, 0, SET_GS }, + { "set-ss", REQ_ARG, 0, SET_SS }, + { "set-tr", REQ_ARG, 0, SET_TR }, + { "set-ldtr", REQ_ARG, 0, SET_LDTR }, + { "set-pinning",REQ_ARG, 0, SET_PINNING }, + { "set-vmcs-exception-bitmap", + REQ_ARG, 0, SET_VMCS_EXCEPTION_BITMAP }, + { "set-vmcs-entry-interruption-info", + REQ_ARG, 0, SET_VMCS_ENTRY_INTERRUPTION_INFO }, + { "capname", REQ_ARG, 0, CAPNAME }, + { "setcap", REQ_ARG, 0, SET_CAP }, + { "getcap", NO_ARG, &getcap, 1 }, + { "get-stats", NO_ARG, &get_stats, 1 }, + { "get-desc-ds",NO_ARG, &get_desc_ds, 1 }, + { "set-desc-ds",NO_ARG, &set_desc_ds, 1 }, + { "get-desc-es",NO_ARG, &get_desc_es, 1 }, + { "set-desc-es",NO_ARG, &set_desc_es, 1 }, + { "get-desc-ss",NO_ARG, &get_desc_ss, 1 }, + { "set-desc-ss",NO_ARG, &set_desc_ss, 1 }, + { "get-desc-cs",NO_ARG, &get_desc_cs, 1 }, + { "set-desc-cs",NO_ARG, &set_desc_cs, 1 }, + { "get-desc-fs",NO_ARG, &get_desc_fs, 1 }, + { "set-desc-fs",NO_ARG, &set_desc_fs, 1 }, + { "get-desc-gs",NO_ARG, &get_desc_gs, 1 }, + { "set-desc-gs",NO_ARG, &set_desc_gs, 1 }, + { "get-desc-tr",NO_ARG, &get_desc_tr, 1 }, + { "set-desc-tr",NO_ARG, &set_desc_tr, 1 }, + { "set-desc-ldtr", NO_ARG, &set_desc_ldtr, 1 }, + { "get-desc-ldtr", NO_ARG, &get_desc_ldtr, 1 }, + { "set-desc-gdtr", NO_ARG, &set_desc_gdtr, 1 }, + { "get-desc-gdtr", NO_ARG, &get_desc_gdtr, 1 }, + { "set-desc-idtr", NO_ARG, &set_desc_idtr, 1 }, + { "get-desc-idtr", NO_ARG, &get_desc_idtr, 1 }, + { "get-lowmem", NO_ARG, &get_lowmem, 1 }, + { "get-highmem",NO_ARG, &get_highmem, 1 }, + { "get-efer", NO_ARG, &get_efer, 1 }, + { "get-cr0", NO_ARG, &get_cr0, 1 }, + { "get-cr3", NO_ARG, &get_cr3, 1 }, + { "get-cr4", NO_ARG, &get_cr4, 1 }, + { "get-dr7", NO_ARG, &get_dr7, 1 }, + { "get-rsp", NO_ARG, &get_rsp, 1 }, + { "get-rip", NO_ARG, &get_rip, 1 }, + { "get-rax", NO_ARG, &get_rax, 1 }, + { "get-rbx", NO_ARG, &get_rbx, 1 }, + { "get-rcx", NO_ARG, &get_rcx, 1 }, + { "get-rdx", NO_ARG, &get_rdx, 1 }, + { "get-rsi", NO_ARG, &get_rsi, 1 }, + { "get-rdi", NO_ARG, &get_rdi, 1 }, + { "get-rbp", NO_ARG, &get_rbp, 1 }, + { "get-r8", NO_ARG, &get_r8, 1 }, + { "get-r9", NO_ARG, &get_r9, 1 }, + { "get-r10", NO_ARG, &get_r10, 1 }, + { "get-r11", NO_ARG, &get_r11, 1 }, + { "get-r12", NO_ARG, &get_r12, 1 }, + { "get-r13", NO_ARG, &get_r13, 1 }, + { "get-r14", NO_ARG, &get_r14, 1 }, + { "get-r15", NO_ARG, &get_r15, 1 }, + { "get-rflags", NO_ARG, &get_rflags, 1 }, + { "get-cs", NO_ARG, &get_cs, 1 }, + { "get-ds", NO_ARG, &get_ds, 1 }, + { "get-es", NO_ARG, &get_es, 1 }, + { "get-fs", NO_ARG, &get_fs, 1 }, + { "get-gs", NO_ARG, &get_gs, 1 }, + { "get-ss", NO_ARG, &get_ss, 1 }, + { "get-tr", NO_ARG, &get_tr, 1 }, + { "get-ldtr", NO_ARG, &get_ldtr, 1 }, + { "get-vmcs-pinbased-ctls", + NO_ARG, &get_pinbased_ctls, 1 }, + { "get-vmcs-procbased-ctls", + NO_ARG, &get_procbased_ctls, 1 }, + { "get-vmcs-procbased-ctls2", + NO_ARG, &get_procbased_ctls2, 1 }, + { "get-vmcs-guest-linear-address", + NO_ARG, &get_vmcs_gla, 1 }, + { "get-vmcs-guest-physical-address", + NO_ARG, &get_vmcs_gpa, 1 }, + { "get-vmcs-entry-interruption-info", + NO_ARG, &get_vmcs_entry_interruption_info, 1}, + { "get-vmcs-eptp", NO_ARG, &get_eptp, 1 }, + { "get-vmcs-exception-bitmap", + NO_ARG, &get_exception_bitmap, 1 }, + { "get-vmcs-io-bitmap-address", + NO_ARG, &get_io_bitmap, 1 }, + { "get-vmcs-tsc-offset", NO_ARG,&get_tsc_offset, 1 }, + { "get-vmcs-cr0-mask", NO_ARG, &get_cr0_mask, 1 }, + { "get-vmcs-cr0-shadow", NO_ARG,&get_cr0_shadow, 1 }, + { "get-vmcs-cr4-mask", NO_ARG, &get_cr4_mask, 1 }, + { "get-vmcs-cr4-shadow", NO_ARG,&get_cr4_shadow, 1 }, + { "get-vmcs-cr3-targets", NO_ARG, &get_cr3_targets, 1}, + { "get-vmcs-apic-access-address", + NO_ARG, &get_apic_access_addr, 1}, + { "get-vmcs-virtual-apic-address", + NO_ARG, &get_virtual_apic_addr, 1}, + { "get-vmcs-tpr-threshold", + NO_ARG, &get_tpr_threshold, 1 }, + { "get-vmcs-msr-bitmap", + NO_ARG, &get_msr_bitmap, 1 }, + { "get-vmcs-msr-bitmap-address", + NO_ARG, &get_msr_bitmap_address, 1 }, + { "get-vmcs-vpid", NO_ARG, &get_vpid, 1 }, + { "get-vmcs-ple-gap", NO_ARG, &get_ple_gap, 1 }, + { "get-vmcs-ple-window", NO_ARG,&get_ple_window,1 }, + { "get-vmcs-instruction-error", + NO_ARG, &get_inst_err, 1 }, + { "get-vmcs-exit-ctls", NO_ARG, &get_exit_ctls, 1 }, + { "get-vmcs-entry-ctls", + NO_ARG, &get_entry_ctls, 1 }, + { "get-vmcs-guest-pat", NO_ARG, &get_guest_pat, 1 }, + { "get-vmcs-host-pat", NO_ARG, &get_host_pat, 1 }, + { "get-vmcs-host-cr0", + NO_ARG, &get_host_cr0, 1 }, + { "get-vmcs-host-cr3", + NO_ARG, &get_host_cr3, 1 }, + { "get-vmcs-host-cr4", + NO_ARG, &get_host_cr4, 1 }, + { "get-vmcs-host-rip", + NO_ARG, &get_host_rip, 1 }, + { "get-vmcs-host-rsp", + NO_ARG, &get_host_rsp, 1 }, + { "get-vmcs-guest-sysenter", + NO_ARG, &get_guest_sysenter, 1 }, + { "get-vmcs-link", NO_ARG, &get_vmcs_link, 1 }, + { "get-vmcs-exit-reason", + NO_ARG, &get_vmcs_exit_reason, 1 }, + { "get-vmcs-exit-qualification", + NO_ARG, &get_vmcs_exit_qualification, 1 }, + { "get-vmcs-exit-interruption-info", + NO_ARG, &get_vmcs_exit_interruption_info, 1}, + { "get-vmcs-exit-interruption-error", + NO_ARG, &get_vmcs_exit_interruption_error, 1}, + { "get-vmcs-interruptibility", + NO_ARG, &get_vmcs_interruptibility, 1 }, + { "get-pinning",NO_ARG, &get_pinning, 1 }, + { "run", NO_ARG, &run, 1 }, + { "create", NO_ARG, &create, 1 }, + { "destroy", NO_ARG, &destroy, 1 }, + { NULL, 0, NULL, 0 } + }; + + vcpu = 0; + progname = basename(argv[0]); + + while ((ch = getopt_long(argc, argv, "", opts, NULL)) != -1) { + switch (ch) { + case 0: + break; + case VMNAME: + vmname = optarg; + break; + case VCPU: + vcpu = atoi(optarg); + break; + case SET_LOWMEM: + lowmem = atoi(optarg) * MB; + lowmem = roundup(lowmem, 2 * MB); + break; + case SET_HIGHMEM: + highmem = atoi(optarg) * MB; + highmem = roundup(highmem, 2 * MB); + break; + case SET_EFER: + efer = strtoul(optarg, NULL, 0); + set_efer = 1; + break; + case SET_CR0: + cr0 = strtoul(optarg, NULL, 0); + set_cr0 = 1; + break; + case SET_CR3: + cr3 = strtoul(optarg, NULL, 0); + set_cr3 = 1; + break; + case SET_CR4: + cr4 = strtoul(optarg, NULL, 0); + set_cr4 = 1; + break; + case SET_DR7: + dr7 = strtoul(optarg, NULL, 0); + set_dr7 = 1; + break; + case SET_RSP: + rsp = strtoul(optarg, NULL, 0); + set_rsp = 1; + break; + case SET_RIP: + rip = strtoul(optarg, NULL, 0); + set_rip = 1; + break; + case SET_RAX: + rax = strtoul(optarg, NULL, 0); + set_rax = 1; + break; + case SET_RFLAGS: + rflags = strtoul(optarg, NULL, 0); + set_rflags = 1; + break; + case DESC_BASE: + desc_base = strtoul(optarg, NULL, 0); + break; + case DESC_LIMIT: + desc_limit = strtoul(optarg, NULL, 0); + break; + case DESC_ACCESS: + desc_access = strtoul(optarg, NULL, 0); + break; + case SET_CS: + cs = strtoul(optarg, NULL, 0); + set_cs = 1; + break; + case SET_DS: + ds = strtoul(optarg, NULL, 0); + set_ds = 1; + break; + case SET_ES: + es = strtoul(optarg, NULL, 0); + set_es = 1; + break; + case SET_FS: + fs = strtoul(optarg, NULL, 0); + set_fs = 1; + break; + case SET_GS: + gs = strtoul(optarg, NULL, 0); + set_gs = 1; + break; + case SET_SS: + ss = strtoul(optarg, NULL, 0); + set_ss = 1; + break; + case SET_TR: + tr = strtoul(optarg, NULL, 0); + set_tr = 1; + break; + case SET_LDTR: + ldtr = strtoul(optarg, NULL, 0); + set_ldtr = 1; + break; + case SET_PINNING: + pincpu = strtol(optarg, NULL, 0); + set_pinning = 1; + break; + case SET_VMCS_EXCEPTION_BITMAP: + exception_bitmap = strtoul(optarg, NULL, 0); + set_exception_bitmap = 1; + break; + case SET_VMCS_ENTRY_INTERRUPTION_INFO: + vmcs_entry_interruption_info = strtoul(optarg, NULL, 0); + set_vmcs_entry_interruption_info = 1; + break; + case SET_CAP: + capval = strtoul(optarg, NULL, 0); + setcap = 1; + break; + case CAPNAME: + capname = optarg; + break; + default: + usage(); + } + } + argc -= optind; + argv += optind; + + if (vmname == NULL) + usage(); + + error = 0; + + if (!error && create) + error = vm_create(vmname); + + if (!error) { + ctx = vm_open(vmname); + if (ctx == NULL) + error = -1; + } + + if (!error && lowmem) + error = vm_setup_memory(ctx, 0, lowmem, NULL); + + if (!error && highmem) + error = vm_setup_memory(ctx, 4 * GB, highmem, NULL); + + if (!error && set_efer) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_EFER, efer); + + if (!error && set_cr0) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR0, cr0); + + if (!error && set_cr3) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR3, cr3); + + if (!error && set_cr4) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR4, cr4); + + if (!error && set_dr7) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR7, dr7); + + if (!error && set_rsp) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RSP, rsp); + + if (!error && set_rip) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, rip); + + if (!error && set_rax) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, rax); + + if (!error && set_rflags) { + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RFLAGS, + rflags); + } + + if (!error && set_desc_ds) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_DS, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_es) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_ES, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_ss) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_SS, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_cs) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_CS, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_fs) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_FS, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_gs) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_GS, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_tr) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_TR, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_ldtr) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_LDTR, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_gdtr) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_GDTR, + desc_base, desc_limit, 0); + } + + if (!error && set_desc_idtr) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_IDTR, + desc_base, desc_limit, 0); + } + + if (!error && set_cs) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CS, cs); + + if (!error && set_ds) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DS, ds); + + if (!error && set_es) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_ES, es); + + if (!error && set_fs) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_FS, fs); + + if (!error && set_gs) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_GS, gs); + + if (!error && set_ss) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_SS, ss); + + if (!error && set_tr) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_TR, tr); + + if (!error && set_ldtr) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_LDTR, ldtr); + + if (!error && set_pinning) + error = vm_set_pinning(ctx, vcpu, pincpu); + + if (!error && set_exception_bitmap) { + error = vm_set_vmcs_field(ctx, vcpu, VMCS_EXCEPTION_BITMAP, + exception_bitmap); + } + + if (!error && set_vmcs_entry_interruption_info) { + error = vm_set_vmcs_field(ctx, vcpu, VMCS_ENTRY_INTR_INFO, + vmcs_entry_interruption_info); + } + + if (!error && get_lowmem) { + error = vm_get_memory_seg(ctx, 0, &hpa, &len); + if (error == 0) + printf("lowmem\t\t0x%016lx/%ld\n", hpa, len); + } + + if (!error && get_highmem) { + error = vm_get_memory_seg(ctx, 4 * GB, &hpa, &len); + if (error == 0) + printf("highmem\t\t0x%016lx/%ld\n", hpa, len); + } + + if (!error && get_efer) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_EFER, &efer); + if (error == 0) + printf("efer[%d]\t\t0x%016lx\n", vcpu, efer); + } + + if (!error && get_cr0) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR0, &cr0); + if (error == 0) + printf("cr0[%d]\t\t0x%016lx\n", vcpu, cr0); + } + + if (!error && get_cr3) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR3, &cr3); + if (error == 0) + printf("cr3[%d]\t\t0x%016lx\n", vcpu, cr3); + } + + if (!error && get_cr4) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR4, &cr4); + if (error == 0) + printf("cr4[%d]\t\t0x%016lx\n", vcpu, cr4); + } + + if (!error && get_dr7) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR7, &dr7); + if (error == 0) + printf("dr7[%d]\t\t0x%016lx\n", vcpu, dr7); + } + + if (!error && get_rsp) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RSP, &rsp); + if (error == 0) + printf("rsp[%d]\t\t0x%016lx\n", vcpu, rsp); + } + + if (!error && get_rip) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip); + if (error == 0) + printf("rip[%d]\t\t0x%016lx\n", vcpu, rip); + } + + if (!error && get_rax) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RAX, &rax); + if (error == 0) + printf("rax[%d]\t\t0x%016lx\n", vcpu, rax); + } + + if (!error && get_rbx) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RBX, &rbx); + if (error == 0) + printf("rbx[%d]\t\t0x%016lx\n", vcpu, rbx); + } + + if (!error && get_rcx) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RCX, &rcx); + if (error == 0) + printf("rcx[%d]\t\t0x%016lx\n", vcpu, rcx); + } + + if (!error && get_rdx) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RDX, &rdx); + if (error == 0) + printf("rdx[%d]\t\t0x%016lx\n", vcpu, rdx); + } + + if (!error && get_rsi) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RSI, &rsi); + if (error == 0) + printf("rsi[%d]\t\t0x%016lx\n", vcpu, rsi); + } + + if (!error && get_rdi) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RDI, &rdi); + if (error == 0) + printf("rdi[%d]\t\t0x%016lx\n", vcpu, rdi); + } + + if (!error && get_rbp) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RBP, &rbp); + if (error == 0) + printf("rbp[%d]\t\t0x%016lx\n", vcpu, rbp); + } + + if (!error && get_r8) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R8, &r8); + if (error == 0) + printf("r8[%d]\t\t0x%016lx\n", vcpu, r8); + } + + if (!error && get_r9) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R9, &r9); + if (error == 0) + printf("r9[%d]\t\t0x%016lx\n", vcpu, r9); + } + + if (!error && get_r10) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R10, &r10); + if (error == 0) + printf("r10[%d]\t\t0x%016lx\n", vcpu, r10); + } + + if (!error && get_r11) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R11, &r11); + if (error == 0) + printf("r11[%d]\t\t0x%016lx\n", vcpu, r11); + } + + if (!error && get_r12) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R12, &r12); + if (error == 0) + printf("r12[%d]\t\t0x%016lx\n", vcpu, r12); + } + + if (!error && get_r13) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R13, &r13); + if (error == 0) + printf("r13[%d]\t\t0x%016lx\n", vcpu, r13); + } + + if (!error && get_r14) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R14, &r14); + if (error == 0) + printf("r14[%d]\t\t0x%016lx\n", vcpu, r14); + } + + if (!error && get_r15) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R15, &r15); + if (error == 0) + printf("r15[%d]\t\t0x%016lx\n", vcpu, r15); + } + + if (!error && get_rflags) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RFLAGS, + &rflags); + if (error == 0) + printf("rflags[%d]\t0x%016lx\n", vcpu, rflags); + } + + if (!error && get_stats) { + int i, num_stats; + uint64_t *stats; + struct timeval tv; + const char *desc; + + stats = vm_get_stats(ctx, vcpu, &tv, &num_stats); + if (stats != NULL) { + printf("vcpu%d\n", vcpu); + for (i = 0; i < num_stats; i++) { + desc = vm_get_stat_desc(ctx, i); + printf("%-32s\t%ld\n", desc, stats[i]); + } + } + } + + if (!error && get_desc_ds) { + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_DS, + &desc_base, &desc_limit, &desc_access); + if (error == 0) { + printf("ds desc[%d]\t0x%016lx/0x%08x/0x%08x\n", + vcpu, desc_base, desc_limit, desc_access); + } + } + + if (!error && get_desc_es) { + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_ES, + &desc_base, &desc_limit, &desc_access); + if (error == 0) { + printf("es desc[%d]\t0x%016lx/0x%08x/0x%08x\n", + vcpu, desc_base, desc_limit, desc_access); + } + } + + if (!error && get_desc_fs) { + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_FS, + &desc_base, &desc_limit, &desc_access); + if (error == 0) { + printf("fs desc[%d]\t0x%016lx/0x%08x/0x%08x\n", + vcpu, desc_base, desc_limit, desc_access); + } + } + + if (!error && get_desc_gs) { + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_GS, + &desc_base, &desc_limit, &desc_access); + if (error == 0) { + printf("gs desc[%d]\t0x%016lx/0x%08x/0x%08x\n", + vcpu, desc_base, desc_limit, desc_access); + } + } + + if (!error && get_desc_ss) { + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS, + &desc_base, &desc_limit, &desc_access); + if (error == 0) { + printf("ss desc[%d]\t0x%016lx/0x%08x/0x%08x\n", + vcpu, desc_base, desc_limit, desc_access); + } + } + + if (!error && get_desc_cs) { + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_CS, + &desc_base, &desc_limit, &desc_access); + if (error == 0) { + printf("cs desc[%d]\t0x%016lx/0x%08x/0x%08x\n", + vcpu, desc_base, desc_limit, desc_access); + } + } + + if (!error && get_desc_tr) { + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR, + &desc_base, &desc_limit, &desc_access); + if (error == 0) { + printf("tr desc[%d]\t0x%016lx/0x%08x/0x%08x\n", + vcpu, desc_base, desc_limit, desc_access); + } + } + + if (!error && get_desc_ldtr) { + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_LDTR, + &desc_base, &desc_limit, &desc_access); + if (error == 0) { + printf("ldtr desc[%d]\t0x%016lx/0x%08x/0x%08x\n", + vcpu, desc_base, desc_limit, desc_access); + } + } + + if (!error && get_desc_gdtr) { + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_GDTR, + &desc_base, &desc_limit, &desc_access); + if (error == 0) { + printf("gdtr[%d]\t\t0x%016lx/0x%08x\n", + vcpu, desc_base, desc_limit); + } + } + + if (!error && get_desc_idtr) { + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_IDTR, + &desc_base, &desc_limit, &desc_access); + if (error == 0) { + printf("idtr[%d]\t\t0x%016lx/0x%08x\n", + vcpu, desc_base, desc_limit); + } + } + + if (!error && get_cs) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CS, &cs); + if (error == 0) + printf("cs[%d]\t\t0x%04lx\n", vcpu, cs); + } + + if (!error && get_ds) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DS, &ds); + if (error == 0) + printf("ds[%d]\t\t0x%04lx\n", vcpu, ds); + } + + if (!error && get_es) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_ES, &es); + if (error == 0) + printf("es[%d]\t\t0x%04lx\n", vcpu, es); + } + + if (!error && get_fs) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_FS, &fs); + if (error == 0) + printf("fs[%d]\t\t0x%04lx\n", vcpu, fs); + } + + if (!error && get_gs) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_GS, &gs); + if (error == 0) + printf("gs[%d]\t\t0x%04lx\n", vcpu, gs); + } + + if (!error && get_ss) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_SS, &ss); + if (error == 0) + printf("ss[%d]\t\t0x%04lx\n", vcpu, ss); + } + + if (!error && get_tr) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_TR, &tr); + if (error == 0) + printf("tr[%d]\t\t0x%04lx\n", vcpu, tr); + } + + if (!error && get_ldtr) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_LDTR, &ldtr); + if (error == 0) + printf("ldtr[%d]\t\t0x%04lx\n", vcpu, ldtr); + } + + if (!error && get_pinning) { + error = vm_get_pinning(ctx, vcpu, &pincpu); + if (error == 0) { + if (pincpu < 0) + printf("pincpu[%d]\tunpinned\n", vcpu); + else + printf("pincpu[%d]\t%d\n", vcpu, pincpu); + } + } + + if (!error && get_pinbased_ctls) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_PIN_BASED_CTLS, &ctl); + if (error == 0) + printf("pinbased_ctls[%d]\t0x%08lx\n", vcpu, ctl); + } + + if (!error && get_procbased_ctls) { + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_PRI_PROC_BASED_CTLS, &ctl); + if (error == 0) + printf("procbased_ctls[%d]\t0x%08lx\n", vcpu, ctl); + } + + if (!error && get_procbased_ctls2) { + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_SEC_PROC_BASED_CTLS, &ctl); + if (error == 0) + printf("procbased_ctls2[%d]\t0x%08lx\n", vcpu, ctl); + } + + if (!error && get_vmcs_gla) { + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_GUEST_LINEAR_ADDRESS, &u64); + if (error == 0) + printf("gla[%d]\t\t0x%016lx\n", vcpu, u64); + } + + if (!error && get_vmcs_gpa) { + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_GUEST_PHYSICAL_ADDRESS, &u64); + if (error == 0) + printf("gpa[%d]\t\t0x%016lx\n", vcpu, u64); + } + + if (!error && get_vmcs_entry_interruption_info) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_ENTRY_INTR_INFO,&u64); + if (error == 0) { + printf("entry_interruption_info[%d]\t0x%08lx\n", + vcpu, u64); + } + } + + if (!error && get_eptp) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_EPTP, &eptp); + if (error == 0) + printf("eptp[%d]\t\t0x%016lx\n", vcpu, eptp); + } + + if (!error && get_exception_bitmap) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXCEPTION_BITMAP, + &bm); + if (error == 0) + printf("exception_bitmap[%d]\t0x%08lx\n", vcpu, bm); + } + + if (!error && get_io_bitmap) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_IO_BITMAP_A, &bm); + if (error == 0) + printf("io_bitmap_a[%d]\t0x%08lx\n", vcpu, bm); + error = vm_get_vmcs_field(ctx, vcpu, VMCS_IO_BITMAP_B, &bm); + if (error == 0) + printf("io_bitmap_b[%d]\t0x%08lx\n", vcpu, bm); + } + + if (!error && get_tsc_offset) { + uint64_t tscoff; + error = vm_get_vmcs_field(ctx, vcpu, VMCS_TSC_OFFSET, &tscoff); + if (error == 0) + printf("tsc_offset[%d]\t0x%016lx\n", vcpu, tscoff); + } + + if (!error && get_cr0_mask) { + uint64_t cr0mask; + error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR0_MASK, &cr0mask); + if (error == 0) + printf("cr0_mask[%d]\t\t0x%016lx\n", vcpu, cr0mask); + } + + if (!error && get_cr0_shadow) { + uint64_t cr0shadow; + error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR0_SHADOW, + &cr0shadow); + if (error == 0) + printf("cr0_shadow[%d]\t\t0x%016lx\n", vcpu, cr0shadow); + } + + if (!error && get_cr4_mask) { + uint64_t cr4mask; + error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR4_MASK, &cr4mask); + if (error == 0) + printf("cr4_mask[%d]\t\t0x%016lx\n", vcpu, cr4mask); + } + + if (!error && get_cr4_shadow) { + uint64_t cr4shadow; + error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR4_SHADOW, + &cr4shadow); + if (error == 0) + printf("cr4_shadow[%d]\t\t0x%016lx\n", vcpu, cr4shadow); + } + + if (!error && get_cr3_targets) { + uint64_t target_count, target_addr; + error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET_COUNT, + &target_count); + if (error == 0) { + printf("cr3_target_count[%d]\t0x%08lx\n", + vcpu, target_count); + } + + error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET0, + &target_addr); + if (error == 0) { + printf("cr3_target0[%d]\t\t0x%016lx\n", + vcpu, target_addr); + } + + error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET1, + &target_addr); + if (error == 0) { + printf("cr3_target1[%d]\t\t0x%016lx\n", + vcpu, target_addr); + } + + error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET2, + &target_addr); + if (error == 0) { + printf("cr3_target2[%d]\t\t0x%016lx\n", + vcpu, target_addr); + } + + error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET3, + &target_addr); + if (error == 0) { + printf("cr3_target3[%d]\t\t0x%016lx\n", + vcpu, target_addr); + } + } + + if (!error && get_apic_access_addr) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_APIC_ACCESS, &addr); + if (error == 0) + printf("apic_access_addr[%d]\t0x%016lx\n", vcpu, addr); + } + + if (!error && get_virtual_apic_addr) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_VIRTUAL_APIC, &addr); + if (error == 0) + printf("virtual_apic_addr[%d]\t0x%016lx\n", vcpu, addr); + } + + if (!error && get_tpr_threshold) { + uint64_t threshold; + error = vm_get_vmcs_field(ctx, vcpu, VMCS_TPR_THRESHOLD, + &threshold); + if (error == 0) + printf("tpr_threshold[%d]\t0x%08lx\n", vcpu, threshold); + } + + if (!error && get_msr_bitmap_address) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_MSR_BITMAP, &addr); + if (error == 0) + printf("msr_bitmap[%d]\t\t0x%016lx\n", vcpu, addr); + } + + if (!error && get_msr_bitmap) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_MSR_BITMAP, &addr); + if (error == 0) + error = dump_vmcs_msr_bitmap(vcpu, addr); + } + + if (!error && get_vpid) { + uint64_t vpid; + error = vm_get_vmcs_field(ctx, vcpu, VMCS_VPID, &vpid); + if (error == 0) + printf("vpid[%d]\t\t0x%04lx\n", vcpu, vpid); + } + + if (!error && get_ple_window) { + uint64_t window; + error = vm_get_vmcs_field(ctx, vcpu, VMCS_PLE_WINDOW, &window); + if (error == 0) + printf("ple_window[%d]\t\t0x%08lx\n", vcpu, window); + } + + if (!error && get_ple_gap) { + uint64_t gap; + error = vm_get_vmcs_field(ctx, vcpu, VMCS_PLE_GAP, &gap); + if (error == 0) + printf("ple_gap[%d]\t\t0x%08lx\n", vcpu, gap); + } + + if (!error && get_inst_err) { + uint64_t insterr; + error = vm_get_vmcs_field(ctx, vcpu, VMCS_INSTRUCTION_ERROR, + &insterr); + if (error == 0) { + printf("instruction_error[%d]\t0x%08lx\n", + vcpu, insterr); + } + } + + if (!error && get_exit_ctls) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_CTLS, &ctl); + if (error == 0) + printf("exit_ctls[%d]\t\t0x%08lx\n", vcpu, ctl); + } + + if (!error && get_entry_ctls) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_ENTRY_CTLS, &ctl); + if (error == 0) + printf("entry_ctls[%d]\t\t0x%08lx\n", vcpu, ctl); + } + + if (!error && get_host_pat) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_IA32_PAT, &pat); + if (error == 0) + printf("host_pat[%d]\t\t0x%016lx\n", vcpu, pat); + } + + if (!error && get_guest_pat) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_GUEST_IA32_PAT, &pat); + if (error == 0) + printf("guest_pat[%d]\t\t0x%016lx\n", vcpu, pat); + } + + if (!error && get_host_cr0) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR0, &cr0); + if (error == 0) + printf("host_cr0[%d]\t\t0x%016lx\n", vcpu, cr0); + } + + if (!error && get_host_cr3) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR3, &cr3); + if (error == 0) + printf("host_cr3[%d]\t\t0x%016lx\n", vcpu, cr3); + } + + if (!error && get_host_cr4) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR4, &cr4); + if (error == 0) + printf("host_cr4[%d]\t\t0x%016lx\n", vcpu, cr4); + } + + if (!error && get_host_rip) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_RIP, &rip); + if (error == 0) + printf("host_rip[%d]\t\t0x%016lx\n", vcpu, rip); + } + + if (!error && get_host_rsp) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_RSP, &rsp); + if (error == 0) + printf("host_rip[%d]\t\t0x%016lx\n", vcpu, rsp); + } + + if (!error && get_guest_sysenter) { + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_GUEST_IA32_SYSENTER_CS, &cs); + if (error == 0) + printf("guest_sysenter_cs[%d]\t0x%08lx\n", vcpu, cs); + + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_GUEST_IA32_SYSENTER_ESP, &rsp); + if (error == 0) + printf("guest_sysenter_sp[%d]\t0x%016lx\n", vcpu, rsp); + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_GUEST_IA32_SYSENTER_EIP, &rip); + if (error == 0) + printf("guest_sysenter_ip[%d]\t0x%016lx\n", vcpu, rip); + } + + if (!error && get_vmcs_link) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_LINK_POINTER, &addr); + if (error == 0) + printf("vmcs_pointer[%d]\t0x%016lx\n", vcpu, addr); + } + + if (!error && get_vmcs_exit_reason) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_REASON, &u64); + if (error == 0) + printf("vmcs_exit_reason[%d]\t0x%016lx\n", vcpu, u64); + } + + if (!error && get_vmcs_exit_qualification) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_QUALIFICATION, + &u64); + if (error == 0) + printf("vmcs_exit_qualification[%d]\t0x%016lx\n", + vcpu, u64); + } + + if (!error && get_vmcs_exit_interruption_info) { + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_EXIT_INTERRUPTION_INFO, &u64); + if (error == 0) { + printf("vmcs_exit_interruption_info[%d]\t0x%08lx\n", + vcpu, u64); + } + } + + if (!error && get_vmcs_exit_interruption_error) { + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_EXIT_INTERRUPTION_ERROR, &u64); + if (error == 0) { + printf("vmcs_exit_interruption_error[%d]\t0x%08lx\n", + vcpu, u64); + } + } + + if (!error && get_vmcs_interruptibility) { + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_GUEST_INTERRUPTIBILITY, &u64); + if (error == 0) { + printf("vmcs_guest_interruptibility[%d]\t0x%08lx\n", + vcpu, u64); + } + } + + if (!error && setcap) { + int captype; + captype = vm_capability_name2type(capname); + error = vm_set_capability(ctx, vcpu, captype, capval); + if (error != 0 && errno == ENOENT) + printf("Capability \"%s\" is not available\n", capname); + } + + if (!error && getcap) { + int captype, val; + captype = vm_capability_name2type(capname); + error = vm_get_capability(ctx, vcpu, captype, &val); + if (error == 0) { + printf("Capability \"%s\" is %s on vcpu %d\n", capname, + val ? "set" : "not set", vcpu); + } else if (errno == ENOENT) { + printf("Capability \"%s\" is not available\n", capname); + } + } + + if (!error && run) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip); + assert(error == 0); + + error = vm_run(ctx, vcpu, rip, &vmexit); + if (error == 0) + dump_vm_run_exitcode(&vmexit, vcpu); + else + printf("vm_run error %d\n", error); + } + + if (error) + printf("errno = %d\n", errno); + + if (!error && destroy) + vm_destroy(ctx); + + exit(error); +}