MSI-x interrupt support for PCI pass-thru devices.
Includes instruction emulation for memory r/w access. This opens the door for io-apic, local apic, hpet timer, and legacy device emulation. Submitted by: ryan dot berryhill at sandvine dot com Reviewed by: grehan Obtained from: Sandvine
This commit is contained in:
parent
38f1b189cd
commit
cd942e0f25
@ -454,6 +454,25 @@ vm_setup_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
|
||||
return (ioctl(ctx->fd, VM_PPTDEV_MSI, &pptmsi));
|
||||
}
|
||||
|
||||
int
|
||||
vm_setup_msix(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
|
||||
int idx, uint32_t msg, uint32_t vector_control, uint64_t addr)
|
||||
{
|
||||
struct vm_pptdev_msix pptmsix;
|
||||
|
||||
bzero(&pptmsix, sizeof(pptmsix));
|
||||
pptmsix.vcpu = vcpu;
|
||||
pptmsix.bus = bus;
|
||||
pptmsix.slot = slot;
|
||||
pptmsix.func = func;
|
||||
pptmsix.idx = idx;
|
||||
pptmsix.msg = msg;
|
||||
pptmsix.addr = addr;
|
||||
pptmsix.vector_control = vector_control;
|
||||
|
||||
return ioctl(ctx->fd, VM_PPTDEV_MSIX, &pptmsix);
|
||||
}
|
||||
|
||||
uint64_t *
|
||||
vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv,
|
||||
int *ret_entries)
|
||||
|
@ -77,6 +77,8 @@ int vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func,
|
||||
vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
|
||||
int vm_setup_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
|
||||
int dest, int vector, int numvec);
|
||||
int vm_setup_msix(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
|
||||
int idx, uint32_t msg, uint32_t vector_control, uint64_t addr);
|
||||
|
||||
/*
|
||||
* Return a pointer to the statistics buffer. Note that this is not MT-safe.
|
||||
|
@ -227,7 +227,8 @@ enum vm_exitcode {
|
||||
VM_EXITCODE_HLT,
|
||||
VM_EXITCODE_MTRAP,
|
||||
VM_EXITCODE_PAUSE,
|
||||
VM_EXITCODE_MAX,
|
||||
VM_EXITCODE_PAGING,
|
||||
VM_EXITCODE_MAX
|
||||
};
|
||||
|
||||
struct vm_exit {
|
||||
@ -243,6 +244,9 @@ struct vm_exit {
|
||||
uint16_t port;
|
||||
uint32_t eax; /* valid for out */
|
||||
} inout;
|
||||
struct {
|
||||
uint64_t cr3;
|
||||
} paging;
|
||||
/*
|
||||
* VMX specific payload. Used when there is no "better"
|
||||
* exitcode to represent the VM-exit.
|
||||
|
@ -108,6 +108,17 @@ struct vm_pptdev_msi {
|
||||
int destcpu;
|
||||
};
|
||||
|
||||
struct vm_pptdev_msix {
|
||||
int vcpu;
|
||||
int bus;
|
||||
int slot;
|
||||
int func;
|
||||
int idx;
|
||||
uint32_t msg;
|
||||
uint32_t vector_control;
|
||||
uint64_t addr;
|
||||
};
|
||||
|
||||
struct vm_nmi {
|
||||
int cpuid;
|
||||
};
|
||||
@ -143,6 +154,7 @@ enum {
|
||||
IOCNUM_UNBIND_PPTDEV,
|
||||
IOCNUM_MAP_PPTDEV_MMIO,
|
||||
IOCNUM_PPTDEV_MSI,
|
||||
IOCNUM_PPTDEV_MSIX,
|
||||
IOCNUM_INJECT_NMI,
|
||||
IOCNUM_VM_STATS,
|
||||
IOCNUM_VM_STAT_DESC,
|
||||
@ -182,6 +194,8 @@ enum {
|
||||
_IOW('v', IOCNUM_MAP_PPTDEV_MMIO, struct vm_pptdev_mmio)
|
||||
#define VM_PPTDEV_MSI \
|
||||
_IOW('v', IOCNUM_PPTDEV_MSI, struct vm_pptdev_msi)
|
||||
#define VM_PPTDEV_MSIX \
|
||||
_IOW('v', IOCNUM_PPTDEV_MSIX, struct vm_pptdev_msix)
|
||||
#define VM_INJECT_NMI \
|
||||
_IOW('v', IOCNUM_INJECT_NMI, struct vm_nmi)
|
||||
#define VM_STATS \
|
||||
|
@ -65,6 +65,7 @@ uint64_t vmcs_read(uint32_t encoding);
|
||||
#define vmcs_instruction_error() vmcs_read(VMCS_INSTRUCTION_ERROR)
|
||||
#define vmcs_exit_reason() (vmcs_read(VMCS_EXIT_REASON) & 0xffff)
|
||||
#define vmcs_exit_qualification() vmcs_read(VMCS_EXIT_QUALIFICATION)
|
||||
#define vmcs_guest_cr3() vmcs_read(VMCS_GUEST_CR3)
|
||||
|
||||
#endif /* _KERNEL */
|
||||
|
||||
|
@ -1185,6 +1185,10 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
|
||||
case EXIT_REASON_CPUID:
|
||||
handled = vmx_handle_cpuid(vcpu, vmxctx);
|
||||
break;
|
||||
case EXIT_REASON_EPT_FAULT:
|
||||
vmexit->exitcode = VM_EXITCODE_PAGING;
|
||||
vmexit->u.paging.cr3 = vmcs_guest_cr3();
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
@ -32,6 +32,7 @@ __FBSDID("$FreeBSD$");
|
||||
#include <sys/param.h>
|
||||
#include <sys/systm.h>
|
||||
#include <sys/kernel.h>
|
||||
#include <sys/malloc.h>
|
||||
#include <sys/module.h>
|
||||
#include <sys/bus.h>
|
||||
#include <sys/pciio.h>
|
||||
@ -56,9 +57,12 @@ __FBSDID("$FreeBSD$");
|
||||
#define MAX_MMIOSEGS (PCIR_MAX_BAR_0 + 1)
|
||||
#define MAX_MSIMSGS 32
|
||||
|
||||
MALLOC_DEFINE(M_PPTMSIX, "pptmsix", "Passthru MSI-X resources");
|
||||
|
||||
struct pptintr_arg { /* pptintr(pptintr_arg) */
|
||||
struct pptdev *pptdev;
|
||||
int msg;
|
||||
int vec;
|
||||
int vcpu;
|
||||
};
|
||||
|
||||
static struct pptdev {
|
||||
@ -75,6 +79,16 @@ static struct pptdev {
|
||||
void *cookie[MAX_MSIMSGS];
|
||||
struct pptintr_arg arg[MAX_MSIMSGS];
|
||||
} msi;
|
||||
|
||||
struct {
|
||||
int num_msgs;
|
||||
int startrid;
|
||||
int msix_table_rid;
|
||||
struct resource *msix_table_res;
|
||||
struct resource **res;
|
||||
void **cookie;
|
||||
struct pptintr_arg *arg;
|
||||
} msix;
|
||||
} pptdevs[32];
|
||||
|
||||
static int num_pptdevs;
|
||||
@ -209,6 +223,57 @@ ppt_teardown_msi(struct pptdev *ppt)
|
||||
ppt->msi.num_msgs = 0;
|
||||
}
|
||||
|
||||
static void
|
||||
ppt_teardown_msix_intr(struct pptdev *ppt, int idx)
|
||||
{
|
||||
int rid;
|
||||
struct resource *res;
|
||||
void *cookie;
|
||||
|
||||
rid = ppt->msix.startrid + idx;
|
||||
res = ppt->msix.res[idx];
|
||||
cookie = ppt->msix.cookie[idx];
|
||||
|
||||
if (cookie != NULL)
|
||||
bus_teardown_intr(ppt->dev, res, cookie);
|
||||
|
||||
if (res != NULL)
|
||||
bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res);
|
||||
|
||||
ppt->msix.res[idx] = NULL;
|
||||
ppt->msix.cookie[idx] = NULL;
|
||||
}
|
||||
|
||||
static void
|
||||
ppt_teardown_msix(struct pptdev *ppt)
|
||||
{
|
||||
int i, error;
|
||||
|
||||
if (ppt->msix.num_msgs == 0)
|
||||
return;
|
||||
|
||||
for (i = 0; i < ppt->msix.num_msgs; i++)
|
||||
ppt_teardown_msix_intr(ppt, i);
|
||||
|
||||
if (ppt->msix.msix_table_res) {
|
||||
bus_release_resource(ppt->dev, SYS_RES_MEMORY,
|
||||
ppt->msix.msix_table_rid,
|
||||
ppt->msix.msix_table_res);
|
||||
ppt->msix.msix_table_res = NULL;
|
||||
ppt->msix.msix_table_rid = 0;
|
||||
}
|
||||
|
||||
free(ppt->msix.res, M_PPTMSIX);
|
||||
free(ppt->msix.cookie, M_PPTMSIX);
|
||||
free(ppt->msix.arg, M_PPTMSIX);
|
||||
|
||||
error = pci_release_msi(ppt->dev);
|
||||
if (error)
|
||||
printf("ppt_teardown_msix: Failed to release MSI-X resources (error %i)\n", error);
|
||||
|
||||
ppt->msix.num_msgs = 0;
|
||||
}
|
||||
|
||||
int
|
||||
ppt_assign_device(struct vm *vm, int bus, int slot, int func)
|
||||
{
|
||||
@ -244,6 +309,7 @@ ppt_unassign_device(struct vm *vm, int bus, int slot, int func)
|
||||
return (EBUSY);
|
||||
ppt_unmap_mmio(vm, ppt);
|
||||
ppt_teardown_msi(ppt);
|
||||
ppt_teardown_msix(ppt);
|
||||
iommu_remove_device(vm_iommu_domain(vm), bus, slot, func);
|
||||
ppt->vm = NULL;
|
||||
return (0);
|
||||
@ -309,10 +375,10 @@ pptintr(void *arg)
|
||||
|
||||
pptarg = arg;
|
||||
ppt = pptarg->pptdev;
|
||||
vec = ppt->msi.vector + pptarg->msg;
|
||||
vec = pptarg->vec;
|
||||
|
||||
if (ppt->vm != NULL)
|
||||
(void) lapic_set_intr(ppt->vm, ppt->msi.vcpu, vec);
|
||||
(void) lapic_set_intr(ppt->vm, pptarg->vcpu, vec);
|
||||
else {
|
||||
/*
|
||||
* XXX
|
||||
@ -431,7 +497,7 @@ ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
|
||||
break;
|
||||
|
||||
ppt->msi.arg[i].pptdev = ppt;
|
||||
ppt->msi.arg[i].msg = i;
|
||||
ppt->msi.arg[i].vec = vector + i;
|
||||
|
||||
error = bus_setup_intr(ppt->dev, ppt->msi.res[i],
|
||||
INTR_TYPE_NET | INTR_MPSAFE,
|
||||
@ -448,3 +514,110 @@ ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
int
|
||||
ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
|
||||
int idx, uint32_t msg, uint32_t vector_control, uint64_t addr)
|
||||
{
|
||||
struct pptdev *ppt;
|
||||
struct pci_devinfo *dinfo;
|
||||
int numvec, vector_count, rid, error;
|
||||
size_t res_size, cookie_size, arg_size;
|
||||
|
||||
ppt = ppt_find(bus, slot, func);
|
||||
if (ppt == NULL)
|
||||
return (ENOENT);
|
||||
if (ppt->vm != vm) /* Make sure we own this device */
|
||||
return (EBUSY);
|
||||
|
||||
dinfo = device_get_ivars(ppt->dev);
|
||||
if (!dinfo)
|
||||
return (ENXIO);
|
||||
|
||||
/*
|
||||
* First-time configuration:
|
||||
* Allocate the MSI-X table
|
||||
* Allocate the IRQ resources
|
||||
* Set up some variables in ppt->msix
|
||||
*/
|
||||
if (!ppt->msix.msix_table_res) {
|
||||
ppt->msix.res = NULL;
|
||||
ppt->msix.cookie = NULL;
|
||||
ppt->msix.arg = NULL;
|
||||
|
||||
rid = dinfo->cfg.msix.msix_table_bar;
|
||||
ppt->msix.msix_table_res = bus_alloc_resource_any(ppt->dev, SYS_RES_MEMORY,
|
||||
&rid, RF_ACTIVE);
|
||||
if (ppt->msix.msix_table_res == NULL)
|
||||
return (ENOSPC);
|
||||
|
||||
ppt->msix.msix_table_rid = rid;
|
||||
|
||||
vector_count = numvec = pci_msix_count(ppt->dev);
|
||||
|
||||
error = pci_alloc_msix(ppt->dev, &numvec);
|
||||
if (error)
|
||||
return (error);
|
||||
else if (vector_count != numvec) {
|
||||
pci_release_msi(ppt->dev);
|
||||
return (ENOSPC);
|
||||
}
|
||||
|
||||
ppt->msix.num_msgs = numvec;
|
||||
|
||||
ppt->msix.startrid = 1;
|
||||
|
||||
res_size = numvec * sizeof(ppt->msix.res[0]);
|
||||
cookie_size = numvec * sizeof(ppt->msix.cookie[0]);
|
||||
arg_size = numvec * sizeof(ppt->msix.arg[0]);
|
||||
|
||||
ppt->msix.res = malloc(res_size, M_PPTMSIX, M_WAITOK);
|
||||
ppt->msix.cookie = malloc(cookie_size, M_PPTMSIX, M_WAITOK);
|
||||
ppt->msix.arg = malloc(arg_size, M_PPTMSIX, M_WAITOK);
|
||||
if (ppt->msix.res == NULL || ppt->msix.cookie == NULL ||
|
||||
ppt->msix.arg == NULL) {
|
||||
ppt_teardown_msix(ppt);
|
||||
return (ENOSPC);
|
||||
}
|
||||
bzero(ppt->msix.res, res_size);
|
||||
bzero(ppt->msix.cookie, cookie_size);
|
||||
bzero(ppt->msix.arg, arg_size);
|
||||
}
|
||||
|
||||
if ((vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
|
||||
/* Tear down the IRQ if it's already set up */
|
||||
ppt_teardown_msix_intr(ppt, idx);
|
||||
|
||||
/* Allocate the IRQ resource */
|
||||
ppt->msix.cookie[idx] = NULL;
|
||||
rid = ppt->msix.startrid + idx;
|
||||
ppt->msix.res[idx] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ,
|
||||
&rid, RF_ACTIVE);
|
||||
if (ppt->msix.res[idx] == NULL)
|
||||
return (ENXIO);
|
||||
|
||||
ppt->msix.arg[idx].pptdev = ppt;
|
||||
ppt->msix.arg[idx].vec = msg;
|
||||
ppt->msix.arg[idx].vcpu = (addr >> 12) & 0xFF;
|
||||
|
||||
/* Setup the MSI-X interrupt */
|
||||
error = bus_setup_intr(ppt->dev, ppt->msix.res[idx],
|
||||
INTR_TYPE_NET | INTR_MPSAFE,
|
||||
pptintr, NULL, &ppt->msix.arg[idx],
|
||||
&ppt->msix.cookie[idx]);
|
||||
|
||||
if (error != 0) {
|
||||
bus_teardown_intr(ppt->dev, ppt->msix.res[idx], ppt->msix.cookie[idx]);
|
||||
bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, ppt->msix.res[idx]);
|
||||
ppt->msix.cookie[idx] = NULL;
|
||||
ppt->msix.res[idx] = NULL;
|
||||
return (ENXIO);
|
||||
}
|
||||
} else {
|
||||
/* Masked, tear it down if it's already been set up */
|
||||
ppt_teardown_msix_intr(ppt, idx);
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
|
@ -36,5 +36,6 @@ int ppt_map_mmio(struct vm *vm, int bus, int slot, int func,
|
||||
vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
|
||||
int ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
|
||||
int destcpu, int vector, int numvec);
|
||||
|
||||
int ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
|
||||
int idx, uint32_t msg, uint32_t vector_control, uint64_t addr);
|
||||
#endif
|
||||
|
@ -778,6 +778,7 @@ vlapic_init(struct vm *vm, int vcpuid)
|
||||
void
|
||||
vlapic_cleanup(struct vlapic *vlapic)
|
||||
{
|
||||
vlapic_op_halt(vlapic);
|
||||
vdev_unregister(vlapic);
|
||||
free(vlapic, M_VLAPIC);
|
||||
}
|
||||
|
@ -158,6 +158,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
|
||||
struct vm_pptdev *pptdev;
|
||||
struct vm_pptdev_mmio *pptmmio;
|
||||
struct vm_pptdev_msi *pptmsi;
|
||||
struct vm_pptdev_msix *pptmsix;
|
||||
struct vm_nmi *vmnmi;
|
||||
struct vm_stats *vmstats;
|
||||
struct vm_stat_desc *statdesc;
|
||||
@ -240,6 +241,14 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
|
||||
pptmsi->destcpu, pptmsi->vector,
|
||||
pptmsi->numvec);
|
||||
break;
|
||||
case VM_PPTDEV_MSIX:
|
||||
pptmsix = (struct vm_pptdev_msix *)data;
|
||||
error = ppt_setup_msix(sc->vm, pptmsix->vcpu,
|
||||
pptmsix->bus, pptmsix->slot,
|
||||
pptmsix->func, pptmsix->idx,
|
||||
pptmsix->msg, pptmsix->vector_control,
|
||||
pptmsix->addr);
|
||||
break;
|
||||
case VM_MAP_PPTDEV_MMIO:
|
||||
pptmmio = (struct vm_pptdev_mmio *)data;
|
||||
error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
|
||||
|
@ -4,7 +4,8 @@
|
||||
|
||||
PROG= bhyve
|
||||
|
||||
SRCS= atpic.c consport.c dbgport.c elcr.c fbsdrun.c inout.c mevent.c
|
||||
SRCS= atpic.c consport.c dbgport.c elcr.c fbsdrun.c inout.c
|
||||
SRCS+= instruction_emul.c mevent.c
|
||||
SRCS+= pci_emul.c pci_hostbridge.c pci_passthru.c pci_virtio_block.c
|
||||
SRCS+= pci_virtio_net.c pit_8254.c post.c rtc.c uart.c xmsr.c
|
||||
|
||||
|
@ -53,6 +53,7 @@ __FBSDID("$FreeBSD$");
|
||||
#include "mevent.h"
|
||||
#include "pci_emul.h"
|
||||
#include "xmsr.h"
|
||||
#include "instruction_emul.h"
|
||||
|
||||
#define DEFAULT_GUEST_HZ 100
|
||||
#define DEFAULT_GUEST_TSLICE 200
|
||||
@ -108,6 +109,7 @@ struct fbsdstats {
|
||||
uint64_t vmexit_hlt;
|
||||
uint64_t vmexit_pause;
|
||||
uint64_t vmexit_mtrap;
|
||||
uint64_t vmexit_paging;
|
||||
uint64_t cpu_switch_rotate;
|
||||
uint64_t cpu_switch_direct;
|
||||
int io_reset;
|
||||
@ -412,6 +414,20 @@ vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
|
||||
return (VMEXIT_RESTART);
|
||||
}
|
||||
|
||||
static int
|
||||
vmexit_paging(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
|
||||
{
|
||||
|
||||
stats.vmexit_paging++;
|
||||
|
||||
if (emulate_instruction(ctx, *pvcpu, vmexit->rip, vmexit->u.paging.cr3) != 0) {
|
||||
printf("Failed to emulate instruction at 0x%lx\n", vmexit->rip);
|
||||
return (VMEXIT_ABORT);
|
||||
}
|
||||
|
||||
return (VMEXIT_CONTINUE);
|
||||
}
|
||||
|
||||
static void
|
||||
sigalrm(int sig)
|
||||
{
|
||||
@ -446,12 +462,13 @@ setup_timeslice(void)
|
||||
}
|
||||
|
||||
static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
|
||||
[VM_EXITCODE_INOUT] = vmexit_inout,
|
||||
[VM_EXITCODE_VMX] = vmexit_vmx,
|
||||
[VM_EXITCODE_BOGUS] = vmexit_bogus,
|
||||
[VM_EXITCODE_RDMSR] = vmexit_rdmsr,
|
||||
[VM_EXITCODE_WRMSR] = vmexit_wrmsr,
|
||||
[VM_EXITCODE_MTRAP] = vmexit_mtrap,
|
||||
[VM_EXITCODE_INOUT] = vmexit_inout,
|
||||
[VM_EXITCODE_VMX] = vmexit_vmx,
|
||||
[VM_EXITCODE_BOGUS] = vmexit_bogus,
|
||||
[VM_EXITCODE_RDMSR] = vmexit_rdmsr,
|
||||
[VM_EXITCODE_WRMSR] = vmexit_wrmsr,
|
||||
[VM_EXITCODE_MTRAP] = vmexit_mtrap,
|
||||
[VM_EXITCODE_PAGING] = vmexit_paging
|
||||
};
|
||||
|
||||
static void
|
||||
|
555
usr.sbin/bhyve/instruction_emul.c
Normal file
555
usr.sbin/bhyve/instruction_emul.c
Normal file
@ -0,0 +1,555 @@
|
||||
/*-
|
||||
* Copyright (c) 2012 Sandvine, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
* $FreeBSD$
|
||||
*/
|
||||
|
||||
#include <strings.h>
|
||||
#include <unistd.h>
|
||||
#include <machine/vmm.h>
|
||||
#include <vmmapi.h>
|
||||
|
||||
#include "fbsdrun.h"
|
||||
#include "instruction_emul.h"
|
||||
|
||||
#define PREFIX_LOCK 0xF0
|
||||
#define PREFIX_REPNE 0xF2
|
||||
#define PREFIX_REPE 0xF3
|
||||
#define PREFIX_CS_OVERRIDE 0x2E
|
||||
#define PREFIX_SS_OVERRIDE 0x36
|
||||
#define PREFIX_DS_OVERRIDE 0x3E
|
||||
#define PREFIX_ES_OVERRIDE 0x26
|
||||
#define PREFIX_FS_OVERRIDE 0x64
|
||||
#define PREFIX_GS_OVERRIDE 0x65
|
||||
#define PREFIX_BRANCH_NOT_TAKEN 0x2E
|
||||
#define PREFIX_BRANCH_TAKEN 0x3E
|
||||
#define PREFIX_OPSIZE 0x66
|
||||
#define PREFIX_ADDRSIZE 0x67
|
||||
|
||||
#define OPCODE_2BYTE_ESCAPE 0x0F
|
||||
#define OPCODE_3BYTE_ESCAPE 0x38
|
||||
|
||||
#define MODRM_MOD_MASK 0xC0
|
||||
#define MODRM_MOD_SHIFT 6
|
||||
#define MODRM_RM_MASK 0x07
|
||||
#define MODRM_RM_SHIFT 0
|
||||
#define MODRM_REG_MASK 0x38
|
||||
#define MODRM_REG_SHIFT 3
|
||||
|
||||
#define MOD_INDIRECT 0x0
|
||||
#define MOD_INDIRECT_DISP8 0x1
|
||||
#define MOD_INDIRECT_DISP32 0x2
|
||||
#define MOD_DIRECT 0x3
|
||||
|
||||
#define RM_EAX 0x0
|
||||
#define RM_ECX 0x1
|
||||
#define RM_EDX 0x2
|
||||
#define RM_EBX 0x3
|
||||
#define RM_SIB 0x4
|
||||
#define RM_DISP32 0x5
|
||||
#define RM_EBP RM_DISP32
|
||||
#define RM_ESI 0x6
|
||||
#define RM_EDI 0x7
|
||||
|
||||
#define REG_EAX 0x0
|
||||
#define REG_ECX 0x1
|
||||
#define REG_EDX 0x2
|
||||
#define REG_EBX 0x3
|
||||
#define REG_ESP 0x4
|
||||
#define REG_EBP 0x5
|
||||
#define REG_ESI 0x6
|
||||
#define REG_EDI 0x7
|
||||
#define REG_R8 0x8
|
||||
#define REG_R9 0x9
|
||||
#define REG_R10 0xA
|
||||
#define REG_R11 0xB
|
||||
#define REG_R12 0xC
|
||||
#define REG_R13 0xD
|
||||
#define REG_R14 0xE
|
||||
#define REG_R15 0xF
|
||||
|
||||
#define HAS_MODRM 1
|
||||
#define FROM_RM (1<<1)
|
||||
#define FROM_REG (1<<2)
|
||||
#define TO_RM (1<<3)
|
||||
#define TO_REG (1<<4)
|
||||
|
||||
#define REX_MASK 0xF0
|
||||
#define REX_PREFIX 0x40
|
||||
#define is_rex_prefix(x) ( ((x) & REX_MASK) == REX_PREFIX )
|
||||
#define REX_W_MASK 0x8
|
||||
#define REX_R_MASK 0x4
|
||||
#define REX_X_MASK 0x2
|
||||
#define REX_B_MASK 0x1
|
||||
|
||||
#define is_prefix(x) ((x) == PREFIX_LOCK || (x) == PREFIX_REPNE || \
|
||||
(x) == PREFIX_REPE || (x) == PREFIX_CS_OVERRIDE || \
|
||||
(x) == PREFIX_SS_OVERRIDE || (x) == PREFIX_DS_OVERRIDE || \
|
||||
(x) == PREFIX_ES_OVERRIDE || (x) == PREFIX_FS_OVERRIDE || \
|
||||
(x) == PREFIX_GS_OVERRIDE || (x) == PREFIX_BRANCH_NOT_TAKEN || \
|
||||
(x) == PREFIX_BRANCH_TAKEN || (x) == PREFIX_OPSIZE || \
|
||||
(x) == PREFIX_ADDRSIZE || is_rex_prefix((x)))
|
||||
|
||||
#define PAGE_FRAME_MASK 0x80
|
||||
#define PAGE_OFFSET_MASK 0xFFF
|
||||
#define PAGE_TABLE_ENTRY_MASK (~PAGE_OFFSET_MASK)
|
||||
#define PML4E_OFFSET_MASK 0x0000FF8000000000
|
||||
#define PML4E_SHIFT 39
|
||||
|
||||
#define MAX_EMULATED_REGIONS 8
|
||||
int registered_regions = 0;
|
||||
struct memory_region
|
||||
{
|
||||
uintptr_t start;
|
||||
uintptr_t end;
|
||||
emulated_read_func_t memread;
|
||||
emulated_write_func_t memwrite;
|
||||
void *arg;
|
||||
} emulated_regions[MAX_EMULATED_REGIONS];
|
||||
|
||||
struct decoded_instruction
|
||||
{
|
||||
void *instruction;
|
||||
uint8_t *opcode;
|
||||
uint8_t *modrm;
|
||||
uint8_t *sib;
|
||||
uint8_t *displacement;
|
||||
uint8_t *immediate;
|
||||
|
||||
uint8_t opcode_flags;
|
||||
|
||||
uint8_t addressing_mode;
|
||||
uint8_t rm;
|
||||
uint8_t reg;
|
||||
uint8_t rex_r;
|
||||
uint8_t rex_w;
|
||||
uint8_t rex_b;
|
||||
uint8_t rex_x;
|
||||
|
||||
int32_t disp;
|
||||
};
|
||||
|
||||
static enum vm_reg_name vm_reg_name_mappings[] = {
|
||||
[REG_EAX] = VM_REG_GUEST_RAX,
|
||||
[REG_EBX] = VM_REG_GUEST_RBX,
|
||||
[REG_ECX] = VM_REG_GUEST_RCX,
|
||||
[REG_EDX] = VM_REG_GUEST_RDX,
|
||||
[REG_ESP] = VM_REG_GUEST_RSP,
|
||||
[REG_EBP] = VM_REG_GUEST_RBP,
|
||||
[REG_ESI] = VM_REG_GUEST_RSI,
|
||||
[REG_EDI] = VM_REG_GUEST_RDI,
|
||||
[REG_R8] = VM_REG_GUEST_R8,
|
||||
[REG_R9] = VM_REG_GUEST_R9,
|
||||
[REG_R10] = VM_REG_GUEST_R10,
|
||||
[REG_R11] = VM_REG_GUEST_R11,
|
||||
[REG_R12] = VM_REG_GUEST_R12,
|
||||
[REG_R13] = VM_REG_GUEST_R13,
|
||||
[REG_R14] = VM_REG_GUEST_R14,
|
||||
[REG_R15] = VM_REG_GUEST_R15
|
||||
};
|
||||
|
||||
uint8_t one_byte_opcodes[256] = {
|
||||
[0x89] = HAS_MODRM | FROM_REG | TO_RM,
|
||||
[0x8B] = HAS_MODRM | FROM_RM | TO_REG,
|
||||
};
|
||||
|
||||
static uintptr_t
|
||||
gla2gpa(uint64_t gla, uint64_t guest_cr3)
|
||||
{
|
||||
uint64_t *table;
|
||||
uint64_t mask, entry;
|
||||
int level, shift;
|
||||
uintptr_t page_frame;
|
||||
|
||||
table = paddr_guest2host(guest_cr3 & PAGE_TABLE_ENTRY_MASK);
|
||||
mask = PML4E_OFFSET_MASK;
|
||||
shift = PML4E_SHIFT;
|
||||
for (level = 0; level < 4; ++level)
|
||||
{
|
||||
entry = table[(gla & mask) >> shift];
|
||||
table = (uint64_t*)(entry & PAGE_TABLE_ENTRY_MASK);
|
||||
|
||||
/* This entry does not point to another page table */
|
||||
if (entry & PAGE_FRAME_MASK || level >= 3)
|
||||
break;
|
||||
|
||||
table = paddr_guest2host((uintptr_t)table);
|
||||
mask >>= 9;
|
||||
shift -= 9;
|
||||
}
|
||||
|
||||
mask = (1 << shift) - 1;
|
||||
page_frame = ((uintptr_t)table & ~mask);
|
||||
return (page_frame | (gla & mask));
|
||||
}
|
||||
|
||||
static void *
|
||||
gla2hla(uint64_t gla, uint64_t guest_cr3)
|
||||
{
|
||||
uintptr_t gpa;
|
||||
|
||||
gpa = gla2gpa(gla, guest_cr3);
|
||||
return paddr_guest2host(gpa);
|
||||
}
|
||||
|
||||
/*
|
||||
* Decodes all of the prefixes of the instruction. Only a subset of REX
|
||||
* prefixes are currently supported. If any unsupported prefix is
|
||||
* encountered, returns -1.
|
||||
*/
|
||||
static int
|
||||
decode_prefixes(struct decoded_instruction *decoded)
|
||||
{
|
||||
uint8_t *current_prefix;
|
||||
|
||||
current_prefix = decoded->instruction;
|
||||
|
||||
if (is_rex_prefix(*current_prefix)) {
|
||||
decoded->rex_w = *current_prefix & REX_W_MASK;
|
||||
decoded->rex_r = *current_prefix & REX_R_MASK;
|
||||
decoded->rex_x = *current_prefix & REX_X_MASK;
|
||||
decoded->rex_b = *current_prefix & REX_B_MASK;
|
||||
current_prefix++;
|
||||
} else if (is_prefix(*current_prefix)) {
|
||||
return (-1);
|
||||
}
|
||||
|
||||
decoded->opcode = current_prefix;
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Decodes the instruction's opcode. If the opcode is not understood, returns
|
||||
* -1 indicating an error. Sets the instruction's mod_rm pointer to the
|
||||
* location of the ModR/M field.
|
||||
*/
|
||||
static int
|
||||
decode_opcode(struct decoded_instruction *decoded)
|
||||
{
|
||||
uint8_t opcode, flags;
|
||||
|
||||
opcode = *decoded->opcode;
|
||||
flags = one_byte_opcodes[opcode];
|
||||
|
||||
if (!flags)
|
||||
return (-1);
|
||||
|
||||
if (flags & HAS_MODRM) {
|
||||
decoded->modrm = decoded->opcode + 1;
|
||||
}
|
||||
|
||||
decoded->opcode_flags = flags;
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Decodes the instruction's ModR/M field. Sets the instruction's sib pointer
|
||||
* to the location of the SIB if one is expected to be present, or 0 if not.
|
||||
*/
|
||||
static int
|
||||
decode_mod_rm(struct decoded_instruction *decoded)
|
||||
{
|
||||
uint8_t modrm;
|
||||
uint8_t *extension_operands;
|
||||
|
||||
if (decoded->modrm) {
|
||||
modrm = *decoded->modrm;
|
||||
|
||||
decoded->addressing_mode = (modrm & MODRM_MOD_MASK) >> MODRM_MOD_SHIFT;
|
||||
decoded->rm = (modrm & MODRM_RM_MASK) >> MODRM_RM_SHIFT;
|
||||
decoded->reg = (modrm & MODRM_REG_MASK) >> MODRM_REG_SHIFT;
|
||||
|
||||
if (decoded->rex_b)
|
||||
decoded->rm |= (1<<3);
|
||||
|
||||
if (decoded->rex_r)
|
||||
decoded->reg |= (1<<3);
|
||||
|
||||
extension_operands = decoded->modrm + 1;
|
||||
|
||||
if (decoded->rm == RM_SIB) {
|
||||
decoded->sib = decoded->modrm + 1;
|
||||
extension_operands = decoded->sib + 1;
|
||||
}
|
||||
|
||||
switch (decoded->addressing_mode) {
|
||||
case MOD_INDIRECT:
|
||||
case MOD_DIRECT:
|
||||
decoded->displacement = 0;
|
||||
break;
|
||||
case MOD_INDIRECT_DISP8:
|
||||
decoded->displacement = extension_operands;
|
||||
break;
|
||||
case MOD_INDIRECT_DISP32:
|
||||
decoded->displacement = extension_operands;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Decodes the instruction's SIB field. No such instructions are currently
|
||||
* supported, so do nothing and return -1 if there is a SIB field, 0 otherwise.
|
||||
*/
|
||||
static int
|
||||
decode_sib(struct decoded_instruction *decoded)
|
||||
{
|
||||
|
||||
if (decoded->sib)
|
||||
return (-1);
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Grabs and saves the instruction's immediate operand and displacement if
|
||||
* they are present. Immediates are not currently supported, so if an
|
||||
* immediate is present it will return -1 indicating an error.
|
||||
*/
|
||||
static int
|
||||
decode_extension_operands(struct decoded_instruction *decoded)
|
||||
{
|
||||
|
||||
if (decoded->displacement) {
|
||||
if (decoded->addressing_mode == MOD_INDIRECT_DISP8) {
|
||||
decoded->disp = (int32_t)*decoded->displacement;
|
||||
} else if (decoded->addressing_mode == MOD_INDIRECT_DISP32) {
|
||||
decoded->disp = *((int32_t*)decoded->displacement);
|
||||
}
|
||||
}
|
||||
|
||||
if (decoded->immediate) {
|
||||
return (-1);
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
static int
|
||||
decode_instruction(void *instr, struct decoded_instruction *decoded)
|
||||
{
|
||||
int error;
|
||||
|
||||
bzero(decoded, sizeof(*decoded));
|
||||
decoded->instruction = instr;
|
||||
|
||||
error = decode_prefixes(decoded);
|
||||
if (error)
|
||||
return (error);
|
||||
|
||||
error = decode_opcode(decoded);
|
||||
if (error)
|
||||
return (error);
|
||||
|
||||
error = decode_mod_rm(decoded);
|
||||
if (error)
|
||||
return (error);
|
||||
|
||||
error = decode_sib(decoded);
|
||||
if (error)
|
||||
return (error);
|
||||
|
||||
error = decode_extension_operands(decoded);
|
||||
if (error)
|
||||
return (error);
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
static struct memory_region *
|
||||
find_region(uintptr_t addr)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < registered_regions; ++i) {
|
||||
if (emulated_regions[i].start <= addr &&
|
||||
emulated_regions[i].end >= addr) {
|
||||
return &emulated_regions[i];
|
||||
}
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
static enum vm_reg_name
|
||||
get_vm_reg_name(uint8_t reg)
|
||||
{
|
||||
return vm_reg_name_mappings[reg];
|
||||
}
|
||||
|
||||
static int
|
||||
get_operand(struct vmctx *vm, int vcpu, uint64_t guest_cr3,
|
||||
const struct decoded_instruction *instruction, uint64_t *operand)
|
||||
{
|
||||
enum vm_reg_name regname;
|
||||
uint64_t reg;
|
||||
uintptr_t target;
|
||||
int error;
|
||||
uint8_t rm, addressing_mode;
|
||||
struct memory_region *emulated_memory;
|
||||
|
||||
if (instruction->opcode_flags & FROM_RM) {
|
||||
rm = instruction->rm;
|
||||
addressing_mode = instruction->addressing_mode;
|
||||
} else if (instruction->opcode_flags & FROM_REG) {
|
||||
rm = instruction->reg;
|
||||
addressing_mode = MOD_DIRECT;
|
||||
} else
|
||||
return (-1);
|
||||
|
||||
regname = get_vm_reg_name(rm);
|
||||
error = vm_get_register(vm, vcpu, regname, ®);
|
||||
if (error)
|
||||
return (error);
|
||||
|
||||
switch (addressing_mode) {
|
||||
case MOD_DIRECT:
|
||||
*operand = reg;
|
||||
return (0);
|
||||
case MOD_INDIRECT:
|
||||
target = gla2gpa(reg, guest_cr3);
|
||||
emulated_memory = find_region(target);
|
||||
if (emulated_memory) {
|
||||
return emulated_memory->memread(vm, vcpu, target,
|
||||
4, operand,
|
||||
emulated_memory->arg);
|
||||
}
|
||||
return (-1);
|
||||
case MOD_INDIRECT_DISP8:
|
||||
case MOD_INDIRECT_DISP32:
|
||||
target = gla2gpa(reg, guest_cr3);
|
||||
target += instruction->disp;
|
||||
emulated_memory = find_region(target);
|
||||
if (emulated_memory) {
|
||||
return emulated_memory->memread(vm, vcpu, target,
|
||||
4, operand,
|
||||
emulated_memory->arg);
|
||||
}
|
||||
return (-1);
|
||||
default:
|
||||
return (-1);
|
||||
}
|
||||
}
|
||||
|
||||
static int
|
||||
perform_write(struct vmctx *vm, int vcpu, uint64_t guest_cr3,
|
||||
const struct decoded_instruction *instruction, uint64_t operand)
|
||||
{
|
||||
enum vm_reg_name regname;
|
||||
uintptr_t target;
|
||||
int error;
|
||||
uint64_t reg;
|
||||
struct memory_region *emulated_memory;
|
||||
uint8_t addressing_mode;
|
||||
|
||||
if (instruction->opcode_flags & TO_RM) {
|
||||
reg = instruction->rm;
|
||||
addressing_mode = instruction->addressing_mode;
|
||||
} else if (instruction->opcode_flags & TO_REG) {
|
||||
reg = instruction->reg;
|
||||
addressing_mode = MOD_DIRECT;
|
||||
} else
|
||||
return (-1);
|
||||
|
||||
regname = get_vm_reg_name(reg);
|
||||
error = vm_get_register(vm, vcpu, regname, ®);
|
||||
if (error)
|
||||
return (error);
|
||||
|
||||
switch(addressing_mode) {
|
||||
case MOD_DIRECT:
|
||||
return vm_set_register(vm, vcpu, regname, operand);
|
||||
case MOD_INDIRECT:
|
||||
target = gla2gpa(reg, guest_cr3);
|
||||
emulated_memory = find_region(target);
|
||||
if (emulated_memory) {
|
||||
return emulated_memory->memwrite(vm, vcpu, target,
|
||||
4, operand,
|
||||
emulated_memory->arg);
|
||||
}
|
||||
return (-1);
|
||||
default:
|
||||
return (-1);
|
||||
}
|
||||
}
|
||||
|
||||
static int
|
||||
emulate_decoded_instruction(struct vmctx *vm, int vcpu, uint64_t cr3,
|
||||
const struct decoded_instruction *instruction)
|
||||
{
|
||||
uint64_t operand;
|
||||
int error;
|
||||
|
||||
error = get_operand(vm, vcpu, cr3, instruction, &operand);
|
||||
if (error)
|
||||
return (error);
|
||||
|
||||
return perform_write(vm, vcpu, cr3, instruction, operand);
|
||||
}
|
||||
|
||||
int
|
||||
emulate_instruction(struct vmctx *vm, int vcpu, uint64_t rip, uint64_t cr3)
|
||||
{
|
||||
struct decoded_instruction instr;
|
||||
int error;
|
||||
void *instruction = gla2hla(rip, cr3);
|
||||
|
||||
if ((error = decode_instruction(instruction, &instr)) != 0)
|
||||
return (error);
|
||||
|
||||
return emulate_decoded_instruction(vm, vcpu, cr3, &instr);
|
||||
}
|
||||
|
||||
struct memory_region *
|
||||
register_emulated_memory(uintptr_t start, size_t len, emulated_read_func_t memread,
|
||||
emulated_write_func_t memwrite, void *arg)
|
||||
{
|
||||
if (registered_regions > MAX_EMULATED_REGIONS)
|
||||
return (NULL);
|
||||
|
||||
struct memory_region *region = &emulated_regions[registered_regions];
|
||||
region->start = start;
|
||||
region->end = start + len;
|
||||
region->memread = memread;
|
||||
region->memwrite = memwrite;
|
||||
region->arg = arg;
|
||||
|
||||
registered_regions++;
|
||||
return (region);
|
||||
}
|
||||
|
||||
void
|
||||
move_memory_region(struct memory_region *region, uintptr_t start)
|
||||
{
|
||||
size_t len;
|
||||
|
||||
len = region->end - region->start;
|
||||
region->start = start;
|
||||
region->end = start + len;
|
||||
}
|
||||
|
47
usr.sbin/bhyve/instruction_emul.h
Normal file
47
usr.sbin/bhyve/instruction_emul.h
Normal file
@ -0,0 +1,47 @@
|
||||
/*-
|
||||
* Copyright (c) 2012 Sandvine, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
* $FreeBSD$
|
||||
*/
|
||||
|
||||
#ifndef _INSTRUCTION_EMUL_H_
|
||||
#define _INSTRUCTION_EMUL_H_
|
||||
|
||||
struct memory_region;
|
||||
|
||||
typedef int (*emulated_read_func_t)(struct vmctx *vm, int vcpu, uintptr_t addr,
|
||||
int size, uint64_t *data, void *arg);
|
||||
typedef int (*emulated_write_func_t)(struct vmctx *vm, int vcpu, uintptr_t addr,
|
||||
int size, uint64_t data, void *arg);
|
||||
|
||||
int emulate_instruction(struct vmctx *vm, int vcpu, uint64_t rip,
|
||||
uint64_t cr3);
|
||||
struct memory_region *register_emulated_memory(uintptr_t start, size_t len,
|
||||
emulated_read_func_t memread,
|
||||
emulated_write_func_t memwrite,
|
||||
void *arg);
|
||||
void move_memory_region(struct memory_region *memory_region, uintptr_t start);
|
||||
|
||||
#endif
|
@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$");
|
||||
#include "fbsdrun.h"
|
||||
#include "inout.h"
|
||||
#include "pci_emul.h"
|
||||
#include "instruction_emul.h"
|
||||
|
||||
#define CONF1_ADDR_PORT 0x0cf8
|
||||
#define CONF1_DATA_PORT 0x0cfc
|
||||
@ -571,6 +572,29 @@ pci_emul_add_msicap(struct pci_devinst *pi, int msgnum)
|
||||
return (pci_emul_add_capability(pi, (u_char *)&msicap, sizeof(msicap)));
|
||||
}
|
||||
|
||||
void
|
||||
msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
|
||||
int bytes, uint32_t val)
|
||||
{
|
||||
uint16_t msgctrl, rwmask;
|
||||
int off, table_bar;
|
||||
|
||||
off = offset - capoff;
|
||||
table_bar = pi->pi_msix.table_bar;
|
||||
/* Message Control Register */
|
||||
if (off == 2 && bytes == 2) {
|
||||
rwmask = PCIM_MSIXCTRL_MSIX_ENABLE | PCIM_MSIXCTRL_FUNCTION_MASK;
|
||||
msgctrl = pci_get_cfgdata16(pi, offset);
|
||||
msgctrl &= ~rwmask;
|
||||
msgctrl |= val & rwmask;
|
||||
val = msgctrl;
|
||||
|
||||
pi->pi_msix.enabled = val & PCIM_MSIXCTRL_MSIX_ENABLE;
|
||||
}
|
||||
|
||||
CFGWRITE(pi, offset, val, bytes);
|
||||
}
|
||||
|
||||
void
|
||||
msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
|
||||
int bytes, uint32_t val)
|
||||
@ -847,6 +871,11 @@ pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
|
||||
assert(0);
|
||||
}
|
||||
pci_set_cfgdata32(pi, coff, bar);
|
||||
|
||||
if (pi->pi_bar[idx].handler) {
|
||||
pi->pi_bar[idx].handler(pi, idx, bar);
|
||||
}
|
||||
|
||||
} else if (pci_emul_iscap(pi, coff)) {
|
||||
pci_emul_capwrite(pi, coff, bytes, *eax);
|
||||
} else {
|
||||
|
@ -42,6 +42,7 @@
|
||||
|
||||
struct vmctx;
|
||||
struct pci_devinst;
|
||||
struct memory_region;
|
||||
|
||||
struct pci_devemu {
|
||||
char *pe_emu; /* Name of device emulation */
|
||||
@ -73,14 +74,30 @@ enum pcibar_type {
|
||||
PCIBAR_MEMHI64
|
||||
};
|
||||
|
||||
typedef int (*bar_write_func_t)(struct pci_devinst *pdi, int idx, uint64_t bar);
|
||||
|
||||
struct pcibar {
|
||||
enum pcibar_type type; /* io or memory */
|
||||
uint64_t size;
|
||||
uint64_t addr;
|
||||
bar_write_func_t handler;
|
||||
};
|
||||
|
||||
#define PI_NAMESZ 40
|
||||
|
||||
struct msix_table_entry {
|
||||
uint64_t addr;
|
||||
uint32_t msg_data;
|
||||
uint32_t vector_control;
|
||||
} __packed;
|
||||
|
||||
/*
|
||||
* In case the structure is modified to hold extra information, use a define
|
||||
* for the size that should be emulated.
|
||||
*/
|
||||
#define MSIX_TABLE_ENTRY_SIZE 16
|
||||
#define MAX_MSIX_TABLE_SIZE 2048
|
||||
|
||||
struct pci_devinst {
|
||||
struct pci_devemu *pi_d;
|
||||
struct vmctx *pi_vmctx;
|
||||
@ -96,6 +113,19 @@ struct pci_devinst {
|
||||
int msgnum;
|
||||
} pi_msi;
|
||||
|
||||
struct {
|
||||
int enabled;
|
||||
int table_bar;
|
||||
int pba_bar;
|
||||
size_t table_offset;
|
||||
uintptr_t table_gpa;
|
||||
size_t table_size;
|
||||
int table_count;
|
||||
size_t pba_offset;
|
||||
struct memory_region *table_bar_region;
|
||||
struct msix_table_entry table[MAX_MSIX_TABLE_SIZE];
|
||||
} pi_msix;
|
||||
|
||||
void *pi_arg; /* devemu-private data */
|
||||
|
||||
u_char pi_cfgdata[PCI_REGMAX + 1];
|
||||
@ -111,6 +141,14 @@ struct msicap {
|
||||
uint16_t msgdata;
|
||||
} __packed;
|
||||
|
||||
struct msixcap {
|
||||
uint8_t capid;
|
||||
uint8_t nextptr;
|
||||
uint16_t msgctrl;
|
||||
uint32_t table_offset;
|
||||
uint32_t pba_offset;
|
||||
} __packed;
|
||||
|
||||
void init_pci(struct vmctx *ctx);
|
||||
void pci_parse_slot(char *opt);
|
||||
void pci_parse_name(char *opt);
|
||||
@ -120,6 +158,8 @@ int pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, uint64_t hostbase,
|
||||
int pci_emul_add_msicap(struct pci_devinst *pi, int msgnum);
|
||||
void msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
|
||||
int bytes, uint32_t val);
|
||||
void msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
|
||||
int bytes, uint32_t val);
|
||||
|
||||
void pci_generate_msi(struct pci_devinst *pi, int msgnum);
|
||||
int pci_msi_enabled(struct pci_devinst *pi);
|
||||
|
@ -47,6 +47,7 @@ __FBSDID("$FreeBSD$");
|
||||
#include <machine/vmm.h>
|
||||
#include <vmmapi.h>
|
||||
#include "pci_emul.h"
|
||||
#include "instruction_emul.h"
|
||||
|
||||
#ifndef _PATH_DEVPCI
|
||||
#define _PATH_DEVPCI "/dev/pci"
|
||||
@ -58,6 +59,11 @@ __FBSDID("$FreeBSD$");
|
||||
|
||||
#define LEGACY_SUPPORT 1
|
||||
|
||||
#define MSIX_TABLE_BIR_MASK 7
|
||||
#define MSIX_TABLE_OFFSET_MASK (~MSIX_TABLE_BIR_MASK);
|
||||
#define MSIX_TABLE_COUNT(x) (((x) & 0x7FF) + 1)
|
||||
#define MSIX_CAPLEN 12
|
||||
|
||||
static int pcifd = -1;
|
||||
static int iofd = -1;
|
||||
|
||||
@ -69,6 +75,9 @@ struct passthru_softc {
|
||||
int msgctrl;
|
||||
int emulated;
|
||||
} psc_msi;
|
||||
struct {
|
||||
int capoff;
|
||||
} psc_msix;
|
||||
struct pcisel psc_sel;
|
||||
};
|
||||
|
||||
@ -152,17 +161,19 @@ passthru_add_msicap(struct pci_devinst *pi, int msgnum, int nextptr)
|
||||
static int
|
||||
cfginitmsi(struct passthru_softc *sc)
|
||||
{
|
||||
int ptr, cap, sts, caplen;
|
||||
int ptr, capptr, cap, sts, caplen;
|
||||
uint32_t u32;
|
||||
struct pcisel sel;
|
||||
struct pci_devinst *pi;
|
||||
struct msixcap msixcap;
|
||||
uint32_t *msixcap_ptr;
|
||||
|
||||
pi = sc->psc_pi;
|
||||
sel = sc->psc_sel;
|
||||
|
||||
/*
|
||||
* Parse the capabilities and cache the location of the MSI
|
||||
* capability.
|
||||
* and MSI-X capabilities.
|
||||
*/
|
||||
sts = read_config(&sel, PCIR_STATUS, 2);
|
||||
if (sts & PCIM_STATUS_CAPPRESENT) {
|
||||
@ -179,18 +190,44 @@ cfginitmsi(struct passthru_softc *sc)
|
||||
ptr + 2, 2);
|
||||
sc->psc_msi.emulated = 0;
|
||||
caplen = msi_caplen(sc->psc_msi.msgctrl);
|
||||
capptr = ptr;
|
||||
while (caplen > 0) {
|
||||
u32 = read_config(&sel, ptr, 4);
|
||||
pci_set_cfgdata32(pi, ptr, u32);
|
||||
u32 = read_config(&sel, capptr, 4);
|
||||
pci_set_cfgdata32(pi, capptr, u32);
|
||||
caplen -= 4;
|
||||
ptr += 4;
|
||||
capptr += 4;
|
||||
}
|
||||
} else if (cap == PCIY_MSIX) {
|
||||
/*
|
||||
* Copy the MSI-X capability
|
||||
*/
|
||||
sc->psc_msix.capoff = ptr;
|
||||
caplen = 12;
|
||||
msixcap_ptr = (uint32_t*) &msixcap;
|
||||
capptr = ptr;
|
||||
while (caplen > 0) {
|
||||
u32 = read_config(&sel, capptr, 4);
|
||||
*msixcap_ptr = u32;
|
||||
pci_set_cfgdata32(pi, capptr, u32);
|
||||
caplen -= 4;
|
||||
capptr += 4;
|
||||
msixcap_ptr++;
|
||||
}
|
||||
break;
|
||||
}
|
||||
ptr = read_config(&sel, ptr + PCICAP_NEXTPTR, 1);
|
||||
}
|
||||
}
|
||||
|
||||
if (sc->psc_msix.capoff == 0)
|
||||
return (-1);
|
||||
|
||||
pi->pi_msix.pba_bar = msixcap.pba_offset & MSIX_TABLE_BIR_MASK;
|
||||
pi->pi_msix.pba_offset = msixcap.pba_offset & MSIX_TABLE_OFFSET_MASK;
|
||||
pi->pi_msix.table_bar = msixcap.table_offset & MSIX_TABLE_BIR_MASK;
|
||||
pi->pi_msix.table_offset = msixcap.table_offset & MSIX_TABLE_OFFSET_MASK;
|
||||
|
||||
pi->pi_msix.table_count = MSIX_TABLE_COUNT(msixcap.msgctrl);
|
||||
|
||||
#ifdef LEGACY_SUPPORT
|
||||
/*
|
||||
* If the passthrough device does not support MSI then craft a
|
||||
@ -208,12 +245,182 @@ cfginitmsi(struct passthru_softc *sc)
|
||||
}
|
||||
#endif
|
||||
|
||||
if (sc->psc_msi.capoff == 0) /* MSI or bust */
|
||||
/* Make sure one of the capabilities is present */
|
||||
if (sc->psc_msi.capoff == 0 && sc->psc_msix.capoff == 0)
|
||||
return (-1);
|
||||
else
|
||||
return (0);
|
||||
}
|
||||
|
||||
static int
|
||||
msix_table_read(struct vmctx *vm, int vcpu, uintptr_t addr,
|
||||
int size, uint64_t *data, void *arg)
|
||||
{
|
||||
struct passthru_softc *sc;
|
||||
struct pci_devinst *pi;
|
||||
int index;
|
||||
size_t offset, entry_offset;
|
||||
uint8_t *src8;
|
||||
uint16_t *src16;
|
||||
uint32_t *src32;
|
||||
uint64_t *src64;
|
||||
struct msix_table_entry *entry;
|
||||
|
||||
sc = arg;
|
||||
pi = sc->psc_pi;
|
||||
offset = addr - pi->pi_msix.table_gpa;
|
||||
entry_offset = addr % MSIX_TABLE_ENTRY_SIZE;
|
||||
index = offset / MSIX_TABLE_ENTRY_SIZE;
|
||||
entry = &pi->pi_msix.table[index];
|
||||
|
||||
switch(size) {
|
||||
case 1:
|
||||
src8 = (uint8_t*)((void*)entry + entry_offset);
|
||||
*data = *src8;
|
||||
break;
|
||||
case 2:
|
||||
src16 = (uint16_t*)((void*)entry + entry_offset);
|
||||
*data = *src16;
|
||||
break;
|
||||
case 4:
|
||||
src32 = (uint32_t*)((void*)entry + entry_offset);
|
||||
*data = *src32;
|
||||
break;
|
||||
case 8:
|
||||
src64 = (uint64_t*)((void*)entry + entry_offset);
|
||||
*data = *src64;
|
||||
break;
|
||||
default:
|
||||
return (-1);
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
static int
|
||||
msix_table_write(struct vmctx *vm, int vcpu, uintptr_t addr,
|
||||
int size, uint64_t data, void *arg)
|
||||
{
|
||||
struct passthru_softc *sc;
|
||||
struct pci_devinst *pi;
|
||||
int error, index;
|
||||
size_t offset, entry_offset;
|
||||
uint32_t *dest;
|
||||
struct msix_table_entry *entry;
|
||||
uint32_t vector_control;
|
||||
|
||||
sc = arg;
|
||||
pi = sc->psc_pi;
|
||||
offset = addr - pi->pi_msix.table_gpa;
|
||||
entry_offset = addr % MSIX_TABLE_ENTRY_SIZE;
|
||||
index = offset / MSIX_TABLE_ENTRY_SIZE;
|
||||
entry = &pi->pi_msix.table[index];
|
||||
|
||||
/* Only 4 byte naturally-aligned writes are supported */
|
||||
if (size == 4 && entry_offset % 4 == 0) {
|
||||
vector_control = entry->vector_control;
|
||||
dest = (uint32_t*)((void*)entry + entry_offset);
|
||||
*dest = data;
|
||||
/* If MSI-X hasn't been enabled, do nothing */
|
||||
if (pi->pi_msix.enabled) {
|
||||
/* If the entry is masked, don't set it up */
|
||||
if ((entry->vector_control & PCIM_MSIX_VCTRL_MASK) == 0 ||
|
||||
(vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
|
||||
error = vm_setup_msix(vm, vcpu, sc->psc_sel.pc_bus,
|
||||
sc->psc_sel.pc_dev,
|
||||
sc->psc_sel.pc_func,
|
||||
index, entry->msg_data,
|
||||
entry->vector_control,
|
||||
entry->addr);
|
||||
if (error)
|
||||
return (-1);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
printf("Unsupported unaligned or non-4-byte write to MSI-X table\n");
|
||||
return (-1);
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
static int
|
||||
msix_bar_handler(struct pci_devinst *pdi, int idx, uint64_t bar)
|
||||
{
|
||||
uintptr_t start;
|
||||
|
||||
start = (bar & PCIM_BAR_MEM_BASE) + pdi->pi_msix.table_offset;
|
||||
move_memory_region(pdi->pi_msix.table_bar_region, start);
|
||||
pdi->pi_msix.table_gpa = start;
|
||||
return (0);
|
||||
}
|
||||
|
||||
static int
|
||||
init_msix_table(struct vmctx *ctx, struct passthru_softc *sc, uint64_t base)
|
||||
{
|
||||
int idx;
|
||||
size_t table_size;
|
||||
vm_paddr_t start;
|
||||
size_t len;
|
||||
struct pci_devinst *pi = sc->psc_pi;
|
||||
|
||||
/*
|
||||
* If the MSI-X table BAR maps memory intended for
|
||||
* other uses, it is at least assured that the table
|
||||
* either resides in its own page within the region,
|
||||
* or it resides in a page shared with only the PBA.
|
||||
*/
|
||||
if (pi->pi_msix.pba_bar == pi->pi_msix.table_bar &&
|
||||
((pi->pi_msix.pba_offset - pi->pi_msix.table_offset) < 4096)) {
|
||||
/* Need to also emulate the PBA, not supported yet */
|
||||
printf("Unsupported MSI-X table and PBA in same page\n");
|
||||
return (-1);
|
||||
}
|
||||
/*
|
||||
* May need to split the BAR into 3 regions:
|
||||
* Before the MSI-X table, the MSI-X table, and after it
|
||||
* XXX for now, assume that the table is not in the middle
|
||||
*/
|
||||
table_size = pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
|
||||
pi->pi_msix.table_size = table_size;
|
||||
idx = pi->pi_msix.table_bar;
|
||||
|
||||
/* Round up to page size */
|
||||
table_size = (table_size + 0x1000) & ~0xFFF;
|
||||
if (pi->pi_msix.table_offset == 0) {
|
||||
/* Map everything after the MSI-X table */
|
||||
start = pi->pi_bar[idx].addr + table_size;
|
||||
len = pi->pi_bar[idx].size - table_size;
|
||||
} else {
|
||||
/* Map everything before the MSI-X table */
|
||||
start = pi->pi_bar[idx].addr;
|
||||
len = pi->pi_msix.table_offset;
|
||||
}
|
||||
return vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus,
|
||||
sc->psc_sel.pc_dev, sc->psc_sel.pc_func,
|
||||
start, len, base + table_size);
|
||||
}
|
||||
|
||||
static int
|
||||
cfginitmsix(struct passthru_softc *sc)
|
||||
{
|
||||
int table_bar;
|
||||
struct pci_devinst *pi;
|
||||
|
||||
pi = sc->psc_pi;
|
||||
table_bar = pi->pi_msix.table_bar;
|
||||
pi->pi_msix.table_gpa = sc->psc_bar[table_bar].addr + pi->pi_msix.table_offset;
|
||||
pi->pi_msix.table_bar_region = register_emulated_memory(pi->pi_msix.table_gpa,
|
||||
pi->pi_msix.table_size,
|
||||
msix_table_read,
|
||||
msix_table_write, sc);
|
||||
if (!pi->pi_msix.table_bar_region)
|
||||
return (-1);
|
||||
|
||||
pi->pi_bar[table_bar].handler = msix_bar_handler;
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
static int
|
||||
cfginitbar(struct vmctx *ctx, struct passthru_softc *sc)
|
||||
{
|
||||
@ -262,10 +469,13 @@ cfginitbar(struct vmctx *ctx, struct passthru_softc *sc)
|
||||
if (error)
|
||||
return (-1);
|
||||
|
||||
/*
|
||||
* Map the physical MMIO space in the guest MMIO space
|
||||
*/
|
||||
if (bartype != PCIBAR_IO) {
|
||||
/* The MSI-X table needs special handling */
|
||||
if (i == pi->pi_msix.table_bar) {
|
||||
error = init_msix_table(ctx, sc, base);
|
||||
if (error)
|
||||
return (-1);
|
||||
} else if (bartype != PCIBAR_IO) {
|
||||
/* Map the physical MMIO space in the guest MMIO space */
|
||||
error = vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus,
|
||||
sc->psc_sel.pc_dev, sc->psc_sel.pc_func,
|
||||
pi->pi_bar[i].addr, pi->pi_bar[i].size, base);
|
||||
@ -299,10 +509,13 @@ cfginit(struct vmctx *ctx, struct pci_devinst *pi, int bus, int slot, int func)
|
||||
sc->psc_sel.pc_dev = slot;
|
||||
sc->psc_sel.pc_func = func;
|
||||
|
||||
if (cfginitmsi(sc) != 0)
|
||||
goto done;
|
||||
|
||||
if (cfginitbar(ctx, sc) != 0)
|
||||
goto done;
|
||||
|
||||
if (cfginitmsi(sc) != 0)
|
||||
if (cfginitmsix(sc) != 0)
|
||||
goto done;
|
||||
|
||||
error = 0; /* success */
|
||||
@ -381,6 +594,16 @@ msicap_access(struct passthru_softc *sc, int coff)
|
||||
return (0);
|
||||
}
|
||||
|
||||
static int
|
||||
msixcap_access(struct passthru_softc *sc, int coff)
|
||||
{
|
||||
if (sc->psc_msix.capoff == 0)
|
||||
return (0);
|
||||
|
||||
return (coff >= sc->psc_msix.capoff &&
|
||||
coff < sc->psc_msix.capoff + MSIX_CAPLEN);
|
||||
}
|
||||
|
||||
static int
|
||||
passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int coff,
|
||||
int bytes, uint32_t *rv)
|
||||
@ -416,7 +639,7 @@ static int
|
||||
passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int coff,
|
||||
int bytes, uint32_t val)
|
||||
{
|
||||
int error;
|
||||
int error, msix_table_entries, i;
|
||||
struct passthru_softc *sc;
|
||||
|
||||
sc = pi->pi_arg;
|
||||
@ -443,6 +666,27 @@ passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int coff,
|
||||
return (0);
|
||||
}
|
||||
|
||||
if (msixcap_access(sc, coff)) {
|
||||
msixcap_cfgwrite(pi, sc->psc_msix.capoff, coff, bytes, val);
|
||||
if (pi->pi_msix.enabled) {
|
||||
msix_table_entries = pi->pi_msix.table_count;
|
||||
for (i = 0; i < msix_table_entries; i++) {
|
||||
error = vm_setup_msix(ctx, vcpu, sc->psc_sel.pc_bus,
|
||||
sc->psc_sel.pc_dev,
|
||||
sc->psc_sel.pc_func, i,
|
||||
pi->pi_msix.table[i].msg_data,
|
||||
pi->pi_msix.table[i].vector_control,
|
||||
pi->pi_msix.table[i].addr);
|
||||
|
||||
if (error) {
|
||||
printf("vm_setup_msix returned error %d\r\n", errno);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
#ifdef LEGACY_SUPPORT
|
||||
/*
|
||||
* If this device does not support MSI natively then we cannot let
|
||||
|
Loading…
x
Reference in New Issue
Block a user