freebsd-skq/sys/amd64/vmm/intel/vtd.c
grehan 5d455a50f5 MFC r267921, r267934, r267949, r267959, r267966, r268202, r268276,
r268427, r268428, r268521, r268638,	r268639, r268701, r268777,
    r268889, r268922, r269008, r269042,	r269043, r269080, r269094,
    r269108, r269109, r269281, r269317,	r269700, r269896, r269962,
    r269989.

Catch bhyve up to CURRENT.

Lightly tested with FreeBSD i386/amd64,	Linux i386/amd64, and
OpenBSD/amd64. Still resolving an	issue with OpenBSD/i386.

Many thanks to jhb@ for	all the	hard work on the prior MFCs !

r267921 - support the "mov r/m8, imm8" instruction
r267934 - document options
r267949 - set DMI vers/date to fixed values
r267959 - doc: sort cmd flags
r267966 - EPT misconf post-mortem info
r268202 - use correct flag for event index
r268276 - 64-bit virtio capability api
r268427 - invalidate guest TLB when cr3 is updated, needed for TSS
r268428 - identify vcpu's operating mode
r268521 - use correct offset in guest logical-to-linear translation
r268638 - chs value
r268639 - chs fake values
r268701 - instr emul operand/address size override prefix support
r268777 - emulation for legacy x86 task switching
r268889 - nested exception support
r268922 - fix INVARIANTS build
r269008 - emulate instructions found in the OpenBSD/i386 5.5 kernel
r269042 - fix fault injection
r269043 - Reduce VMEXIT_RESTARTs in task_switch.c
r269080 - fix issues in PUSH emulation
r269094 - simplify return values from the inout handlers
r269108 - don't return -1 from the push emulation handler
r269109 - avoid permanent sleep in vm_handle_hlt()
r269281 - list VT-x features in base kernel dmesg
r269317 - Mark AHCI fatal errors as not completed
r269700 - Support PCI extended config space in bhyve
r269896 - Minor cleanup
r269962 - use max guest memory when creating IOMMU domain
r269989 - fix interrupt mode names
2014-08-19 01:20:24 +00:00

693 lines
16 KiB
C

/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <dev/pci/pcireg.h>
#include <machine/vmparam.h>
#include <contrib/dev/acpica/include/acpi.h>
#include "io/iommu.h"
/*
* Documented in the "Intel Virtualization Technology for Directed I/O",
* Architecture Spec, September 2008.
*/
/* Section 10.4 "Register Descriptions" */
struct vtdmap {
volatile uint32_t version;
volatile uint32_t res0;
volatile uint64_t cap;
volatile uint64_t ext_cap;
volatile uint32_t gcr;
volatile uint32_t gsr;
volatile uint64_t rta;
volatile uint64_t ccr;
};
#define VTD_CAP_SAGAW(cap) (((cap) >> 8) & 0x1F)
#define VTD_CAP_ND(cap) ((cap) & 0x7)
#define VTD_CAP_CM(cap) (((cap) >> 7) & 0x1)
#define VTD_CAP_SPS(cap) (((cap) >> 34) & 0xF)
#define VTD_CAP_RWBF(cap) (((cap) >> 4) & 0x1)
#define VTD_ECAP_DI(ecap) (((ecap) >> 2) & 0x1)
#define VTD_ECAP_COHERENCY(ecap) ((ecap) & 0x1)
#define VTD_ECAP_IRO(ecap) (((ecap) >> 8) & 0x3FF)
#define VTD_GCR_WBF (1 << 27)
#define VTD_GCR_SRTP (1 << 30)
#define VTD_GCR_TE (1U << 31)
#define VTD_GSR_WBFS (1 << 27)
#define VTD_GSR_RTPS (1 << 30)
#define VTD_GSR_TES (1U << 31)
#define VTD_CCR_ICC (1UL << 63) /* invalidate context cache */
#define VTD_CCR_CIRG_GLOBAL (1UL << 61) /* global invalidation */
#define VTD_IIR_IVT (1UL << 63) /* invalidation IOTLB */
#define VTD_IIR_IIRG_GLOBAL (1ULL << 60) /* global IOTLB invalidation */
#define VTD_IIR_IIRG_DOMAIN (2ULL << 60) /* domain IOTLB invalidation */
#define VTD_IIR_IIRG_PAGE (3ULL << 60) /* page IOTLB invalidation */
#define VTD_IIR_DRAIN_READS (1ULL << 49) /* drain pending DMA reads */
#define VTD_IIR_DRAIN_WRITES (1ULL << 48) /* drain pending DMA writes */
#define VTD_IIR_DOMAIN_P 32
#define VTD_ROOT_PRESENT 0x1
#define VTD_CTX_PRESENT 0x1
#define VTD_CTX_TT_ALL (1UL << 2)
#define VTD_PTE_RD (1UL << 0)
#define VTD_PTE_WR (1UL << 1)
#define VTD_PTE_SUPERPAGE (1UL << 7)
#define VTD_PTE_ADDR_M (0x000FFFFFFFFFF000UL)
struct domain {
uint64_t *ptp; /* first level page table page */
int pt_levels; /* number of page table levels */
int addrwidth; /* 'AW' field in context entry */
int spsmask; /* supported super page sizes */
u_int id; /* domain id */
vm_paddr_t maxaddr; /* highest address to be mapped */
SLIST_ENTRY(domain) next;
};
static SLIST_HEAD(, domain) domhead;
#define DRHD_MAX_UNITS 8
static int drhd_num;
static struct vtdmap *vtdmaps[DRHD_MAX_UNITS];
static int max_domains;
typedef int (*drhd_ident_func_t)(void);
static uint64_t root_table[PAGE_SIZE / sizeof(uint64_t)] __aligned(4096);
static uint64_t ctx_tables[256][PAGE_SIZE / sizeof(uint64_t)] __aligned(4096);
static MALLOC_DEFINE(M_VTD, "vtd", "vtd");
static int
vtd_max_domains(struct vtdmap *vtdmap)
{
int nd;
nd = VTD_CAP_ND(vtdmap->cap);
switch (nd) {
case 0:
return (16);
case 1:
return (64);
case 2:
return (256);
case 3:
return (1024);
case 4:
return (4 * 1024);
case 5:
return (16 * 1024);
case 6:
return (64 * 1024);
default:
panic("vtd_max_domains: invalid value of nd (0x%0x)", nd);
}
}
static u_int
domain_id(void)
{
u_int id;
struct domain *dom;
/* Skip domain id 0 - it is reserved when Caching Mode field is set */
for (id = 1; id < max_domains; id++) {
SLIST_FOREACH(dom, &domhead, next) {
if (dom->id == id)
break;
}
if (dom == NULL)
break; /* found it */
}
if (id >= max_domains)
panic("domain ids exhausted");
return (id);
}
static void
vtd_wbflush(struct vtdmap *vtdmap)
{
if (VTD_ECAP_COHERENCY(vtdmap->ext_cap) == 0)
pmap_invalidate_cache();
if (VTD_CAP_RWBF(vtdmap->cap)) {
vtdmap->gcr = VTD_GCR_WBF;
while ((vtdmap->gsr & VTD_GSR_WBFS) != 0)
;
}
}
static void
vtd_ctx_global_invalidate(struct vtdmap *vtdmap)
{
vtdmap->ccr = VTD_CCR_ICC | VTD_CCR_CIRG_GLOBAL;
while ((vtdmap->ccr & VTD_CCR_ICC) != 0)
;
}
static void
vtd_iotlb_global_invalidate(struct vtdmap *vtdmap)
{
int offset;
volatile uint64_t *iotlb_reg, val;
vtd_wbflush(vtdmap);
offset = VTD_ECAP_IRO(vtdmap->ext_cap) * 16;
iotlb_reg = (volatile uint64_t *)((caddr_t)vtdmap + offset + 8);
*iotlb_reg = VTD_IIR_IVT | VTD_IIR_IIRG_GLOBAL |
VTD_IIR_DRAIN_READS | VTD_IIR_DRAIN_WRITES;
while (1) {
val = *iotlb_reg;
if ((val & VTD_IIR_IVT) == 0)
break;
}
}
static void
vtd_translation_enable(struct vtdmap *vtdmap)
{
vtdmap->gcr = VTD_GCR_TE;
while ((vtdmap->gsr & VTD_GSR_TES) == 0)
;
}
static void
vtd_translation_disable(struct vtdmap *vtdmap)
{
vtdmap->gcr = 0;
while ((vtdmap->gsr & VTD_GSR_TES) != 0)
;
}
static int
vtd_init(void)
{
int i, units, remaining;
struct vtdmap *vtdmap;
vm_paddr_t ctx_paddr;
char *end, envname[32];
unsigned long mapaddr;
ACPI_STATUS status;
ACPI_TABLE_DMAR *dmar;
ACPI_DMAR_HEADER *hdr;
ACPI_DMAR_HARDWARE_UNIT *drhd;
/*
* Allow the user to override the ACPI DMAR table by specifying the
* physical address of each remapping unit.
*
* The following example specifies two remapping units at
* physical addresses 0xfed90000 and 0xfeda0000 respectively.
* set vtd.regmap.0.addr=0xfed90000
* set vtd.regmap.1.addr=0xfeda0000
*/
for (units = 0; units < DRHD_MAX_UNITS; units++) {
snprintf(envname, sizeof(envname), "vtd.regmap.%d.addr", units);
if (getenv_ulong(envname, &mapaddr) == 0)
break;
vtdmaps[units] = (struct vtdmap *)PHYS_TO_DMAP(mapaddr);
}
if (units > 0)
goto skip_dmar;
/* Search for DMAR table. */
status = AcpiGetTable(ACPI_SIG_DMAR, 0, (ACPI_TABLE_HEADER **)&dmar);
if (ACPI_FAILURE(status))
return (ENXIO);
end = (char *)dmar + dmar->Header.Length;
remaining = dmar->Header.Length - sizeof(ACPI_TABLE_DMAR);
while (remaining > sizeof(ACPI_DMAR_HEADER)) {
hdr = (ACPI_DMAR_HEADER *)(end - remaining);
if (hdr->Length > remaining)
break;
/*
* From Intel VT-d arch spec, version 1.3:
* BIOS implementations must report mapping structures
* in numerical order, i.e. All remapping structures of
* type 0 (DRHD) enumerated before remapping structures of
* type 1 (RMRR) and so forth.
*/
if (hdr->Type != ACPI_DMAR_TYPE_HARDWARE_UNIT)
break;
drhd = (ACPI_DMAR_HARDWARE_UNIT *)hdr;
vtdmaps[units++] = (struct vtdmap *)PHYS_TO_DMAP(drhd->Address);
if (units >= DRHD_MAX_UNITS)
break;
remaining -= hdr->Length;
}
if (units <= 0)
return (ENXIO);
skip_dmar:
drhd_num = units;
vtdmap = vtdmaps[0];
if (VTD_CAP_CM(vtdmap->cap) != 0)
panic("vtd_init: invalid caching mode");
max_domains = vtd_max_domains(vtdmap);
/*
* Set up the root-table to point to the context-entry tables
*/
for (i = 0; i < 256; i++) {
ctx_paddr = vtophys(ctx_tables[i]);
if (ctx_paddr & PAGE_MASK)
panic("ctx table (0x%0lx) not page aligned", ctx_paddr);
root_table[i * 2] = ctx_paddr | VTD_ROOT_PRESENT;
}
return (0);
}
static void
vtd_cleanup(void)
{
}
static void
vtd_enable(void)
{
int i;
struct vtdmap *vtdmap;
for (i = 0; i < drhd_num; i++) {
vtdmap = vtdmaps[i];
vtd_wbflush(vtdmap);
/* Update the root table address */
vtdmap->rta = vtophys(root_table);
vtdmap->gcr = VTD_GCR_SRTP;
while ((vtdmap->gsr & VTD_GSR_RTPS) == 0)
;
vtd_ctx_global_invalidate(vtdmap);
vtd_iotlb_global_invalidate(vtdmap);
vtd_translation_enable(vtdmap);
}
}
static void
vtd_disable(void)
{
int i;
struct vtdmap *vtdmap;
for (i = 0; i < drhd_num; i++) {
vtdmap = vtdmaps[i];
vtd_translation_disable(vtdmap);
}
}
static void
vtd_add_device(void *arg, int bus, int slot, int func)
{
int idx;
uint64_t *ctxp;
struct domain *dom = arg;
vm_paddr_t pt_paddr;
struct vtdmap *vtdmap;
if (bus < 0 || bus > PCI_BUSMAX ||
slot < 0 || slot > PCI_SLOTMAX ||
func < 0 || func > PCI_FUNCMAX)
panic("vtd_add_device: invalid bsf %d/%d/%d", bus, slot, func);
vtdmap = vtdmaps[0];
ctxp = ctx_tables[bus];
pt_paddr = vtophys(dom->ptp);
idx = (slot << 3 | func) * 2;
if (ctxp[idx] & VTD_CTX_PRESENT) {
panic("vtd_add_device: device %d/%d/%d is already owned by "
"domain %d", bus, slot, func,
(uint16_t)(ctxp[idx + 1] >> 8));
}
/*
* Order is important. The 'present' bit is set only after all fields
* of the context pointer are initialized.
*/
ctxp[idx + 1] = dom->addrwidth | (dom->id << 8);
if (VTD_ECAP_DI(vtdmap->ext_cap))
ctxp[idx] = VTD_CTX_TT_ALL;
else
ctxp[idx] = 0;
ctxp[idx] |= pt_paddr | VTD_CTX_PRESENT;
/*
* 'Not Present' entries are not cached in either the Context Cache
* or in the IOTLB, so there is no need to invalidate either of them.
*/
}
static void
vtd_remove_device(void *arg, int bus, int slot, int func)
{
int i, idx;
uint64_t *ctxp;
struct vtdmap *vtdmap;
if (bus < 0 || bus > PCI_BUSMAX ||
slot < 0 || slot > PCI_SLOTMAX ||
func < 0 || func > PCI_FUNCMAX)
panic("vtd_add_device: invalid bsf %d/%d/%d", bus, slot, func);
ctxp = ctx_tables[bus];
idx = (slot << 3 | func) * 2;
/*
* Order is important. The 'present' bit is must be cleared first.
*/
ctxp[idx] = 0;
ctxp[idx + 1] = 0;
/*
* Invalidate the Context Cache and the IOTLB.
*
* XXX use device-selective invalidation for Context Cache
* XXX use domain-selective invalidation for IOTLB
*/
for (i = 0; i < drhd_num; i++) {
vtdmap = vtdmaps[i];
vtd_ctx_global_invalidate(vtdmap);
vtd_iotlb_global_invalidate(vtdmap);
}
}
#define CREATE_MAPPING 0
#define REMOVE_MAPPING 1
static uint64_t
vtd_update_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len,
int remove)
{
struct domain *dom;
int i, spshift, ptpshift, ptpindex, nlevels;
uint64_t spsize, *ptp;
dom = arg;
ptpindex = 0;
ptpshift = 0;
KASSERT(gpa + len > gpa, ("%s: invalid gpa range %#lx/%#lx", __func__,
gpa, len));
KASSERT(gpa + len <= dom->maxaddr, ("%s: gpa range %#lx/%#lx beyond "
"domain maxaddr %#lx", __func__, gpa, len, dom->maxaddr));
if (gpa & PAGE_MASK)
panic("vtd_create_mapping: unaligned gpa 0x%0lx", gpa);
if (hpa & PAGE_MASK)
panic("vtd_create_mapping: unaligned hpa 0x%0lx", hpa);
if (len & PAGE_MASK)
panic("vtd_create_mapping: unaligned len 0x%0lx", len);
/*
* Compute the size of the mapping that we can accomodate.
*
* This is based on three factors:
* - supported super page size
* - alignment of the region starting at 'gpa' and 'hpa'
* - length of the region 'len'
*/
spshift = 48;
for (i = 3; i >= 0; i--) {
spsize = 1UL << spshift;
if ((dom->spsmask & (1 << i)) != 0 &&
(gpa & (spsize - 1)) == 0 &&
(hpa & (spsize - 1)) == 0 &&
(len >= spsize)) {
break;
}
spshift -= 9;
}
ptp = dom->ptp;
nlevels = dom->pt_levels;
while (--nlevels >= 0) {
ptpshift = 12 + nlevels * 9;
ptpindex = (gpa >> ptpshift) & 0x1FF;
/* We have reached the leaf mapping */
if (spshift >= ptpshift) {
break;
}
/*
* We are working on a non-leaf page table page.
*
* Create a downstream page table page if necessary and point
* to it from the current page table.
*/
if (ptp[ptpindex] == 0) {
void *nlp = malloc(PAGE_SIZE, M_VTD, M_WAITOK | M_ZERO);
ptp[ptpindex] = vtophys(nlp)| VTD_PTE_RD | VTD_PTE_WR;
}
ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & VTD_PTE_ADDR_M);
}
if ((gpa & ((1UL << ptpshift) - 1)) != 0)
panic("gpa 0x%lx and ptpshift %d mismatch", gpa, ptpshift);
/*
* Update the 'gpa' -> 'hpa' mapping
*/
if (remove) {
ptp[ptpindex] = 0;
} else {
ptp[ptpindex] = hpa | VTD_PTE_RD | VTD_PTE_WR;
if (nlevels > 0)
ptp[ptpindex] |= VTD_PTE_SUPERPAGE;
}
return (1UL << ptpshift);
}
static uint64_t
vtd_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len)
{
return (vtd_update_mapping(arg, gpa, hpa, len, CREATE_MAPPING));
}
static uint64_t
vtd_remove_mapping(void *arg, vm_paddr_t gpa, uint64_t len)
{
return (vtd_update_mapping(arg, gpa, 0, len, REMOVE_MAPPING));
}
static void
vtd_invalidate_tlb(void *dom)
{
int i;
struct vtdmap *vtdmap;
/*
* Invalidate the IOTLB.
* XXX use domain-selective invalidation for IOTLB
*/
for (i = 0; i < drhd_num; i++) {
vtdmap = vtdmaps[i];
vtd_iotlb_global_invalidate(vtdmap);
}
}
static void *
vtd_create_domain(vm_paddr_t maxaddr)
{
struct domain *dom;
vm_paddr_t addr;
int tmp, i, gaw, agaw, sagaw, res, pt_levels, addrwidth;
struct vtdmap *vtdmap;
if (drhd_num <= 0)
panic("vtd_create_domain: no dma remapping hardware available");
vtdmap = vtdmaps[0];
/*
* Calculate AGAW.
* Section 3.4.2 "Adjusted Guest Address Width", Architecture Spec.
*/
addr = 0;
for (gaw = 0; addr < maxaddr; gaw++)
addr = 1ULL << gaw;
res = (gaw - 12) % 9;
if (res == 0)
agaw = gaw;
else
agaw = gaw + 9 - res;
if (agaw > 64)
agaw = 64;
/*
* Select the smallest Supported AGAW and the corresponding number
* of page table levels.
*/
pt_levels = 2;
sagaw = 30;
addrwidth = 0;
tmp = VTD_CAP_SAGAW(vtdmap->cap);
for (i = 0; i < 5; i++) {
if ((tmp & (1 << i)) != 0 && sagaw >= agaw)
break;
pt_levels++;
addrwidth++;
sagaw += 9;
if (sagaw > 64)
sagaw = 64;
}
if (i >= 5) {
panic("vtd_create_domain: SAGAW 0x%lx does not support AGAW %d",
VTD_CAP_SAGAW(vtdmap->cap), agaw);
}
dom = malloc(sizeof(struct domain), M_VTD, M_ZERO | M_WAITOK);
dom->pt_levels = pt_levels;
dom->addrwidth = addrwidth;
dom->id = domain_id();
dom->maxaddr = maxaddr;
dom->ptp = malloc(PAGE_SIZE, M_VTD, M_ZERO | M_WAITOK);
if ((uintptr_t)dom->ptp & PAGE_MASK)
panic("vtd_create_domain: ptp (%p) not page aligned", dom->ptp);
#ifdef notyet
/*
* XXX superpage mappings for the iommu do not work correctly.
*
* By default all physical memory is mapped into the host_domain.
* When a VM is allocated wired memory the pages belonging to it
* are removed from the host_domain and added to the vm's domain.
*
* If the page being removed was mapped using a superpage mapping
* in the host_domain then we need to demote the mapping before
* removing the page.
*
* There is not any code to deal with the demotion at the moment
* so we disable superpage mappings altogether.
*/
dom->spsmask = VTD_CAP_SPS(vtdmap->cap);
#endif
SLIST_INSERT_HEAD(&domhead, dom, next);
return (dom);
}
static void
vtd_free_ptp(uint64_t *ptp, int level)
{
int i;
uint64_t *nlp;
if (level > 1) {
for (i = 0; i < 512; i++) {
if ((ptp[i] & (VTD_PTE_RD | VTD_PTE_WR)) == 0)
continue;
if ((ptp[i] & VTD_PTE_SUPERPAGE) != 0)
continue;
nlp = (uint64_t *)PHYS_TO_DMAP(ptp[i] & VTD_PTE_ADDR_M);
vtd_free_ptp(nlp, level - 1);
}
}
bzero(ptp, PAGE_SIZE);
free(ptp, M_VTD);
}
static void
vtd_destroy_domain(void *arg)
{
struct domain *dom;
dom = arg;
SLIST_REMOVE(&domhead, dom, domain, next);
vtd_free_ptp(dom->ptp, dom->pt_levels);
free(dom, M_VTD);
}
struct iommu_ops iommu_ops_intel = {
vtd_init,
vtd_cleanup,
vtd_enable,
vtd_disable,
vtd_create_domain,
vtd_destroy_domain,
vtd_create_mapping,
vtd_remove_mapping,
vtd_add_device,
vtd_remove_device,
vtd_invalidate_tlb,
};