Support PCI extended config space in bhyve.

Add the ACPI MCFG table to advertise the extended config memory window.

Introduce a new flag MEM_F_IMMUTABLE for memory ranges that cannot be deleted
or moved in the guest's address space. The PCI extended config space is an
example of an immutable memory range.

Add emulation for the "movzw" instruction. This instruction is used by FreeBSD
to read a 16-bit extended config space register.

CR:		https://phabric.freebsd.org/D505
Reviewed by:	jhb, grehan
Requested by:	tychon
This commit is contained in:
Neel Natu 2014-08-08 03:49:01 +00:00
parent 8f5a8818f5
commit 12a6eb99a1
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=269700
6 changed files with 232 additions and 82 deletions

View File

@ -82,6 +82,10 @@ static const struct vie_op two_byte_opcodes[256] = {
.op_byte = 0xB6,
.op_type = VIE_OP_TYPE_MOVZX,
},
[0xB7] = {
.op_byte = 0xB7,
.op_type = VIE_OP_TYPE_MOVZX,
},
[0xBE] = {
.op_byte = 0xBE,
.op_type = VIE_OP_TYPE_MOVSX,
@ -503,6 +507,25 @@ emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
val = (uint8_t)val;
/* write the result */
error = vie_update_register(vm, vcpuid, reg, val, size);
break;
case 0xB7:
/*
* MOV and zero extend word from mem (ModRM:r/m) to
* reg (ModRM:reg).
*
* 0F B7/r movzx r32, r/m16
* REX.W + 0F B7/r movzx r64, r/m16
*/
error = memread(vm, vcpuid, gpa, &val, 2, arg);
if (error)
return (error);
reg = gpr_map[vie->reg];
/* zero-extend word */
val = (uint16_t)val;
error = vie_update_register(vm, vcpuid, reg, val, size);
break;
case 0xBE:

View File

@ -40,12 +40,13 @@
* Layout
* ------
* RSDP -> 0xf2400 (36 bytes fixed)
* RSDT -> 0xf2440 (36 bytes + 4*N table addrs, 2 used)
* XSDT -> 0xf2480 (36 bytes + 8*N table addrs, 2 used)
* RSDT -> 0xf2440 (36 bytes + 4*7 table addrs, 4 used)
* XSDT -> 0xf2480 (36 bytes + 8*7 table addrs, 4 used)
* MADT -> 0xf2500 (depends on #CPUs)
* FADT -> 0xf2600 (268 bytes)
* HPET -> 0xf2740 (56 bytes)
* FACS -> 0xf2780 (64 bytes)
* MCFG -> 0xf2780 (60 bytes)
* FACS -> 0xf27C0 (64 bytes)
* DSDT -> 0xf2800 (variable - can go up to 0x100000)
*/
@ -80,7 +81,8 @@ __FBSDID("$FreeBSD$");
#define MADT_OFFSET 0x100
#define FADT_OFFSET 0x200
#define HPET_OFFSET 0x340
#define FACS_OFFSET 0x380
#define MCFG_OFFSET 0x380
#define FACS_OFFSET 0x3C0
#define DSDT_OFFSET 0x400
#define BHYVE_ASL_TEMPLATE "bhyve.XXXXXXX"
@ -178,6 +180,8 @@ basl_fwrite_rsdt(FILE *fp)
basl_acpi_base + FADT_OFFSET);
EFPRINTF(fp, "[0004]\t\tACPI Table Address 2 : %08X\n",
basl_acpi_base + HPET_OFFSET);
EFPRINTF(fp, "[0004]\t\tACPI Table Address 3 : %08X\n",
basl_acpi_base + MCFG_OFFSET);
EFFLUSH(fp);
@ -216,6 +220,8 @@ basl_fwrite_xsdt(FILE *fp)
basl_acpi_base + FADT_OFFSET);
EFPRINTF(fp, "[0004]\t\tACPI Table Address 2 : 00000000%08X\n",
basl_acpi_base + HPET_OFFSET);
EFPRINTF(fp, "[0004]\t\tACPI Table Address 3 : 00000000%08X\n",
basl_acpi_base + MCFG_OFFSET);
EFFLUSH(fp);
@ -582,6 +588,39 @@ basl_fwrite_hpet(FILE *fp)
return (errno);
}
static int
basl_fwrite_mcfg(FILE *fp)
{
int err = 0;
EFPRINTF(fp, "/*\n");
EFPRINTF(fp, " * bhyve MCFG template\n");
EFPRINTF(fp, " */\n");
EFPRINTF(fp, "[0004]\t\tSignature : \"MCFG\"\n");
EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n");
EFPRINTF(fp, "[0001]\t\tRevision : 01\n");
EFPRINTF(fp, "[0001]\t\tChecksum : 00\n");
EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVMCFG \"\n");
EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n");
/* iasl will fill in the compiler ID/revision fields */
EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n");
EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n");
EFPRINTF(fp, "[0008]\t\tReserved : 0\n");
EFPRINTF(fp, "\n");
EFPRINTF(fp, "[0008]\t\tBase Address : %016lX\n", pci_ecfg_base());
EFPRINTF(fp, "[0002]\t\tSegment Group: 0000\n");
EFPRINTF(fp, "[0001]\t\tStart Bus: 00\n");
EFPRINTF(fp, "[0001]\t\tEnd Bus: FF\n");
EFPRINTF(fp, "[0004]\t\tReserved : 0\n");
EFFLUSH(fp);
return (0);
err_exit:
return (errno);
}
static int
basl_fwrite_facs(FILE *fp)
{
@ -921,6 +960,7 @@ static struct {
{ basl_fwrite_madt, MADT_OFFSET },
{ basl_fwrite_fadt, FADT_OFFSET },
{ basl_fwrite_hpet, HPET_OFFSET },
{ basl_fwrite_mcfg, MCFG_OFFSET },
{ basl_fwrite_facs, FACS_OFFSET },
{ basl_fwrite_dsdt, DSDT_OFFSET },
{ NULL }

View File

@ -162,7 +162,7 @@ emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie,
{
struct mmio_rb_range *entry;
int err;
int err, immutable;
pthread_rwlock_rdlock(&mmio_rwlock);
/*
@ -186,9 +186,27 @@ emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie,
}
assert(entry != NULL);
/*
* An 'immutable' memory range is guaranteed to be never removed
* so there is no need to hold 'mmio_rwlock' while calling the
* handler.
*
* XXX writes to the PCIR_COMMAND register can cause register_mem()
* to be called. If the guest is using PCI extended config space
* to modify the PCIR_COMMAND register then register_mem() can
* deadlock on 'mmio_rwlock'. However by registering the extended
* config space window as 'immutable' the deadlock can be avoided.
*/
immutable = (entry->mr_param.flags & MEM_F_IMMUTABLE);
if (immutable)
pthread_rwlock_unlock(&mmio_rwlock);
err = vmm_emulate_instruction(ctx, vcpu, paddr, vie, paging,
mem_read, mem_write, &entry->mr_param);
pthread_rwlock_unlock(&mmio_rwlock);
if (!immutable)
pthread_rwlock_unlock(&mmio_rwlock);
return (err);
}
@ -246,6 +264,7 @@ unregister_mem(struct mem_range *memp)
mr = &entry->mr_param;
assert(mr->name == memp->name);
assert(mr->base == memp->base && mr->size == memp->size);
assert((mr->flags & MEM_F_IMMUTABLE) == 0);
RB_REMOVE(mmio_rb_tree, &mmio_rb_root, entry);
/* flush Per-vCPU cache */

View File

@ -48,6 +48,7 @@ struct mem_range {
#define MEM_F_READ 0x1
#define MEM_F_WRITE 0x2
#define MEM_F_RW 0x3
#define MEM_F_IMMUTABLE 0x4 /* mem_range cannot be unregistered */
void init_mem(void);
int emulate_mem(struct vmctx *, int vcpu, uint64_t paddr, struct vie *vie,

View File

@ -109,16 +109,20 @@ static uint64_t pci_emul_membase64;
#define PCI_EMUL_IOBASE 0x2000
#define PCI_EMUL_IOLIMIT 0x10000
#define PCI_EMUL_MEMLIMIT32 0xE0000000 /* 3.5GB */
#define PCI_EMUL_ECFG_BASE 0xE0000000 /* 3.5GB */
#define PCI_EMUL_ECFG_SIZE (MAXBUSES * 1024 * 1024) /* 1MB per bus */
SYSRES_MEM(PCI_EMUL_ECFG_BASE, PCI_EMUL_ECFG_SIZE);
#define PCI_EMUL_MEMLIMIT32 PCI_EMUL_ECFG_BASE
#define PCI_EMUL_MEMBASE64 0xD000000000UL
#define PCI_EMUL_MEMLIMIT64 0xFD00000000UL
static struct pci_devemu *pci_emul_finddev(char *name);
static void pci_lintr_route(struct pci_devinst *pi);
static void pci_lintr_update(struct pci_devinst *pi);
static struct mem_range pci_mem_hole;
static void pci_lintr_route(struct pci_devinst *pi);
static void pci_lintr_update(struct pci_devinst *pi);
static void pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot,
int func, int coff, int bytes, uint32_t *val);
/*
* I/O access
@ -1023,12 +1027,37 @@ pci_emul_fallback_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
return (0);
}
static int
pci_emul_ecfg_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
int bytes, uint64_t *val, void *arg1, long arg2)
{
int bus, slot, func, coff, in;
coff = addr & 0xfff;
func = (addr >> 12) & 0x7;
slot = (addr >> 15) & 0x1f;
bus = (addr >> 20) & 0xff;
in = (dir == MEM_F_READ);
if (in)
*val = ~0UL;
pci_cfgrw(ctx, vcpu, in, bus, slot, func, coff, bytes, (uint32_t *)val);
return (0);
}
uint64_t
pci_ecfg_base(void)
{
return (PCI_EMUL_ECFG_BASE);
}
#define BUSIO_ROUNDUP 32
#define BUSMEM_ROUNDUP (1024 * 1024)
int
init_pci(struct vmctx *ctx)
{
struct mem_range mr;
struct pci_devemu *pde;
struct businfo *bi;
struct slotinfo *si;
@ -1112,22 +1141,34 @@ init_pci(struct vmctx *ctx)
* The guest physical memory map looks like the following:
* [0, lowmem) guest system memory
* [lowmem, lowmem_limit) memory hole (may be absent)
* [lowmem_limit, 4GB) PCI hole (32-bit BAR allocation)
* [lowmem_limit, 0xE0000000) PCI hole (32-bit BAR allocation)
* [0xE0000000, 0xF0000000) PCI extended config window
* [0xF0000000, 4GB) LAPIC, IOAPIC, HPET, firmware
* [4GB, 4GB + highmem)
*
*/
/*
* Accesses to memory addresses that are not allocated to system
* memory or PCI devices return 0xff's.
*/
lowmem = vm_get_lowmem_size(ctx);
bzero(&mr, sizeof(struct mem_range));
mr.name = "PCI hole";
mr.flags = MEM_F_RW | MEM_F_IMMUTABLE;
mr.base = lowmem;
mr.size = (4ULL * 1024 * 1024 * 1024) - lowmem;
mr.handler = pci_emul_fallback_handler;
error = register_mem_fallback(&mr);
assert(error == 0);
memset(&pci_mem_hole, 0, sizeof(struct mem_range));
pci_mem_hole.name = "PCI hole";
pci_mem_hole.flags = MEM_F_RW;
pci_mem_hole.base = lowmem;
pci_mem_hole.size = (4ULL * 1024 * 1024 * 1024) - lowmem;
pci_mem_hole.handler = pci_emul_fallback_handler;
error = register_mem_fallback(&pci_mem_hole);
/* PCI extended config space */
bzero(&mr, sizeof(struct mem_range));
mr.name = "PCI ECFG";
mr.flags = MEM_F_RW | MEM_F_IMMUTABLE;
mr.base = PCI_EMUL_ECFG_BASE;
mr.size = PCI_EMUL_ECFG_SIZE;
mr.handler = pci_emul_ecfg_handler;
error = register_mem(&mr);
assert(error == 0);
return (0);
@ -1612,41 +1653,6 @@ pci_emul_hdrtype_fixup(int bus, int slot, int off, int bytes, uint32_t *rv)
}
}
static int cfgenable, cfgbus, cfgslot, cfgfunc, cfgoff;
static int
pci_emul_cfgaddr(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
uint32_t x;
if (bytes != 4) {
if (in)
*eax = (bytes == 2) ? 0xffff : 0xff;
return (0);
}
if (in) {
x = (cfgbus << 16) |
(cfgslot << 11) |
(cfgfunc << 8) |
cfgoff;
if (cfgenable)
x |= CONF1_ENABLE;
*eax = x;
} else {
x = *eax;
cfgenable = (x & CONF1_ENABLE) == CONF1_ENABLE;
cfgoff = x & PCI_REGMAX;
cfgfunc = (x >> 8) & PCI_FUNCMAX;
cfgslot = (x >> 11) & PCI_SLOTMAX;
cfgbus = (x >> 16) & PCI_BUSMAX;
}
return (0);
}
INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_INOUT, pci_emul_cfgaddr);
static uint32_t
bits_changed(uint32_t old, uint32_t new, uint32_t mask)
{
@ -1709,41 +1715,51 @@ pci_emul_cmdwrite(struct pci_devinst *pi, uint32_t new, int bytes)
pci_lintr_update(pi);
}
static int
pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
static void
pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, int func,
int coff, int bytes, uint32_t *eax)
{
struct businfo *bi;
struct slotinfo *si;
struct pci_devinst *pi;
struct pci_devemu *pe;
int coff, idx, needcfg;
int idx, needcfg;
uint64_t addr, bar, mask;
assert(bytes == 1 || bytes == 2 || bytes == 4);
if ((bi = pci_businfo[cfgbus]) != NULL) {
si = &bi->slotinfo[cfgslot];
pi = si->si_funcs[cfgfunc].fi_devi;
if ((bi = pci_businfo[bus]) != NULL) {
si = &bi->slotinfo[slot];
pi = si->si_funcs[func].fi_devi;
} else
pi = NULL;
coff = cfgoff + (port - CONF1_DATA_PORT);
#if 0
printf("pcicfg-%s from 0x%0x of %d bytes (%d/%d/%d)\n\r",
in ? "read" : "write", coff, bytes, cfgbus, cfgslot, cfgfunc);
#endif
/*
* Just return if there is no device at this cfgslot:cfgfunc,
* if the guest is doing an un-aligned access, or if the config
* address word isn't enabled.
* Just return if there is no device at this slot:func or if the
* the guest is doing an un-aligned access.
*/
if (!cfgenable || pi == NULL || (coff & (bytes - 1)) != 0) {
if (pi == NULL || (bytes != 1 && bytes != 2 && bytes != 4) ||
(coff & (bytes - 1)) != 0) {
if (in)
*eax = 0xffffffff;
return (0);
return;
}
/*
* Ignore all writes beyond the standard config space and return all
* ones on reads.
*/
if (coff >= PCI_REGMAX + 1) {
if (in) {
*eax = 0xffffffff;
/*
* Extended capabilities begin at offset 256 in config
* space. Absence of extended capabilities is signaled
* with all 0s in the extended capability header at
* offset 256.
*/
if (coff <= PCI_REGMAX + 4)
*eax = 0x00000000;
}
return;
}
pe = pi->pi_d;
@ -1754,8 +1770,8 @@ pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
if (in) {
/* Let the device emulation override the default handler */
if (pe->pe_cfgread != NULL) {
needcfg = pe->pe_cfgread(ctx, vcpu, pi,
coff, bytes, eax);
needcfg = pe->pe_cfgread(ctx, vcpu, pi, coff, bytes,
eax);
} else {
needcfg = 1;
}
@ -1769,12 +1785,12 @@ pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
*eax = pci_get_cfgdata32(pi, coff);
}
pci_emul_hdrtype_fixup(cfgbus, cfgslot, coff, bytes, eax);
pci_emul_hdrtype_fixup(bus, slot, coff, bytes, eax);
} else {
/* Let the device emulation override the default handler */
if (pe->pe_cfgwrite != NULL &&
(*pe->pe_cfgwrite)(ctx, vcpu, pi, coff, bytes, *eax) == 0)
return (0);
return;
/*
* Special handling for write to BAR registers
@ -1785,7 +1801,7 @@ pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
* 4-byte aligned.
*/
if (bytes != 4 || (coff & 0x3) != 0)
return (0);
return;
idx = (coff - PCIR_BAR(0)) / 4;
mask = ~(pi->pi_bar[idx].size - 1);
switch (pi->pi_bar[idx].type) {
@ -1843,7 +1859,57 @@ pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
CFGWRITE(pi, coff, *eax, bytes);
}
}
}
static int cfgenable, cfgbus, cfgslot, cfgfunc, cfgoff;
static int
pci_emul_cfgaddr(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
uint32_t x;
if (bytes != 4) {
if (in)
*eax = (bytes == 2) ? 0xffff : 0xff;
return (0);
}
if (in) {
x = (cfgbus << 16) | (cfgslot << 11) | (cfgfunc << 8) | cfgoff;
if (cfgenable)
x |= CONF1_ENABLE;
*eax = x;
} else {
x = *eax;
cfgenable = (x & CONF1_ENABLE) == CONF1_ENABLE;
cfgoff = x & PCI_REGMAX;
cfgfunc = (x >> 8) & PCI_FUNCMAX;
cfgslot = (x >> 11) & PCI_SLOTMAX;
cfgbus = (x >> 16) & PCI_BUSMAX;
}
return (0);
}
INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_INOUT, pci_emul_cfgaddr);
static int
pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
int coff;
assert(bytes == 1 || bytes == 2 || bytes == 4);
coff = cfgoff + (port - CONF1_DATA_PORT);
if (cfgenable) {
pci_cfgrw(ctx, vcpu, in, cfgbus, cfgslot, cfgfunc, coff, bytes,
eax);
} else {
/* Ignore accesses to cfgdata if not enabled by cfgaddr */
if (in)
*eax = 0xffffffff;
}
return (0);
}

View File

@ -235,6 +235,7 @@ uint64_t pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size);
int pci_count_lintr(int bus);
void pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg);
void pci_write_dsdt(void);
uint64_t pci_ecfg_base(void);
int pci_bus_configured(int bus);
static __inline void