d45b7f14ae
vmm.ko - kernel module for VT-x, VT-d and hypervisor control bhyve - user-space sequencer and i/o emulation vmmctl - dump of hypervisor register state libvmm - front-end to vmm.ko chardev interface bhyve was designed and implemented by Neel Natu. Thanks to the following folk from NetApp who helped to make this available: Joe CaraDonna Peter Snyder Jeff Heller Sandeep Mann Steve Miller Brian Pawlowski
414 lines
10 KiB
C
414 lines
10 KiB
C
/*-
|
|
* Copyright (c) 2011 NetApp, Inc.
|
|
* All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
* SUCH DAMAGE.
|
|
*
|
|
* $FreeBSD$
|
|
*/
|
|
|
|
#include <sys/cdefs.h>
|
|
__FBSDID("$FreeBSD$");
|
|
|
|
#include <sys/param.h>
|
|
#include <sys/lock.h>
|
|
#include <sys/mutex.h>
|
|
#include <sys/linker.h>
|
|
#include <sys/systm.h>
|
|
#include <sys/malloc.h>
|
|
#include <sys/kernel.h>
|
|
|
|
#include <vm/vm.h>
|
|
#include <vm/pmap.h>
|
|
|
|
#include <machine/md_var.h>
|
|
#include <machine/metadata.h>
|
|
#include <machine/pc/bios.h>
|
|
#include <machine/vmparam.h>
|
|
#include <machine/pmap.h>
|
|
|
|
#include "vmm_util.h"
|
|
#include "vmm_mem.h"
|
|
|
|
static MALLOC_DEFINE(M_VMM_MEM, "vmm memory", "vmm memory");
|
|
|
|
#define MB (1024 * 1024)
|
|
#define GB (1024 * MB)
|
|
|
|
#define VMM_MEM_MAXSEGS 64
|
|
|
|
/* protected by vmm_mem_mtx */
|
|
static struct {
|
|
vm_paddr_t base;
|
|
vm_size_t length;
|
|
} vmm_mem_avail[VMM_MEM_MAXSEGS];
|
|
|
|
static int vmm_mem_nsegs;
|
|
|
|
static vm_paddr_t maxaddr;
|
|
|
|
static struct mtx vmm_mem_mtx;
|
|
|
|
/*
|
|
* Steal any memory that was deliberately hidden from FreeBSD either by
|
|
* the use of MAXMEM kernel config option or the hw.physmem loader tunable.
|
|
*/
|
|
static int
|
|
vmm_mem_steal_memory(void)
|
|
{
|
|
int nsegs;
|
|
caddr_t kmdp;
|
|
uint32_t smapsize;
|
|
uint64_t base, length;
|
|
struct bios_smap *smapbase, *smap, *smapend;
|
|
|
|
/*
|
|
* Borrowed from hammer_time() and getmemsize() in machdep.c
|
|
*/
|
|
kmdp = preload_search_by_type("elf kernel");
|
|
if (kmdp == NULL)
|
|
kmdp = preload_search_by_type("elf64 kernel");
|
|
|
|
smapbase = (struct bios_smap *)preload_search_info(kmdp,
|
|
MODINFO_METADATA | MODINFOMD_SMAP);
|
|
if (smapbase == NULL)
|
|
panic("No BIOS smap info from loader!");
|
|
|
|
smapsize = *((uint32_t *)smapbase - 1);
|
|
smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
|
|
|
|
nsegs = 0;
|
|
for (smap = smapbase; smap < smapend; smap++) {
|
|
/*
|
|
* XXX
|
|
* Assuming non-overlapping, monotonically increasing
|
|
* memory segments.
|
|
*/
|
|
if (smap->type != SMAP_TYPE_MEMORY)
|
|
continue;
|
|
if (smap->length == 0)
|
|
break;
|
|
|
|
base = roundup(smap->base, NBPDR);
|
|
length = rounddown(smap->length, NBPDR);
|
|
|
|
/* Skip this segment if FreeBSD is using all of it. */
|
|
if (base + length <= ptoa(Maxmem))
|
|
continue;
|
|
|
|
/*
|
|
* If FreeBSD is using part of this segment then adjust
|
|
* 'base' and 'length' accordingly.
|
|
*/
|
|
if (base < ptoa(Maxmem)) {
|
|
uint64_t used;
|
|
used = roundup(ptoa(Maxmem), NBPDR) - base;
|
|
base += used;
|
|
length -= used;
|
|
}
|
|
|
|
if (length == 0)
|
|
continue;
|
|
|
|
vmm_mem_avail[nsegs].base = base;
|
|
vmm_mem_avail[nsegs].length = length;
|
|
|
|
if (base + length > maxaddr)
|
|
maxaddr = base + length;
|
|
|
|
if (0 && bootverbose) {
|
|
printf("vmm_mem_populate: index %d, base 0x%0lx, "
|
|
"length %ld\n",
|
|
nsegs, vmm_mem_avail[nsegs].base,
|
|
vmm_mem_avail[nsegs].length);
|
|
}
|
|
|
|
nsegs++;
|
|
if (nsegs >= VMM_MEM_MAXSEGS) {
|
|
printf("vmm_mem_populate: maximum number of vmm memory "
|
|
"segments reached!\n");
|
|
return (ENOSPC);
|
|
}
|
|
}
|
|
|
|
vmm_mem_nsegs = nsegs;
|
|
|
|
return (0);
|
|
}
|
|
|
|
static void
|
|
vmm_mem_direct_map(vm_paddr_t start, vm_paddr_t end)
|
|
{
|
|
vm_paddr_t addr, remaining;
|
|
int pdpi, pdi, superpage_size;
|
|
pml4_entry_t *pml4p;
|
|
pdp_entry_t *pdp;
|
|
pd_entry_t *pd;
|
|
uint64_t page_attr_bits;
|
|
|
|
if (end >= NBPML4)
|
|
panic("Cannot map memory beyond %ldGB", NBPML4 / GB);
|
|
|
|
/* XXX FreeBSD 8.1 does not use 1G superpages in the direct map */
|
|
if (0 && vmm_supports_1G_pages())
|
|
superpage_size = NBPDP;
|
|
else
|
|
superpage_size = NBPDR;
|
|
|
|
/*
|
|
* Get the page directory pointer page that contains the direct
|
|
* map address mappings.
|
|
*/
|
|
pml4p = kernel_pmap->pm_pml4;
|
|
pdp = (pdp_entry_t *)PHYS_TO_DMAP(pml4p[DMPML4I] & ~PAGE_MASK);
|
|
|
|
page_attr_bits = PG_RW | PG_V | PG_PS | PG_G;
|
|
addr = start;
|
|
while (addr < end) {
|
|
remaining = end - addr;
|
|
pdpi = addr / NBPDP;
|
|
if (superpage_size == NBPDP &&
|
|
remaining >= NBPDP &&
|
|
addr % NBPDP == 0) {
|
|
/*
|
|
* If there isn't a mapping for this address then
|
|
* create one but if there is one already make sure
|
|
* it matches what we expect it to be.
|
|
*/
|
|
if (pdp[pdpi] == 0) {
|
|
pdp[pdpi] = addr | page_attr_bits;
|
|
if (0 && bootverbose) {
|
|
printf("vmm_mem_populate: mapping "
|
|
"0x%lx with 1GB page at "
|
|
"pdpi %d\n", addr, pdpi);
|
|
}
|
|
} else {
|
|
pdp_entry_t pdpe = pdp[pdpi];
|
|
if ((pdpe & ~PAGE_MASK) != addr ||
|
|
(pdpe & page_attr_bits) != page_attr_bits) {
|
|
panic("An invalid mapping 0x%016lx "
|
|
"already exists for 0x%016lx\n",
|
|
pdpe, addr);
|
|
}
|
|
}
|
|
addr += NBPDP;
|
|
} else {
|
|
if (remaining < NBPDR) {
|
|
panic("vmm_mem_populate: remaining (%ld) must "
|
|
"be greater than NBPDR (%d)\n",
|
|
remaining, NBPDR);
|
|
}
|
|
if (pdp[pdpi] == 0) {
|
|
/*
|
|
* XXX we lose this memory forever because
|
|
* we do not keep track of the virtual address
|
|
* that would be required to free this page.
|
|
*/
|
|
pd = malloc(PAGE_SIZE, M_VMM_MEM,
|
|
M_WAITOK | M_ZERO);
|
|
if ((uintptr_t)pd & PAGE_MASK) {
|
|
panic("vmm_mem_populate: page directory"
|
|
"page not aligned on %d "
|
|
"boundary\n", PAGE_SIZE);
|
|
}
|
|
pdp[pdpi] = vtophys(pd);
|
|
pdp[pdpi] |= PG_RW | PG_V | PG_U;
|
|
if (0 && bootverbose) {
|
|
printf("Creating page directory "
|
|
"at pdp index %d for 0x%016lx\n",
|
|
pdpi, addr);
|
|
}
|
|
}
|
|
pdi = (addr % NBPDP) / NBPDR;
|
|
pd = (pd_entry_t *)PHYS_TO_DMAP(pdp[pdpi] & ~PAGE_MASK);
|
|
|
|
/*
|
|
* Create a new mapping if one doesn't already exist
|
|
* or validate it if it does.
|
|
*/
|
|
if (pd[pdi] == 0) {
|
|
pd[pdi] = addr | page_attr_bits;
|
|
if (0 && bootverbose) {
|
|
printf("vmm_mem_populate: mapping "
|
|
"0x%lx with 2MB page at "
|
|
"pdpi %d, pdi %d\n",
|
|
addr, pdpi, pdi);
|
|
}
|
|
} else {
|
|
pd_entry_t pde = pd[pdi];
|
|
if ((pde & ~PAGE_MASK) != addr ||
|
|
(pde & page_attr_bits) != page_attr_bits) {
|
|
panic("An invalid mapping 0x%016lx "
|
|
"already exists for 0x%016lx\n",
|
|
pde, addr);
|
|
}
|
|
}
|
|
addr += NBPDR;
|
|
}
|
|
}
|
|
}
|
|
|
|
static int
|
|
vmm_mem_populate(void)
|
|
{
|
|
int seg, error;
|
|
vm_paddr_t start, end;
|
|
|
|
/* populate the vmm_mem_avail[] array */
|
|
error = vmm_mem_steal_memory();
|
|
if (error)
|
|
return (error);
|
|
|
|
/*
|
|
* Now map the memory that was hidden from FreeBSD in
|
|
* the direct map VA space.
|
|
*/
|
|
for (seg = 0; seg < vmm_mem_nsegs; seg++) {
|
|
start = vmm_mem_avail[seg].base;
|
|
end = start + vmm_mem_avail[seg].length;
|
|
if ((start & PDRMASK) != 0 || (end & PDRMASK) != 0) {
|
|
panic("start (0x%016lx) and end (0x%016lx) must be "
|
|
"aligned on a %dMB boundary\n",
|
|
start, end, NBPDR / MB);
|
|
}
|
|
vmm_mem_direct_map(start, end);
|
|
}
|
|
|
|
return (0);
|
|
}
|
|
|
|
int
|
|
vmm_mem_init(void)
|
|
{
|
|
int error;
|
|
|
|
mtx_init(&vmm_mem_mtx, "vmm_mem_mtx", NULL, MTX_DEF);
|
|
|
|
error = vmm_mem_populate();
|
|
if (error)
|
|
return (error);
|
|
|
|
return (0);
|
|
}
|
|
|
|
vm_paddr_t
|
|
vmm_mem_alloc(size_t size)
|
|
{
|
|
int i;
|
|
vm_paddr_t addr;
|
|
|
|
if ((size & PDRMASK) != 0) {
|
|
panic("vmm_mem_alloc: size 0x%0lx must be "
|
|
"aligned on a 0x%0x boundary\n", size, NBPDR);
|
|
}
|
|
|
|
addr = 0;
|
|
|
|
mtx_lock(&vmm_mem_mtx);
|
|
for (i = 0; i < vmm_mem_nsegs; i++) {
|
|
if (vmm_mem_avail[i].length >= size) {
|
|
addr = vmm_mem_avail[i].base;
|
|
vmm_mem_avail[i].base += size;
|
|
vmm_mem_avail[i].length -= size;
|
|
/* remove a zero length segment */
|
|
if (vmm_mem_avail[i].length == 0) {
|
|
memmove(&vmm_mem_avail[i],
|
|
&vmm_mem_avail[i + 1],
|
|
(vmm_mem_nsegs - (i + 1)) *
|
|
sizeof(vmm_mem_avail[0]));
|
|
vmm_mem_nsegs--;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
mtx_unlock(&vmm_mem_mtx);
|
|
|
|
return (addr);
|
|
}
|
|
|
|
void
|
|
vmm_mem_free(vm_paddr_t base, size_t length)
|
|
{
|
|
int i;
|
|
|
|
if ((base & PDRMASK) != 0 || (length & PDRMASK) != 0) {
|
|
panic("vmm_mem_free: base 0x%0lx and length 0x%0lx must be "
|
|
"aligned on a 0x%0x boundary\n", base, length, NBPDR);
|
|
}
|
|
|
|
mtx_lock(&vmm_mem_mtx);
|
|
|
|
for (i = 0; i < vmm_mem_nsegs; i++) {
|
|
if (vmm_mem_avail[i].base > base)
|
|
break;
|
|
}
|
|
|
|
if (vmm_mem_nsegs >= VMM_MEM_MAXSEGS)
|
|
panic("vmm_mem_free: cannot free any more segments");
|
|
|
|
/* Create a new segment at index 'i' */
|
|
memmove(&vmm_mem_avail[i + 1], &vmm_mem_avail[i],
|
|
(vmm_mem_nsegs - i) * sizeof(vmm_mem_avail[0]));
|
|
|
|
vmm_mem_avail[i].base = base;
|
|
vmm_mem_avail[i].length = length;
|
|
|
|
vmm_mem_nsegs++;
|
|
|
|
coalesce_some_more:
|
|
for (i = 0; i < vmm_mem_nsegs - 1; i++) {
|
|
if (vmm_mem_avail[i].base + vmm_mem_avail[i].length ==
|
|
vmm_mem_avail[i + 1].base) {
|
|
vmm_mem_avail[i].length += vmm_mem_avail[i + 1].length;
|
|
memmove(&vmm_mem_avail[i + 1], &vmm_mem_avail[i + 2],
|
|
(vmm_mem_nsegs - (i + 2)) * sizeof(vmm_mem_avail[0]));
|
|
vmm_mem_nsegs--;
|
|
goto coalesce_some_more;
|
|
}
|
|
}
|
|
|
|
mtx_unlock(&vmm_mem_mtx);
|
|
}
|
|
|
|
vm_paddr_t
|
|
vmm_mem_maxaddr(void)
|
|
{
|
|
|
|
return (maxaddr);
|
|
}
|
|
|
|
void
|
|
vmm_mem_dump(void)
|
|
{
|
|
int i;
|
|
vm_paddr_t base;
|
|
vm_size_t length;
|
|
|
|
mtx_lock(&vmm_mem_mtx);
|
|
for (i = 0; i < vmm_mem_nsegs; i++) {
|
|
base = vmm_mem_avail[i].base;
|
|
length = vmm_mem_avail[i].length;
|
|
printf("%-4d0x%016lx 0x%016lx\n", i, base, base + length);
|
|
}
|
|
mtx_unlock(&vmm_mem_mtx);
|
|
}
|