freebsd-skq/sys/amd64/vmm/intel/ept.c
Neel Natu bda273f21e Get rid of assumptions in the hypervisor that the host physical memory
associated with guest physical memory is contiguous.

Rewrite vm_gpa2hpa() to get the GPA to HPA mapping by querying the nested
page tables.
2012-10-03 00:46:30 +00:00

393 lines
9.2 KiB
C

/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <sys/errno.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/smp.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <machine/param.h>
#include <machine/cpufunc.h>
#include <machine/pmap.h>
#include <machine/vmparam.h>
#include <machine/vmm.h>
#include "vmx_cpufunc.h"
#include "vmx_msr.h"
#include "vmx.h"
#include "ept.h"
#define EPT_PWL4(cap) ((cap) & (1UL << 6))
#define EPT_MEMORY_TYPE_WB(cap) ((cap) & (1UL << 14))
#define EPT_PDE_SUPERPAGE(cap) ((cap) & (1UL << 16)) /* 2MB pages */
#define EPT_PDPTE_SUPERPAGE(cap) ((cap) & (1UL << 17)) /* 1GB pages */
#define INVVPID_SUPPORTED(cap) ((cap) & (1UL << 32))
#define INVEPT_SUPPORTED(cap) ((cap) & (1UL << 20))
#define INVVPID_ALL_TYPES_MASK 0xF0000000000UL
#define INVVPID_ALL_TYPES_SUPPORTED(cap) \
(((cap) & INVVPID_ALL_TYPES_MASK) == INVVPID_ALL_TYPES_MASK)
#define INVEPT_ALL_TYPES_MASK 0x6000000UL
#define INVEPT_ALL_TYPES_SUPPORTED(cap) \
(((cap) & INVEPT_ALL_TYPES_MASK) == INVEPT_ALL_TYPES_MASK)
#define EPT_PG_RD (1 << 0)
#define EPT_PG_WR (1 << 1)
#define EPT_PG_EX (1 << 2)
#define EPT_PG_MEMORY_TYPE(x) ((x) << 3)
#define EPT_PG_IGNORE_PAT (1 << 6)
#define EPT_PG_SUPERPAGE (1 << 7)
#define EPT_ADDR_MASK ((uint64_t)-1 << 12)
MALLOC_DECLARE(M_VMX);
static uint64_t page_sizes_mask;
int
ept_init(void)
{
int page_shift;
uint64_t cap;
cap = rdmsr(MSR_VMX_EPT_VPID_CAP);
/*
* Verify that:
* - page walk length is 4 steps
* - extended page tables can be laid out in write-back memory
* - invvpid instruction with all possible types is supported
* - invept instruction with all possible types is supported
*/
if (!EPT_PWL4(cap) ||
!EPT_MEMORY_TYPE_WB(cap) ||
!INVVPID_SUPPORTED(cap) ||
!INVVPID_ALL_TYPES_SUPPORTED(cap) ||
!INVEPT_SUPPORTED(cap) ||
!INVEPT_ALL_TYPES_SUPPORTED(cap))
return (EINVAL);
/* Set bits in 'page_sizes_mask' for each valid page size */
page_shift = PAGE_SHIFT;
page_sizes_mask = 1UL << page_shift; /* 4KB page */
page_shift += 9;
if (EPT_PDE_SUPERPAGE(cap))
page_sizes_mask |= 1UL << page_shift; /* 2MB superpage */
page_shift += 9;
if (EPT_PDPTE_SUPERPAGE(cap))
page_sizes_mask |= 1UL << page_shift; /* 1GB superpage */
return (0);
}
#if 0
static void
ept_dump(uint64_t *ptp, int nlevels)
{
int i, t, tabs;
uint64_t *ptpnext, ptpval;
if (--nlevels < 0)
return;
tabs = 3 - nlevels;
for (t = 0; t < tabs; t++)
printf("\t");
printf("PTP = %p\n", ptp);
for (i = 0; i < 512; i++) {
ptpval = ptp[i];
if (ptpval == 0)
continue;
for (t = 0; t < tabs; t++)
printf("\t");
printf("%3d 0x%016lx\n", i, ptpval);
if (nlevels != 0 && (ptpval & EPT_PG_SUPERPAGE) == 0) {
ptpnext = (uint64_t *)
PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK);
ept_dump(ptpnext, nlevels);
}
}
}
#endif
static size_t
ept_create_mapping(uint64_t *ptp, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
vm_memattr_t attr, vm_prot_t prot, boolean_t spok)
{
int spshift, ptpshift, ptpindex, nlevels;
/*
* Compute the size of the mapping that we can accomodate.
*
* This is based on three factors:
* - super page sizes supported by the processor
* - alignment of the region starting at 'gpa' and 'hpa'
* - length of the region 'len'
*/
spshift = PAGE_SHIFT;
if (spok)
spshift += (EPT_PWLEVELS - 1) * 9;
while (spshift >= PAGE_SHIFT) {
uint64_t spsize = 1UL << spshift;
if ((page_sizes_mask & spsize) != 0 &&
(gpa & (spsize - 1)) == 0 &&
(hpa & (spsize - 1)) == 0 &&
length >= spsize) {
break;
}
spshift -= 9;
}
if (spshift < PAGE_SHIFT) {
panic("Invalid spshift for gpa 0x%016lx, hpa 0x%016lx, "
"length 0x%016lx, page_sizes_mask 0x%016lx",
gpa, hpa, length, page_sizes_mask);
}
nlevels = EPT_PWLEVELS;
while (--nlevels >= 0) {
ptpshift = PAGE_SHIFT + nlevels * 9;
ptpindex = (gpa >> ptpshift) & 0x1FF;
/* We have reached the leaf mapping */
if (spshift >= ptpshift)
break;
/*
* We are working on a non-leaf page table page.
*
* Create the next level page table page if necessary and point
* to it from the current page table.
*/
if (ptp[ptpindex] == 0) {
void *nlp = malloc(PAGE_SIZE, M_VMX, M_WAITOK | M_ZERO);
ptp[ptpindex] = vtophys(nlp);
ptp[ptpindex] |= EPT_PG_RD | EPT_PG_WR | EPT_PG_EX;
}
/* Work our way down to the next level page table page */
ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & EPT_ADDR_MASK);
}
if ((gpa & ((1UL << ptpshift) - 1)) != 0) {
panic("ept_create_mapping: gpa 0x%016lx and ptpshift %d "
"mismatch\n", gpa, ptpshift);
}
if (prot != VM_PROT_NONE) {
/* Do the mapping */
ptp[ptpindex] = hpa;
/* Apply the access controls */
if (prot & VM_PROT_READ)
ptp[ptpindex] |= EPT_PG_RD;
if (prot & VM_PROT_WRITE)
ptp[ptpindex] |= EPT_PG_WR;
if (prot & VM_PROT_EXECUTE)
ptp[ptpindex] |= EPT_PG_EX;
/*
* XXX should we enforce this memory type by setting the
* ignore PAT bit to 1.
*/
ptp[ptpindex] |= EPT_PG_MEMORY_TYPE(attr);
if (nlevels > 0)
ptp[ptpindex] |= EPT_PG_SUPERPAGE;
} else {
/* Remove the mapping */
ptp[ptpindex] = 0;
}
return (1UL << ptpshift);
}
static vm_paddr_t
ept_lookup_mapping(uint64_t *ptp, vm_paddr_t gpa)
{
int nlevels, ptpshift, ptpindex;
uint64_t ptpval, hpabase, pgmask;
nlevels = EPT_PWLEVELS;
while (--nlevels >= 0) {
ptpshift = PAGE_SHIFT + nlevels * 9;
ptpindex = (gpa >> ptpshift) & 0x1FF;
ptpval = ptp[ptpindex];
/* Cannot make progress beyond this point */
if ((ptpval & (EPT_PG_RD | EPT_PG_WR | EPT_PG_EX)) == 0)
break;
if (nlevels == 0 || (ptpval & EPT_PG_SUPERPAGE)) {
pgmask = (1UL << ptpshift) - 1;
hpabase = ptpval & ~pgmask;
return (hpabase | (gpa & pgmask));
}
/* Work our way down to the next level page table page */
ptp = (uint64_t *)PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK);
}
return ((vm_paddr_t)-1);
}
static void
ept_free_pt_entry(pt_entry_t pte)
{
if (pte == 0)
return;
/* sanity check */
if ((pte & EPT_PG_SUPERPAGE) != 0)
panic("ept_free_pt_entry: pte cannot have superpage bit");
return;
}
static void
ept_free_pd_entry(pd_entry_t pde)
{
pt_entry_t *pt;
int i;
if (pde == 0)
return;
if ((pde & EPT_PG_SUPERPAGE) == 0) {
pt = (pt_entry_t *)PHYS_TO_DMAP(pde & EPT_ADDR_MASK);
for (i = 0; i < NPTEPG; i++)
ept_free_pt_entry(pt[i]);
free(pt, M_VMX); /* free the page table page */
}
}
static void
ept_free_pdp_entry(pdp_entry_t pdpe)
{
pd_entry_t *pd;
int i;
if (pdpe == 0)
return;
if ((pdpe & EPT_PG_SUPERPAGE) == 0) {
pd = (pd_entry_t *)PHYS_TO_DMAP(pdpe & EPT_ADDR_MASK);
for (i = 0; i < NPDEPG; i++)
ept_free_pd_entry(pd[i]);
free(pd, M_VMX); /* free the page directory page */
}
}
static void
ept_free_pml4_entry(pml4_entry_t pml4e)
{
pdp_entry_t *pdp;
int i;
if (pml4e == 0)
return;
if ((pml4e & EPT_PG_SUPERPAGE) == 0) {
pdp = (pdp_entry_t *)PHYS_TO_DMAP(pml4e & EPT_ADDR_MASK);
for (i = 0; i < NPDPEPG; i++)
ept_free_pdp_entry(pdp[i]);
free(pdp, M_VMX); /* free the page directory ptr page */
}
}
void
ept_vmcleanup(struct vmx *vmx)
{
int i;
for (i = 0; i < NPML4EPG; i++)
ept_free_pml4_entry(vmx->pml4ept[i]);
}
int
ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t len,
vm_memattr_t attr, int prot, boolean_t spok)
{
size_t n;
struct vmx *vmx = arg;
while (len > 0) {
n = ept_create_mapping(vmx->pml4ept, gpa, hpa, len, attr,
prot, spok);
len -= n;
gpa += n;
hpa += n;
}
return (0);
}
vm_paddr_t
ept_vmmmap_get(void *arg, vm_paddr_t gpa)
{
vm_paddr_t hpa;
struct vmx *vmx;
vmx = arg;
hpa = ept_lookup_mapping(vmx->pml4ept, gpa);
return (hpa);
}
static void
invept_single_context(void *arg)
{
struct invept_desc desc = *(struct invept_desc *)arg;
invept(INVEPT_TYPE_SINGLE_CONTEXT, desc);
}
void
ept_invalidate_mappings(u_long pml4ept)
{
struct invept_desc invept_desc = { 0 };
invept_desc.eptp = EPTP(pml4ept);
smp_rendezvous(NULL, invept_single_context, NULL, &invept_desc);
}