Merge projects/bhyve_npt_pmap into head.

Make the amd64/pmap code aware of nested page table mappings used by bhyve
guests. This allows bhyve to associate each guest with its own vmspace and
deal with nested page faults in the context of that vmspace. This also
enables features like accessed/dirty bit tracking, swapping to disk and
transparent superpage promotions of guest memory.

Guest vmspace:
Each bhyve guest has a unique vmspace to represent the physical memory
allocated to the guest. Each memory segment allocated by the guest is
mapped into the guest's address space via the 'vmspace->vm_map' and is
backed by an object of type OBJT_DEFAULT.

pmap types:
The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT.

The PT_X86 pmap type is used by the vmspace associated with the host kernel
as well as user processes executing on the host. The PT_EPT pmap is used by
the vmspace associated with a bhyve guest.

Page Table Entries:
The EPT page table entries as mostly similar in functionality to regular
page table entries although there are some differences in terms of what
bits are used to express that functionality. For e.g. the dirty bit is
represented by bit 9 in the nested PTE as opposed to bit 6 in the regular
x86 PTE. Therefore the bitmask representing the dirty bit is now computed
at runtime based on the type of the pmap. Thus PG_M that was previously a
macro now becomes a local variable that is initialized at runtime using
'pmap_modified_bit(pmap)'.

An additional wrinkle associated with EPT mappings is that older Intel
processors don't have hardware support for tracking accessed/dirty bits in
the PTE. This means that the amd64/pmap code needs to emulate these bits to
provide proper accounting to the VM subsystem. This is achieved by using
the following mapping for EPT entries that need emulation of A/D bits:
               Bit Position           Interpreted By
PG_V               52                 software (accessed bit emulation handler)
PG_RW              53                 software (dirty bit emulation handler)
PG_A               0                  hardware (aka EPT_PG_RD)
PG_M               1                  hardware (aka EPT_PG_WR)

The idea to use the mapping listed above for A/D bit emulation came from
Alan Cox (alc@).

The final difference with respect to x86 PTEs is that some EPT implementations
do not support superpage mappings. This is recorded in the 'pm_flags' field
of the pmap.

TLB invalidation:
The amd64/pmap code has a number of ways to do invalidation of mappings
that may be cached in the TLB: single page, multiple pages in a range or the
entire TLB. All of these funnel into a single EPT invalidation routine called
'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and
sends an IPI to the host cpus that are executing the guest's vcpus. On a
subsequent entry into the guest it will detect that the EPT has changed and
invalidate the mappings from the TLB.

Guest memory access:
Since the guest memory is no longer wired we need to hold the host physical
page that backs the guest physical page before we can access it. The helper
functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose.

PCI passthru:
Guest's with PCI passthru devices will wire the entire guest physical address
space. The MMIO BAR associated with the passthru device is backed by a
vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that
have one or more PCI passthru devices attached to them.

Limitations:
There isn't a way to map a guest physical page without execute permissions.
This is because the amd64/pmap code interprets the guest physical mappings as
user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U
shares the same bit position as EPT_PG_EXECUTE all guest mappings become
automatically executable.

Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews
as well as their support and encouragement.

Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing
object for pci passthru mmio regions.

Special thanks to Peter Holm for testing the patch on short notice.

Approved by:	re
Discussed with:	grehan
Reviewed by:	alc, kib
Tested by:	pho
This commit is contained in:
Neel Natu 2013-10-05 21:22:35 +00:00
parent bf57e9793a
commit 318224bbe6
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=256072
30 changed files with 2137 additions and 886 deletions

View File

@ -124,7 +124,8 @@ vm_destroy(struct vmctx *vm)
}
int
vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len)
vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len,
int *wired)
{
int error;
struct vm_memory_segment seg;
@ -133,6 +134,8 @@ vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len)
seg.gpa = gpa;
error = ioctl(ctx->fd, VM_GET_MEMORY_SEG, &seg);
*ret_len = seg.len;
if (wired != NULL)
*wired = seg.wired;
return (error);
}
@ -741,3 +744,23 @@ vcpu_reset(struct vmctx *vmctx, int vcpu)
done:
return (error);
}
int
vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num)
{
int error, i;
struct vm_gpa_pte gpapte;
bzero(&gpapte, sizeof(gpapte));
gpapte.gpa = gpa;
error = ioctl(ctx->fd, VM_GET_GPA_PMAP, &gpapte);
if (error == 0) {
*num = gpapte.ptenum;
for (i = 0; i < gpapte.ptenum; i++)
pte[i] = gpapte.pte[i];
}
return (error);
}

View File

@ -45,9 +45,11 @@ enum vm_mmap_style {
int vm_create(const char *name);
struct vmctx *vm_open(const char *name);
void vm_destroy(struct vmctx *ctx);
int vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len);
int vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len,
int *wired);
int vm_setup_memory(struct vmctx *ctx, size_t len, enum vm_mmap_style s);
void *vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len);
int vm_get_gpa_pmap(struct vmctx *, uint64_t gpa, uint64_t *pte, int *num);
uint32_t vm_get_lowmem_limit(struct vmctx *ctx);
void vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit);
int vm_set_desc(struct vmctx *ctx, int vcpu, int reg,

View File

@ -1574,7 +1574,7 @@ getmemsize(caddr_t kmdp, u_int64_t first)
/*
* map page into kernel: valid, read/write,non-cacheable
*/
*pte = pa | PG_V | PG_RW | PG_N;
*pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
invltlb();
tmp = *(int *)ptr;

File diff suppressed because it is too large Load Diff

View File

@ -733,6 +733,14 @@ trap_pfault(frame, usermode)
}
}
/*
* If the trap was caused by errant bits in the PTE then panic.
*/
if (frame->tf_err & PGEX_RSV) {
trap_fatal(frame, eva);
return (-1);
}
/*
* PGEX_I is defined only if the execute disable bit capability is
* supported and enabled.
@ -822,10 +830,11 @@ trap_fatal(frame, eva)
#endif
if (type == T_PAGEFLT) {
printf("fault virtual address = 0x%lx\n", eva);
printf("fault code = %s %s %s, %s\n",
printf("fault code = %s %s %s%s, %s\n",
code & PGEX_U ? "user" : "supervisor",
code & PGEX_W ? "write" : "read",
code & PGEX_I ? "instruction" : "data",
code & PGEX_RSV ? " rsv" : "",
code & PGEX_P ? "protection violation" : "page not present");
}
printf("instruction pointer = 0x%lx:0x%lx\n",

View File

@ -50,41 +50,74 @@
* of the fields not present here and there, depending on a lot of things.
*/
/* ---- Intel Nomenclature ---- */
#define PG_V 0x001 /* P Valid */
#define PG_RW 0x002 /* R/W Read/Write */
#define PG_U 0x004 /* U/S User/Supervisor */
#define PG_NC_PWT 0x008 /* PWT Write through */
#define PG_NC_PCD 0x010 /* PCD Cache disable */
#define PG_A 0x020 /* A Accessed */
#define PG_M 0x040 /* D Dirty */
#define PG_PS 0x080 /* PS Page size (0=4k,1=2M) */
#define PG_PTE_PAT 0x080 /* PAT PAT index */
#define PG_G 0x100 /* G Global */
#define PG_AVAIL1 0x200 /* / Available for system */
#define PG_AVAIL2 0x400 /* < programmers use */
#define PG_AVAIL3 0x800 /* \ */
#define PG_PDE_PAT 0x1000 /* PAT PAT index */
#define PG_NX (1ul<<63) /* No-execute */
/* Our various interpretations of the above */
#define PG_W PG_AVAIL1 /* "Wired" pseudoflag */
#define PG_MANAGED PG_AVAIL2
#define PG_FRAME (0x000ffffffffff000ul)
#define PG_PS_FRAME (0x000fffffffe00000ul)
#define PG_PROT (PG_RW|PG_U) /* all protection bits . */
#define PG_N (PG_NC_PWT|PG_NC_PCD) /* Non-cacheable */
#define X86_PG_V 0x001 /* P Valid */
#define X86_PG_RW 0x002 /* R/W Read/Write */
#define X86_PG_U 0x004 /* U/S User/Supervisor */
#define X86_PG_NC_PWT 0x008 /* PWT Write through */
#define X86_PG_NC_PCD 0x010 /* PCD Cache disable */
#define X86_PG_A 0x020 /* A Accessed */
#define X86_PG_M 0x040 /* D Dirty */
#define X86_PG_PS 0x080 /* PS Page size (0=4k,1=2M) */
#define X86_PG_PTE_PAT 0x080 /* PAT PAT index */
#define X86_PG_G 0x100 /* G Global */
#define X86_PG_AVAIL1 0x200 /* / Available for system */
#define X86_PG_AVAIL2 0x400 /* < programmers use */
#define X86_PG_AVAIL3 0x800 /* \ */
#define X86_PG_PDE_PAT 0x1000 /* PAT PAT index */
#define X86_PG_NX (1ul<<63) /* No-execute */
#define X86_PG_AVAIL(x) (1ul << (x))
/* Page level cache control fields used to determine the PAT type */
#define PG_PDE_CACHE (PG_PDE_PAT | PG_NC_PWT | PG_NC_PCD)
#define PG_PTE_CACHE (PG_PTE_PAT | PG_NC_PWT | PG_NC_PCD)
#define X86_PG_PDE_CACHE (X86_PG_PDE_PAT | X86_PG_NC_PWT | X86_PG_NC_PCD)
#define X86_PG_PTE_CACHE (X86_PG_PTE_PAT | X86_PG_NC_PWT | X86_PG_NC_PCD)
/*
* Intel extended page table (EPT) bit definitions.
*/
#define EPT_PG_READ 0x001 /* R Read */
#define EPT_PG_WRITE 0x002 /* W Write */
#define EPT_PG_EXECUTE 0x004 /* X Execute */
#define EPT_PG_IGNORE_PAT 0x040 /* IPAT Ignore PAT */
#define EPT_PG_PS 0x080 /* PS Page size */
#define EPT_PG_A 0x100 /* A Accessed */
#define EPT_PG_M 0x200 /* D Dirty */
#define EPT_PG_MEMORY_TYPE(x) ((x) << 3) /* MT Memory Type */
/*
* Define the PG_xx macros in terms of the bits on x86 PTEs.
*/
#define PG_V X86_PG_V
#define PG_RW X86_PG_RW
#define PG_U X86_PG_U
#define PG_NC_PWT X86_PG_NC_PWT
#define PG_NC_PCD X86_PG_NC_PCD
#define PG_A X86_PG_A
#define PG_M X86_PG_M
#define PG_PS X86_PG_PS
#define PG_PTE_PAT X86_PG_PTE_PAT
#define PG_G X86_PG_G
#define PG_AVAIL1 X86_PG_AVAIL1
#define PG_AVAIL2 X86_PG_AVAIL2
#define PG_AVAIL3 X86_PG_AVAIL3
#define PG_PDE_PAT X86_PG_PDE_PAT
#define PG_NX X86_PG_NX
#define PG_PDE_CACHE X86_PG_PDE_CACHE
#define PG_PTE_CACHE X86_PG_PTE_CACHE
/* Our various interpretations of the above */
#define PG_W X86_PG_AVAIL3 /* "Wired" pseudoflag */
#define PG_MANAGED X86_PG_AVAIL2
#define EPT_PG_EMUL_V X86_PG_AVAIL(52)
#define EPT_PG_EMUL_RW X86_PG_AVAIL(53)
#define PG_FRAME (0x000ffffffffff000ul)
#define PG_PS_FRAME (0x000fffffffe00000ul)
/*
* Promotion to a 2MB (PDE) page mapping requires that the corresponding 4KB
* (PTE) page mappings have identical settings for the following fields:
*/
#define PG_PTE_PROMOTE (PG_NX | PG_MANAGED | PG_W | PG_G | PG_PTE_PAT | \
PG_M | PG_A | PG_NC_PCD | PG_NC_PWT | PG_U | PG_RW | PG_V)
#define PG_PTE_PROMOTE (PG_NX | PG_MANAGED | PG_W | PG_G | PG_PTE_CACHE | \
PG_M | PG_A | PG_U | PG_RW | PG_V)
/*
* Page Protection Exception bits
@ -96,6 +129,28 @@
#define PGEX_RSV 0x08 /* reserved PTE field is non-zero */
#define PGEX_I 0x10 /* during an instruction fetch */
/*
* undef the PG_xx macros that define bits in the regular x86 PTEs that
* have a different position in nested PTEs. This is done when compiling
* code that needs to be aware of the differences between regular x86 and
* nested PTEs.
*
* The appropriate bitmask will be calculated at runtime based on the pmap
* type.
*/
#ifdef AMD64_NPT_AWARE
#undef PG_AVAIL1 /* X86_PG_AVAIL1 aliases with EPT_PG_M */
#undef PG_G
#undef PG_A
#undef PG_M
#undef PG_PDE_PAT
#undef PG_PDE_CACHE
#undef PG_PTE_PAT
#undef PG_PTE_CACHE
#undef PG_RW
#undef PG_V
#endif
/*
* Pte related macros. This is complicated by having to deal with
* the sign extension of the 48th bit.
@ -256,6 +311,11 @@ struct pmap {
int pm_flags;
};
/* flags */
#define PMAP_PDE_SUPERPAGE (1 << 0) /* supports 2MB superpages */
#define PMAP_EMULATE_AD_BITS (1 << 1) /* needs A/D bits emulation */
#define PMAP_SUPPORTS_EXEC_ONLY (1 << 2) /* execute only mappings ok */
typedef struct pmap *pmap_t;
#ifdef _KERNEL
@ -272,6 +332,9 @@ extern struct pmap kernel_pmap_store;
#define PMAP_MTX(pmap) (&(pmap)->pm_mtx)
#define PMAP_TRYLOCK(pmap) mtx_trylock(&(pmap)->pm_mtx)
#define PMAP_UNLOCK(pmap) mtx_unlock(&(pmap)->pm_mtx)
int pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags);
int pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype);
#endif
/*
@ -330,7 +393,7 @@ void pmap_invalidate_all(pmap_t);
void pmap_invalidate_cache(void);
void pmap_invalidate_cache_pages(vm_page_t *pages, int count);
void pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva);
void pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num);
#endif /* _KERNEL */
#endif /* !LOCORE */

View File

@ -39,19 +39,18 @@ struct seg_desc;
struct vm_exit;
struct vm_run;
struct vlapic;
struct vmspace;
struct vm_object;
struct pmap;
enum x2apic_state;
typedef int (*vmm_init_func_t)(void);
typedef int (*vmm_cleanup_func_t)(void);
typedef void * (*vmi_init_func_t)(struct vm *vm); /* instance specific apis */
typedef int (*vmi_run_func_t)(void *vmi, int vcpu, register_t rip);
typedef void * (*vmi_init_func_t)(struct vm *vm, struct pmap *pmap);
typedef int (*vmi_run_func_t)(void *vmi, int vcpu, register_t rip,
struct pmap *pmap);
typedef void (*vmi_cleanup_func_t)(void *vmi);
typedef int (*vmi_mmap_set_func_t)(void *vmi, vm_paddr_t gpa,
vm_paddr_t hpa, size_t length,
vm_memattr_t attr, int prot,
boolean_t superpages_ok);
typedef vm_paddr_t (*vmi_mmap_get_func_t)(void *vmi, vm_paddr_t gpa);
typedef int (*vmi_get_register_t)(void *vmi, int vcpu, int num,
uint64_t *retval);
typedef int (*vmi_set_register_t)(void *vmi, int vcpu, int num,
@ -65,6 +64,8 @@ typedef int (*vmi_inject_event_t)(void *vmi, int vcpu,
uint32_t code, int code_valid);
typedef int (*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval);
typedef int (*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val);
typedef struct vmspace * (*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max);
typedef void (*vmi_vmspace_free)(struct vmspace *vmspace);
struct vmm_ops {
vmm_init_func_t init; /* module wide initialization */
@ -73,8 +74,6 @@ struct vmm_ops {
vmi_init_func_t vminit; /* vm-specific initialization */
vmi_run_func_t vmrun;
vmi_cleanup_func_t vmcleanup;
vmi_mmap_set_func_t vmmmap_set;
vmi_mmap_get_func_t vmmmap_get;
vmi_get_register_t vmgetreg;
vmi_set_register_t vmsetreg;
vmi_get_desc_t vmgetdesc;
@ -82,6 +81,8 @@ struct vmm_ops {
vmi_inject_event_t vminject;
vmi_get_cap_t vmgetcap;
vmi_set_cap_t vmsetcap;
vmi_vmspace_alloc vmspace_alloc;
vmi_vmspace_free vmspace_free;
};
extern struct vmm_ops vmm_ops_intel;
@ -93,9 +94,14 @@ const char *vm_name(struct vm *vm);
int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len);
int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len);
vm_paddr_t vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t size);
void *vm_gpa_hold(struct vm *, vm_paddr_t gpa, size_t len, int prot,
void **cookie);
void vm_gpa_release(void *cookie);
int vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
struct vm_memory_segment *seg);
int vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len,
vm_offset_t *offset, struct vm_object **object);
boolean_t vm_mem_allocated(struct vm *vm, vm_paddr_t gpa);
int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval);
int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val);
int vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
@ -130,8 +136,9 @@ void *vm_iommu_domain(struct vm *vm);
enum vcpu_state {
VCPU_IDLE,
VCPU_FROZEN,
VCPU_RUNNING,
VCPU_CANNOT_RUN,
VCPU_SLEEPING,
};
int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state);
@ -145,7 +152,9 @@ vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu)
void *vcpu_stats(struct vm *vm, int vcpu);
void vm_interrupt_hostcpu(struct vm *vm, int vcpu);
struct vmspace *vm_get_vmspace(struct vm *vm);
int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func);
int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func);
#endif /* KERNEL */
#include <machine/vmm_instruction_emul.h>
@ -247,6 +256,7 @@ enum vm_exitcode {
VM_EXITCODE_MTRAP,
VM_EXITCODE_PAUSE,
VM_EXITCODE_PAGING,
VM_EXITCODE_INST_EMUL,
VM_EXITCODE_SPINUP_AP,
VM_EXITCODE_MAX
};
@ -266,8 +276,15 @@ struct vm_exit {
} inout;
struct {
uint64_t gpa;
struct vie vie;
int fault_type;
int protection;
} paging;
struct {
uint64_t gpa;
uint64_t gla;
uint64_t cr3;
struct vie vie;
} inst_emul;
/*
* VMX specific payload. Used when there is no "better"
* exitcode to represent the VM-exit.

View File

@ -36,7 +36,8 @@ int vmmdev_cleanup(void);
struct vm_memory_segment {
vm_paddr_t gpa; /* in */
size_t len; /* in */
size_t len;
int wired;
};
struct vm_register {
@ -135,6 +136,12 @@ struct vm_x2apic {
enum x2apic_state state;
};
struct vm_gpa_pte {
uint64_t gpa; /* in */
uint64_t pte[4]; /* out */
int ptenum;
};
enum {
/* general routines */
IOCNUM_ABIVERS = 0,
@ -145,6 +152,7 @@ enum {
/* memory apis */
IOCNUM_MAP_MEMORY = 10,
IOCNUM_GET_MEMORY_SEG = 11,
IOCNUM_GET_GPA_PMAP = 12,
/* register/state accessors */
IOCNUM_SET_REGISTER = 20,
@ -215,4 +223,6 @@ enum {
_IOW('v', IOCNUM_SET_X2APIC_STATE, struct vm_x2apic)
#define VM_GET_X2APIC_STATE \
_IOWR('v', IOCNUM_GET_X2APIC_STATE, struct vm_x2apic)
#define VM_GET_GPA_PMAP \
_IOWR('v', IOCNUM_GET_GPA_PMAP, struct vm_gpa_pte)
#endif

View File

@ -102,11 +102,15 @@ int vmm_emulate_instruction(void *vm, int cpuid, uint64_t gpa, struct vie *vie,
#ifdef _KERNEL
/*
* APIs to fetch and decode the instruction from nested page fault handler.
*
* 'vie' must be initialized before calling 'vmm_fetch_instruction()'
*/
int vmm_fetch_instruction(struct vm *vm, int cpuid,
uint64_t rip, int inst_length, uint64_t cr3,
struct vie *vie);
void vie_init(struct vie *vie);
/*
* Decode the instruction fetched into 'vie' so it can be emulated.
*

View File

@ -54,7 +54,7 @@ amdv_cleanup(void)
}
static void *
amdv_vminit(struct vm *vm)
amdv_vminit(struct vm *vm, struct pmap *pmap)
{
printf("amdv_vminit: not implemented\n");
@ -62,7 +62,7 @@ amdv_vminit(struct vm *vm)
}
static int
amdv_vmrun(void *arg, int vcpu, register_t rip)
amdv_vmrun(void *arg, int vcpu, register_t rip, struct pmap *pmap)
{
printf("amdv_vmrun: not implemented\n");
@ -77,23 +77,6 @@ amdv_vmcleanup(void *arg)
return;
}
static int
amdv_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
vm_memattr_t attr, int prot, boolean_t spok)
{
printf("amdv_vmmmap_set: not implemented\n");
return (EINVAL);
}
static vm_paddr_t
amdv_vmmmap_get(void *arg, vm_paddr_t gpa)
{
printf("amdv_vmmmap_get: not implemented\n");
return (EINVAL);
}
static int
amdv_getreg(void *arg, int vcpu, int regnum, uint64_t *retval)
{
@ -151,21 +134,37 @@ amdv_setcap(void *arg, int vcpu, int type, int val)
return (EINVAL);
}
static struct vmspace *
amdv_vmspace_alloc(vm_offset_t min, vm_offset_t max)
{
printf("amdv_vmspace_alloc: not implemented\n");
return (NULL);
}
static void
amdv_vmspace_free(struct vmspace *vmspace)
{
printf("amdv_vmspace_free: not implemented\n");
return;
}
struct vmm_ops vmm_ops_amd = {
amdv_init,
amdv_cleanup,
amdv_vminit,
amdv_vmrun,
amdv_vmcleanup,
amdv_vmmmap_set,
amdv_vmmmap_get,
amdv_getreg,
amdv_setreg,
amdv_getdesc,
amdv_setdesc,
amdv_inject_event,
amdv_getcap,
amdv_setcap
amdv_setcap,
amdv_vmspace_alloc,
amdv_vmspace_free,
};
static int

View File

@ -29,32 +29,31 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/types.h>
#include <sys/errno.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/smp.h>
#include <sys/sysctl.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <machine/param.h>
#include <machine/cpufunc.h>
#include <machine/pmap.h>
#include <machine/vmparam.h>
#include <vm/vm_extern.h>
#include <machine/vmm.h>
#include "vmx_cpufunc.h"
#include "vmx_msr.h"
#include "vmx.h"
#include "ept.h"
#define EPT_SUPPORTS_EXEC_ONLY(cap) ((cap) & (1UL << 0))
#define EPT_PWL4(cap) ((cap) & (1UL << 6))
#define EPT_MEMORY_TYPE_WB(cap) ((cap) & (1UL << 14))
#define EPT_PDE_SUPERPAGE(cap) ((cap) & (1UL << 16)) /* 2MB pages */
#define EPT_PDPTE_SUPERPAGE(cap) ((cap) & (1UL << 17)) /* 1GB pages */
#define INVVPID_SUPPORTED(cap) ((cap) & (1UL << 32))
#define INVEPT_SUPPORTED(cap) ((cap) & (1UL << 20))
#define AD_BITS_SUPPORTED(cap) ((cap) & (1UL << 21))
#define INVVPID_SUPPORTED(cap) ((cap) & (1UL << 32))
#define INVVPID_ALL_TYPES_MASK 0xF0000000000UL
#define INVVPID_ALL_TYPES_SUPPORTED(cap) \
@ -64,28 +63,22 @@ __FBSDID("$FreeBSD$");
#define INVEPT_ALL_TYPES_SUPPORTED(cap) \
(((cap) & INVEPT_ALL_TYPES_MASK) == INVEPT_ALL_TYPES_MASK)
#define EPT_PG_RD (1 << 0)
#define EPT_PG_WR (1 << 1)
#define EPT_PG_EX (1 << 2)
#define EPT_PG_MEMORY_TYPE(x) ((x) << 3)
#define EPT_PG_IGNORE_PAT (1 << 6)
#define EPT_PG_SUPERPAGE (1 << 7)
#define EPT_PWLEVELS 4 /* page walk levels */
#define EPT_ENABLE_AD_BITS (1 << 6)
#define EPT_ADDR_MASK ((uint64_t)-1 << 12)
SYSCTL_DECL(_hw_vmm);
SYSCTL_NODE(_hw_vmm, OID_AUTO, ept, CTLFLAG_RW, NULL, NULL);
MALLOC_DECLARE(M_VMX);
static int ept_enable_ad_bits;
static uint64_t page_sizes_mask;
/*
* Set this to 1 to have the EPT tables respect the guest PAT settings
*/
static int ept_pat_passthru;
static int ept_pmap_flags;
SYSCTL_INT(_hw_vmm_ept, OID_AUTO, pmap_flags, CTLFLAG_RD,
&ept_pmap_flags, 0, NULL);
int
ept_init(void)
{
int page_shift;
int use_hw_ad_bits, use_superpages, use_exec_only;
uint64_t cap;
cap = rdmsr(MSR_VMX_EPT_VPID_CAP);
@ -105,17 +98,22 @@ ept_init(void)
!INVEPT_ALL_TYPES_SUPPORTED(cap))
return (EINVAL);
/* Set bits in 'page_sizes_mask' for each valid page size */
page_shift = PAGE_SHIFT;
page_sizes_mask = 1UL << page_shift; /* 4KB page */
use_superpages = 1;
TUNABLE_INT_FETCH("hw.vmm.ept.use_superpages", &use_superpages);
if (use_superpages && EPT_PDE_SUPERPAGE(cap))
ept_pmap_flags |= PMAP_PDE_SUPERPAGE; /* 2MB superpage */
page_shift += 9;
if (EPT_PDE_SUPERPAGE(cap))
page_sizes_mask |= 1UL << page_shift; /* 2MB superpage */
use_hw_ad_bits = 1;
TUNABLE_INT_FETCH("hw.vmm.ept.use_hw_ad_bits", &use_hw_ad_bits);
if (use_hw_ad_bits && AD_BITS_SUPPORTED(cap))
ept_enable_ad_bits = 1;
else
ept_pmap_flags |= PMAP_EMULATE_AD_BITS;
page_shift += 9;
if (EPT_PDPTE_SUPERPAGE(cap))
page_sizes_mask |= 1UL << page_shift; /* 1GB superpage */
use_exec_only = 1;
TUNABLE_INT_FETCH("hw.vmm.ept.use_exec_only", &use_exec_only);
if (use_exec_only && EPT_SUPPORTS_EXEC_ONLY(cap))
ept_pmap_flags |= PMAP_SUPPORTS_EXEC_ONLY;
return (0);
}
@ -154,233 +152,6 @@ ept_dump(uint64_t *ptp, int nlevels)
}
#endif
static size_t
ept_create_mapping(uint64_t *ptp, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
vm_memattr_t attr, vm_prot_t prot, boolean_t spok)
{
int spshift, ptpshift, ptpindex, nlevels;
/*
* Compute the size of the mapping that we can accomodate.
*
* This is based on three factors:
* - super page sizes supported by the processor
* - alignment of the region starting at 'gpa' and 'hpa'
* - length of the region 'len'
*/
spshift = PAGE_SHIFT;
if (spok)
spshift += (EPT_PWLEVELS - 1) * 9;
while (spshift >= PAGE_SHIFT) {
uint64_t spsize = 1UL << spshift;
if ((page_sizes_mask & spsize) != 0 &&
(gpa & (spsize - 1)) == 0 &&
(hpa & (spsize - 1)) == 0 &&
length >= spsize) {
break;
}
spshift -= 9;
}
if (spshift < PAGE_SHIFT) {
panic("Invalid spshift for gpa 0x%016lx, hpa 0x%016lx, "
"length 0x%016lx, page_sizes_mask 0x%016lx",
gpa, hpa, length, page_sizes_mask);
}
nlevels = EPT_PWLEVELS;
while (--nlevels >= 0) {
ptpshift = PAGE_SHIFT + nlevels * 9;
ptpindex = (gpa >> ptpshift) & 0x1FF;
/* We have reached the leaf mapping */
if (spshift >= ptpshift)
break;
/*
* We are working on a non-leaf page table page.
*
* Create the next level page table page if necessary and point
* to it from the current page table.
*/
if (ptp[ptpindex] == 0) {
void *nlp = malloc(PAGE_SIZE, M_VMX, M_WAITOK | M_ZERO);
ptp[ptpindex] = vtophys(nlp);
ptp[ptpindex] |= EPT_PG_RD | EPT_PG_WR | EPT_PG_EX;
}
/* Work our way down to the next level page table page */
ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & EPT_ADDR_MASK);
}
if ((gpa & ((1UL << ptpshift) - 1)) != 0) {
panic("ept_create_mapping: gpa 0x%016lx and ptpshift %d "
"mismatch\n", gpa, ptpshift);
}
if (prot != VM_PROT_NONE) {
/* Do the mapping */
ptp[ptpindex] = hpa;
/* Apply the access controls */
if (prot & VM_PROT_READ)
ptp[ptpindex] |= EPT_PG_RD;
if (prot & VM_PROT_WRITE)
ptp[ptpindex] |= EPT_PG_WR;
if (prot & VM_PROT_EXECUTE)
ptp[ptpindex] |= EPT_PG_EX;
/*
* By default the PAT type is ignored - this appears to
* be how other hypervisors handle EPT. Allow this to be
* overridden.
*/
ptp[ptpindex] |= EPT_PG_MEMORY_TYPE(attr);
if (!ept_pat_passthru)
ptp[ptpindex] |= EPT_PG_IGNORE_PAT;
if (nlevels > 0)
ptp[ptpindex] |= EPT_PG_SUPERPAGE;
} else {
/* Remove the mapping */
ptp[ptpindex] = 0;
}
return (1UL << ptpshift);
}
static vm_paddr_t
ept_lookup_mapping(uint64_t *ptp, vm_paddr_t gpa)
{
int nlevels, ptpshift, ptpindex;
uint64_t ptpval, hpabase, pgmask;
nlevels = EPT_PWLEVELS;
while (--nlevels >= 0) {
ptpshift = PAGE_SHIFT + nlevels * 9;
ptpindex = (gpa >> ptpshift) & 0x1FF;
ptpval = ptp[ptpindex];
/* Cannot make progress beyond this point */
if ((ptpval & (EPT_PG_RD | EPT_PG_WR | EPT_PG_EX)) == 0)
break;
if (nlevels == 0 || (ptpval & EPT_PG_SUPERPAGE)) {
pgmask = (1UL << ptpshift) - 1;
hpabase = ptpval & ~pgmask;
return (hpabase | (gpa & pgmask));
}
/* Work our way down to the next level page table page */
ptp = (uint64_t *)PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK);
}
return ((vm_paddr_t)-1);
}
static void
ept_free_pt_entry(pt_entry_t pte)
{
if (pte == 0)
return;
/* sanity check */
if ((pte & EPT_PG_SUPERPAGE) != 0)
panic("ept_free_pt_entry: pte cannot have superpage bit");
return;
}
static void
ept_free_pd_entry(pd_entry_t pde)
{
pt_entry_t *pt;
int i;
if (pde == 0)
return;
if ((pde & EPT_PG_SUPERPAGE) == 0) {
pt = (pt_entry_t *)PHYS_TO_DMAP(pde & EPT_ADDR_MASK);
for (i = 0; i < NPTEPG; i++)
ept_free_pt_entry(pt[i]);
free(pt, M_VMX); /* free the page table page */
}
}
static void
ept_free_pdp_entry(pdp_entry_t pdpe)
{
pd_entry_t *pd;
int i;
if (pdpe == 0)
return;
if ((pdpe & EPT_PG_SUPERPAGE) == 0) {
pd = (pd_entry_t *)PHYS_TO_DMAP(pdpe & EPT_ADDR_MASK);
for (i = 0; i < NPDEPG; i++)
ept_free_pd_entry(pd[i]);
free(pd, M_VMX); /* free the page directory page */
}
}
static void
ept_free_pml4_entry(pml4_entry_t pml4e)
{
pdp_entry_t *pdp;
int i;
if (pml4e == 0)
return;
if ((pml4e & EPT_PG_SUPERPAGE) == 0) {
pdp = (pdp_entry_t *)PHYS_TO_DMAP(pml4e & EPT_ADDR_MASK);
for (i = 0; i < NPDPEPG; i++)
ept_free_pdp_entry(pdp[i]);
free(pdp, M_VMX); /* free the page directory ptr page */
}
}
void
ept_vmcleanup(struct vmx *vmx)
{
int i;
for (i = 0; i < NPML4EPG; i++)
ept_free_pml4_entry(vmx->pml4ept[i]);
}
int
ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t len,
vm_memattr_t attr, int prot, boolean_t spok)
{
size_t n;
struct vmx *vmx = arg;
while (len > 0) {
n = ept_create_mapping(vmx->pml4ept, gpa, hpa, len, attr,
prot, spok);
len -= n;
gpa += n;
hpa += n;
}
return (0);
}
vm_paddr_t
ept_vmmmap_get(void *arg, vm_paddr_t gpa)
{
vm_paddr_t hpa;
struct vmx *vmx;
vmx = arg;
hpa = ept_lookup_mapping(vmx->pml4ept, gpa);
return (hpa);
}
static void
invept_single_context(void *arg)
{
@ -390,11 +161,44 @@ invept_single_context(void *arg)
}
void
ept_invalidate_mappings(u_long pml4ept)
ept_invalidate_mappings(u_long eptp)
{
struct invept_desc invept_desc = { 0 };
invept_desc.eptp = EPTP(pml4ept);
invept_desc.eptp = eptp;
smp_rendezvous(NULL, invept_single_context, NULL, &invept_desc);
}
static int
ept_pinit(pmap_t pmap)
{
return (pmap_pinit_type(pmap, PT_EPT, ept_pmap_flags));
}
struct vmspace *
ept_vmspace_alloc(vm_offset_t min, vm_offset_t max)
{
return (vmspace_alloc(min, max, ept_pinit));
}
void
ept_vmspace_free(struct vmspace *vmspace)
{
vmspace_free(vmspace);
}
uint64_t
eptp(uint64_t pml4)
{
uint64_t eptp_val;
eptp_val = pml4 | (EPT_PWLEVELS - 1) << 3 | PAT_WRITE_BACK;
if (ept_enable_ad_bits)
eptp_val |= EPT_ENABLE_AD_BITS;
return (eptp_val);
}

View File

@ -31,13 +31,9 @@
struct vmx;
#define EPT_PWLEVELS 4 /* page walk levels */
#define EPTP(pml4) ((pml4) | (EPT_PWLEVELS - 1) << 3 | PAT_WRITE_BACK)
int ept_init(void);
int ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
vm_memattr_t attr, int prot, boolean_t allow_superpage_mappings);
vm_paddr_t ept_vmmmap_get(void *arg, vm_paddr_t gpa);
void ept_invalidate_mappings(u_long ept_pml4);
void ept_vmcleanup(struct vmx *vmx);
void ept_invalidate_mappings(u_long eptp);
struct vmspace *ept_vmspace_alloc(vm_offset_t min, vm_offset_t max);
void ept_vmspace_free(struct vmspace *vmspace);
uint64_t eptp(uint64_t pml4);
#endif

View File

@ -318,14 +318,14 @@ vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count)
int
vmcs_set_defaults(struct vmcs *vmcs,
u_long host_rip, u_long host_rsp, u_long ept_pml4,
u_long host_rip, u_long host_rsp, uint64_t eptp,
uint32_t pinbased_ctls, uint32_t procbased_ctls,
uint32_t procbased_ctls2, uint32_t exit_ctls,
uint32_t entry_ctls, u_long msr_bitmap, uint16_t vpid)
{
int error, codesel, datasel, tsssel;
u_long cr0, cr4, efer;
uint64_t eptp, pat, fsbase, idtrbase;
uint64_t pat, fsbase, idtrbase;
uint32_t exc_bitmap;
codesel = vmm_get_host_codesel();
@ -432,7 +432,6 @@ vmcs_set_defaults(struct vmcs *vmcs,
goto done;
/* eptp */
eptp = EPTP(ept_pml4);
if ((error = vmwrite(VMCS_EPTP, eptp)) != 0)
goto done;

View File

@ -47,7 +47,7 @@ struct msr_entry {
int vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count);
int vmcs_set_defaults(struct vmcs *vmcs, u_long host_rip, u_long host_rsp,
u_long ept_pml4,
uint64_t eptp,
uint32_t pinbased_ctls, uint32_t procbased_ctls,
uint32_t procbased_ctls2, uint32_t exit_ctls,
uint32_t entry_ctls, u_long msr_bitmap,
@ -68,6 +68,8 @@ uint64_t vmcs_read(uint32_t encoding);
#define vmcs_guest_cr3() vmcs_read(VMCS_GUEST_CR3)
#define vmcs_gpa() vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS)
#define vmcs_gla() vmcs_read(VMCS_GUEST_LINEAR_ADDRESS)
#define vmcs_idt_vectoring_info() vmcs_read(VMCS_IDT_VECTORING_INFO)
#define vmcs_idt_vectoring_err() vmcs_read(VMCS_IDT_VECTORING_ERROR)
#endif /* _KERNEL */
@ -313,6 +315,12 @@ uint64_t vmcs_read(uint32_t encoding);
#define VMCS_INTERRUPTION_INFO_HW_INTR (0 << 8)
#define VMCS_INTERRUPTION_INFO_NMI (2 << 8)
/*
* VMCS IDT-Vectoring information fields
*/
#define VMCS_IDT_VEC_VALID (1 << 31)
#define VMCS_IDT_VEC_ERRCODE_VALID (1 << 11)
/*
* VMCS Guest interruptibility field
*/
@ -332,6 +340,9 @@ uint64_t vmcs_read(uint32_t encoding);
#define EPT_VIOLATION_DATA_READ (1UL << 0)
#define EPT_VIOLATION_DATA_WRITE (1UL << 1)
#define EPT_VIOLATION_INST_FETCH (1UL << 2)
#define EPT_VIOLATION_GPA_READABLE (1UL << 3)
#define EPT_VIOLATION_GPA_WRITEABLE (1UL << 4)
#define EPT_VIOLATION_GPA_EXECUTABLE (1UL << 5)
#define EPT_VIOLATION_GLA_VALID (1UL << 7)
#define EPT_VIOLATION_XLAT_VALID (1UL << 8)

View File

@ -49,8 +49,6 @@ __FBSDID("$FreeBSD$");
#include <machine/specialreg.h>
#include <machine/vmparam.h>
#include <x86/apicreg.h>
#include <machine/vmm.h>
#include "vmm_host.h"
#include "vmm_lapic.h"
@ -167,9 +165,6 @@ static int cap_pause_exit;
static int cap_unrestricted_guest;
static int cap_monitor_trap;
/* statistics */
static VMM_STAT_INTEL(VMEXIT_HLT_IGNORED, "number of times hlt was ignored");
static struct unrhdr *vpid_unr;
static u_int vpid_alloc_failed;
SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD,
@ -740,7 +735,7 @@ vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial)
#define vmx_setup_cr4_shadow(vmcs,init) vmx_setup_cr_shadow(4, (vmcs), (init))
static void *
vmx_vminit(struct vm *vm)
vmx_vminit(struct vm *vm, pmap_t pmap)
{
uint16_t vpid[VM_MAXCPU];
int i, error, guest_msr_count;
@ -753,6 +748,8 @@ vmx_vminit(struct vm *vm)
}
vmx->vm = vm;
vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4));
/*
* Clean up EPTP-tagged guest physical and combined mappings
*
@ -762,7 +759,7 @@ vmx_vminit(struct vm *vm)
*
* Combined mappings for this EP4TA are also invalidated for all VPIDs.
*/
ept_invalidate_mappings(vtophys(vmx->pml4ept));
ept_invalidate_mappings(vmx->eptp);
msr_bitmap_initialize(vmx->msr_bitmap);
@ -818,7 +815,7 @@ vmx_vminit(struct vm *vm)
error = vmcs_set_defaults(&vmx->vmcs[i],
(u_long)vmx_longjmp,
(u_long)&vmx->ctx[i],
vtophys(vmx->pml4ept),
vmx->eptp,
pinbased_ctls,
procbased_ctls,
procbased_ctls2,
@ -856,6 +853,9 @@ vmx_vminit(struct vm *vm)
error = vmx_setup_cr4_shadow(&vmx->vmcs[i], 0);
if (error != 0)
panic("vmx_setup_cr4_shadow %d", error);
vmx->ctx[i].pmap = pmap;
vmx->ctx[i].eptp = vmx->eptp;
}
return (vmx);
@ -1281,21 +1281,49 @@ vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
}
static int
vmx_ept_fault(struct vm *vm, int cpu,
uint64_t gla, uint64_t gpa, uint64_t rip, int inst_length,
uint64_t cr3, uint64_t ept_qual, struct vie *vie)
ept_fault_type(uint64_t ept_qual)
{
int read, write, error;
int fault_type;
/* EPT violation on an instruction fetch doesn't make sense here */
if (ept_qual & EPT_VIOLATION_DATA_WRITE)
fault_type = VM_PROT_WRITE;
else if (ept_qual & EPT_VIOLATION_INST_FETCH)
fault_type = VM_PROT_EXECUTE;
else
fault_type= VM_PROT_READ;
return (fault_type);
}
static int
ept_protection(uint64_t ept_qual)
{
int prot = 0;
if (ept_qual & EPT_VIOLATION_GPA_READABLE)
prot |= VM_PROT_READ;
if (ept_qual & EPT_VIOLATION_GPA_WRITEABLE)
prot |= VM_PROT_WRITE;
if (ept_qual & EPT_VIOLATION_GPA_EXECUTABLE)
prot |= VM_PROT_EXECUTE;
return (prot);
}
static boolean_t
ept_emulation_fault(uint64_t ept_qual)
{
int read, write;
/* EPT fault on an instruction fetch doesn't make sense here */
if (ept_qual & EPT_VIOLATION_INST_FETCH)
return (UNHANDLED);
return (FALSE);
/* EPT violation must be a read fault or a write fault */
/* EPT fault must be a read fault or a write fault */
read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0;
write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0;
if ((read | write) == 0)
return (UNHANDLED);
return (FALSE);
/*
* The EPT violation must have been caused by accessing a
@ -1304,26 +1332,10 @@ vmx_ept_fault(struct vm *vm, int cpu,
*/
if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 ||
(ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) {
return (UNHANDLED);
return (FALSE);
}
/* Fetch, decode and emulate the faulting instruction */
if (vmm_fetch_instruction(vm, cpu, rip, inst_length, cr3, vie) != 0)
return (UNHANDLED);
if (vmm_decode_instruction(vm, cpu, gla, vie) != 0)
return (UNHANDLED);
/*
* Check if this is a local apic access
*/
if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE)
return (UNHANDLED);
error = vmm_emulate_instruction(vm, cpu, gpa, vie,
lapic_mmio_read, lapic_mmio_write, 0);
return (error ? UNHANDLED : HANDLED);
return (TRUE);
}
static int
@ -1332,18 +1344,47 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
int error, handled;
struct vmcs *vmcs;
struct vmxctx *vmxctx;
uint32_t eax, ecx, edx;
uint64_t qual, gla, gpa, cr3, intr_info;
uint32_t eax, ecx, edx, idtvec_info, idtvec_err, reason;
uint64_t qual, gpa;
handled = 0;
vmcs = &vmx->vmcs[vcpu];
vmxctx = &vmx->ctx[vcpu];
qual = vmexit->u.vmx.exit_qualification;
reason = vmexit->u.vmx.exit_reason;
vmexit->exitcode = VM_EXITCODE_BOGUS;
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1);
switch (vmexit->u.vmx.exit_reason) {
/*
* VM exits that could be triggered during event injection on the
* previous VM entry need to be handled specially by re-injecting
* the event.
*
* See "Information for VM Exits During Event Delivery" in Intel SDM
* for details.
*/
switch (reason) {
case EXIT_REASON_EPT_FAULT:
case EXIT_REASON_EPT_MISCONFIG:
case EXIT_REASON_APIC:
case EXIT_REASON_TASK_SWITCH:
case EXIT_REASON_EXCEPTION:
idtvec_info = vmcs_idt_vectoring_info();
if (idtvec_info & VMCS_IDT_VEC_VALID) {
idtvec_info &= ~(1 << 12); /* clear undefined bit */
vmwrite(VMCS_ENTRY_INTR_INFO, idtvec_info);
if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
idtvec_err = vmcs_idt_vectoring_err();
vmwrite(VMCS_ENTRY_EXCEPTION_ERROR, idtvec_err);
}
vmwrite(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
}
default:
break;
}
switch (reason) {
case EXIT_REASON_CR_ACCESS:
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1);
handled = vmx_emulate_cr_access(vmx, vcpu, qual);
@ -1374,19 +1415,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
break;
case EXIT_REASON_HLT:
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1);
/*
* If there is an event waiting to be injected then there is
* no need to 'hlt'.
*/
error = vmread(VMCS_ENTRY_INTR_INFO, &intr_info);
if (error)
panic("vmx_exit_process: vmread(intrinfo) %d", error);
if (intr_info & VMCS_INTERRUPTION_INFO_VALID) {
handled = 1;
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT_IGNORED, 1);
} else
vmexit->exitcode = VM_EXITCODE_HLT;
vmexit->exitcode = VM_EXITCODE_HLT;
break;
case EXIT_REASON_MTF:
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1);
@ -1440,15 +1469,22 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
break;
case EXIT_REASON_EPT_FAULT:
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EPT_FAULT, 1);
gla = vmcs_gla();
/*
* If 'gpa' lies within the address space allocated to
* memory then this must be a nested page fault otherwise
* this must be an instruction that accesses MMIO space.
*/
gpa = vmcs_gpa();
cr3 = vmcs_guest_cr3();
handled = vmx_ept_fault(vmx->vm, vcpu, gla, gpa,
vmexit->rip, vmexit->inst_length,
cr3, qual, &vmexit->u.paging.vie);
if (!handled) {
if (vm_mem_allocated(vmx->vm, gpa)) {
vmexit->exitcode = VM_EXITCODE_PAGING;
vmexit->u.paging.gpa = gpa;
vmexit->u.paging.fault_type = ept_fault_type(qual);
vmexit->u.paging.protection = ept_protection(qual);
} else if (ept_emulation_fault(qual)) {
vmexit->exitcode = VM_EXITCODE_INST_EMUL;
vmexit->u.inst_emul.gpa = gpa;
vmexit->u.inst_emul.gla = vmcs_gla();
vmexit->u.inst_emul.cr3 = vmcs_guest_cr3();
}
break;
default:
@ -1470,14 +1506,6 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
vm_exit_update_rip(vmexit);
vmexit->rip += vmexit->inst_length;
vmexit->inst_length = 0;
/*
* Special case for spinning up an AP - exit to userspace to
* give the controlling process a chance to intercept and
* spin up a thread for the AP.
*/
if (vmexit->exitcode == VM_EXITCODE_SPINUP_AP)
handled = 0;
} else {
if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
/*
@ -1497,7 +1525,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
}
static int
vmx_run(void *arg, int vcpu, register_t rip)
vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap)
{
int error, vie, rc, handled, astpending;
uint32_t exit_reason;
@ -1505,7 +1533,7 @@ vmx_run(void *arg, int vcpu, register_t rip)
struct vmxctx *vmxctx;
struct vmcs *vmcs;
struct vm_exit *vmexit;
vmx = arg;
vmcs = &vmx->vmcs[vcpu];
vmxctx = &vmx->ctx[vcpu];
@ -1514,6 +1542,11 @@ vmx_run(void *arg, int vcpu, register_t rip)
astpending = 0;
vmexit = vm_exitinfo(vmx->vm, vcpu);
KASSERT(vmxctx->pmap == pmap,
("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap));
KASSERT(vmxctx->eptp == vmx->eptp,
("eptp %p different than ctx eptp %#lx", eptp, vmxctx->eptp));
/*
* XXX Can we avoid doing this every time we do a vm run?
*/
@ -1576,6 +1609,9 @@ vmx_run(void *arg, int vcpu, register_t rip)
vmxctx->launch_error, vie);
#endif
goto err_exit;
case VMX_RETURN_INVEPT:
panic("vm %s:%d invept error %d",
vm_name(vmx->vm), vcpu, vmxctx->launch_error);
default:
panic("vmx_setjmp returned %d", rc);
}
@ -1654,7 +1690,6 @@ vmx_vmcleanup(void *arg)
if (error != 0)
panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error);
ept_vmcleanup(vmx);
free(vmx, M_VMX);
return;
@ -2000,13 +2035,13 @@ struct vmm_ops vmm_ops_intel = {
vmx_vminit,
vmx_run,
vmx_vmcleanup,
ept_vmmmap_set,
ept_vmmmap_get,
vmx_getreg,
vmx_setreg,
vmx_getdesc,
vmx_setdesc,
vmx_inject,
vmx_getcap,
vmx_setcap
vmx_setcap,
ept_vmspace_alloc,
ept_vmspace_free,
};

View File

@ -31,6 +31,8 @@
#include "vmcs.h"
struct pmap;
#define GUEST_MSR_MAX_ENTRIES 64 /* arbitrary */
struct vmxctx {
@ -68,6 +70,15 @@ struct vmxctx {
int launched; /* vmcs launch state */
int launch_error;
long eptgen[MAXCPU]; /* cached pmap->pm_eptgen */
/*
* The 'eptp' and the 'pmap' do not change during the lifetime of
* the VM so it is safe to keep a copy in each vcpu's vmxctx.
*/
vm_paddr_t eptp;
struct pmap *pmap;
};
struct vmxcap {
@ -82,16 +93,15 @@ struct vmxstate {
/* virtual machine softc */
struct vmx {
pml4_entry_t pml4ept[NPML4EPG];
struct vmcs vmcs[VM_MAXCPU]; /* one vmcs per virtual cpu */
char msr_bitmap[PAGE_SIZE];
struct msr_entry guest_msrs[VM_MAXCPU][GUEST_MSR_MAX_ENTRIES];
struct vmxctx ctx[VM_MAXCPU];
struct vmxcap cap[VM_MAXCPU];
struct vmxstate state[VM_MAXCPU];
uint64_t eptp;
struct vm *vm;
};
CTASSERT((offsetof(struct vmx, pml4ept) & PAGE_MASK) == 0);
CTASSERT((offsetof(struct vmx, vmcs) & PAGE_MASK) == 0);
CTASSERT((offsetof(struct vmx, msr_bitmap) & PAGE_MASK) == 0);
CTASSERT((offsetof(struct vmx, guest_msrs) & 15) == 0);
@ -101,6 +111,7 @@ CTASSERT((offsetof(struct vmx, guest_msrs) & 15) == 0);
#define VMX_RETURN_VMRESUME 2
#define VMX_RETURN_VMLAUNCH 3
#define VMX_RETURN_AST 4
#define VMX_RETURN_INVEPT 5
/*
* vmx_setjmp() returns:
* - 0 when it returns directly
@ -108,6 +119,7 @@ CTASSERT((offsetof(struct vmx, guest_msrs) & 15) == 0);
* - 2 when it returns from vmx_resume (which would only be in the error case)
* - 3 when it returns from vmx_launch (which would only be in the error case)
* - 4 when it returns from vmx_resume or vmx_launch because of AST pending
* - 5 when it returns from vmx_launch/vmx_resume because of invept error
*/
int vmx_setjmp(struct vmxctx *ctx);
void vmx_longjmp(void); /* returns via vmx_setjmp */

View File

@ -72,6 +72,10 @@ ASSYM(VMXCTX_HOST_RBX, offsetof(struct vmxctx, host_rbx));
ASSYM(VMXCTX_HOST_RIP, offsetof(struct vmxctx, host_rip));
ASSYM(VMXCTX_LAUNCH_ERROR, offsetof(struct vmxctx, launch_error));
ASSYM(VMXCTX_EPTGEN, offsetof(struct vmxctx, eptgen));
ASSYM(VMXCTX_PMAP, offsetof(struct vmxctx, pmap));
ASSYM(VMXCTX_EPTP, offsetof(struct vmxctx, eptp));
ASSYM(VM_SUCCESS, VM_SUCCESS);
ASSYM(VM_FAIL_INVALID, VM_FAIL_INVALID);
@ -82,8 +86,13 @@ ASSYM(VMX_RETURN_LONGJMP, VMX_RETURN_LONGJMP);
ASSYM(VMX_RETURN_VMRESUME, VMX_RETURN_VMRESUME);
ASSYM(VMX_RETURN_VMLAUNCH, VMX_RETURN_VMLAUNCH);
ASSYM(VMX_RETURN_AST, VMX_RETURN_AST);
ASSYM(VMX_RETURN_INVEPT, VMX_RETURN_INVEPT);
ASSYM(TDF_ASTPENDING, TDF_ASTPENDING);
ASSYM(TDF_NEEDRESCHED, TDF_NEEDRESCHED);
ASSYM(TD_FLAGS, offsetof(struct thread, td_flags));
ASSYM(PC_CURTHREAD, offsetof(struct pcpu, pc_curthread));
ASSYM(PC_CPUID, offsetof(struct pcpu, pc_cpuid));
ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active));
ASSYM(PM_EPTGEN, offsetof(struct pmap, pm_eptgen));

View File

@ -30,6 +30,12 @@
#include "vmx_assym.s"
#ifdef SMP
#define LK lock ;
#else
#define LK
#endif
/*
* Disable interrupts before updating %rsp in VMX_CHECK_AST or
* VMX_GUEST_RESTORE.
@ -86,15 +92,73 @@
movq VMXCTX_GUEST_R15(%rdi),%r15; \
movq VMXCTX_GUEST_RDI(%rdi),%rdi; /* restore rdi the last */
#define VM_INSTRUCTION_ERROR(reg) \
/*
* Check for an error after executing a VMX instruction.
* 'errreg' will be zero on success and non-zero otherwise.
* 'ctxreg' points to the 'struct vmxctx' associated with the vcpu.
*/
#define VM_INSTRUCTION_ERROR(errreg, ctxreg) \
jnc 1f; \
movl $VM_FAIL_INVALID,reg; /* CF is set */ \
movl $VM_FAIL_INVALID,errreg; /* CF is set */ \
jmp 3f; \
1: jnz 2f; \
movl $VM_FAIL_VALID,reg; /* ZF is set */ \
movl $VM_FAIL_VALID,errreg; /* ZF is set */ \
jmp 3f; \
2: movl $VM_SUCCESS,reg; \
3: movl reg,VMXCTX_LAUNCH_ERROR(%rsp)
2: movl $VM_SUCCESS,errreg; \
3: movl errreg,VMXCTX_LAUNCH_ERROR(ctxreg)
/*
* set or clear the appropriate bit in 'pm_active'
* %rdi = vmxctx
* %rax, %r11 = scratch registers
*/
#define VMX_SET_PM_ACTIVE \
movq VMXCTX_PMAP(%rdi), %r11; \
movl PCPU(CPUID), %eax; \
LK btsl %eax, PM_ACTIVE(%r11)
#define VMX_CLEAR_PM_ACTIVE \
movq VMXCTX_PMAP(%rdi), %r11; \
movl PCPU(CPUID), %eax; \
LK btrl %eax, PM_ACTIVE(%r11)
/*
* If 'vmxctx->eptgen[curcpu]' is not identical to 'pmap->pm_eptgen'
* then we must invalidate all mappings associated with this eptp.
*
* %rdi = vmxctx
* %rax, %rbx, %r11 = scratch registers
*/
#define VMX_CHECK_EPTGEN \
movl PCPU(CPUID), %ebx; \
movq VMXCTX_PMAP(%rdi), %r11; \
movq PM_EPTGEN(%r11), %rax; \
cmpq %rax, VMXCTX_EPTGEN(%rdi, %rbx, 8); \
je 9f; \
\
/* Refresh 'vmxctx->eptgen[curcpu]' */ \
movq %rax, VMXCTX_EPTGEN(%rdi, %rbx, 8); \
\
/* Setup the invept descriptor at the top of tmpstk */ \
mov %rdi, %r11; \
addq $VMXCTX_TMPSTKTOP, %r11; \
movq VMXCTX_EPTP(%rdi), %rax; \
movq %rax, -16(%r11); \
movq $0x0, -8(%r11); \
mov $0x1, %eax; /* Single context invalidate */ \
invept -16(%r11), %rax; \
\
/* Check for invept error */ \
VM_INSTRUCTION_ERROR(%eax, %rdi); \
testl %eax, %eax; \
jz 9f; \
\
/* Return via vmx_setjmp with retval of VMX_RETURN_INVEPT */ \
movq $VMX_RETURN_INVEPT, %rsi; \
movq %rdi,%rsp; \
addq $VMXCTX_TMPSTKTOP, %rsp; \
callq vmx_return; \
9: ;
.text
/*
@ -129,6 +193,9 @@ END(vmx_setjmp)
* Return to vmm context through vmx_setjmp() with a value of 'retval'.
*/
ENTRY(vmx_return)
/* The pmap is no longer active on the host cpu */
VMX_CLEAR_PM_ACTIVE
/* Restore host context. */
movq VMXCTX_HOST_R15(%rdi),%r15
movq VMXCTX_HOST_R14(%rdi),%r14
@ -193,6 +260,10 @@ ENTRY(vmx_resume)
VMX_CHECK_AST
VMX_SET_PM_ACTIVE /* This vcpu is now active on the host cpu */
VMX_CHECK_EPTGEN /* Check if we have to invalidate TLB */
/*
* Restore guest state that is not automatically loaded from the vmcs.
*/
@ -203,7 +274,7 @@ ENTRY(vmx_resume)
/*
* Capture the reason why vmresume failed.
*/
VM_INSTRUCTION_ERROR(%eax)
VM_INSTRUCTION_ERROR(%eax, %rsp)
/* Return via vmx_setjmp with return value of VMX_RETURN_VMRESUME */
movq %rsp,%rdi
@ -225,6 +296,10 @@ ENTRY(vmx_launch)
VMX_CHECK_AST
VMX_SET_PM_ACTIVE /* This vcpu is now active on the host cpu */
VMX_CHECK_EPTGEN /* Check if we have to invalidate TLB */
/*
* Restore guest state that is not automatically loaded from the vmcs.
*/
@ -235,7 +310,7 @@ ENTRY(vmx_launch)
/*
* Capture the reason why vmlaunch failed.
*/
VM_INSTRUCTION_ERROR(%eax)
VM_INSTRUCTION_ERROR(%eax, %rsp)
/* Return via vmx_setjmp with return value of VMX_RETURN_VMLAUNCH */
movq %rsp,%rdi

View File

@ -281,6 +281,43 @@ ppt_teardown_msix(struct pptdev *ppt)
ppt->msix.num_msgs = 0;
}
int
ppt_num_devices(struct vm *vm)
{
int i, num;
num = 0;
for (i = 0; i < num_pptdevs; i++) {
if (pptdevs[i].vm == vm)
num++;
}
return (num);
}
boolean_t
ppt_is_mmio(struct vm *vm, vm_paddr_t gpa)
{
int i, n;
struct pptdev *ppt;
struct vm_memory_segment *seg;
for (n = 0; n < num_pptdevs; n++) {
ppt = &pptdevs[n];
if (ppt->vm != vm)
continue;
for (i = 0; i < MAX_MMIOSEGS; i++) {
seg = &ppt->mmio[i];
if (seg->len == 0)
continue;
if (gpa >= seg->gpa && gpa < seg->gpa + seg->len)
return (TRUE);
}
}
return (FALSE);
}
int
ppt_assign_device(struct vm *vm, int bus, int slot, int func)
{
@ -336,7 +373,7 @@ ppt_unassign_all(struct vm *vm)
bus = pci_get_bus(dev);
slot = pci_get_slot(dev);
func = pci_get_function(dev);
ppt_unassign_device(vm, bus, slot, func);
vm_unassign_pptdev(vm, bus, slot, func);
}
}
@ -591,10 +628,3 @@ ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
return (0);
}
int
ppt_num_devices(void)
{
return (num_pptdevs);
}

View File

@ -29,14 +29,20 @@
#ifndef _IO_PPT_H_
#define _IO_PPT_H_
int ppt_assign_device(struct vm *vm, int bus, int slot, int func);
int ppt_unassign_device(struct vm *vm, int bus, int slot, int func);
int ppt_unassign_all(struct vm *vm);
int ppt_map_mmio(struct vm *vm, int bus, int slot, int func,
vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
int ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
int destcpu, int vector, int numvec);
int ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
int idx, uint32_t msg, uint32_t vector_control, uint64_t addr);
int ppt_num_devices(void);
int idx, uint32_t msg, uint32_t vector_control, uint64_t addr);
int ppt_num_devices(struct vm *vm);
boolean_t ppt_is_mmio(struct vm *vm, vm_paddr_t gpa);
/*
* The following functions should never be called directly.
* Use 'vm_assign_pptdev()' and 'vm_unassign_pptdev()' instead.
*/
int ppt_assign_device(struct vm *vm, int bus, int slot, int func);
int ppt_unassign_device(struct vm *vm, int bus, int slot, int func);
#endif

View File

@ -39,18 +39,28 @@ __FBSDID("$FreeBSD$");
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/rwlock.h>
#include <sys/sched.h>
#include <sys/smp.h>
#include <sys/systm.h>
#include <vm/vm.h>
#include <vm/vm_object.h>
#include <vm/vm_page.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_extern.h>
#include <vm/vm_param.h>
#include <machine/vm.h>
#include <machine/pcb.h>
#include <machine/smp.h>
#include <x86/apicreg.h>
#include <machine/pmap.h>
#include <machine/vmparam.h>
#include <machine/vmm.h>
#include "vmm_ktr.h"
#include "vmm_host.h"
#include "vmm_mem.h"
#include "vmm_util.h"
@ -84,15 +94,23 @@ struct vcpu {
#define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
#define vcpu_lock(v) mtx_lock_spin(&((v)->mtx))
#define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx))
#define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED)
struct mem_seg {
vm_paddr_t gpa;
size_t len;
boolean_t wired;
vm_object_t object;
};
#define VM_MAX_MEMORY_SEGMENTS 2
struct vm {
void *cookie; /* processor-specific data */
void *iommu; /* iommu-specific data */
struct vmspace *vmspace; /* guest's address space */
struct vcpu vcpu[VM_MAXCPU];
int num_mem_segs;
struct vm_memory_segment mem_segs[VM_MAX_MEMORY_SEGMENTS];
struct mem_seg mem_segs[VM_MAX_MEMORY_SEGMENTS];
char name[VM_MAX_NAMELEN];
/*
@ -109,16 +127,14 @@ static struct vmm_ops *ops;
#define VMM_INIT() (ops != NULL ? (*ops->init)() : 0)
#define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0)
#define VMINIT(vm) (ops != NULL ? (*ops->vminit)(vm): NULL)
#define VMRUN(vmi, vcpu, rip) \
(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip) : ENXIO)
#define VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
#define VMRUN(vmi, vcpu, rip, pmap) \
(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap) : ENXIO)
#define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
#define VMMMAP_SET(vmi, gpa, hpa, len, attr, prot, spm) \
(ops != NULL ? \
(*ops->vmmmap_set)(vmi, gpa, hpa, len, attr, prot, spm) : \
ENXIO)
#define VMMMAP_GET(vmi, gpa) \
(ops != NULL ? (*ops->vmmmap_get)(vmi, gpa) : ENXIO)
#define VMSPACE_ALLOC(min, max) \
(ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
#define VMSPACE_FREE(vmspace) \
(ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO)
#define VMGETREG(vmi, vcpu, num, retval) \
(ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
#define VMSETREG(vmi, vcpu, num, val) \
@ -213,8 +229,7 @@ vmm_handler(module_t mod, int what, void *arg)
switch (what) {
case MOD_LOAD:
vmmdev_init();
if (ppt_num_devices() > 0)
iommu_init();
iommu_init();
error = vmm_init();
if (error == 0)
vmm_initialized = 1;
@ -265,7 +280,7 @@ vm_create(const char *name, struct vm **retvm)
{
int i;
struct vm *vm;
vm_paddr_t maxaddr;
struct vmspace *vmspace;
const int BSP = 0;
@ -279,59 +294,34 @@ vm_create(const char *name, struct vm **retvm)
if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
return (EINVAL);
vmspace = VMSPACE_ALLOC(VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS);
if (vmspace == NULL)
return (ENOMEM);
vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
strcpy(vm->name, name);
vm->cookie = VMINIT(vm);
vm->cookie = VMINIT(vm, vmspace_pmap(vmspace));
for (i = 0; i < VM_MAXCPU; i++) {
vcpu_init(vm, i);
guest_msrs_init(vm, i);
}
maxaddr = vmm_mem_maxaddr();
vm->iommu = iommu_create_domain(maxaddr);
vm_activate_cpu(vm, BSP);
vm->vmspace = vmspace;
*retvm = vm;
return (0);
}
static void
vm_free_mem_seg(struct vm *vm, struct vm_memory_segment *seg)
vm_free_mem_seg(struct vm *vm, struct mem_seg *seg)
{
size_t len;
vm_paddr_t hpa;
void *host_domain;
host_domain = iommu_host_domain();
if (seg->object != NULL)
vmm_mem_free(vm->vmspace, seg->gpa, seg->len);
len = 0;
while (len < seg->len) {
hpa = vm_gpa2hpa(vm, seg->gpa + len, PAGE_SIZE);
if (hpa == (vm_paddr_t)-1) {
panic("vm_free_mem_segs: cannot free hpa "
"associated with gpa 0x%016lx", seg->gpa + len);
}
/*
* Remove the 'gpa' to 'hpa' mapping in VMs domain.
* And resurrect the 1:1 mapping for 'hpa' in 'host_domain'.
*/
iommu_remove_mapping(vm->iommu, seg->gpa + len, PAGE_SIZE);
iommu_create_mapping(host_domain, hpa, hpa, PAGE_SIZE);
vmm_mem_free(hpa, PAGE_SIZE);
len += PAGE_SIZE;
}
/*
* Invalidate cached translations associated with 'vm->iommu' since
* we have now moved some pages from it.
*/
iommu_invalidate_tlb(vm->iommu);
bzero(seg, sizeof(struct vm_memory_segment));
bzero(seg, sizeof(*seg));
}
void
@ -341,6 +331,9 @@ vm_destroy(struct vm *vm)
ppt_unassign_all(vm);
if (vm->iommu != NULL)
iommu_destroy_domain(vm->iommu);
for (i = 0; i < vm->num_mem_segs; i++)
vm_free_mem_seg(vm, &vm->mem_segs[i]);
@ -349,7 +342,7 @@ vm_destroy(struct vm *vm)
for (i = 0; i < VM_MAXCPU; i++)
vcpu_cleanup(&vm->vcpu[i]);
iommu_destroy_domain(vm->iommu);
VMSPACE_FREE(vm->vmspace);
VMCLEANUP(vm->cookie);
@ -365,52 +358,48 @@ vm_name(struct vm *vm)
int
vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
{
const boolean_t spok = TRUE; /* superpage mappings are ok */
vm_object_t obj;
return (VMMMAP_SET(vm->cookie, gpa, hpa, len, VM_MEMATTR_UNCACHEABLE,
VM_PROT_RW, spok));
if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
return (ENOMEM);
else
return (0);
}
int
vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
{
const boolean_t spok = TRUE; /* superpage mappings are ok */
return (VMMMAP_SET(vm->cookie, gpa, 0, len, 0,
VM_PROT_NONE, spok));
vmm_mmio_free(vm->vmspace, gpa, len);
return (0);
}
/*
* Returns TRUE if 'gpa' is available for allocation and FALSE otherwise
*/
static boolean_t
vm_gpa_available(struct vm *vm, vm_paddr_t gpa)
boolean_t
vm_mem_allocated(struct vm *vm, vm_paddr_t gpa)
{
int i;
vm_paddr_t gpabase, gpalimit;
if (gpa & PAGE_MASK)
panic("vm_gpa_available: gpa (0x%016lx) not page aligned", gpa);
for (i = 0; i < vm->num_mem_segs; i++) {
gpabase = vm->mem_segs[i].gpa;
gpalimit = gpabase + vm->mem_segs[i].len;
if (gpa >= gpabase && gpa < gpalimit)
return (FALSE);
return (TRUE); /* 'gpa' is regular memory */
}
return (TRUE);
if (ppt_is_mmio(vm, gpa))
return (TRUE); /* 'gpa' is pci passthru mmio */
return (FALSE);
}
int
vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
{
int error, available, allocated;
struct vm_memory_segment *seg;
vm_paddr_t g, hpa;
void *host_domain;
const boolean_t spok = TRUE; /* superpage mappings are ok */
int available, allocated;
struct mem_seg *seg;
vm_object_t object;
vm_paddr_t g;
if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
return (EINVAL);
@ -418,10 +407,10 @@ vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
available = allocated = 0;
g = gpa;
while (g < gpa + len) {
if (vm_gpa_available(vm, g))
available++;
else
if (vm_mem_allocated(vm, g))
allocated++;
else
available++;
g += PAGE_SIZE;
}
@ -443,61 +432,203 @@ vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
return (E2BIG);
host_domain = iommu_host_domain();
seg = &vm->mem_segs[vm->num_mem_segs];
error = 0;
if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL)
return (ENOMEM);
seg->gpa = gpa;
seg->len = 0;
while (seg->len < len) {
hpa = vmm_mem_alloc(PAGE_SIZE);
if (hpa == 0) {
error = ENOMEM;
break;
}
error = VMMMAP_SET(vm->cookie, gpa + seg->len, hpa, PAGE_SIZE,
VM_MEMATTR_WRITE_BACK, VM_PROT_ALL, spok);
if (error)
break;
/*
* Remove the 1:1 mapping for 'hpa' from the 'host_domain'.
* Add mapping for 'gpa + seg->len' to 'hpa' in the VMs domain.
*/
iommu_remove_mapping(host_domain, hpa, PAGE_SIZE);
iommu_create_mapping(vm->iommu, gpa + seg->len, hpa, PAGE_SIZE);
seg->len += PAGE_SIZE;
}
if (error) {
vm_free_mem_seg(vm, seg);
return (error);
}
/*
* Invalidate cached translations associated with 'host_domain' since
* we have now moved some pages from it.
*/
iommu_invalidate_tlb(host_domain);
seg->len = len;
seg->object = object;
seg->wired = FALSE;
vm->num_mem_segs++;
return (0);
}
vm_paddr_t
vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t len)
static void
vm_gpa_unwire(struct vm *vm)
{
vm_paddr_t nextpage;
int i, rv;
struct mem_seg *seg;
nextpage = rounddown(gpa + PAGE_SIZE, PAGE_SIZE);
if (len > nextpage - gpa)
panic("vm_gpa2hpa: invalid gpa/len: 0x%016lx/%lu", gpa, len);
for (i = 0; i < vm->num_mem_segs; i++) {
seg = &vm->mem_segs[i];
if (!seg->wired)
continue;
return (VMMMAP_GET(vm->cookie, gpa));
rv = vm_map_unwire(&vm->vmspace->vm_map,
seg->gpa, seg->gpa + seg->len,
VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment "
"%#lx/%ld could not be unwired: %d",
vm_name(vm), seg->gpa, seg->len, rv));
seg->wired = FALSE;
}
}
static int
vm_gpa_wire(struct vm *vm)
{
int i, rv;
struct mem_seg *seg;
for (i = 0; i < vm->num_mem_segs; i++) {
seg = &vm->mem_segs[i];
if (seg->wired)
continue;
/* XXX rlimits? */
rv = vm_map_wire(&vm->vmspace->vm_map,
seg->gpa, seg->gpa + seg->len,
VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
if (rv != KERN_SUCCESS)
break;
seg->wired = TRUE;
}
if (i < vm->num_mem_segs) {
/*
* Undo the wiring before returning an error.
*/
vm_gpa_unwire(vm);
return (EAGAIN);
}
return (0);
}
static void
vm_iommu_modify(struct vm *vm, boolean_t map)
{
int i, sz;
vm_paddr_t gpa, hpa;
struct mem_seg *seg;
void *vp, *cookie, *host_domain;
sz = PAGE_SIZE;
host_domain = iommu_host_domain();
for (i = 0; i < vm->num_mem_segs; i++) {
seg = &vm->mem_segs[i];
KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired",
vm_name(vm), seg->gpa, seg->len));
gpa = seg->gpa;
while (gpa < seg->gpa + seg->len) {
vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE,
&cookie);
KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx",
vm_name(vm), gpa));
vm_gpa_release(cookie);
hpa = DMAP_TO_PHYS((uintptr_t)vp);
if (map) {
iommu_create_mapping(vm->iommu, gpa, hpa, sz);
iommu_remove_mapping(host_domain, hpa, sz);
} else {
iommu_remove_mapping(vm->iommu, gpa, sz);
iommu_create_mapping(host_domain, hpa, hpa, sz);
}
gpa += PAGE_SIZE;
}
}
/*
* Invalidate the cached translations associated with the domain
* from which pages were removed.
*/
if (map)
iommu_invalidate_tlb(host_domain);
else
iommu_invalidate_tlb(vm->iommu);
}
#define vm_iommu_unmap(vm) vm_iommu_modify((vm), FALSE)
#define vm_iommu_map(vm) vm_iommu_modify((vm), TRUE)
int
vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
{
int error;
error = ppt_unassign_device(vm, bus, slot, func);
if (error)
return (error);
if (ppt_num_devices(vm) == 0) {
vm_iommu_unmap(vm);
vm_gpa_unwire(vm);
}
return (0);
}
int
vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
{
int error;
vm_paddr_t maxaddr;
/*
* Virtual machines with pci passthru devices get special treatment:
* - the guest physical memory is wired
* - the iommu is programmed to do the 'gpa' to 'hpa' translation
*
* We need to do this before the first pci passthru device is attached.
*/
if (ppt_num_devices(vm) == 0) {
KASSERT(vm->iommu == NULL,
("vm_assign_pptdev: iommu must be NULL"));
maxaddr = vmm_mem_maxaddr();
vm->iommu = iommu_create_domain(maxaddr);
error = vm_gpa_wire(vm);
if (error)
return (error);
vm_iommu_map(vm);
}
error = ppt_assign_device(vm, bus, slot, func);
return (error);
}
void *
vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
void **cookie)
{
int count, pageoff;
vm_page_t m;
pageoff = gpa & PAGE_MASK;
if (len > PAGE_SIZE - pageoff)
panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
if (count == 1) {
*cookie = m;
return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
} else {
*cookie = NULL;
return (NULL);
}
}
void
vm_gpa_release(void *cookie)
{
vm_page_t m = cookie;
vm_page_lock(m);
vm_page_unhold(m);
vm_page_unlock(m);
}
int
@ -508,13 +639,42 @@ vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
for (i = 0; i < vm->num_mem_segs; i++) {
if (gpabase == vm->mem_segs[i].gpa) {
*seg = vm->mem_segs[i];
seg->gpa = vm->mem_segs[i].gpa;
seg->len = vm->mem_segs[i].len;
seg->wired = vm->mem_segs[i].wired;
return (0);
}
}
return (-1);
}
int
vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len,
vm_offset_t *offset, struct vm_object **object)
{
int i;
size_t seg_len;
vm_paddr_t seg_gpa;
vm_object_t seg_obj;
for (i = 0; i < vm->num_mem_segs; i++) {
if ((seg_obj = vm->mem_segs[i].object) == NULL)
continue;
seg_gpa = vm->mem_segs[i].gpa;
seg_len = vm->mem_segs[i].len;
if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) {
*offset = gpa - seg_gpa;
*object = seg_obj;
vm_object_reference(seg_obj);
return (0);
}
}
return (EINVAL);
}
int
vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
{
@ -633,26 +793,215 @@ save_guest_fpustate(struct vcpu *vcpu)
static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
static int
vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
{
int error;
vcpu_assert_locked(vcpu);
/*
* The following state transitions are allowed:
* IDLE -> FROZEN -> IDLE
* FROZEN -> RUNNING -> FROZEN
* FROZEN -> SLEEPING -> FROZEN
*/
switch (vcpu->state) {
case VCPU_IDLE:
case VCPU_RUNNING:
case VCPU_SLEEPING:
error = (newstate != VCPU_FROZEN);
break;
case VCPU_FROZEN:
error = (newstate == VCPU_FROZEN);
break;
default:
error = 1;
break;
}
if (error == 0)
vcpu->state = newstate;
else
error = EBUSY;
return (error);
}
static void
vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
{
int error;
if ((error = vcpu_set_state(vm, vcpuid, newstate)) != 0)
panic("Error %d setting state to %d\n", error, newstate);
}
static void
vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
{
int error;
if ((error = vcpu_set_state_locked(vcpu, newstate)) != 0)
panic("Error %d setting state to %d", error, newstate);
}
/*
* Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
*/
static int
vm_handle_hlt(struct vm *vm, int vcpuid, boolean_t *retu)
{
struct vcpu *vcpu;
int sleepticks, t;
vcpu = &vm->vcpu[vcpuid];
vcpu_lock(vcpu);
/*
* Figure out the number of host ticks until the next apic
* timer interrupt in the guest.
*/
sleepticks = lapic_timer_tick(vm, vcpuid);
/*
* If the guest local apic timer is disabled then sleep for
* a long time but not forever.
*/
if (sleepticks < 0)
sleepticks = hz;
/*
* Do a final check for pending NMI or interrupts before
* really putting this thread to sleep.
*
* These interrupts could have happened any time after we
* returned from VMRUN() and before we grabbed the vcpu lock.
*/
if (!vm_nmi_pending(vm, vcpuid) && lapic_pending_intr(vm, vcpuid) < 0) {
if (sleepticks <= 0)
panic("invalid sleepticks %d", sleepticks);
t = ticks;
vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
msleep_spin(vcpu, &vcpu->mtx, "vmidle", sleepticks);
vcpu_require_state_locked(vcpu, VCPU_FROZEN);
vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
}
vcpu_unlock(vcpu);
return (0);
}
static int
vm_handle_paging(struct vm *vm, int vcpuid, boolean_t *retu)
{
int rv, ftype;
struct vm_map *map;
struct vcpu *vcpu;
struct vm_exit *vme;
vcpu = &vm->vcpu[vcpuid];
vme = &vcpu->exitinfo;
ftype = vme->u.paging.fault_type;
KASSERT(ftype == VM_PROT_READ ||
ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
("vm_handle_paging: invalid fault_type %d", ftype));
if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
vme->u.paging.gpa, ftype);
if (rv == 0)
goto done;
}
map = &vm->vmspace->vm_map;
rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
VMM_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, ftype = %d",
rv, vme->u.paging.gpa, ftype);
if (rv != KERN_SUCCESS)
return (EFAULT);
done:
/* restart execution at the faulting instruction */
vme->inst_length = 0;
return (0);
}
static int
vm_handle_inst_emul(struct vm *vm, int vcpuid, boolean_t *retu)
{
struct vie *vie;
struct vcpu *vcpu;
struct vm_exit *vme;
int error, inst_length;
uint64_t rip, gla, gpa, cr3;
vcpu = &vm->vcpu[vcpuid];
vme = &vcpu->exitinfo;
rip = vme->rip;
inst_length = vme->inst_length;
gla = vme->u.inst_emul.gla;
gpa = vme->u.inst_emul.gpa;
cr3 = vme->u.inst_emul.cr3;
vie = &vme->u.inst_emul.vie;
vie_init(vie);
/* Fetch, decode and emulate the faulting instruction */
if (vmm_fetch_instruction(vm, vcpuid, rip, inst_length, cr3, vie) != 0)
return (EFAULT);
if (vmm_decode_instruction(vm, vcpuid, gla, vie) != 0)
return (EFAULT);
/* return to userland unless this is a local apic access */
if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE) {
*retu = TRUE;
return (0);
}
error = vmm_emulate_instruction(vm, vcpuid, gpa, vie,
lapic_mmio_read, lapic_mmio_write, 0);
/* return to userland to spin up the AP */
if (error == 0 && vme->exitcode == VM_EXITCODE_SPINUP_AP)
*retu = TRUE;
return (error);
}
int
vm_run(struct vm *vm, struct vm_run *vmrun)
{
int error, vcpuid, sleepticks, t;
int error, vcpuid;
struct vcpu *vcpu;
struct pcb *pcb;
uint64_t tscval, rip;
struct vm_exit *vme;
boolean_t retu;
pmap_t pmap;
vcpuid = vmrun->cpuid;
if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
return (EINVAL);
pmap = vmspace_pmap(vm->vmspace);
vcpu = &vm->vcpu[vcpuid];
vme = &vmrun->vm_exit;
vme = &vcpu->exitinfo;
rip = vmrun->rip;
restart:
critical_enter();
KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
("vm_run: absurd pm_active"));
tscval = rdtsc();
pcb = PCPU_GET(curpcb);
@ -661,62 +1010,44 @@ vm_run(struct vm *vm, struct vm_run *vmrun)
restore_guest_msrs(vm, vcpuid);
restore_guest_fpustate(vcpu);
vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
vcpu->hostcpu = curcpu;
error = VMRUN(vm->cookie, vcpuid, rip);
error = VMRUN(vm->cookie, vcpuid, rip, pmap);
vcpu->hostcpu = NOCPU;
vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
save_guest_fpustate(vcpu);
restore_host_msrs(vm, vcpuid);
vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
/* copy the exit information */
bcopy(&vcpu->exitinfo, vme, sizeof(struct vm_exit));
critical_exit();
/*
* Oblige the guest's desire to 'hlt' by sleeping until the vcpu
* is ready to run.
*/
if (error == 0 && vme->exitcode == VM_EXITCODE_HLT) {
vcpu_lock(vcpu);
/*
* Figure out the number of host ticks until the next apic
* timer interrupt in the guest.
*/
sleepticks = lapic_timer_tick(vm, vcpuid);
/*
* If the guest local apic timer is disabled then sleep for
* a long time but not forever.
*/
if (sleepticks < 0)
sleepticks = hz;
/*
* Do a final check for pending NMI or interrupts before
* really putting this thread to sleep.
*
* These interrupts could have happened any time after we
* returned from VMRUN() and before we grabbed the vcpu lock.
*/
if (!vm_nmi_pending(vm, vcpuid) &&
lapic_pending_intr(vm, vcpuid) < 0) {
if (sleepticks <= 0)
panic("invalid sleepticks %d", sleepticks);
t = ticks;
msleep_spin(vcpu, &vcpu->mtx, "vmidle", sleepticks);
vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
if (error == 0) {
retu = FALSE;
switch (vme->exitcode) {
case VM_EXITCODE_HLT:
error = vm_handle_hlt(vm, vcpuid, &retu);
break;
case VM_EXITCODE_PAGING:
error = vm_handle_paging(vm, vcpuid, &retu);
break;
case VM_EXITCODE_INST_EMUL:
error = vm_handle_inst_emul(vm, vcpuid, &retu);
break;
default:
retu = TRUE; /* handled in userland */
break;
}
}
vcpu_unlock(vcpu);
if (error == 0 && retu == FALSE) {
rip = vme->rip + vme->inst_length;
goto restart;
}
/* copy the exit information */
bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
return (error);
}
@ -869,7 +1200,7 @@ vm_iommu_domain(struct vm *vm)
}
int
vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state state)
vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
{
int error;
struct vcpu *vcpu;
@ -880,20 +1211,7 @@ vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state state)
vcpu = &vm->vcpu[vcpuid];
vcpu_lock(vcpu);
/*
* The following state transitions are allowed:
* IDLE -> RUNNING -> IDLE
* IDLE -> CANNOT_RUN -> IDLE
*/
if ((vcpu->state == VCPU_IDLE && state != VCPU_IDLE) ||
(vcpu->state != VCPU_IDLE && state == VCPU_IDLE)) {
error = 0;
vcpu->state = state;
} else {
error = EBUSY;
}
error = vcpu_set_state_locked(vcpu, newstate);
vcpu_unlock(vcpu);
return (error);
@ -979,16 +1297,7 @@ vm_interrupt_hostcpu(struct vm *vm, int vcpuid)
vcpu_lock(vcpu);
hostcpu = vcpu->hostcpu;
if (hostcpu == NOCPU) {
/*
* If the vcpu is 'RUNNING' but without a valid 'hostcpu' then
* the host thread must be sleeping waiting for an event to
* kick the vcpu out of 'hlt'.
*
* XXX this is racy because the condition exists right before
* and after calling VMRUN() in vm_run(). The wakeup() is
* benign in this case.
*/
if (vcpu->state == VCPU_RUNNING)
if (vcpu->state == VCPU_SLEEPING)
wakeup_one(vcpu);
} else {
if (vcpu->state != VCPU_RUNNING)
@ -998,3 +1307,10 @@ vm_interrupt_hostcpu(struct vm *vm, int vcpuid)
}
vcpu_unlock(vcpu);
}
struct vmspace *
vm_get_vmspace(struct vm *vm)
{
return (vm->vmspace);
}

View File

@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$");
#include <vm/vm.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <machine/pmap.h>
#include <machine/vmparam.h>
@ -95,8 +96,9 @@ vmmdev_lookup2(struct cdev *cdev)
static int
vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
{
int error, off, c;
vm_paddr_t hpa, gpa;
int error, off, c, prot;
vm_paddr_t gpa;
void *hpa, *cookie;
struct vmmdev_softc *sc;
static char zerobuf[PAGE_SIZE];
@ -107,6 +109,7 @@ vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
if (sc == NULL)
error = ENXIO;
prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
while (uio->uio_resid > 0 && error == 0) {
gpa = uio->uio_offset;
off = gpa & PAGE_MASK;
@ -120,14 +123,16 @@ vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
* Since this device does not support lseek(2), dd(1) will
* read(2) blocks of data to simulate the lseek(2).
*/
hpa = vm_gpa2hpa(sc->vm, gpa, c);
if (hpa == (vm_paddr_t)-1) {
hpa = vm_gpa_hold(sc->vm, gpa, c, prot, &cookie);
if (hpa == NULL) {
if (uio->uio_rw == UIO_READ)
error = uiomove(zerobuf, c, uio);
else
error = EFAULT;
} else
error = uiomove((void *)PHYS_TO_DMAP(hpa), c, uio);
} else {
error = uiomove(hpa, c, uio);
vm_gpa_release(cookie);
}
}
mtx_unlock(&vmmdev_mtx);
@ -139,7 +144,6 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
struct thread *td)
{
int error, vcpu, state_changed;
enum vcpu_state new_state;
struct vmmdev_softc *sc;
struct vm_memory_segment *seg;
struct vm_register *vmreg;
@ -156,6 +160,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
struct vm_stats *vmstats;
struct vm_stat_desc *statdesc;
struct vm_x2apic *x2apic;
struct vm_gpa_pte *gpapte;
sc = vmmdev_lookup2(cdev);
if (sc == NULL)
@ -189,12 +194,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
goto done;
}
if (cmd == VM_RUN)
new_state = VCPU_RUNNING;
else
new_state = VCPU_CANNOT_RUN;
error = vcpu_set_state(sc->vm, vcpu, new_state);
error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN);
if (error)
goto done;
@ -211,7 +211,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
*/
error = 0;
for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) {
error = vcpu_set_state(sc->vm, vcpu, VCPU_CANNOT_RUN);
error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN);
if (error)
break;
}
@ -271,13 +271,13 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
break;
case VM_BIND_PPTDEV:
pptdev = (struct vm_pptdev *)data;
error = ppt_assign_device(sc->vm, pptdev->bus, pptdev->slot,
pptdev->func);
error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
pptdev->func);
break;
case VM_UNBIND_PPTDEV:
pptdev = (struct vm_pptdev *)data;
error = ppt_unassign_device(sc->vm, pptdev->bus, pptdev->slot,
pptdev->func);
error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
pptdev->func);
break;
case VM_INJECT_EVENT:
vmevent = (struct vm_event *)data;
@ -348,6 +348,12 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
error = vm_get_x2apic_state(sc->vm,
x2apic->cpuid, &x2apic->state);
break;
case VM_GET_GPA_PMAP:
gpapte = (struct vm_gpa_pte *)data;
pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)),
gpapte->gpa, gpapte->pte, &gpapte->ptenum);
error = 0;
break;
default:
error = ENOTTY;
break;
@ -361,25 +367,25 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
}
done:
/* Make sure that no handler returns a bogus value like ERESTART */
KASSERT(error >= 0, ("vmmdev_ioctl: invalid error return %d", error));
return (error);
}
static int
vmmdev_mmap(struct cdev *cdev, vm_ooffset_t offset, vm_paddr_t *paddr,
int nprot, vm_memattr_t *memattr)
vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset,
vm_size_t size, struct vm_object **object, int nprot)
{
int error;
struct vmmdev_softc *sc;
error = -1;
mtx_lock(&vmmdev_mtx);
sc = vmmdev_lookup2(cdev);
if (sc != NULL && (nprot & PROT_EXEC) == 0) {
*paddr = vm_gpa2hpa(sc->vm, (vm_paddr_t)offset, PAGE_SIZE);
if (*paddr != (vm_paddr_t)-1)
error = 0;
}
if (sc != NULL && (nprot & PROT_EXEC) == 0)
error = vm_get_memobj(sc->vm, *offset, size, offset, object);
else
error = EINVAL;
mtx_unlock(&vmmdev_mtx);
@ -446,7 +452,7 @@ static struct cdevsw vmmdevsw = {
.d_name = "vmmdev",
.d_version = D_VERSION,
.d_ioctl = vmmdev_ioctl,
.d_mmap = vmmdev_mmap,
.d_mmap_single = vmmdev_mmap_single,
.d_read = vmmdev_rw,
.d_write = vmmdev_rw,
};

View File

@ -465,7 +465,7 @@ vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
}
#ifdef _KERNEL
static void
void
vie_init(struct vie *vie)
{
@ -479,9 +479,9 @@ static int
gla2gpa(struct vm *vm, uint64_t gla, uint64_t ptpphys,
uint64_t *gpa, uint64_t *gpaend)
{
vm_paddr_t hpa;
int nlevels, ptpshift, ptpindex;
uint64_t *ptpbase, pte, pgsize;
void *cookie;
/*
* XXX assumes 64-bit guest with 4 page walk levels
@ -491,18 +491,19 @@ gla2gpa(struct vm *vm, uint64_t gla, uint64_t ptpphys,
/* Zero out the lower 12 bits and the upper 12 bits */
ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12;
hpa = vm_gpa2hpa(vm, ptpphys, PAGE_SIZE);
if (hpa == -1)
ptpbase = vm_gpa_hold(vm, ptpphys, PAGE_SIZE, VM_PROT_READ,
&cookie);
if (ptpbase == NULL)
goto error;
ptpbase = (uint64_t *)PHYS_TO_DMAP(hpa);
ptpshift = PAGE_SHIFT + nlevels * 9;
ptpindex = (gla >> ptpshift) & 0x1FF;
pgsize = 1UL << ptpshift;
pte = ptpbase[ptpindex];
vm_gpa_release(cookie);
if ((pte & PG_V) == 0)
goto error;
@ -530,18 +531,18 @@ int
vmm_fetch_instruction(struct vm *vm, int cpuid, uint64_t rip, int inst_length,
uint64_t cr3, struct vie *vie)
{
int n, err;
uint64_t hpa, gpa, gpaend, off;
int n, err, prot;
uint64_t gpa, gpaend, off;
void *hpa, *cookie;
/*
* XXX cache previously fetched instructions using 'rip' as the tag
*/
prot = VM_PROT_READ | VM_PROT_EXECUTE;
if (inst_length > VIE_INST_SIZE)
panic("vmm_fetch_instruction: invalid length %d", inst_length);
vie_init(vie);
/* Copy the instruction into 'vie' */
while (vie->num_valid < inst_length) {
err = gla2gpa(vm, rip, cr3, &gpa, &gpaend);
@ -551,11 +552,12 @@ vmm_fetch_instruction(struct vm *vm, int cpuid, uint64_t rip, int inst_length,
off = gpa & PAGE_MASK;
n = min(inst_length - vie->num_valid, PAGE_SIZE - off);
hpa = vm_gpa2hpa(vm, gpa, n);
if (hpa == -1)
if ((hpa = vm_gpa_hold(vm, gpa, n, prot, &cookie)) == NULL)
break;
bcopy((void *)PHYS_TO_DMAP(hpa), &vie->inst[vie->num_valid], n);
bcopy(hpa, &vie->inst[vie->num_valid], n);
vm_gpa_release(cookie);
rip += n;
vie->num_valid += n;

View File

@ -30,40 +30,24 @@
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/linker.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/kernel.h>
#include <sys/sysctl.h>
#include <sys/sglist.h>
#include <sys/lock.h>
#include <sys/rwlock.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_object.h>
#include <vm/vm_page.h>
#include <vm/vm_pageout.h>
#include <vm/vm_pager.h>
#include <machine/md_var.h>
#include <machine/metadata.h>
#include <machine/pc/bios.h>
#include <machine/vmparam.h>
#include <machine/pmap.h>
#include "vmm_util.h"
#include "vmm_mem.h"
SYSCTL_DECL(_hw_vmm);
static u_long pages_allocated;
SYSCTL_ULONG(_hw_vmm, OID_AUTO, pages_allocated, CTLFLAG_RD,
&pages_allocated, 0, "4KB pages allocated");
static void
update_pages_allocated(int howmany)
{
pages_allocated += howmany; /* XXX locking? */
}
int
vmm_mem_init(void)
{
@ -71,60 +55,95 @@ vmm_mem_init(void)
return (0);
}
vm_paddr_t
vmm_mem_alloc(size_t size)
vm_object_t
vmm_mmio_alloc(struct vmspace *vmspace, vm_paddr_t gpa, size_t len,
vm_paddr_t hpa)
{
int flags;
vm_page_t m;
vm_paddr_t pa;
int error;
vm_object_t obj;
struct sglist *sg;
if (size != PAGE_SIZE)
panic("vmm_mem_alloc: invalid allocation size %lu", size);
sg = sglist_alloc(1, M_WAITOK);
error = sglist_append_phys(sg, hpa, len);
KASSERT(error == 0, ("error %d appending physaddr to sglist", error));
flags = VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
VM_ALLOC_ZERO;
while (1) {
obj = vm_pager_allocate(OBJT_SG, sg, len, VM_PROT_RW, 0, NULL);
if (obj != NULL) {
/*
* XXX need policy to determine when to back off the allocation
* VT-x ignores the MTRR settings when figuring out the
* memory type for translations obtained through EPT.
*
* Therefore we explicitly force the pages provided by
* this object to be mapped as uncacheable.
*/
m = vm_page_alloc(NULL, 0, flags);
if (m == NULL)
VM_WAIT;
else
break;
VM_OBJECT_WLOCK(obj);
error = vm_object_set_memattr(obj, VM_MEMATTR_UNCACHEABLE);
VM_OBJECT_WUNLOCK(obj);
if (error != KERN_SUCCESS) {
panic("vmm_mmio_alloc: vm_object_set_memattr error %d",
error);
}
error = vm_map_find(&vmspace->vm_map, obj, 0, &gpa, len, 0,
VMFS_NO_SPACE, VM_PROT_RW, VM_PROT_RW, 0);
if (error != KERN_SUCCESS) {
vm_object_deallocate(obj);
obj = NULL;
}
}
pa = VM_PAGE_TO_PHYS(m);
if ((m->flags & PG_ZERO) == 0)
pagezero((void *)PHYS_TO_DMAP(pa));
m->valid = VM_PAGE_BITS_ALL;
/*
* Drop the reference on the sglist.
*
* If the scatter/gather object was successfully allocated then it
* has incremented the reference count on the sglist. Dropping the
* initial reference count ensures that the sglist will be freed
* when the object is deallocated.
*
* If the object could not be allocated then we end up freeing the
* sglist.
*/
sglist_free(sg);
update_pages_allocated(1);
return (pa);
return (obj);
}
void
vmm_mem_free(vm_paddr_t base, size_t length)
vmm_mmio_free(struct vmspace *vmspace, vm_paddr_t gpa, size_t len)
{
vm_page_t m;
if (base & PAGE_MASK) {
panic("vmm_mem_free: base 0x%0lx must be aligned on a "
"0x%0x boundary\n", base, PAGE_SIZE);
vm_map_remove(&vmspace->vm_map, gpa, gpa + len);
}
vm_object_t
vmm_mem_alloc(struct vmspace *vmspace, vm_paddr_t gpa, size_t len)
{
int error;
vm_object_t obj;
if (gpa & PAGE_MASK)
panic("vmm_mem_alloc: invalid gpa %#lx", gpa);
if (len == 0 || (len & PAGE_MASK) != 0)
panic("vmm_mem_alloc: invalid allocation size %lu", len);
obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT);
if (obj != NULL) {
error = vm_map_find(&vmspace->vm_map, obj, 0, &gpa, len, 0,
VMFS_NO_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0);
if (error != KERN_SUCCESS) {
vm_object_deallocate(obj);
obj = NULL;
}
}
if (length != PAGE_SIZE)
panic("vmm_mem_free: invalid length %lu", length);
return (obj);
}
m = PHYS_TO_VM_PAGE(base);
m->wire_count--;
vm_page_free(m);
atomic_subtract_int(&cnt.v_wire_count, 1);
void
vmm_mem_free(struct vmspace *vmspace, vm_paddr_t gpa, size_t len)
{
update_pages_allocated(-1);
vm_map_remove(&vmspace->vm_map, gpa, gpa + len);
}
vm_paddr_t

View File

@ -29,9 +29,15 @@
#ifndef _VMM_MEM_H_
#define _VMM_MEM_H_
struct vmspace;
struct vm_object;
int vmm_mem_init(void);
vm_paddr_t vmm_mem_alloc(size_t size);
void vmm_mem_free(vm_paddr_t start, size_t size);
struct vm_object *vmm_mem_alloc(struct vmspace *, vm_paddr_t gpa, size_t size);
struct vm_object *vmm_mmio_alloc(struct vmspace *, vm_paddr_t gpa, size_t len,
vm_paddr_t hpa);
void vmm_mem_free(struct vmspace *, vm_paddr_t gpa, size_t size);
void vmm_mmio_free(struct vmspace *, vm_paddr_t gpa, size_t size);
vm_paddr_t vmm_mem_maxaddr(void);
#endif

View File

@ -101,7 +101,7 @@ struct bhyvestats {
uint64_t vmexit_hlt;
uint64_t vmexit_pause;
uint64_t vmexit_mtrap;
uint64_t vmexit_paging;
uint64_t vmexit_inst_emul;
uint64_t cpu_switch_rotate;
uint64_t cpu_switch_direct;
int io_reset;
@ -208,14 +208,12 @@ fbsdrun_addcpu(struct vmctx *ctx, int vcpu, uint64_t rip)
vmexit[vcpu].rip = rip;
vmexit[vcpu].inst_length = 0;
if (vcpu == BSP) {
mt_vmm_info[vcpu].mt_ctx = ctx;
mt_vmm_info[vcpu].mt_vcpu = vcpu;
error = pthread_create(&mt_vmm_info[vcpu].mt_thr, NULL,
fbsdrun_start_thread, &mt_vmm_info[vcpu]);
assert(error == 0);
}
mt_vmm_info[vcpu].mt_ctx = ctx;
mt_vmm_info[vcpu].mt_vcpu = vcpu;
error = pthread_create(&mt_vmm_info[vcpu].mt_thr, NULL,
fbsdrun_start_thread, &mt_vmm_info[vcpu]);
assert(error == 0);
}
static int
@ -385,13 +383,13 @@ vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
}
static int
vmexit_paging(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
int err;
stats.vmexit_paging++;
stats.vmexit_inst_emul++;
err = emulate_mem(ctx, *pvcpu, vmexit->u.paging.gpa,
&vmexit->u.paging.vie);
err = emulate_mem(ctx, *pvcpu, vmexit->u.inst_emul.gpa,
&vmexit->u.inst_emul.vie);
if (err) {
if (err == EINVAL) {
@ -400,7 +398,7 @@ vmexit_paging(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
vmexit->rip);
} else if (err == ESRCH) {
fprintf(stderr, "Unhandled memory access to 0x%lx\n",
vmexit->u.paging.gpa);
vmexit->u.inst_emul.gpa);
}
return (VMEXIT_ABORT);
@ -416,7 +414,7 @@ static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
[VM_EXITCODE_RDMSR] = vmexit_rdmsr,
[VM_EXITCODE_WRMSR] = vmexit_wrmsr,
[VM_EXITCODE_MTRAP] = vmexit_mtrap,
[VM_EXITCODE_PAGING] = vmexit_paging,
[VM_EXITCODE_INST_EMUL] = vmexit_inst_emul,
[VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap,
};

View File

@ -1048,7 +1048,7 @@ init_pci(struct vmctx *ctx)
* Accesses to memory addresses that are not allocated to system
* memory or PCI devices return 0xff's.
*/
error = vm_get_memory_seg(ctx, 0, &lowmem);
error = vm_get_memory_seg(ctx, 0, &lowmem, NULL);
assert(error == 0);
memset(&memp, 0, sizeof(struct mem_range));

View File

@ -341,14 +341,14 @@ rtc_init(struct vmctx *ctx)
* 0x34/0x35 - 64KB chunks above 16MB, below 4GB
* 0x5b/0x5c/0x5d - 64KB chunks above 4GB
*/
err = vm_get_memory_seg(ctx, 0, &lomem);
err = vm_get_memory_seg(ctx, 0, &lomem, NULL);
assert(err == 0);
lomem = (lomem - m_16MB) / m_64KB;
rtc_nvram[nvoff(RTC_LMEM_LSB)] = lomem;
rtc_nvram[nvoff(RTC_LMEM_MSB)] = lomem >> 8;
if (vm_get_memory_seg(ctx, m_4GB, &himem) == 0) {
if (vm_get_memory_seg(ctx, m_4GB, &himem, NULL) == 0) {
himem /= m_64KB;
rtc_nvram[nvoff(RTC_HMEM_LSB)] = himem;
rtc_nvram[nvoff(RTC_HMEM_SB)] = himem >> 8;

View File

@ -188,12 +188,13 @@ usage(void)
" [--unassign-pptdev=<bus/slot/func>]\n"
" [--set-mem=<memory in units of MB>]\n"
" [--get-lowmem]\n"
" [--get-highmem]\n",
" [--get-highmem]\n"
" [--get-gpa-pmap]\n",
progname);
exit(1);
}
static int get_stats, getcap, setcap, capval;
static int get_stats, getcap, setcap, capval, get_gpa_pmap;
static const char *capname;
static int create, destroy, get_lowmem, get_highmem;
static uint64_t memsize;
@ -377,18 +378,20 @@ enum {
SET_CAP,
CAPNAME,
UNASSIGN_PPTDEV,
GET_GPA_PMAP,
};
int
main(int argc, char *argv[])
{
char *vmname;
int error, ch, vcpu;
vm_paddr_t gpa;
int error, ch, vcpu, ptenum;
vm_paddr_t gpa, gpa_pmap;
size_t len;
struct vm_exit vmexit;
uint64_t ctl, eptp, bm, addr, u64;
uint64_t ctl, eptp, bm, addr, u64, pteval[4], *pte;
struct vmctx *ctx;
int wired;
uint64_t cr0, cr3, cr4, dr7, rsp, rip, rflags, efer, pat;
uint64_t rax, rbx, rcx, rdx, rsi, rdi, rbp;
@ -427,6 +430,7 @@ main(int argc, char *argv[])
{ "capname", REQ_ARG, 0, CAPNAME },
{ "unassign-pptdev", REQ_ARG, 0, UNASSIGN_PPTDEV },
{ "setcap", REQ_ARG, 0, SET_CAP },
{ "get-gpa-pmap", REQ_ARG, 0, GET_GPA_PMAP },
{ "getcap", NO_ARG, &getcap, 1 },
{ "get-stats", NO_ARG, &get_stats, 1 },
{ "get-desc-ds",NO_ARG, &get_desc_ds, 1 },
@ -666,6 +670,10 @@ main(int argc, char *argv[])
capval = strtoul(optarg, NULL, 0);
setcap = 1;
break;
case GET_GPA_PMAP:
gpa_pmap = strtoul(optarg, NULL, 0);
get_gpa_pmap = 1;
break;
case CAPNAME:
capname = optarg;
break;
@ -819,16 +827,18 @@ main(int argc, char *argv[])
if (!error && (get_lowmem || get_all)) {
gpa = 0;
error = vm_get_memory_seg(ctx, gpa, &len);
error = vm_get_memory_seg(ctx, gpa, &len, &wired);
if (error == 0)
printf("lowmem\t\t0x%016lx/%ld\n", gpa, len);
printf("lowmem\t\t0x%016lx/%ld%s\n", gpa, len,
wired ? " wired" : "");
}
if (!error && (get_highmem || get_all)) {
gpa = 4 * GB;
error = vm_get_memory_seg(ctx, gpa, &len);
error = vm_get_memory_seg(ctx, gpa, &len, &wired);
if (error == 0)
printf("highmem\t\t0x%016lx/%ld\n", gpa, len);
printf("highmem\t\t0x%016lx/%ld%s\n", gpa, len,
wired ? " wired" : "");
}
if (!error && (get_efer || get_all)) {
@ -1457,6 +1467,17 @@ main(int argc, char *argv[])
printf("Capability \"%s\" is not available\n", capname);
}
if (!error && get_gpa_pmap) {
error = vm_get_gpa_pmap(ctx, gpa_pmap, pteval, &ptenum);
if (error == 0) {
printf("gpa %#lx:", gpa_pmap);
pte = &pteval[0];
while (ptenum-- > 0)
printf(" %#lx", *pte++);
printf("\n");
}
}
if (!error && (getcap || get_all)) {
int captype, val, getcaptype;

View File

@ -492,8 +492,8 @@ static void
cb_getmem(void *arg, uint64_t *ret_lowmem, uint64_t *ret_highmem)
{
vm_get_memory_seg(ctx, 0, ret_lowmem);
vm_get_memory_seg(ctx, 4 * GB, ret_highmem);
vm_get_memory_seg(ctx, 0, ret_lowmem, NULL);
vm_get_memory_seg(ctx, 4 * GB, ret_highmem, NULL);
}
static const char *