Merge projects/bhyve_npt_pmap into head.

Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
svn path=/head/; revision=256072
2013-10-05 21:22:35 +00:00 · 2013-10-05 21:22:35 +00:00 · 318224bbe6 · 2020-12-20 02:59:44 +00:00
commit 318224bbe6
parent bf57e9793a
30 changed files with 2137 additions and 886 deletions
--- a/lib/libvmmapi/vmmapi.c
+++ b/lib/libvmmapi/vmmapi.c
@ -124,7 +124,8 @@ vm_destroy(struct vmctx *vm)
 }

 int
-vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len)
+vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len,
+		  int *wired)
 {
 	int error;
 	struct vm_memory_segment seg;
@ -133,6 +134,8 @@ vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len)
 	seg.gpa = gpa;
 	error = ioctl(ctx->fd, VM_GET_MEMORY_SEG, &seg);
 	*ret_len = seg.len;
+	if (wired != NULL)
+		*wired = seg.wired;
 	return (error);
 }

@ -741,3 +744,23 @@ vcpu_reset(struct vmctx *vmctx, int vcpu)
 done:
 	return (error);
 }
+
+int
+vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num)
+{
+	int error, i;
+	struct vm_gpa_pte gpapte;
+
+	bzero(&gpapte, sizeof(gpapte));
+	gpapte.gpa = gpa;
+
+	error = ioctl(ctx->fd, VM_GET_GPA_PMAP, &gpapte);
+
+	if (error == 0) {
+		*num = gpapte.ptenum;
+		for (i = 0; i < gpapte.ptenum; i++)
+			pte[i] = gpapte.pte[i];
+	}
+
+	return (error);
+}
--- a/lib/libvmmapi/vmmapi.h
+++ b/lib/libvmmapi/vmmapi.h
@ -45,9 +45,11 @@ enum vm_mmap_style {
 int	vm_create(const char *name);
 struct vmctx *vm_open(const char *name);
 void	vm_destroy(struct vmctx *ctx);
-int	vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len);
+int	vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len,
+			  int *wired);
 int	vm_setup_memory(struct vmctx *ctx, size_t len, enum vm_mmap_style s);
 void	*vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len);
+int	vm_get_gpa_pmap(struct vmctx *, uint64_t gpa, uint64_t *pte, int *num);
 uint32_t vm_get_lowmem_limit(struct vmctx *ctx);
 void	vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit);
 int	vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
--- a/sys/amd64/amd64/machdep.c
+++ b/sys/amd64/amd64/machdep.c
@ -1574,7 +1574,7 @@ getmemsize(caddr_t kmdp, u_int64_t first)
 			/*
 			 * map page into kernel: valid, read/write,non-cacheable
 			 */
-			*pte = pa | PG_V | PG_RW | PG_N;
+			*pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
 			invltlb();

 			tmp = *(int *)ptr;
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
--- a/sys/amd64/amd64/trap.c
+++ b/sys/amd64/amd64/trap.c
@ -733,6 +733,14 @@ trap_pfault(frame, usermode)
 		}
 	}

+	/*
+	 * If the trap was caused by errant bits in the PTE then panic.
+	 */
+	if (frame->tf_err & PGEX_RSV) {
+		trap_fatal(frame, eva);
+		return (-1);
+	}
+
 	/*
 	 * PGEX_I is defined only if the execute disable bit capability is
 	 * supported and enabled.
@ -822,10 +830,11 @@ trap_fatal(frame, eva)
 #endif
 	if (type == T_PAGEFLT) {
 		printf("fault virtual address	= 0x%lx\n", eva);
-		printf("fault code		= %s %s %s, %s\n",
+		printf("fault code		= %s %s %s%s, %s\n",
 			code & PGEX_U ? "user" : "supervisor",
 			code & PGEX_W ? "write" : "read",
 			code & PGEX_I ? "instruction" : "data",
+			code & PGEX_RSV ? " rsv" : "",
 			code & PGEX_P ? "protection violation" : "page not present");
 	}
 	printf("instruction pointer	= 0x%lx:0x%lx\n",
--- a/sys/amd64/include/pmap.h
+++ b/sys/amd64/include/pmap.h
@ -50,41 +50,74 @@
 * of the fields not present here and there, depending on a lot of things.
 */
 				/* ---- Intel Nomenclature ---- */
-#define	PG_V		0x001	/* P	Valid			*/
-#define PG_RW		0x002	/* R/W	Read/Write		*/
-#define PG_U		0x004	/* U/S  User/Supervisor		*/
-#define	PG_NC_PWT	0x008	/* PWT	Write through		*/
-#define	PG_NC_PCD	0x010	/* PCD	Cache disable		*/
-#define PG_A		0x020	/* A	Accessed		*/
-#define	PG_M		0x040	/* D	Dirty			*/
-#define	PG_PS		0x080	/* PS	Page size (0=4k,1=2M)	*/
-#define	PG_PTE_PAT	0x080	/* PAT	PAT index		*/
-#define	PG_G		0x100	/* G	Global			*/
-#define	PG_AVAIL1	0x200	/*    /	Available for system	*/
-#define	PG_AVAIL2	0x400	/*   <	programmers use		*/
-#define	PG_AVAIL3	0x800	/*    \				*/
-#define	PG_PDE_PAT	0x1000	/* PAT	PAT index		*/
-#define	PG_NX		(1ul<<63) /* No-execute */
-
-
-/* Our various interpretations of the above */
-#define PG_W		PG_AVAIL1	/* "Wired" pseudoflag */
-#define	PG_MANAGED	PG_AVAIL2
-#define	PG_FRAME	(0x000ffffffffff000ul)
-#define	PG_PS_FRAME	(0x000fffffffe00000ul)
-#define	PG_PROT		(PG_RW|PG_U)	/* all protection bits . */
-#define PG_N		(PG_NC_PWT|PG_NC_PCD)	/* Non-cacheable */
+#define	X86_PG_V	0x001	/* P	Valid			*/
+#define	X86_PG_RW	0x002	/* R/W	Read/Write		*/
+#define	X86_PG_U	0x004	/* U/S  User/Supervisor		*/
+#define	X86_PG_NC_PWT	0x008	/* PWT	Write through		*/
+#define	X86_PG_NC_PCD	0x010	/* PCD	Cache disable		*/
+#define	X86_PG_A	0x020	/* A	Accessed		*/
+#define	X86_PG_M	0x040	/* D	Dirty			*/
+#define	X86_PG_PS	0x080	/* PS	Page size (0=4k,1=2M)	*/
+#define	X86_PG_PTE_PAT	0x080	/* PAT	PAT index		*/
+#define	X86_PG_G	0x100	/* G	Global			*/
+#define	X86_PG_AVAIL1	0x200	/*    /	Available for system	*/
+#define	X86_PG_AVAIL2	0x400	/*   <	programmers use		*/
+#define	X86_PG_AVAIL3	0x800	/*    \				*/
+#define	X86_PG_PDE_PAT	0x1000	/* PAT	PAT index		*/
+#define	X86_PG_NX	(1ul<<63) /* No-execute */
+#define	X86_PG_AVAIL(x)	(1ul << (x))

 /* Page level cache control fields used to determine the PAT type */
-#define PG_PDE_CACHE	(PG_PDE_PAT | PG_NC_PWT | PG_NC_PCD)
-#define PG_PTE_CACHE	(PG_PTE_PAT | PG_NC_PWT | PG_NC_PCD)
+#define	X86_PG_PDE_CACHE (X86_PG_PDE_PAT | X86_PG_NC_PWT | X86_PG_NC_PCD)
+#define	X86_PG_PTE_CACHE (X86_PG_PTE_PAT | X86_PG_NC_PWT | X86_PG_NC_PCD)
+
+/*
+ * Intel extended page table (EPT) bit definitions.
+ */
+#define	EPT_PG_READ		0x001	/* R	Read		*/
+#define	EPT_PG_WRITE		0x002	/* W	Write		*/
+#define	EPT_PG_EXECUTE		0x004	/* X	Execute		*/
+#define	EPT_PG_IGNORE_PAT	0x040	/* IPAT	Ignore PAT	*/
+#define	EPT_PG_PS		0x080	/* PS	Page size	*/
+#define	EPT_PG_A		0x100	/* A	Accessed	*/
+#define	EPT_PG_M		0x200	/* D	Dirty		*/
+#define	EPT_PG_MEMORY_TYPE(x)	((x) << 3) /* MT Memory Type	*/
+
+/*
+ * Define the PG_xx macros in terms of the bits on x86 PTEs.
+ */
+#define	PG_V		X86_PG_V
+#define	PG_RW		X86_PG_RW
+#define	PG_U		X86_PG_U
+#define	PG_NC_PWT	X86_PG_NC_PWT
+#define	PG_NC_PCD	X86_PG_NC_PCD
+#define	PG_A		X86_PG_A
+#define	PG_M		X86_PG_M
+#define	PG_PS		X86_PG_PS
+#define	PG_PTE_PAT	X86_PG_PTE_PAT
+#define	PG_G		X86_PG_G
+#define	PG_AVAIL1	X86_PG_AVAIL1
+#define	PG_AVAIL2	X86_PG_AVAIL2
+#define	PG_AVAIL3	X86_PG_AVAIL3
+#define	PG_PDE_PAT	X86_PG_PDE_PAT
+#define	PG_NX		X86_PG_NX
+#define	PG_PDE_CACHE	X86_PG_PDE_CACHE
+#define	PG_PTE_CACHE	X86_PG_PTE_CACHE
+
+/* Our various interpretations of the above */
+#define	PG_W		X86_PG_AVAIL3	/* "Wired" pseudoflag */
+#define	PG_MANAGED	X86_PG_AVAIL2
+#define	EPT_PG_EMUL_V	X86_PG_AVAIL(52)
+#define	EPT_PG_EMUL_RW	X86_PG_AVAIL(53)
+#define	PG_FRAME	(0x000ffffffffff000ul)
+#define	PG_PS_FRAME	(0x000fffffffe00000ul)

 /*
 * Promotion to a 2MB (PDE) page mapping requires that the corresponding 4KB
 * (PTE) page mappings have identical settings for the following fields:
 */
-#define	PG_PTE_PROMOTE	(PG_NX | PG_MANAGED | PG_W | PG_G | PG_PTE_PAT | \
-	    PG_M | PG_A | PG_NC_PCD | PG_NC_PWT | PG_U | PG_RW | PG_V)
+#define	PG_PTE_PROMOTE	(PG_NX | PG_MANAGED | PG_W | PG_G | PG_PTE_CACHE | \
+	    PG_M | PG_A | PG_U | PG_RW | PG_V)

 /*
 * Page Protection Exception bits
@ -96,6 +129,28 @@
 #define PGEX_RSV	0x08	/* reserved PTE field is non-zero */
 #define PGEX_I		0x10	/* during an instruction fetch */

+/* 
+ * undef the PG_xx macros that define bits in the regular x86 PTEs that
+ * have a different position in nested PTEs. This is done when compiling
+ * code that needs to be aware of the differences between regular x86 and
+ * nested PTEs.
+ *
+ * The appropriate bitmask will be calculated at runtime based on the pmap
+ * type.
+ */
+#ifdef AMD64_NPT_AWARE
+#undef PG_AVAIL1		/* X86_PG_AVAIL1 aliases with EPT_PG_M */
+#undef PG_G
+#undef PG_A
+#undef PG_M
+#undef PG_PDE_PAT
+#undef PG_PDE_CACHE
+#undef PG_PTE_PAT
+#undef PG_PTE_CACHE
+#undef PG_RW
+#undef PG_V
+#endif
+
 /*
 * Pte related macros.  This is complicated by having to deal with
 * the sign extension of the 48th bit.
@ -256,6 +311,11 @@ struct pmap {
 	int			pm_flags;
 };

+/* flags */
+#define	PMAP_PDE_SUPERPAGE	(1 << 0)	/* supports 2MB superpages */
+#define	PMAP_EMULATE_AD_BITS	(1 << 1)	/* needs A/D bits emulation */
+#define	PMAP_SUPPORTS_EXEC_ONLY	(1 << 2)	/* execute only mappings ok */
+
 typedef struct pmap	*pmap_t;

 #ifdef _KERNEL
@ -272,6 +332,9 @@ extern struct pmap	kernel_pmap_store;
 #define	PMAP_MTX(pmap)		(&(pmap)->pm_mtx)
 #define	PMAP_TRYLOCK(pmap)	mtx_trylock(&(pmap)->pm_mtx)
 #define	PMAP_UNLOCK(pmap)	mtx_unlock(&(pmap)->pm_mtx)
+
+int	pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags);
+int	pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype);
 #endif

 /*
@ -330,7 +393,7 @@ void	pmap_invalidate_all(pmap_t);
 void	pmap_invalidate_cache(void);
 void	pmap_invalidate_cache_pages(vm_page_t *pages, int count);
 void	pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva);
-
+void	pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num);
 #endif /* _KERNEL */

 #endif /* !LOCORE */
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@ -39,19 +39,18 @@ struct seg_desc;
 struct vm_exit;
 struct vm_run;
 struct vlapic;
+struct vmspace;
+struct vm_object;
+struct pmap;

 enum x2apic_state;

 typedef int	(*vmm_init_func_t)(void);
 typedef int	(*vmm_cleanup_func_t)(void);
-typedef void *	(*vmi_init_func_t)(struct vm *vm); /* instance specific apis */
-typedef int	(*vmi_run_func_t)(void *vmi, int vcpu, register_t rip);
+typedef void *	(*vmi_init_func_t)(struct vm *vm, struct pmap *pmap);
+typedef int	(*vmi_run_func_t)(void *vmi, int vcpu, register_t rip,
+				  struct pmap *pmap);
 typedef void	(*vmi_cleanup_func_t)(void *vmi);
-typedef int	(*vmi_mmap_set_func_t)(void *vmi, vm_paddr_t gpa,
-				       vm_paddr_t hpa, size_t length,
-				       vm_memattr_t attr, int prot,
-				       boolean_t superpages_ok);
-typedef vm_paddr_t (*vmi_mmap_get_func_t)(void *vmi, vm_paddr_t gpa);
 typedef int	(*vmi_get_register_t)(void *vmi, int vcpu, int num,
 				      uint64_t *retval);
 typedef int	(*vmi_set_register_t)(void *vmi, int vcpu, int num,
@ -65,6 +64,8 @@ typedef int	(*vmi_inject_event_t)(void *vmi, int vcpu,
 				      uint32_t code, int code_valid);
 typedef int	(*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval);
 typedef int	(*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val);
+typedef struct vmspace * (*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max);
+typedef void	(*vmi_vmspace_free)(struct vmspace *vmspace);

 struct vmm_ops {
 	vmm_init_func_t		init;		/* module wide initialization */
@ -73,8 +74,6 @@ struct vmm_ops {
 	vmi_init_func_t		vminit;		/* vm-specific initialization */
 	vmi_run_func_t		vmrun;
 	vmi_cleanup_func_t	vmcleanup;
-	vmi_mmap_set_func_t	vmmmap_set;
-	vmi_mmap_get_func_t	vmmmap_get;
 	vmi_get_register_t	vmgetreg;
 	vmi_set_register_t	vmsetreg;
 	vmi_get_desc_t		vmgetdesc;
@ -82,6 +81,8 @@ struct vmm_ops {
 	vmi_inject_event_t	vminject;
 	vmi_get_cap_t		vmgetcap;
 	vmi_set_cap_t		vmsetcap;
+	vmi_vmspace_alloc	vmspace_alloc;
+	vmi_vmspace_free	vmspace_free;
 };

 extern struct vmm_ops vmm_ops_intel;
@ -93,9 +94,14 @@ const char *vm_name(struct vm *vm);
 int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len);
 int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
 int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len);
-vm_paddr_t vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t size);
+void *vm_gpa_hold(struct vm *, vm_paddr_t gpa, size_t len, int prot,
+		  void **cookie);
+void vm_gpa_release(void *cookie);
 int vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
 	      struct vm_memory_segment *seg);
+int vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len,
+		  vm_offset_t *offset, struct vm_object **object);
+boolean_t vm_mem_allocated(struct vm *vm, vm_paddr_t gpa);
 int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval);
 int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val);
 int vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
@ -130,8 +136,9 @@ void *vm_iommu_domain(struct vm *vm);

 enum vcpu_state {
 	VCPU_IDLE,
+	VCPU_FROZEN,
 	VCPU_RUNNING,
-	VCPU_CANNOT_RUN,
+	VCPU_SLEEPING,
 };

 int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state);
@ -145,7 +152,9 @@ vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu)

 void *vcpu_stats(struct vm *vm, int vcpu);
 void vm_interrupt_hostcpu(struct vm *vm, int vcpu);
-
+struct vmspace *vm_get_vmspace(struct vm *vm);
+int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func);
+int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func);
 #endif	/* KERNEL */

 #include <machine/vmm_instruction_emul.h>
@ -247,6 +256,7 @@ enum vm_exitcode {
 	VM_EXITCODE_MTRAP,
 	VM_EXITCODE_PAUSE,
 	VM_EXITCODE_PAGING,
+	VM_EXITCODE_INST_EMUL,
 	VM_EXITCODE_SPINUP_AP,
 	VM_EXITCODE_MAX
 };
@ -266,8 +276,15 @@ struct vm_exit {
 		} inout;
 		struct {
 			uint64_t	gpa;
-			struct vie	vie;
+			int		fault_type;
+			int		protection;
 		} paging;
+		struct {
+			uint64_t	gpa;
+			uint64_t	gla;
+			uint64_t	cr3;
+			struct vie	vie;
+		} inst_emul;
 		/*
 		 * VMX specific payload. Used when there is no "better"
 		 * exitcode to represent the VM-exit.
--- a/sys/amd64/include/vmm_dev.h
+++ b/sys/amd64/include/vmm_dev.h
@ -36,7 +36,8 @@ int	vmmdev_cleanup(void);

 struct vm_memory_segment {
 	vm_paddr_t	gpa;	/* in */
-	size_t		len;	/* in */
+	size_t		len;
+	int		wired;
 };

 struct vm_register {
@ -135,6 +136,12 @@ struct vm_x2apic {
 	enum x2apic_state	state;
 };

+struct vm_gpa_pte {
+	uint64_t	gpa;				/* in */
+	uint64_t	pte[4];				/* out */
+	int		ptenum;
+};
+
 enum {
 	/* general routines */
 	IOCNUM_ABIVERS = 0,
@ -145,6 +152,7 @@ enum {
 	/* memory apis */
 	IOCNUM_MAP_MEMORY = 10,
 	IOCNUM_GET_MEMORY_SEG = 11,
+	IOCNUM_GET_GPA_PMAP = 12,

 	/* register/state accessors */
 	IOCNUM_SET_REGISTER = 20,
@ -215,4 +223,6 @@ enum {
 	_IOW('v', IOCNUM_SET_X2APIC_STATE, struct vm_x2apic)
 #define	VM_GET_X2APIC_STATE \
 	_IOWR('v', IOCNUM_GET_X2APIC_STATE, struct vm_x2apic)
+#define	VM_GET_GPA_PMAP \
+	_IOWR('v', IOCNUM_GET_GPA_PMAP, struct vm_gpa_pte)
 #endif
--- a/sys/amd64/include/vmm_instruction_emul.h
+++ b/sys/amd64/include/vmm_instruction_emul.h
@ -102,11 +102,15 @@ int vmm_emulate_instruction(void *vm, int cpuid, uint64_t gpa, struct vie *vie,
 #ifdef _KERNEL
 /*
 * APIs to fetch and decode the instruction from nested page fault handler.
+ *
+ * 'vie' must be initialized before calling 'vmm_fetch_instruction()'
 */
 int vmm_fetch_instruction(struct vm *vm, int cpuid,
 			  uint64_t rip, int inst_length, uint64_t cr3,
 			  struct vie *vie);

+void vie_init(struct vie *vie);
+
 /*
 * Decode the instruction fetched into 'vie' so it can be emulated.
 *
--- a/sys/amd64/vmm/amd/amdv.c
+++ b/sys/amd64/vmm/amd/amdv.c
@ -54,7 +54,7 @@ amdv_cleanup(void)
 }

 static void *
-amdv_vminit(struct vm *vm)
+amdv_vminit(struct vm *vm, struct pmap *pmap)
 {

 	printf("amdv_vminit: not implemented\n");
@ -62,7 +62,7 @@ amdv_vminit(struct vm *vm)
 }

 static int
-amdv_vmrun(void *arg, int vcpu, register_t rip)
+amdv_vmrun(void *arg, int vcpu, register_t rip, struct pmap *pmap)
 {

 	printf("amdv_vmrun: not implemented\n");
@ -77,23 +77,6 @@ amdv_vmcleanup(void *arg)
 	return;
 }

-static int
-amdv_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
-	    vm_memattr_t attr, int prot, boolean_t spok)
-{
-
-	printf("amdv_vmmmap_set: not implemented\n");
-	return (EINVAL);
-}
-
-static vm_paddr_t
-amdv_vmmmap_get(void *arg, vm_paddr_t gpa)
-{
-
-	printf("amdv_vmmmap_get: not implemented\n");
-	return (EINVAL);
-}
-
 static int
 amdv_getreg(void *arg, int vcpu, int regnum, uint64_t *retval)
 {
@ -151,21 +134,37 @@ amdv_setcap(void *arg, int vcpu, int type, int val)
 	return (EINVAL);
 }

+static struct vmspace *
+amdv_vmspace_alloc(vm_offset_t min, vm_offset_t max)
+{
+
+	printf("amdv_vmspace_alloc: not implemented\n");
+	return (NULL);
+}
+
+static void
+amdv_vmspace_free(struct vmspace *vmspace)
+{
+
+	printf("amdv_vmspace_free: not implemented\n");
+	return;
+}
+
 struct vmm_ops vmm_ops_amd = {
 	amdv_init,
 	amdv_cleanup,
 	amdv_vminit,
 	amdv_vmrun,
 	amdv_vmcleanup,
-	amdv_vmmmap_set,
-	amdv_vmmmap_get,
 	amdv_getreg,
 	amdv_setreg,
 	amdv_getdesc,
 	amdv_setdesc,
 	amdv_inject_event,
 	amdv_getcap,
-	amdv_setcap
+	amdv_setcap,
+	amdv_vmspace_alloc,
+	amdv_vmspace_free,
 };

 static int
--- a/sys/amd64/vmm/intel/ept.c
+++ b/sys/amd64/vmm/intel/ept.c
@ -29,32 +29,31 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");

+#include <sys/param.h>
+#include <sys/kernel.h>
 #include <sys/types.h>
-#include <sys/errno.h>
 #include <sys/systm.h>
-#include <sys/malloc.h>
 #include <sys/smp.h>
+#include <sys/sysctl.h>

 #include <vm/vm.h>
 #include <vm/pmap.h>
-
-#include <machine/param.h>
-#include <machine/cpufunc.h>
-#include <machine/pmap.h>
-#include <machine/vmparam.h>
+#include <vm/vm_extern.h>

 #include <machine/vmm.h>
+
 #include "vmx_cpufunc.h"
 #include "vmx_msr.h"
-#include "vmx.h"
 #include "ept.h"

+#define	EPT_SUPPORTS_EXEC_ONLY(cap)	((cap) & (1UL << 0))
 #define	EPT_PWL4(cap)			((cap) & (1UL << 6))
 #define	EPT_MEMORY_TYPE_WB(cap)		((cap) & (1UL << 14))
 #define	EPT_PDE_SUPERPAGE(cap)		((cap) & (1UL << 16))	/* 2MB pages */
 #define	EPT_PDPTE_SUPERPAGE(cap)	((cap) & (1UL << 17))	/* 1GB pages */
-#define	INVVPID_SUPPORTED(cap)		((cap) & (1UL << 32))
 #define	INVEPT_SUPPORTED(cap)		((cap) & (1UL << 20))
+#define	AD_BITS_SUPPORTED(cap)		((cap) & (1UL << 21))
+#define	INVVPID_SUPPORTED(cap)		((cap) & (1UL << 32))

 #define	INVVPID_ALL_TYPES_MASK		0xF0000000000UL
 #define	INVVPID_ALL_TYPES_SUPPORTED(cap)	\
@ -64,28 +63,22 @@ __FBSDID("$FreeBSD$");
 #define	INVEPT_ALL_TYPES_SUPPORTED(cap)		\
 	(((cap) & INVEPT_ALL_TYPES_MASK) == INVEPT_ALL_TYPES_MASK)

-#define	EPT_PG_RD			(1 << 0)
-#define	EPT_PG_WR			(1 << 1)
-#define	EPT_PG_EX			(1 << 2)
-#define	EPT_PG_MEMORY_TYPE(x)		((x) << 3)
-#define	EPT_PG_IGNORE_PAT		(1 << 6)
-#define	EPT_PG_SUPERPAGE		(1 << 7)
+#define	EPT_PWLEVELS		4		/* page walk levels */
+#define	EPT_ENABLE_AD_BITS	(1 << 6)

-#define	EPT_ADDR_MASK			((uint64_t)-1 << 12)
+SYSCTL_DECL(_hw_vmm);
+SYSCTL_NODE(_hw_vmm, OID_AUTO, ept, CTLFLAG_RW, NULL, NULL);

-MALLOC_DECLARE(M_VMX);
+static int ept_enable_ad_bits;

-static uint64_t page_sizes_mask;
-
-/*
- * Set this to 1 to have the EPT tables respect the guest PAT settings
- */
-static int ept_pat_passthru;
+static int ept_pmap_flags;
+SYSCTL_INT(_hw_vmm_ept, OID_AUTO, pmap_flags, CTLFLAG_RD,
+    &ept_pmap_flags, 0, NULL);

 int
 ept_init(void)
 {
-	int page_shift;
+	int use_hw_ad_bits, use_superpages, use_exec_only;
 	uint64_t cap;

 	cap = rdmsr(MSR_VMX_EPT_VPID_CAP);
@ -105,17 +98,22 @@ ept_init(void)
 	    !INVEPT_ALL_TYPES_SUPPORTED(cap))
 		return (EINVAL);

-	/* Set bits in 'page_sizes_mask' for each valid page size */
-	page_shift = PAGE_SHIFT;
-	page_sizes_mask = 1UL << page_shift;		/* 4KB page */
+	use_superpages = 1;
+	TUNABLE_INT_FETCH("hw.vmm.ept.use_superpages", &use_superpages);
+	if (use_superpages && EPT_PDE_SUPERPAGE(cap))
+		ept_pmap_flags |= PMAP_PDE_SUPERPAGE;	/* 2MB superpage */

-	page_shift += 9;
-	if (EPT_PDE_SUPERPAGE(cap))
-		page_sizes_mask |= 1UL << page_shift;	/* 2MB superpage */
+	use_hw_ad_bits = 1;
+	TUNABLE_INT_FETCH("hw.vmm.ept.use_hw_ad_bits", &use_hw_ad_bits);
+	if (use_hw_ad_bits && AD_BITS_SUPPORTED(cap))
+		ept_enable_ad_bits = 1;
+	else
+		ept_pmap_flags |= PMAP_EMULATE_AD_BITS;

-	page_shift += 9;
-	if (EPT_PDPTE_SUPERPAGE(cap))
-		page_sizes_mask |= 1UL << page_shift;	/* 1GB superpage */
+	use_exec_only = 1;
+	TUNABLE_INT_FETCH("hw.vmm.ept.use_exec_only", &use_exec_only);
+	if (use_exec_only && EPT_SUPPORTS_EXEC_ONLY(cap))
+		ept_pmap_flags |= PMAP_SUPPORTS_EXEC_ONLY;

 	return (0);
 }
@ -154,233 +152,6 @@ ept_dump(uint64_t *ptp, int nlevels)
 }
 #endif

-static size_t
-ept_create_mapping(uint64_t *ptp, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
-		   vm_memattr_t attr, vm_prot_t prot, boolean_t spok)
-{
-	int spshift, ptpshift, ptpindex, nlevels;
-
-	/*
-	 * Compute the size of the mapping that we can accomodate.
-	 *
-	 * This is based on three factors:
-	 * - super page sizes supported by the processor
-	 * - alignment of the region starting at 'gpa' and 'hpa'
-	 * - length of the region 'len'
-	 */
-	spshift = PAGE_SHIFT;
-	if (spok)
-		spshift += (EPT_PWLEVELS - 1) * 9;
-	while (spshift >= PAGE_SHIFT) {
-		uint64_t spsize = 1UL << spshift;
-		if ((page_sizes_mask & spsize) != 0 &&
-		    (gpa & (spsize - 1)) == 0 &&
-		    (hpa & (spsize - 1)) == 0 &&
-		    length >= spsize) {
-			break;
-		}
-		spshift -= 9;
-	}
-
-	if (spshift < PAGE_SHIFT) {
-		panic("Invalid spshift for gpa 0x%016lx, hpa 0x%016lx, "
-		      "length 0x%016lx, page_sizes_mask 0x%016lx",
-		      gpa, hpa, length, page_sizes_mask);
-	}
-
-	nlevels = EPT_PWLEVELS;
-	while (--nlevels >= 0) {
-		ptpshift = PAGE_SHIFT + nlevels * 9;
-		ptpindex = (gpa >> ptpshift) & 0x1FF;
-
-		/* We have reached the leaf mapping */
-		if (spshift >= ptpshift)
-			break;
-
-		/*
-		 * We are working on a non-leaf page table page.
-		 *
-		 * Create the next level page table page if necessary and point
-		 * to it from the current page table.
-		 */
-		if (ptp[ptpindex] == 0) {
-			void *nlp = malloc(PAGE_SIZE, M_VMX, M_WAITOK | M_ZERO);
-			ptp[ptpindex] = vtophys(nlp);
-			ptp[ptpindex] |= EPT_PG_RD | EPT_PG_WR | EPT_PG_EX;
-		}
-
-		/* Work our way down to the next level page table page */
-		ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & EPT_ADDR_MASK);
-	}
-
-	if ((gpa & ((1UL << ptpshift) - 1)) != 0) {
-		panic("ept_create_mapping: gpa 0x%016lx and ptpshift %d "
-		      "mismatch\n", gpa, ptpshift);
-	}
-
-	if (prot != VM_PROT_NONE) {
-		/* Do the mapping */
-		ptp[ptpindex] = hpa;
-
-		/* Apply the access controls */
-		if (prot & VM_PROT_READ)
-			ptp[ptpindex] |= EPT_PG_RD;
-		if (prot & VM_PROT_WRITE)
-			ptp[ptpindex] |= EPT_PG_WR;
-		if (prot & VM_PROT_EXECUTE)
-			ptp[ptpindex] |= EPT_PG_EX;
-
-		/*
-		 * By default the PAT type is ignored - this appears to
-		 * be how other hypervisors handle EPT. Allow this to be
-		 * overridden.
-		 */
-		ptp[ptpindex] |= EPT_PG_MEMORY_TYPE(attr);
-		if (!ept_pat_passthru)
-			ptp[ptpindex] |= EPT_PG_IGNORE_PAT;
-
-		if (nlevels > 0)
-			ptp[ptpindex] |= EPT_PG_SUPERPAGE;
-	} else {
-		/* Remove the mapping */
-		ptp[ptpindex] = 0;
-	}
-
-	return (1UL << ptpshift);
-}
-
-static vm_paddr_t
-ept_lookup_mapping(uint64_t *ptp, vm_paddr_t gpa)
-{
-	int nlevels, ptpshift, ptpindex;
-	uint64_t ptpval, hpabase, pgmask;
-
-	nlevels = EPT_PWLEVELS;
-	while (--nlevels >= 0) {
-		ptpshift = PAGE_SHIFT + nlevels * 9;
-		ptpindex = (gpa >> ptpshift) & 0x1FF;
-
-		ptpval = ptp[ptpindex];
-
-		/* Cannot make progress beyond this point */
-		if ((ptpval & (EPT_PG_RD | EPT_PG_WR | EPT_PG_EX)) == 0)
-			break;
-
-		if (nlevels == 0 || (ptpval & EPT_PG_SUPERPAGE)) {
-			pgmask = (1UL << ptpshift) - 1;
-			hpabase = ptpval & ~pgmask;
-			return (hpabase | (gpa & pgmask));
-		}
-
-		/* Work our way down to the next level page table page */
-		ptp = (uint64_t *)PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK);
-	}
-
-	return ((vm_paddr_t)-1);
-}
-
-static void
-ept_free_pt_entry(pt_entry_t pte)
-{
-	if (pte == 0)
-		return;
-
-	/* sanity check */
-	if ((pte & EPT_PG_SUPERPAGE) != 0)
-		panic("ept_free_pt_entry: pte cannot have superpage bit");
-
-	return;
-}
-
-static void
-ept_free_pd_entry(pd_entry_t pde)
-{
-	pt_entry_t	*pt;
-	int		i;
-
-	if (pde == 0)
-		return;
-
-	if ((pde & EPT_PG_SUPERPAGE) == 0) {
-		pt = (pt_entry_t *)PHYS_TO_DMAP(pde & EPT_ADDR_MASK);
-		for (i = 0; i < NPTEPG; i++)
-			ept_free_pt_entry(pt[i]);
-		free(pt, M_VMX);	/* free the page table page */
-	}
-}
-
-static void
-ept_free_pdp_entry(pdp_entry_t pdpe)
-{
-	pd_entry_t 	*pd;
-	int		 i;
-
-	if (pdpe == 0)
-		return;
-
-	if ((pdpe & EPT_PG_SUPERPAGE) == 0) {
-		pd = (pd_entry_t *)PHYS_TO_DMAP(pdpe & EPT_ADDR_MASK);
-		for (i = 0; i < NPDEPG; i++)
-			ept_free_pd_entry(pd[i]);
-		free(pd, M_VMX);	/* free the page directory page */
-	}
-}
-
-static void
-ept_free_pml4_entry(pml4_entry_t pml4e)
-{
-	pdp_entry_t	*pdp;
-	int		i;
-
-	if (pml4e == 0)
-		return;
-
-	if ((pml4e & EPT_PG_SUPERPAGE) == 0) {
-		pdp = (pdp_entry_t *)PHYS_TO_DMAP(pml4e & EPT_ADDR_MASK);
-		for (i = 0; i < NPDPEPG; i++)
-			ept_free_pdp_entry(pdp[i]);
-		free(pdp, M_VMX);	/* free the page directory ptr page */
-	}
-}
-
-void
-ept_vmcleanup(struct vmx *vmx)
-{
-	int 		 i;
-
-	for (i = 0; i < NPML4EPG; i++)
-		ept_free_pml4_entry(vmx->pml4ept[i]);
-}
-
-int
-ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t len,
-		vm_memattr_t attr, int prot, boolean_t spok)
-{
-	size_t n;
-	struct vmx *vmx = arg;
-
-	while (len > 0) {
-		n = ept_create_mapping(vmx->pml4ept, gpa, hpa, len, attr,
-				       prot, spok);
-		len -= n;
-		gpa += n;
-		hpa += n;
-	}
-
-	return (0);
-}
-
-vm_paddr_t
-ept_vmmmap_get(void *arg, vm_paddr_t gpa)
-{
-	vm_paddr_t hpa;
-	struct vmx *vmx;
-
-	vmx = arg;
-	hpa = ept_lookup_mapping(vmx->pml4ept, gpa);
-	return (hpa);
-}
-
 static void
 invept_single_context(void *arg)
 {
@ -390,11 +161,44 @@ invept_single_context(void *arg)
 }

 void
-ept_invalidate_mappings(u_long pml4ept)
+ept_invalidate_mappings(u_long eptp)
 {
 	struct invept_desc invept_desc = { 0 };

-	invept_desc.eptp = EPTP(pml4ept);
+	invept_desc.eptp = eptp;

 	smp_rendezvous(NULL, invept_single_context, NULL, &invept_desc);
 }
+
+static int
+ept_pinit(pmap_t pmap)
+{
+
+	return (pmap_pinit_type(pmap, PT_EPT, ept_pmap_flags));
+}
+
+struct vmspace *
+ept_vmspace_alloc(vm_offset_t min, vm_offset_t max)
+{
+
+	return (vmspace_alloc(min, max, ept_pinit));
+}
+
+void
+ept_vmspace_free(struct vmspace *vmspace)
+{
+
+	vmspace_free(vmspace);
+}
+
+uint64_t
+eptp(uint64_t pml4)
+{
+	uint64_t eptp_val;
+
+	eptp_val = pml4 | (EPT_PWLEVELS - 1) << 3 | PAT_WRITE_BACK;
+	if (ept_enable_ad_bits)
+		eptp_val |= EPT_ENABLE_AD_BITS;
+
+	return (eptp_val);
+}
--- a/sys/amd64/vmm/intel/ept.h
+++ b/sys/amd64/vmm/intel/ept.h
@ -31,13 +31,9 @@

 struct vmx;

-#define	EPT_PWLEVELS	4		/* page walk levels */
-#define	EPTP(pml4)	((pml4) | (EPT_PWLEVELS - 1) << 3 | PAT_WRITE_BACK)
-
 int	ept_init(void);
-int	ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
-	    vm_memattr_t attr, int prot, boolean_t allow_superpage_mappings);
-vm_paddr_t ept_vmmmap_get(void *arg, vm_paddr_t gpa);
-void	ept_invalidate_mappings(u_long ept_pml4);
-void	ept_vmcleanup(struct vmx *vmx);
+void	ept_invalidate_mappings(u_long eptp);
+struct vmspace *ept_vmspace_alloc(vm_offset_t min, vm_offset_t max);
+void	ept_vmspace_free(struct vmspace *vmspace);
+uint64_t eptp(uint64_t pml4);
 #endif
--- a/sys/amd64/vmm/intel/vmcs.c
+++ b/sys/amd64/vmm/intel/vmcs.c
@ -318,14 +318,14 @@ vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count)

 int
 vmcs_set_defaults(struct vmcs *vmcs,
-		  u_long host_rip, u_long host_rsp, u_long ept_pml4,
+		  u_long host_rip, u_long host_rsp, uint64_t eptp,
 		  uint32_t pinbased_ctls, uint32_t procbased_ctls,
 		  uint32_t procbased_ctls2, uint32_t exit_ctls,
 		  uint32_t entry_ctls, u_long msr_bitmap, uint16_t vpid)
 {
 	int error, codesel, datasel, tsssel;
 	u_long cr0, cr4, efer;
-	uint64_t eptp, pat, fsbase, idtrbase;
+	uint64_t pat, fsbase, idtrbase;
 	uint32_t exc_bitmap;

 	codesel = vmm_get_host_codesel();
@ -432,7 +432,6 @@ vmcs_set_defaults(struct vmcs *vmcs,
 		goto done;

 	/* eptp */
-	eptp = EPTP(ept_pml4);
 	if ((error = vmwrite(VMCS_EPTP, eptp)) != 0)
 		goto done;

--- a/sys/amd64/vmm/intel/vmcs.h
+++ b/sys/amd64/vmm/intel/vmcs.h
@ -47,7 +47,7 @@ struct msr_entry {

 int vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count);
 int	vmcs_set_defaults(struct vmcs *vmcs, u_long host_rip, u_long host_rsp,
-			  u_long ept_pml4,
+			  uint64_t eptp,
 			  uint32_t pinbased_ctls, uint32_t procbased_ctls,
 			  uint32_t procbased_ctls2, uint32_t exit_ctls,
 			  uint32_t entry_ctls, u_long msr_bitmap,
@ -68,6 +68,8 @@ uint64_t vmcs_read(uint32_t encoding);
 #define	vmcs_guest_cr3()		vmcs_read(VMCS_GUEST_CR3)
 #define	vmcs_gpa()			vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS)
 #define	vmcs_gla()			vmcs_read(VMCS_GUEST_LINEAR_ADDRESS)
+#define	vmcs_idt_vectoring_info()	vmcs_read(VMCS_IDT_VECTORING_INFO)
+#define	vmcs_idt_vectoring_err()	vmcs_read(VMCS_IDT_VECTORING_ERROR)

 #endif	/* _KERNEL */

@ -313,6 +315,12 @@ uint64_t vmcs_read(uint32_t encoding);
 #define	VMCS_INTERRUPTION_INFO_HW_INTR	(0 << 8)
 #define	VMCS_INTERRUPTION_INFO_NMI	(2 << 8)

+/*
+ * VMCS IDT-Vectoring information fields
+ */
+#define	VMCS_IDT_VEC_VALID		(1 << 31)
+#define	VMCS_IDT_VEC_ERRCODE_VALID	(1 << 11)
+
 /*
 * VMCS Guest interruptibility field
 */
@ -332,6 +340,9 @@ uint64_t vmcs_read(uint32_t encoding);
 #define	EPT_VIOLATION_DATA_READ		(1UL << 0)
 #define	EPT_VIOLATION_DATA_WRITE	(1UL << 1)
 #define	EPT_VIOLATION_INST_FETCH	(1UL << 2)
+#define	EPT_VIOLATION_GPA_READABLE	(1UL << 3)
+#define	EPT_VIOLATION_GPA_WRITEABLE	(1UL << 4)
+#define	EPT_VIOLATION_GPA_EXECUTABLE	(1UL << 5)
 #define	EPT_VIOLATION_GLA_VALID		(1UL << 7)
 #define	EPT_VIOLATION_XLAT_VALID	(1UL << 8)

--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@ -49,8 +49,6 @@ __FBSDID("$FreeBSD$");
 #include <machine/specialreg.h>
 #include <machine/vmparam.h>

-#include <x86/apicreg.h>
-
 #include <machine/vmm.h>
 #include "vmm_host.h"
 #include "vmm_lapic.h"
@ -167,9 +165,6 @@ static int cap_pause_exit;
 static int cap_unrestricted_guest;
 static int cap_monitor_trap;
 
-/* statistics */
-static VMM_STAT_INTEL(VMEXIT_HLT_IGNORED, "number of times hlt was ignored");
-
 static struct unrhdr *vpid_unr;
 static u_int vpid_alloc_failed;
 SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD,
@ -740,7 +735,7 @@ vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial)
 #define	vmx_setup_cr4_shadow(vmcs,init)	vmx_setup_cr_shadow(4, (vmcs), (init))

 static void *
-vmx_vminit(struct vm *vm)
+vmx_vminit(struct vm *vm, pmap_t pmap)
 {
 	uint16_t vpid[VM_MAXCPU];
 	int i, error, guest_msr_count;
@ -753,6 +748,8 @@ vmx_vminit(struct vm *vm)
 	}
 	vmx->vm = vm;

+	vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4));
+
 	/*
 	 * Clean up EPTP-tagged guest physical and combined mappings
 	 *
@ -762,7 +759,7 @@ vmx_vminit(struct vm *vm)
 	 *
 	 * Combined mappings for this EP4TA are also invalidated for all VPIDs.
 	 */
-	ept_invalidate_mappings(vtophys(vmx->pml4ept));
+	ept_invalidate_mappings(vmx->eptp);

 	msr_bitmap_initialize(vmx->msr_bitmap);

@ -818,7 +815,7 @@ vmx_vminit(struct vm *vm)
 		error = vmcs_set_defaults(&vmx->vmcs[i],
 					  (u_long)vmx_longjmp,
 					  (u_long)&vmx->ctx[i],
-					  vtophys(vmx->pml4ept),
+					  vmx->eptp,
 					  pinbased_ctls,
 					  procbased_ctls,
 					  procbased_ctls2,
@ -856,6 +853,9 @@ vmx_vminit(struct vm *vm)
 		error = vmx_setup_cr4_shadow(&vmx->vmcs[i], 0);
 		if (error != 0)
 			panic("vmx_setup_cr4_shadow %d", error);
+
+		vmx->ctx[i].pmap = pmap;
+		vmx->ctx[i].eptp = vmx->eptp;
 	}

 	return (vmx);
@ -1281,21 +1281,49 @@ vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
 }

 static int
-vmx_ept_fault(struct vm *vm, int cpu,
-	      uint64_t gla, uint64_t gpa, uint64_t rip, int inst_length,
-	      uint64_t cr3, uint64_t ept_qual, struct vie *vie)
+ept_fault_type(uint64_t ept_qual)
 {
-	int read, write, error;
+	int fault_type;

-	/* EPT violation on an instruction fetch doesn't make sense here */
+	if (ept_qual & EPT_VIOLATION_DATA_WRITE)
+		fault_type = VM_PROT_WRITE;
+	else if (ept_qual & EPT_VIOLATION_INST_FETCH)
+		fault_type = VM_PROT_EXECUTE;
+	else
+		fault_type= VM_PROT_READ;
+
+	return (fault_type);
+}
+
+static int
+ept_protection(uint64_t ept_qual)
+{
+	int prot = 0;
+
+	if (ept_qual & EPT_VIOLATION_GPA_READABLE)
+		prot |= VM_PROT_READ;
+	if (ept_qual & EPT_VIOLATION_GPA_WRITEABLE)
+		prot |= VM_PROT_WRITE;
+	if (ept_qual & EPT_VIOLATION_GPA_EXECUTABLE)
+		prot |= VM_PROT_EXECUTE;
+
+	return (prot);
+}
+
+static boolean_t
+ept_emulation_fault(uint64_t ept_qual)
+{
+	int read, write;
+
+	/* EPT fault on an instruction fetch doesn't make sense here */
 	if (ept_qual & EPT_VIOLATION_INST_FETCH)
-		return (UNHANDLED);
+		return (FALSE);

-	/* EPT violation must be a read fault or a write fault */
+	/* EPT fault must be a read fault or a write fault */
 	read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0;
 	write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0;
 	if ((read | write) == 0)
-		return (UNHANDLED);
+		return (FALSE);

 	/*
 	 * The EPT violation must have been caused by accessing a
@ -1304,26 +1332,10 @@ vmx_ept_fault(struct vm *vm, int cpu,
 	 */
 	if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 ||
 	    (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) {
-		return (UNHANDLED);
+		return (FALSE);
 	}

-	/* Fetch, decode and emulate the faulting instruction */
-	if (vmm_fetch_instruction(vm, cpu, rip, inst_length, cr3, vie) != 0)
-		return (UNHANDLED);
-
-	if (vmm_decode_instruction(vm, cpu, gla, vie) != 0)
-		return (UNHANDLED);
-
-	/*
-	 * Check if this is a local apic access
-	 */
-	if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE)
-		return (UNHANDLED);
-
-	error = vmm_emulate_instruction(vm, cpu, gpa, vie,
-					lapic_mmio_read, lapic_mmio_write, 0);
-
-	return (error ? UNHANDLED : HANDLED);
+	return (TRUE);
 }

 static int
@ -1332,18 +1344,47 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 	int error, handled;
 	struct vmcs *vmcs;
 	struct vmxctx *vmxctx;
-	uint32_t eax, ecx, edx;
-	uint64_t qual, gla, gpa, cr3, intr_info;
+	uint32_t eax, ecx, edx, idtvec_info, idtvec_err, reason;
+	uint64_t qual, gpa;

 	handled = 0;
 	vmcs = &vmx->vmcs[vcpu];
 	vmxctx = &vmx->ctx[vcpu];
 	qual = vmexit->u.vmx.exit_qualification;
+	reason = vmexit->u.vmx.exit_reason;
 	vmexit->exitcode = VM_EXITCODE_BOGUS;

 	vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1);

-	switch (vmexit->u.vmx.exit_reason) {
+	/*
+	 * VM exits that could be triggered during event injection on the
+	 * previous VM entry need to be handled specially by re-injecting
+	 * the event.
+	 *
+	 * See "Information for VM Exits During Event Delivery" in Intel SDM
+	 * for details.
+	 */
+	switch (reason) {
+	case EXIT_REASON_EPT_FAULT:
+	case EXIT_REASON_EPT_MISCONFIG:
+	case EXIT_REASON_APIC:
+	case EXIT_REASON_TASK_SWITCH:
+	case EXIT_REASON_EXCEPTION:
+		idtvec_info = vmcs_idt_vectoring_info();
+		if (idtvec_info & VMCS_IDT_VEC_VALID) {
+			idtvec_info &= ~(1 << 12); /* clear undefined bit */
+			vmwrite(VMCS_ENTRY_INTR_INFO, idtvec_info);
+			if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
+				idtvec_err = vmcs_idt_vectoring_err();
+				vmwrite(VMCS_ENTRY_EXCEPTION_ERROR, idtvec_err);
+			}
+			vmwrite(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
+		}
+	default:
+		break;
+	}
+
+	switch (reason) {
 	case EXIT_REASON_CR_ACCESS:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1);
 		handled = vmx_emulate_cr_access(vmx, vcpu, qual);
@ -1374,19 +1415,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 		break;
 	case EXIT_REASON_HLT:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1);
-		/*
-		 * If there is an event waiting to be injected then there is
-		 * no need to 'hlt'.
-		 */
-		error = vmread(VMCS_ENTRY_INTR_INFO, &intr_info);
-		if (error)
-			panic("vmx_exit_process: vmread(intrinfo) %d", error);
-
-		if (intr_info & VMCS_INTERRUPTION_INFO_VALID) {
-			handled = 1;
-			vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT_IGNORED, 1);
-		} else
-			vmexit->exitcode = VM_EXITCODE_HLT;
+		vmexit->exitcode = VM_EXITCODE_HLT;
 		break;
 	case EXIT_REASON_MTF:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1);
@ -1440,15 +1469,22 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 		break;
 	case EXIT_REASON_EPT_FAULT:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EPT_FAULT, 1);
-		gla = vmcs_gla();
+		/*
+		 * If 'gpa' lies within the address space allocated to
+		 * memory then this must be a nested page fault otherwise
+		 * this must be an instruction that accesses MMIO space.
+		 */
 		gpa = vmcs_gpa();
-		cr3 = vmcs_guest_cr3();
-		handled = vmx_ept_fault(vmx->vm, vcpu, gla, gpa,
-					vmexit->rip, vmexit->inst_length,
-					cr3, qual, &vmexit->u.paging.vie);
-		if (!handled) {
+		if (vm_mem_allocated(vmx->vm, gpa)) {
 			vmexit->exitcode = VM_EXITCODE_PAGING;
 			vmexit->u.paging.gpa = gpa;
+			vmexit->u.paging.fault_type = ept_fault_type(qual);
+			vmexit->u.paging.protection = ept_protection(qual);
+		} else if (ept_emulation_fault(qual)) {
+			vmexit->exitcode = VM_EXITCODE_INST_EMUL;
+			vmexit->u.inst_emul.gpa = gpa;
+			vmexit->u.inst_emul.gla = vmcs_gla();
+			vmexit->u.inst_emul.cr3 = vmcs_guest_cr3();
 		}
 		break;
 	default:
@ -1470,14 +1506,6 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 		vm_exit_update_rip(vmexit);
 		vmexit->rip += vmexit->inst_length;
 		vmexit->inst_length = 0;
-
-		/*
-		 * Special case for spinning up an AP - exit to userspace to
-		 * give the controlling process a chance to intercept and
-		 * spin up a thread for the AP.
-		 */
-		if (vmexit->exitcode == VM_EXITCODE_SPINUP_AP)
-			handled = 0;
 	} else {
 		if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
 			/*
@ -1497,7 +1525,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 }

 static int
-vmx_run(void *arg, int vcpu, register_t rip)
+vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap)
 {
 	int error, vie, rc, handled, astpending;
 	uint32_t exit_reason;
@ -1505,7 +1533,7 @@ vmx_run(void *arg, int vcpu, register_t rip)
 	struct vmxctx *vmxctx;
 	struct vmcs *vmcs;
 	struct vm_exit *vmexit;
-	
+
 	vmx = arg;
 	vmcs = &vmx->vmcs[vcpu];
 	vmxctx = &vmx->ctx[vcpu];
@ -1514,6 +1542,11 @@ vmx_run(void *arg, int vcpu, register_t rip)
 	astpending = 0;
 	vmexit = vm_exitinfo(vmx->vm, vcpu);

+	KASSERT(vmxctx->pmap == pmap,
+	    ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap));
+	KASSERT(vmxctx->eptp == vmx->eptp,
+	    ("eptp %p different than ctx eptp %#lx", eptp, vmxctx->eptp));
+
 	/*
 	 * XXX Can we avoid doing this every time we do a vm run?
 	 */
@ -1576,6 +1609,9 @@ vmx_run(void *arg, int vcpu, register_t rip)
 				vmxctx->launch_error, vie);
 #endif
 			goto err_exit;
+		case VMX_RETURN_INVEPT:
+			panic("vm %s:%d invept error %d",
+			      vm_name(vmx->vm), vcpu, vmxctx->launch_error);
 		default:
 			panic("vmx_setjmp returned %d", rc);
 		}
@ -1654,7 +1690,6 @@ vmx_vmcleanup(void *arg)
 	if (error != 0)
 		panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error);

-	ept_vmcleanup(vmx);
 	free(vmx, M_VMX);

 	return;
@ -2000,13 +2035,13 @@ struct vmm_ops vmm_ops_intel = {
 	vmx_vminit,
 	vmx_run,
 	vmx_vmcleanup,
-	ept_vmmmap_set,
-	ept_vmmmap_get,
 	vmx_getreg,
 	vmx_setreg,
 	vmx_getdesc,
 	vmx_setdesc,
 	vmx_inject,
 	vmx_getcap,
-	vmx_setcap
+	vmx_setcap,
+	ept_vmspace_alloc,
+	ept_vmspace_free,
 };
--- a/sys/amd64/vmm/intel/vmx.h
+++ b/sys/amd64/vmm/intel/vmx.h
@ -31,6 +31,8 @@

 #include "vmcs.h"

+struct pmap;
+
 #define	GUEST_MSR_MAX_ENTRIES	64		/* arbitrary */

 struct vmxctx {
@ -68,6 +70,15 @@ struct vmxctx {
 	
 	int		launched;		/* vmcs launch state */
 	int		launch_error;
+
+	long		eptgen[MAXCPU];		/* cached pmap->pm_eptgen */
+
+	/*
+	 * The 'eptp' and the 'pmap' do not change during the lifetime of
+	 * the VM so it is safe to keep a copy in each vcpu's vmxctx.
+	 */
+	vm_paddr_t	eptp;
+	struct pmap	*pmap;
 };

 struct vmxcap {
@ -82,16 +93,15 @@ struct vmxstate {

 /* virtual machine softc */
 struct vmx {
-	pml4_entry_t	pml4ept[NPML4EPG];
 	struct vmcs	vmcs[VM_MAXCPU];	/* one vmcs per virtual cpu */
 	char		msr_bitmap[PAGE_SIZE];
 	struct msr_entry guest_msrs[VM_MAXCPU][GUEST_MSR_MAX_ENTRIES];
 	struct vmxctx	ctx[VM_MAXCPU];
 	struct vmxcap	cap[VM_MAXCPU];
 	struct vmxstate	state[VM_MAXCPU];
+	uint64_t	eptp;
 	struct vm	*vm;
 };
-CTASSERT((offsetof(struct vmx, pml4ept) & PAGE_MASK) == 0);
 CTASSERT((offsetof(struct vmx, vmcs) & PAGE_MASK) == 0);
 CTASSERT((offsetof(struct vmx, msr_bitmap) & PAGE_MASK) == 0);
 CTASSERT((offsetof(struct vmx, guest_msrs) & 15) == 0);
@ -101,6 +111,7 @@ CTASSERT((offsetof(struct vmx, guest_msrs) & 15) == 0);
 #define	VMX_RETURN_VMRESUME	2
 #define	VMX_RETURN_VMLAUNCH	3
 #define	VMX_RETURN_AST		4
+#define	VMX_RETURN_INVEPT	5
 /*
 * vmx_setjmp() returns:
 * - 0 when it returns directly
@ -108,6 +119,7 @@ CTASSERT((offsetof(struct vmx, guest_msrs) & 15) == 0);
 * - 2 when it returns from vmx_resume (which would only be in the error case)
 * - 3 when it returns from vmx_launch (which would only be in the error case)
 * - 4 when it returns from vmx_resume or vmx_launch because of AST pending
+ * - 5 when it returns from vmx_launch/vmx_resume because of invept error
 */
 int	vmx_setjmp(struct vmxctx *ctx);
 void	vmx_longjmp(void);			/* returns via vmx_setjmp */
--- a/sys/amd64/vmm/intel/vmx_genassym.c
+++ b/sys/amd64/vmm/intel/vmx_genassym.c
@ -72,6 +72,10 @@ ASSYM(VMXCTX_HOST_RBX, offsetof(struct vmxctx, host_rbx));
 ASSYM(VMXCTX_HOST_RIP, offsetof(struct vmxctx, host_rip));

 ASSYM(VMXCTX_LAUNCH_ERROR, offsetof(struct vmxctx, launch_error));
+ASSYM(VMXCTX_EPTGEN, offsetof(struct vmxctx, eptgen));
+
+ASSYM(VMXCTX_PMAP, offsetof(struct vmxctx, pmap));
+ASSYM(VMXCTX_EPTP, offsetof(struct vmxctx, eptp));

 ASSYM(VM_SUCCESS,	VM_SUCCESS);
 ASSYM(VM_FAIL_INVALID,	VM_FAIL_INVALID);
@ -82,8 +86,13 @@ ASSYM(VMX_RETURN_LONGJMP,	VMX_RETURN_LONGJMP);
 ASSYM(VMX_RETURN_VMRESUME,	VMX_RETURN_VMRESUME);
 ASSYM(VMX_RETURN_VMLAUNCH,	VMX_RETURN_VMLAUNCH);
 ASSYM(VMX_RETURN_AST,		VMX_RETURN_AST);
+ASSYM(VMX_RETURN_INVEPT,	VMX_RETURN_INVEPT);

 ASSYM(TDF_ASTPENDING, TDF_ASTPENDING);
 ASSYM(TDF_NEEDRESCHED, TDF_NEEDRESCHED);
 ASSYM(TD_FLAGS, offsetof(struct thread, td_flags));
 ASSYM(PC_CURTHREAD, offsetof(struct pcpu, pc_curthread));
+ASSYM(PC_CPUID, offsetof(struct pcpu, pc_cpuid));
+
+ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active));
+ASSYM(PM_EPTGEN, offsetof(struct pmap, pm_eptgen));
--- a/sys/amd64/vmm/intel/vmx_support.S
+++ b/sys/amd64/vmm/intel/vmx_support.S
@ -30,6 +30,12 @@

 #include "vmx_assym.s"

+#ifdef SMP
+#define	LK	lock ;
+#else
+#define	LK
+#endif
+
 /*
 * Disable interrupts before updating %rsp in VMX_CHECK_AST or
 * VMX_GUEST_RESTORE.
@ -86,15 +92,73 @@
 	movq	VMXCTX_GUEST_R15(%rdi),%r15;				\
 	movq	VMXCTX_GUEST_RDI(%rdi),%rdi; /* restore rdi the last */

-#define	VM_INSTRUCTION_ERROR(reg)					\
+/*
+ * Check for an error after executing a VMX instruction.
+ * 'errreg' will be zero on success and non-zero otherwise.
+ * 'ctxreg' points to the 'struct vmxctx' associated with the vcpu.
+ */
+#define	VM_INSTRUCTION_ERROR(errreg, ctxreg)				\
 	jnc 	1f;							\
-	movl 	$VM_FAIL_INVALID,reg;		/* CF is set */		\
+	movl 	$VM_FAIL_INVALID,errreg;		/* CF is set */	\
 	jmp 	3f;							\
 1:	jnz 	2f;							\
-	movl 	$VM_FAIL_VALID,reg;		/* ZF is set */		\
+	movl 	$VM_FAIL_VALID,errreg;		/* ZF is set */		\
 	jmp 	3f;							\
-2:	movl 	$VM_SUCCESS,reg;					\
-3:	movl	reg,VMXCTX_LAUNCH_ERROR(%rsp)
+2:	movl 	$VM_SUCCESS,errreg;					\
+3:	movl	errreg,VMXCTX_LAUNCH_ERROR(ctxreg)
+
+/*
+ * set or clear the appropriate bit in 'pm_active'
+ * %rdi = vmxctx
+ * %rax, %r11 = scratch registers
+ */
+#define	VMX_SET_PM_ACTIVE						\
+	movq	VMXCTX_PMAP(%rdi), %r11;				\
+	movl	PCPU(CPUID), %eax;					\
+	LK btsl	%eax, PM_ACTIVE(%r11)
+
+#define	VMX_CLEAR_PM_ACTIVE						\
+	movq	VMXCTX_PMAP(%rdi), %r11;				\
+	movl	PCPU(CPUID), %eax;					\
+	LK btrl	%eax, PM_ACTIVE(%r11)
+
+/*
+ * If 'vmxctx->eptgen[curcpu]' is not identical to 'pmap->pm_eptgen'
+ * then we must invalidate all mappings associated with this eptp.
+ *
+ * %rdi = vmxctx
+ * %rax, %rbx, %r11 = scratch registers
+ */
+#define	VMX_CHECK_EPTGEN						\
+	movl	PCPU(CPUID), %ebx;					\
+	movq	VMXCTX_PMAP(%rdi), %r11;				\
+	movq	PM_EPTGEN(%r11), %rax;					\
+	cmpq	%rax, VMXCTX_EPTGEN(%rdi, %rbx, 8);			\
+	je	9f;							\
+									\
+	/* Refresh 'vmxctx->eptgen[curcpu]' */				\
+	movq	%rax, VMXCTX_EPTGEN(%rdi, %rbx, 8);			\
+									\
+	/* Setup the invept descriptor at the top of tmpstk */		\
+	mov	%rdi, %r11;						\
+	addq	$VMXCTX_TMPSTKTOP, %r11;				\
+	movq	VMXCTX_EPTP(%rdi), %rax;				\
+	movq	%rax, -16(%r11);					\
+	movq	$0x0, -8(%r11);						\
+	mov	$0x1, %eax;	/* Single context invalidate */		\
+	invept	-16(%r11), %rax;					\
+									\
+	/* Check for invept error */					\
+	VM_INSTRUCTION_ERROR(%eax, %rdi);				\
+	testl	%eax, %eax;						\
+	jz	9f;							\
+									\
+	/* Return via vmx_setjmp with retval of VMX_RETURN_INVEPT */	\
+	movq	$VMX_RETURN_INVEPT, %rsi;				\
+	movq	%rdi,%rsp;						\
+	addq	$VMXCTX_TMPSTKTOP, %rsp;				\
+	callq	vmx_return;						\
+9:	;

 	.text
 /*
@ -129,6 +193,9 @@ END(vmx_setjmp)
 * Return to vmm context through vmx_setjmp() with a value of 'retval'.
 */
 ENTRY(vmx_return)
+	/* The pmap is no longer active on the host cpu */
+	VMX_CLEAR_PM_ACTIVE
+
 	/* Restore host context. */
 	movq	VMXCTX_HOST_R15(%rdi),%r15
 	movq	VMXCTX_HOST_R14(%rdi),%r14
@ -193,6 +260,10 @@ ENTRY(vmx_resume)

 	VMX_CHECK_AST

+	VMX_SET_PM_ACTIVE	/* This vcpu is now active on the host cpu */
+
+	VMX_CHECK_EPTGEN	/* Check if we have to invalidate TLB */
+
 	/*
 	 * Restore guest state that is not automatically loaded from the vmcs.
 	 */
@ -203,7 +274,7 @@ ENTRY(vmx_resume)
 	/*
 	 * Capture the reason why vmresume failed.
 	 */
-	VM_INSTRUCTION_ERROR(%eax)
+	VM_INSTRUCTION_ERROR(%eax, %rsp)

 	/* Return via vmx_setjmp with return value of VMX_RETURN_VMRESUME */
 	movq	%rsp,%rdi
@ -225,6 +296,10 @@ ENTRY(vmx_launch)

 	VMX_CHECK_AST

+	VMX_SET_PM_ACTIVE	/* This vcpu is now active on the host cpu */
+
+	VMX_CHECK_EPTGEN	/* Check if we have to invalidate TLB */
+
 	/*
 	 * Restore guest state that is not automatically loaded from the vmcs.
 	 */
@ -235,7 +310,7 @@ ENTRY(vmx_launch)
 	/*
 	 * Capture the reason why vmlaunch failed.
 	 */
-	VM_INSTRUCTION_ERROR(%eax)
+	VM_INSTRUCTION_ERROR(%eax, %rsp)

 	/* Return via vmx_setjmp with return value of VMX_RETURN_VMLAUNCH */
 	movq	%rsp,%rdi
--- a/sys/amd64/vmm/io/ppt.c
+++ b/sys/amd64/vmm/io/ppt.c
@ -281,6 +281,43 @@ ppt_teardown_msix(struct pptdev *ppt)
 	ppt->msix.num_msgs = 0;
 }

+int
+ppt_num_devices(struct vm *vm)
+{
+	int i, num;
+
+	num = 0;
+	for (i = 0; i < num_pptdevs; i++) {
+		if (pptdevs[i].vm == vm)
+			num++;
+	}
+	return (num);
+}
+
+boolean_t
+ppt_is_mmio(struct vm *vm, vm_paddr_t gpa)
+{
+	int i, n;
+	struct pptdev *ppt;
+	struct vm_memory_segment *seg;
+
+	for (n = 0; n < num_pptdevs; n++) {
+		ppt = &pptdevs[n];
+		if (ppt->vm != vm)
+			continue;
+
+		for (i = 0; i < MAX_MMIOSEGS; i++) {
+			seg = &ppt->mmio[i];
+			if (seg->len == 0)
+				continue;
+			if (gpa >= seg->gpa && gpa < seg->gpa + seg->len)
+				return (TRUE);
+		}
+	}
+
+	return (FALSE);
+}
+
 int
 ppt_assign_device(struct vm *vm, int bus, int slot, int func)
 {
@ -336,7 +373,7 @@ ppt_unassign_all(struct vm *vm)
 			bus = pci_get_bus(dev);
 			slot = pci_get_slot(dev);
 			func = pci_get_function(dev);
-			ppt_unassign_device(vm, bus, slot, func);
+			vm_unassign_pptdev(vm, bus, slot, func);
 		}
 	}

@ -591,10 +628,3 @@ ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,

 	return (0);
 }
-
-int
-ppt_num_devices(void)
-{
-
-	return (num_pptdevs);
-}
--- a/sys/amd64/vmm/io/ppt.h
+++ b/sys/amd64/vmm/io/ppt.h
@ -29,14 +29,20 @@
 #ifndef _IO_PPT_H_
 #define	_IO_PPT_H_

-int	ppt_assign_device(struct vm *vm, int bus, int slot, int func);
-int	ppt_unassign_device(struct vm *vm, int bus, int slot, int func);
 int	ppt_unassign_all(struct vm *vm);
 int	ppt_map_mmio(struct vm *vm, int bus, int slot, int func,
 		     vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
 int	ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
 		      int destcpu, int vector, int numvec);
 int	ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
-		       int idx, uint32_t msg, uint32_t vector_control, uint64_t addr);
-int	ppt_num_devices(void);
+		int idx, uint32_t msg, uint32_t vector_control, uint64_t addr);
+int	ppt_num_devices(struct vm *vm);
+boolean_t ppt_is_mmio(struct vm *vm, vm_paddr_t gpa);
+
+/*
+ * The following functions should never be called directly.
+ * Use 'vm_assign_pptdev()' and 'vm_unassign_pptdev()' instead.
+ */
+int	ppt_assign_device(struct vm *vm, int bus, int slot, int func);
+int	ppt_unassign_device(struct vm *vm, int bus, int slot, int func);
 #endif
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@ -39,18 +39,28 @@ __FBSDID("$FreeBSD$");
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
+#include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/systm.h>

 #include <vm/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_param.h>

 #include <machine/vm.h>
 #include <machine/pcb.h>
 #include <machine/smp.h>
 #include <x86/apicreg.h>
+#include <machine/pmap.h>
+#include <machine/vmparam.h>

 #include <machine/vmm.h>
+#include "vmm_ktr.h"
 #include "vmm_host.h"
 #include "vmm_mem.h"
 #include "vmm_util.h"
@ -84,15 +94,23 @@ struct vcpu {
 #define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
 #define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
 #define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
+#define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)

+struct mem_seg {
+	vm_paddr_t	gpa;
+	size_t		len;
+	boolean_t	wired;
+	vm_object_t	object;
+};
 #define	VM_MAX_MEMORY_SEGMENTS	2

 struct vm {
 	void		*cookie;	/* processor-specific data */
 	void		*iommu;		/* iommu-specific data */
+	struct vmspace	*vmspace;	/* guest's address space */
 	struct vcpu	vcpu[VM_MAXCPU];
 	int		num_mem_segs;
-	struct vm_memory_segment mem_segs[VM_MAX_MEMORY_SEGMENTS];
+	struct mem_seg	mem_segs[VM_MAX_MEMORY_SEGMENTS];
 	char		name[VM_MAX_NAMELEN];

 	/*
@ -109,16 +127,14 @@ static struct vmm_ops *ops;
 #define	VMM_INIT()	(ops != NULL ? (*ops->init)() : 0)
 #define	VMM_CLEANUP()	(ops != NULL ? (*ops->cleanup)() : 0)

-#define	VMINIT(vm)	(ops != NULL ? (*ops->vminit)(vm): NULL)
-#define	VMRUN(vmi, vcpu, rip) \
-	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip) : ENXIO)
+#define	VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
+#define	VMRUN(vmi, vcpu, rip, pmap) \
+	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap) : ENXIO)
 #define	VMCLEANUP(vmi)	(ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
-#define	VMMMAP_SET(vmi, gpa, hpa, len, attr, prot, spm)			\
-    	(ops != NULL ? 							\
-    	(*ops->vmmmap_set)(vmi, gpa, hpa, len, attr, prot, spm) :	\
-	ENXIO)
-#define	VMMMAP_GET(vmi, gpa) \
-	(ops != NULL ? (*ops->vmmmap_get)(vmi, gpa) : ENXIO)
+#define	VMSPACE_ALLOC(min, max) \
+	(ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
+#define	VMSPACE_FREE(vmspace) \
+	(ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO)
 #define	VMGETREG(vmi, vcpu, num, retval)		\
 	(ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
 #define	VMSETREG(vmi, vcpu, num, val)		\
@ -213,8 +229,7 @@ vmm_handler(module_t mod, int what, void *arg)
 	switch (what) {
 	case MOD_LOAD:
 		vmmdev_init();
-		if (ppt_num_devices() > 0)
-			iommu_init();
+		iommu_init();
 		error = vmm_init();
 		if (error == 0)
 			vmm_initialized = 1;
@ -265,7 +280,7 @@ vm_create(const char *name, struct vm **retvm)
 {
 	int i;
 	struct vm *vm;
-	vm_paddr_t maxaddr;
+	struct vmspace *vmspace;

 	const int BSP = 0;

@ -279,59 +294,34 @@ vm_create(const char *name, struct vm **retvm)
 	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
 		return (EINVAL);

+	vmspace = VMSPACE_ALLOC(VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS);
+	if (vmspace == NULL)
+		return (ENOMEM);
+
 	vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
 	strcpy(vm->name, name);
-	vm->cookie = VMINIT(vm);
+	vm->cookie = VMINIT(vm, vmspace_pmap(vmspace));

 	for (i = 0; i < VM_MAXCPU; i++) {
 		vcpu_init(vm, i);
 		guest_msrs_init(vm, i);
 	}

-	maxaddr = vmm_mem_maxaddr();
-	vm->iommu = iommu_create_domain(maxaddr);
 	vm_activate_cpu(vm, BSP);
+	vm->vmspace = vmspace;

 	*retvm = vm;
 	return (0);
 }

 static void
-vm_free_mem_seg(struct vm *vm, struct vm_memory_segment *seg)
+vm_free_mem_seg(struct vm *vm, struct mem_seg *seg)
 {
-	size_t len;
-	vm_paddr_t hpa;
-	void *host_domain;

-	host_domain = iommu_host_domain();
+	if (seg->object != NULL)
+		vmm_mem_free(vm->vmspace, seg->gpa, seg->len);

-	len = 0;
-	while (len < seg->len) {
-		hpa = vm_gpa2hpa(vm, seg->gpa + len, PAGE_SIZE);
-		if (hpa == (vm_paddr_t)-1) {
-			panic("vm_free_mem_segs: cannot free hpa "
-			      "associated with gpa 0x%016lx", seg->gpa + len);
-		}
-
-		/*
-		 * Remove the 'gpa' to 'hpa' mapping in VMs domain.
-		 * And resurrect the 1:1 mapping for 'hpa' in 'host_domain'.
-		 */
-		iommu_remove_mapping(vm->iommu, seg->gpa + len, PAGE_SIZE);
-		iommu_create_mapping(host_domain, hpa, hpa, PAGE_SIZE);
-
-		vmm_mem_free(hpa, PAGE_SIZE);
-
-		len += PAGE_SIZE;
-	}
-
-	/*
-	 * Invalidate cached translations associated with 'vm->iommu' since
-	 * we have now moved some pages from it.
-	 */
-	iommu_invalidate_tlb(vm->iommu);
-
-	bzero(seg, sizeof(struct vm_memory_segment));
+	bzero(seg, sizeof(*seg));
 }

 void
@ -341,6 +331,9 @@ vm_destroy(struct vm *vm)

 	ppt_unassign_all(vm);

+	if (vm->iommu != NULL)
+		iommu_destroy_domain(vm->iommu);
+
 	for (i = 0; i < vm->num_mem_segs; i++)
 		vm_free_mem_seg(vm, &vm->mem_segs[i]);

@ -349,7 +342,7 @@ vm_destroy(struct vm *vm)
 	for (i = 0; i < VM_MAXCPU; i++)
 		vcpu_cleanup(&vm->vcpu[i]);

-	iommu_destroy_domain(vm->iommu);
+	VMSPACE_FREE(vm->vmspace);

 	VMCLEANUP(vm->cookie);

@ -365,52 +358,48 @@ vm_name(struct vm *vm)
 int
 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
 {
-	const boolean_t spok = TRUE;	/* superpage mappings are ok */
+	vm_object_t obj;

-	return (VMMMAP_SET(vm->cookie, gpa, hpa, len, VM_MEMATTR_UNCACHEABLE,
-			   VM_PROT_RW, spok));
+	if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
+		return (ENOMEM);
+	else
+		return (0);
 }

 int
 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
 {
-	const boolean_t spok = TRUE;	/* superpage mappings are ok */

-	return (VMMMAP_SET(vm->cookie, gpa, 0, len, 0,
-			   VM_PROT_NONE, spok));
+	vmm_mmio_free(vm->vmspace, gpa, len);
+	return (0);
 }

-/*
- * Returns TRUE if 'gpa' is available for allocation and FALSE otherwise
- */
-static boolean_t
-vm_gpa_available(struct vm *vm, vm_paddr_t gpa)
+boolean_t
+vm_mem_allocated(struct vm *vm, vm_paddr_t gpa)
 {
 	int i;
 	vm_paddr_t gpabase, gpalimit;

-	if (gpa & PAGE_MASK)
-		panic("vm_gpa_available: gpa (0x%016lx) not page aligned", gpa);
-
 	for (i = 0; i < vm->num_mem_segs; i++) {
 		gpabase = vm->mem_segs[i].gpa;
 		gpalimit = gpabase + vm->mem_segs[i].len;
 		if (gpa >= gpabase && gpa < gpalimit)
-			return (FALSE);
+			return (TRUE);		/* 'gpa' is regular memory */
 	}

-	return (TRUE);
+	if (ppt_is_mmio(vm, gpa))
+		return (TRUE);			/* 'gpa' is pci passthru mmio */
+
+	return (FALSE);
 }

 int
 vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
 {
-	int error, available, allocated;
-	struct vm_memory_segment *seg;
-	vm_paddr_t g, hpa;
-	void *host_domain;
-
-	const boolean_t spok = TRUE;	/* superpage mappings are ok */
+	int available, allocated;
+	struct mem_seg *seg;
+	vm_object_t object;
+	vm_paddr_t g;

 	if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
 		return (EINVAL);
@ -418,10 +407,10 @@ vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
 	available = allocated = 0;
 	g = gpa;
 	while (g < gpa + len) {
-		if (vm_gpa_available(vm, g))
-			available++;
-		else
+		if (vm_mem_allocated(vm, g))
 			allocated++;
+		else
+			available++;

 		g += PAGE_SIZE;
 	}
@ -443,61 +432,203 @@ vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
 	if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
 		return (E2BIG);

-	host_domain = iommu_host_domain();
-
 	seg = &vm->mem_segs[vm->num_mem_segs];

-	error = 0;
+	if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL)
+		return (ENOMEM);
+
 	seg->gpa = gpa;
-	seg->len = 0;
-	while (seg->len < len) {
-		hpa = vmm_mem_alloc(PAGE_SIZE);
-		if (hpa == 0) {
-			error = ENOMEM;
-			break;
-		}
-
-		error = VMMMAP_SET(vm->cookie, gpa + seg->len, hpa, PAGE_SIZE,
-				   VM_MEMATTR_WRITE_BACK, VM_PROT_ALL, spok);
-		if (error)
-			break;
-
-		/*
-		 * Remove the 1:1 mapping for 'hpa' from the 'host_domain'.
-		 * Add mapping for 'gpa + seg->len' to 'hpa' in the VMs domain.
-		 */
-		iommu_remove_mapping(host_domain, hpa, PAGE_SIZE);
-		iommu_create_mapping(vm->iommu, gpa + seg->len, hpa, PAGE_SIZE);
-
-		seg->len += PAGE_SIZE;
-	}
-
-	if (error) {
-		vm_free_mem_seg(vm, seg);
-		return (error);
-	}
-
-	/*
-	 * Invalidate cached translations associated with 'host_domain' since
-	 * we have now moved some pages from it.
-	 */
-	iommu_invalidate_tlb(host_domain);
+	seg->len = len;
+	seg->object = object;
+	seg->wired = FALSE;

 	vm->num_mem_segs++;

 	return (0);
 }

-vm_paddr_t
-vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t len)
+static void
+vm_gpa_unwire(struct vm *vm)
 {
-	vm_paddr_t nextpage;
+	int i, rv;
+	struct mem_seg *seg;

-	nextpage = rounddown(gpa + PAGE_SIZE, PAGE_SIZE);
-	if (len > nextpage - gpa)
-		panic("vm_gpa2hpa: invalid gpa/len: 0x%016lx/%lu", gpa, len);
+	for (i = 0; i < vm->num_mem_segs; i++) {
+		seg = &vm->mem_segs[i];
+		if (!seg->wired)
+			continue;

-	return (VMMMAP_GET(vm->cookie, gpa));
+		rv = vm_map_unwire(&vm->vmspace->vm_map,
+				   seg->gpa, seg->gpa + seg->len,
+				   VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
+		KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment "
+		    "%#lx/%ld could not be unwired: %d",
+		    vm_name(vm), seg->gpa, seg->len, rv));
+
+		seg->wired = FALSE;
+	}
+}
+
+static int
+vm_gpa_wire(struct vm *vm)
+{
+	int i, rv;
+	struct mem_seg *seg;
+
+	for (i = 0; i < vm->num_mem_segs; i++) {
+		seg = &vm->mem_segs[i];
+		if (seg->wired)
+			continue;
+
+		/* XXX rlimits? */
+		rv = vm_map_wire(&vm->vmspace->vm_map,
+				 seg->gpa, seg->gpa + seg->len,
+				 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
+		if (rv != KERN_SUCCESS)
+			break;
+
+		seg->wired = TRUE;
+	}
+
+	if (i < vm->num_mem_segs) {
+		/*
+		 * Undo the wiring before returning an error.
+		 */
+		vm_gpa_unwire(vm);
+		return (EAGAIN);
+	}
+
+	return (0);
+}
+
+static void
+vm_iommu_modify(struct vm *vm, boolean_t map)
+{
+	int i, sz;
+	vm_paddr_t gpa, hpa;
+	struct mem_seg *seg;
+	void *vp, *cookie, *host_domain;
+
+	sz = PAGE_SIZE;
+	host_domain = iommu_host_domain();
+
+	for (i = 0; i < vm->num_mem_segs; i++) {
+		seg = &vm->mem_segs[i];
+		KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired",
+		    vm_name(vm), seg->gpa, seg->len));
+
+		gpa = seg->gpa;
+		while (gpa < seg->gpa + seg->len) {
+			vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE,
+					 &cookie);
+			KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx",
+			    vm_name(vm), gpa));
+
+			vm_gpa_release(cookie);
+
+			hpa = DMAP_TO_PHYS((uintptr_t)vp);
+			if (map) {
+				iommu_create_mapping(vm->iommu, gpa, hpa, sz);
+				iommu_remove_mapping(host_domain, hpa, sz);
+			} else {
+				iommu_remove_mapping(vm->iommu, gpa, sz);
+				iommu_create_mapping(host_domain, hpa, hpa, sz);
+			}
+
+			gpa += PAGE_SIZE;
+		}
+	}
+
+	/*
+	 * Invalidate the cached translations associated with the domain
+	 * from which pages were removed.
+	 */
+	if (map)
+		iommu_invalidate_tlb(host_domain);
+	else
+		iommu_invalidate_tlb(vm->iommu);
+}
+
+#define	vm_iommu_unmap(vm)	vm_iommu_modify((vm), FALSE)
+#define	vm_iommu_map(vm)	vm_iommu_modify((vm), TRUE)
+
+int
+vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
+{
+	int error;
+
+	error = ppt_unassign_device(vm, bus, slot, func);
+	if (error)
+		return (error);
+
+	if (ppt_num_devices(vm) == 0) {
+		vm_iommu_unmap(vm);
+		vm_gpa_unwire(vm);
+	}
+	return (0);
+}
+
+int
+vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
+{
+	int error;
+	vm_paddr_t maxaddr;
+
+	/*
+	 * Virtual machines with pci passthru devices get special treatment:
+	 * - the guest physical memory is wired
+	 * - the iommu is programmed to do the 'gpa' to 'hpa' translation
+	 *
+	 * We need to do this before the first pci passthru device is attached.
+	 */
+	if (ppt_num_devices(vm) == 0) {
+		KASSERT(vm->iommu == NULL,
+		    ("vm_assign_pptdev: iommu must be NULL"));
+		maxaddr = vmm_mem_maxaddr();
+		vm->iommu = iommu_create_domain(maxaddr);
+
+		error = vm_gpa_wire(vm);
+		if (error)
+			return (error);
+
+		vm_iommu_map(vm);
+	}
+
+	error = ppt_assign_device(vm, bus, slot, func);
+	return (error);
+}
+
+void *
+vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
+	    void **cookie)
+{
+	int count, pageoff;
+	vm_page_t m;
+
+	pageoff = gpa & PAGE_MASK;
+	if (len > PAGE_SIZE - pageoff)
+		panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
+
+	count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
+	    trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
+
+	if (count == 1) {
+		*cookie = m;
+		return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
+	} else {
+		*cookie = NULL;
+		return (NULL);
+	}
+}
+
+void
+vm_gpa_release(void *cookie)
+{
+	vm_page_t m = cookie;
+
+	vm_page_lock(m);
+	vm_page_unhold(m);
+	vm_page_unlock(m);
 }

 int
@ -508,13 +639,42 @@ vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,

 	for (i = 0; i < vm->num_mem_segs; i++) {
 		if (gpabase == vm->mem_segs[i].gpa) {
-			*seg = vm->mem_segs[i];
+			seg->gpa = vm->mem_segs[i].gpa;
+			seg->len = vm->mem_segs[i].len;
+			seg->wired = vm->mem_segs[i].wired;
 			return (0);
 		}
 	}
 	return (-1);
 }

+int
+vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len,
+	      vm_offset_t *offset, struct vm_object **object)
+{
+	int i;
+	size_t seg_len;
+	vm_paddr_t seg_gpa;
+	vm_object_t seg_obj;
+
+	for (i = 0; i < vm->num_mem_segs; i++) {
+		if ((seg_obj = vm->mem_segs[i].object) == NULL)
+			continue;
+
+		seg_gpa = vm->mem_segs[i].gpa;
+		seg_len = vm->mem_segs[i].len;
+
+		if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) {
+			*offset = gpa - seg_gpa;
+			*object = seg_obj;
+			vm_object_reference(seg_obj);
+			return (0);
+		}
+	}
+
+	return (EINVAL);
+}
+
 int
 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
 {
@ -633,26 +793,215 @@ save_guest_fpustate(struct vcpu *vcpu)

 static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");

+static int
+vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
+{
+	int error;
+
+	vcpu_assert_locked(vcpu);
+
+	/*
+	 * The following state transitions are allowed:
+	 * IDLE -> FROZEN -> IDLE
+	 * FROZEN -> RUNNING -> FROZEN
+	 * FROZEN -> SLEEPING -> FROZEN
+	 */
+	switch (vcpu->state) {
+	case VCPU_IDLE:
+	case VCPU_RUNNING:
+	case VCPU_SLEEPING:
+		error = (newstate != VCPU_FROZEN);
+		break;
+	case VCPU_FROZEN:
+		error = (newstate == VCPU_FROZEN);
+		break;
+	default:
+		error = 1;
+		break;
+	}
+
+	if (error == 0)
+		vcpu->state = newstate;
+	else
+		error = EBUSY;
+
+	return (error);
+}
+
+static void
+vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
+{
+	int error;
+
+	if ((error = vcpu_set_state(vm, vcpuid, newstate)) != 0)
+		panic("Error %d setting state to %d\n", error, newstate);
+}
+
+static void
+vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
+{
+	int error;
+
+	if ((error = vcpu_set_state_locked(vcpu, newstate)) != 0)
+		panic("Error %d setting state to %d", error, newstate);
+}
+
+/*
+ * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
+ */
+static int
+vm_handle_hlt(struct vm *vm, int vcpuid, boolean_t *retu)
+{
+	struct vcpu *vcpu;
+	int sleepticks, t;
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	vcpu_lock(vcpu);
+
+	/*
+	 * Figure out the number of host ticks until the next apic
+	 * timer interrupt in the guest.
+	 */
+	sleepticks = lapic_timer_tick(vm, vcpuid);
+
+	/*
+	 * If the guest local apic timer is disabled then sleep for
+	 * a long time but not forever.
+	 */
+	if (sleepticks < 0)
+		sleepticks = hz;
+
+	/*
+	 * Do a final check for pending NMI or interrupts before
+	 * really putting this thread to sleep.
+	 *
+	 * These interrupts could have happened any time after we
+	 * returned from VMRUN() and before we grabbed the vcpu lock.
+	 */
+	if (!vm_nmi_pending(vm, vcpuid) && lapic_pending_intr(vm, vcpuid) < 0) {
+		if (sleepticks <= 0)
+			panic("invalid sleepticks %d", sleepticks);
+		t = ticks;
+		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
+		msleep_spin(vcpu, &vcpu->mtx, "vmidle", sleepticks);
+		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
+		vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
+	}
+	vcpu_unlock(vcpu);
+
+	return (0);
+}
+
+static int
+vm_handle_paging(struct vm *vm, int vcpuid, boolean_t *retu)
+{
+	int rv, ftype;
+	struct vm_map *map;
+	struct vcpu *vcpu;
+	struct vm_exit *vme;
+
+	vcpu = &vm->vcpu[vcpuid];
+	vme = &vcpu->exitinfo;
+
+	ftype = vme->u.paging.fault_type;
+	KASSERT(ftype == VM_PROT_READ ||
+	    ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
+	    ("vm_handle_paging: invalid fault_type %d", ftype));
+
+	if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
+		rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
+		    vme->u.paging.gpa, ftype);
+		if (rv == 0)
+			goto done;
+	}
+
+	map = &vm->vmspace->vm_map;
+	rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
+
+	VMM_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, ftype = %d",
+		 rv, vme->u.paging.gpa, ftype);
+
+	if (rv != KERN_SUCCESS)
+		return (EFAULT);
+done:
+	/* restart execution at the faulting instruction */
+	vme->inst_length = 0;
+
+	return (0);
+}
+
+static int
+vm_handle_inst_emul(struct vm *vm, int vcpuid, boolean_t *retu)
+{
+	struct vie *vie;
+	struct vcpu *vcpu;
+	struct vm_exit *vme;
+	int error, inst_length;
+	uint64_t rip, gla, gpa, cr3;
+
+	vcpu = &vm->vcpu[vcpuid];
+	vme = &vcpu->exitinfo;
+
+	rip = vme->rip;
+	inst_length = vme->inst_length;
+
+	gla = vme->u.inst_emul.gla;
+	gpa = vme->u.inst_emul.gpa;
+	cr3 = vme->u.inst_emul.cr3;
+	vie = &vme->u.inst_emul.vie;
+
+	vie_init(vie);
+
+	/* Fetch, decode and emulate the faulting instruction */
+	if (vmm_fetch_instruction(vm, vcpuid, rip, inst_length, cr3, vie) != 0)
+		return (EFAULT);
+
+	if (vmm_decode_instruction(vm, vcpuid, gla, vie) != 0)
+		return (EFAULT);
+
+	/* return to userland unless this is a local apic access */
+	if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE) {
+		*retu = TRUE;
+		return (0);
+	}
+
+	error = vmm_emulate_instruction(vm, vcpuid, gpa, vie,
+					lapic_mmio_read, lapic_mmio_write, 0);
+
+	/* return to userland to spin up the AP */
+	if (error == 0 && vme->exitcode == VM_EXITCODE_SPINUP_AP)
+		*retu = TRUE;
+
+	return (error);
+}
+
 int
 vm_run(struct vm *vm, struct vm_run *vmrun)
 {
-	int error, vcpuid, sleepticks, t;
+	int error, vcpuid;
 	struct vcpu *vcpu;
 	struct pcb *pcb;
 	uint64_t tscval, rip;
 	struct vm_exit *vme;
+	boolean_t retu;
+	pmap_t pmap;

 	vcpuid = vmrun->cpuid;

 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);

+	pmap = vmspace_pmap(vm->vmspace);
 	vcpu = &vm->vcpu[vcpuid];
-	vme = &vmrun->vm_exit;
+	vme = &vcpu->exitinfo;
 	rip = vmrun->rip;
 restart:
 	critical_enter();

+	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
+	    ("vm_run: absurd pm_active"));
+
 	tscval = rdtsc();

 	pcb = PCPU_GET(curpcb);
@ -661,62 +1010,44 @@ vm_run(struct vm *vm, struct vm_run *vmrun)
 	restore_guest_msrs(vm, vcpuid);	
 	restore_guest_fpustate(vcpu);

+	vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
 	vcpu->hostcpu = curcpu;
-	error = VMRUN(vm->cookie, vcpuid, rip);
+	error = VMRUN(vm->cookie, vcpuid, rip, pmap);
 	vcpu->hostcpu = NOCPU;
+	vcpu_require_state(vm, vcpuid, VCPU_FROZEN);

 	save_guest_fpustate(vcpu);
 	restore_host_msrs(vm, vcpuid);

 	vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);

-	/* copy the exit information */
-	bcopy(&vcpu->exitinfo, vme, sizeof(struct vm_exit));
-
 	critical_exit();

-	/*
-	 * Oblige the guest's desire to 'hlt' by sleeping until the vcpu
-	 * is ready to run.
-	 */
-	if (error == 0 && vme->exitcode == VM_EXITCODE_HLT) {
-		vcpu_lock(vcpu);
-
-		/*
-		 * Figure out the number of host ticks until the next apic
-		 * timer interrupt in the guest.
-		 */
-		sleepticks = lapic_timer_tick(vm, vcpuid);
-
-		/*
-		 * If the guest local apic timer is disabled then sleep for
-		 * a long time but not forever.
-		 */
-		if (sleepticks < 0)
-			sleepticks = hz;
-
-		/*
-		 * Do a final check for pending NMI or interrupts before
-		 * really putting this thread to sleep.
-		 *
-		 * These interrupts could have happened any time after we
-		 * returned from VMRUN() and before we grabbed the vcpu lock.
-		 */
-		if (!vm_nmi_pending(vm, vcpuid) &&
-		    lapic_pending_intr(vm, vcpuid) < 0) {
-			if (sleepticks <= 0)
-				panic("invalid sleepticks %d", sleepticks);
-			t = ticks;
-			msleep_spin(vcpu, &vcpu->mtx, "vmidle", sleepticks);
-			vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
+	if (error == 0) {
+		retu = FALSE;
+		switch (vme->exitcode) {
+		case VM_EXITCODE_HLT:
+			error = vm_handle_hlt(vm, vcpuid, &retu);
+			break;
+		case VM_EXITCODE_PAGING:
+			error = vm_handle_paging(vm, vcpuid, &retu);
+			break;
+		case VM_EXITCODE_INST_EMUL:
+			error = vm_handle_inst_emul(vm, vcpuid, &retu);
+			break;
+		default:
+			retu = TRUE;	/* handled in userland */
+			break;
 		}
+	}

-		vcpu_unlock(vcpu);
-
+	if (error == 0 && retu == FALSE) {
 		rip = vme->rip + vme->inst_length;
 		goto restart;
 	}

+	/* copy the exit information */
+	bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
 	return (error);
 }

@ -869,7 +1200,7 @@ vm_iommu_domain(struct vm *vm)
 }

 int
-vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state state)
+vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
 {
 	int error;
 	struct vcpu *vcpu;
@ -880,20 +1211,7 @@ vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state state)
 	vcpu = &vm->vcpu[vcpuid];

 	vcpu_lock(vcpu);
-
-	/*
-	 * The following state transitions are allowed:
-	 * IDLE -> RUNNING -> IDLE
-	 * IDLE -> CANNOT_RUN -> IDLE
-	 */
-	if ((vcpu->state == VCPU_IDLE && state != VCPU_IDLE) ||
-	    (vcpu->state != VCPU_IDLE && state == VCPU_IDLE)) {
-		error = 0;
-		vcpu->state = state;
-	} else {
-		error = EBUSY;
-	}
-
+	error = vcpu_set_state_locked(vcpu, newstate);
 	vcpu_unlock(vcpu);

 	return (error);
@ -979,16 +1297,7 @@ vm_interrupt_hostcpu(struct vm *vm, int vcpuid)
 	vcpu_lock(vcpu);
 	hostcpu = vcpu->hostcpu;
 	if (hostcpu == NOCPU) {
-		/*
-		 * If the vcpu is 'RUNNING' but without a valid 'hostcpu' then
-		 * the host thread must be sleeping waiting for an event to
-		 * kick the vcpu out of 'hlt'.
-		 *
-		 * XXX this is racy because the condition exists right before
-		 * and after calling VMRUN() in vm_run(). The wakeup() is
-		 * benign in this case.
-		 */
-		if (vcpu->state == VCPU_RUNNING)
+		if (vcpu->state == VCPU_SLEEPING)
 			wakeup_one(vcpu);
 	} else {
 		if (vcpu->state != VCPU_RUNNING)
@ -998,3 +1307,10 @@ vm_interrupt_hostcpu(struct vm *vm, int vcpuid)
 	}
 	vcpu_unlock(vcpu);
 }
+
+struct vmspace *
+vm_get_vmspace(struct vm *vm)
+{
+
+	return (vm->vmspace);
+}
--- a/sys/amd64/vmm/vmm_dev.c
+++ b/sys/amd64/vmm/vmm_dev.c
@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$");

 #include <vm/vm.h>
 #include <vm/pmap.h>
+#include <vm/vm_map.h>

 #include <machine/pmap.h>
 #include <machine/vmparam.h>
@ -95,8 +96,9 @@ vmmdev_lookup2(struct cdev *cdev)
 static int
 vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
 {
-	int error, off, c;
-	vm_paddr_t hpa, gpa;
+	int error, off, c, prot;
+	vm_paddr_t gpa;
+	void *hpa, *cookie;
 	struct vmmdev_softc *sc;

 	static char zerobuf[PAGE_SIZE];
@ -107,6 +109,7 @@ vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
 	if (sc == NULL)
 		error = ENXIO;

+	prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
 	while (uio->uio_resid > 0 && error == 0) {
 		gpa = uio->uio_offset;
 		off = gpa & PAGE_MASK;
@ -120,14 +123,16 @@ vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
 		 * Since this device does not support lseek(2), dd(1) will
 		 * read(2) blocks of data to simulate the lseek(2).
 		 */
-		hpa = vm_gpa2hpa(sc->vm, gpa, c);
-		if (hpa == (vm_paddr_t)-1) {
+		hpa = vm_gpa_hold(sc->vm, gpa, c, prot, &cookie);
+		if (hpa == NULL) {
 			if (uio->uio_rw == UIO_READ)
 				error = uiomove(zerobuf, c, uio);
 			else
 				error = EFAULT;
-		} else
-			error = uiomove((void *)PHYS_TO_DMAP(hpa), c, uio);
+		} else {
+			error = uiomove(hpa, c, uio);
+			vm_gpa_release(cookie);
+		}
 	}

 	mtx_unlock(&vmmdev_mtx);
@ -139,7 +144,6 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 	     struct thread *td)
 {
 	int error, vcpu, state_changed;
-	enum vcpu_state new_state;
 	struct vmmdev_softc *sc;
 	struct vm_memory_segment *seg;
 	struct vm_register *vmreg;
@ -156,6 +160,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 	struct vm_stats *vmstats;
 	struct vm_stat_desc *statdesc;
 	struct vm_x2apic *x2apic;
+	struct vm_gpa_pte *gpapte;

 	sc = vmmdev_lookup2(cdev);
 	if (sc == NULL)
@ -189,12 +194,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 			goto done;
 		}

-		if (cmd == VM_RUN)
-			new_state = VCPU_RUNNING;
-		else
-			new_state = VCPU_CANNOT_RUN;
-
-		error = vcpu_set_state(sc->vm, vcpu, new_state);
+		error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN);
 		if (error)
 			goto done;

@ -211,7 +211,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 		 */
 		error = 0;
 		for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) {
-			error = vcpu_set_state(sc->vm, vcpu, VCPU_CANNOT_RUN);
+			error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN);
 			if (error)
 				break;
 		}
@ -271,13 +271,13 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 		break;
 	case VM_BIND_PPTDEV:
 		pptdev = (struct vm_pptdev *)data;
-		error = ppt_assign_device(sc->vm, pptdev->bus, pptdev->slot,
-					  pptdev->func);
+		error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
+					 pptdev->func);
 		break;
 	case VM_UNBIND_PPTDEV:
 		pptdev = (struct vm_pptdev *)data;
-		error = ppt_unassign_device(sc->vm, pptdev->bus, pptdev->slot,
-					    pptdev->func);
+		error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
+					   pptdev->func);
 		break;
 	case VM_INJECT_EVENT:
 		vmevent = (struct vm_event *)data;
@ -348,6 +348,12 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 		error = vm_get_x2apic_state(sc->vm,
 					    x2apic->cpuid, &x2apic->state);
 		break;
+	case VM_GET_GPA_PMAP:
+		gpapte = (struct vm_gpa_pte *)data;
+		pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)),
+				 gpapte->gpa, gpapte->pte, &gpapte->ptenum);
+		error = 0;
+		break;
 	default:
 		error = ENOTTY;
 		break;
@ -361,25 +367,25 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 	}

 done:
+	/* Make sure that no handler returns a bogus value like ERESTART */
+	KASSERT(error >= 0, ("vmmdev_ioctl: invalid error return %d", error));
 	return (error);
 }

 static int
-vmmdev_mmap(struct cdev *cdev, vm_ooffset_t offset, vm_paddr_t *paddr,
-    int nprot, vm_memattr_t *memattr)
+vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset,
+		   vm_size_t size, struct vm_object **object, int nprot)
 {
 	int error;
 	struct vmmdev_softc *sc;

-	error = -1;
 	mtx_lock(&vmmdev_mtx);

 	sc = vmmdev_lookup2(cdev);
-	if (sc != NULL && (nprot & PROT_EXEC) == 0) {
-		*paddr = vm_gpa2hpa(sc->vm, (vm_paddr_t)offset, PAGE_SIZE);
-		if (*paddr != (vm_paddr_t)-1)
-			error = 0;
-	}
+	if (sc != NULL && (nprot & PROT_EXEC) == 0)
+		error = vm_get_memobj(sc->vm, *offset, size, offset, object);
+	else
+		error = EINVAL;

 	mtx_unlock(&vmmdev_mtx);

@ -446,7 +452,7 @@ static struct cdevsw vmmdevsw = {
 	.d_name		= "vmmdev",
 	.d_version	= D_VERSION,
 	.d_ioctl	= vmmdev_ioctl,
-	.d_mmap		= vmmdev_mmap,
+	.d_mmap_single	= vmmdev_mmap_single,
 	.d_read		= vmmdev_rw,
 	.d_write	= vmmdev_rw,
 };
--- a/sys/amd64/vmm/vmm_instruction_emul.c
+++ b/sys/amd64/vmm/vmm_instruction_emul.c
@ -465,7 +465,7 @@ vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 }

 #ifdef _KERNEL
-static void
+void
 vie_init(struct vie *vie)
 {

@ -479,9 +479,9 @@ static int
 gla2gpa(struct vm *vm, uint64_t gla, uint64_t ptpphys,
 	uint64_t *gpa, uint64_t *gpaend)
 {
-	vm_paddr_t hpa;
 	int nlevels, ptpshift, ptpindex;
 	uint64_t *ptpbase, pte, pgsize;
+	void *cookie;

 	/*
 	 * XXX assumes 64-bit guest with 4 page walk levels
@ -491,18 +491,19 @@ gla2gpa(struct vm *vm, uint64_t gla, uint64_t ptpphys,
 		/* Zero out the lower 12 bits and the upper 12 bits */
 		ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12;

-		hpa = vm_gpa2hpa(vm, ptpphys, PAGE_SIZE);
-		if (hpa == -1)
+		ptpbase = vm_gpa_hold(vm, ptpphys, PAGE_SIZE, VM_PROT_READ,
+				      &cookie);
+		if (ptpbase == NULL)
 			goto error;

-		ptpbase = (uint64_t *)PHYS_TO_DMAP(hpa);
-
 		ptpshift = PAGE_SHIFT + nlevels * 9;
 		ptpindex = (gla >> ptpshift) & 0x1FF;
 		pgsize = 1UL << ptpshift;

 		pte = ptpbase[ptpindex];

+		vm_gpa_release(cookie);
+
 		if ((pte & PG_V) == 0)
 			goto error;

@ -530,18 +531,18 @@ int
 vmm_fetch_instruction(struct vm *vm, int cpuid, uint64_t rip, int inst_length,
 		      uint64_t cr3, struct vie *vie)
 {
-	int n, err;
-	uint64_t hpa, gpa, gpaend, off;
+	int n, err, prot;
+	uint64_t gpa, gpaend, off;
+	void *hpa, *cookie;

 	/*
 	 * XXX cache previously fetched instructions using 'rip' as the tag
 	 */

+	prot = VM_PROT_READ | VM_PROT_EXECUTE;
 	if (inst_length > VIE_INST_SIZE)
 		panic("vmm_fetch_instruction: invalid length %d", inst_length);

-	vie_init(vie);
-
 	/* Copy the instruction into 'vie' */
 	while (vie->num_valid < inst_length) {
 		err = gla2gpa(vm, rip, cr3, &gpa, &gpaend);
@ -551,11 +552,12 @@ vmm_fetch_instruction(struct vm *vm, int cpuid, uint64_t rip, int inst_length,
 		off = gpa & PAGE_MASK;
 		n = min(inst_length - vie->num_valid, PAGE_SIZE - off);

-		hpa = vm_gpa2hpa(vm, gpa, n);
-		if (hpa == -1)
+		if ((hpa = vm_gpa_hold(vm, gpa, n, prot, &cookie)) == NULL)
 			break;

-		bcopy((void *)PHYS_TO_DMAP(hpa), &vie->inst[vie->num_valid], n);
+		bcopy(hpa, &vie->inst[vie->num_valid], n);
+
+		vm_gpa_release(cookie);

 		rip += n;
 		vie->num_valid += n;
--- a/sys/amd64/vmm/vmm_mem.c
+++ b/sys/amd64/vmm/vmm_mem.c
@ -30,40 +30,24 @@
 __FBSDID("$FreeBSD$");

 #include <sys/param.h>
-#include <sys/lock.h>
-#include <sys/mutex.h>
-#include <sys/linker.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
-#include <sys/kernel.h>
-#include <sys/sysctl.h>
+#include <sys/sglist.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>

 #include <vm/vm.h>
+#include <vm/vm_param.h>
 #include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
 #include <vm/vm_page.h>
-#include <vm/vm_pageout.h>
+#include <vm/vm_pager.h>

 #include <machine/md_var.h>
-#include <machine/metadata.h>
-#include <machine/pc/bios.h>
-#include <machine/vmparam.h>
-#include <machine/pmap.h>

-#include "vmm_util.h"
 #include "vmm_mem.h"

-SYSCTL_DECL(_hw_vmm);
-
-static u_long pages_allocated;
-SYSCTL_ULONG(_hw_vmm, OID_AUTO, pages_allocated, CTLFLAG_RD,
-	     &pages_allocated, 0, "4KB pages allocated");
-
-static void
-update_pages_allocated(int howmany)
-{
-	pages_allocated += howmany;	/* XXX locking? */
-}
-
 int
 vmm_mem_init(void)
 {
@ -71,60 +55,95 @@ vmm_mem_init(void)
 	return (0);
 }

-vm_paddr_t
-vmm_mem_alloc(size_t size)
+vm_object_t
+vmm_mmio_alloc(struct vmspace *vmspace, vm_paddr_t gpa, size_t len,
+	       vm_paddr_t hpa)
 {
-	int flags;
-	vm_page_t m;
-	vm_paddr_t pa;
+	int error;
+	vm_object_t obj;
+	struct sglist *sg;

-	if (size != PAGE_SIZE)
-		panic("vmm_mem_alloc: invalid allocation size %lu", size);
+	sg = sglist_alloc(1, M_WAITOK);
+	error = sglist_append_phys(sg, hpa, len);
+	KASSERT(error == 0, ("error %d appending physaddr to sglist", error));

-	flags = VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
-		VM_ALLOC_ZERO;
-
-	while (1) {
+	obj = vm_pager_allocate(OBJT_SG, sg, len, VM_PROT_RW, 0, NULL);
+	if (obj != NULL) {
 		/*
-		 * XXX need policy to determine when to back off the allocation
+		 * VT-x ignores the MTRR settings when figuring out the
+		 * memory type for translations obtained through EPT.
+		 *
+		 * Therefore we explicitly force the pages provided by
+		 * this object to be mapped as uncacheable.
 		 */
-		m = vm_page_alloc(NULL, 0, flags);
-		if (m == NULL)
-			VM_WAIT;
-		else
-			break;
+		VM_OBJECT_WLOCK(obj);
+		error = vm_object_set_memattr(obj, VM_MEMATTR_UNCACHEABLE);
+		VM_OBJECT_WUNLOCK(obj);
+		if (error != KERN_SUCCESS) {
+			panic("vmm_mmio_alloc: vm_object_set_memattr error %d",
+				error);
+		}
+		error = vm_map_find(&vmspace->vm_map, obj, 0, &gpa, len, 0,
+				    VMFS_NO_SPACE, VM_PROT_RW, VM_PROT_RW, 0);
+		if (error != KERN_SUCCESS) {
+			vm_object_deallocate(obj);
+			obj = NULL;
+		}
 	}

-	pa = VM_PAGE_TO_PHYS(m);
-	
-	if ((m->flags & PG_ZERO) == 0)
-		pagezero((void *)PHYS_TO_DMAP(pa));
-	m->valid = VM_PAGE_BITS_ALL;
+	/*
+	 * Drop the reference on the sglist.
+	 *
+	 * If the scatter/gather object was successfully allocated then it
+	 * has incremented the reference count on the sglist. Dropping the
+	 * initial reference count ensures that the sglist will be freed
+	 * when the object is deallocated.
+	 * 
+	 * If the object could not be allocated then we end up freeing the
+	 * sglist.
+	 */
+	sglist_free(sg);

-	update_pages_allocated(1);
-
-	return (pa);
+	return (obj);
 }

 void
-vmm_mem_free(vm_paddr_t base, size_t length)
+vmm_mmio_free(struct vmspace *vmspace, vm_paddr_t gpa, size_t len)
 {
-	vm_page_t m;

-	if (base & PAGE_MASK) {
-		panic("vmm_mem_free: base 0x%0lx must be aligned on a "
-		      "0x%0x boundary\n", base, PAGE_SIZE);
+	vm_map_remove(&vmspace->vm_map, gpa, gpa + len);
+}
+
+vm_object_t
+vmm_mem_alloc(struct vmspace *vmspace, vm_paddr_t gpa, size_t len)
+{
+	int error;
+	vm_object_t obj;
+
+	if (gpa & PAGE_MASK)
+		panic("vmm_mem_alloc: invalid gpa %#lx", gpa);
+
+	if (len == 0 || (len & PAGE_MASK) != 0)
+		panic("vmm_mem_alloc: invalid allocation size %lu", len);
+
+	obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT);
+	if (obj != NULL) {
+		error = vm_map_find(&vmspace->vm_map, obj, 0, &gpa, len, 0,
+				    VMFS_NO_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0);
+		if (error != KERN_SUCCESS) {
+			vm_object_deallocate(obj);
+			obj = NULL;
+		}
 	}

-	if (length != PAGE_SIZE)
-		panic("vmm_mem_free: invalid length %lu", length);
+	return (obj);
+}

-	m = PHYS_TO_VM_PAGE(base);
-	m->wire_count--;
-	vm_page_free(m);
-	atomic_subtract_int(&cnt.v_wire_count, 1);
+void
+vmm_mem_free(struct vmspace *vmspace, vm_paddr_t gpa, size_t len)
+{

-	update_pages_allocated(-1);
+	vm_map_remove(&vmspace->vm_map, gpa, gpa + len);
 }

 vm_paddr_t
--- a/sys/amd64/vmm/vmm_mem.h
+++ b/sys/amd64/vmm/vmm_mem.h
@ -29,9 +29,15 @@
 #ifndef	_VMM_MEM_H_
 #define	_VMM_MEM_H_

+struct vmspace;
+struct vm_object;
+
 int		vmm_mem_init(void);
-vm_paddr_t	vmm_mem_alloc(size_t size);
-void		vmm_mem_free(vm_paddr_t start, size_t size);
+struct vm_object *vmm_mem_alloc(struct vmspace *, vm_paddr_t gpa, size_t size);
+struct vm_object *vmm_mmio_alloc(struct vmspace *, vm_paddr_t gpa, size_t len,
+				 vm_paddr_t hpa);
+void		vmm_mem_free(struct vmspace *, vm_paddr_t gpa, size_t size);
+void		vmm_mmio_free(struct vmspace *, vm_paddr_t gpa, size_t size);
 vm_paddr_t	vmm_mem_maxaddr(void);

 #endif
--- a/usr.sbin/bhyve/bhyverun.c
+++ b/usr.sbin/bhyve/bhyverun.c
@ -101,7 +101,7 @@ struct bhyvestats {
        uint64_t        vmexit_hlt;
        uint64_t        vmexit_pause;
        uint64_t        vmexit_mtrap;
-        uint64_t        vmexit_paging;
+        uint64_t        vmexit_inst_emul;
        uint64_t        cpu_switch_rotate;
        uint64_t        cpu_switch_direct;
        int             io_reset;
@ -208,14 +208,12 @@ fbsdrun_addcpu(struct vmctx *ctx, int vcpu, uint64_t rip)
 	vmexit[vcpu].rip = rip;
 	vmexit[vcpu].inst_length = 0;

-	if (vcpu == BSP) {
-		mt_vmm_info[vcpu].mt_ctx = ctx;
-		mt_vmm_info[vcpu].mt_vcpu = vcpu;
-	
-		error = pthread_create(&mt_vmm_info[vcpu].mt_thr, NULL,
-				fbsdrun_start_thread, &mt_vmm_info[vcpu]);
-		assert(error == 0);
-	}
+	mt_vmm_info[vcpu].mt_ctx = ctx;
+	mt_vmm_info[vcpu].mt_vcpu = vcpu;
+
+	error = pthread_create(&mt_vmm_info[vcpu].mt_thr, NULL,
+	    fbsdrun_start_thread, &mt_vmm_info[vcpu]);
+	assert(error == 0);
 }

 static int
@ -385,13 +383,13 @@ vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 }

 static int
-vmexit_paging(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
 	int err;
-	stats.vmexit_paging++;
+	stats.vmexit_inst_emul++;

-	err = emulate_mem(ctx, *pvcpu, vmexit->u.paging.gpa,
-			  &vmexit->u.paging.vie);
+	err = emulate_mem(ctx, *pvcpu, vmexit->u.inst_emul.gpa,
+			  &vmexit->u.inst_emul.vie);

 	if (err) {
 		if (err == EINVAL) {
@ -400,7 +398,7 @@ vmexit_paging(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 			    vmexit->rip);
 		} else if (err == ESRCH) {
 			fprintf(stderr, "Unhandled memory access to 0x%lx\n",
-			    vmexit->u.paging.gpa);
+			    vmexit->u.inst_emul.gpa);
 		}

 		return (VMEXIT_ABORT);
@ -416,7 +414,7 @@ static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
 	[VM_EXITCODE_RDMSR]  = vmexit_rdmsr,
 	[VM_EXITCODE_WRMSR]  = vmexit_wrmsr,
 	[VM_EXITCODE_MTRAP]  = vmexit_mtrap,
-	[VM_EXITCODE_PAGING] = vmexit_paging,
+	[VM_EXITCODE_INST_EMUL] = vmexit_inst_emul,
 	[VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap,
 };

--- a/usr.sbin/bhyve/pci_emul.c
+++ b/usr.sbin/bhyve/pci_emul.c
@ -1048,7 +1048,7 @@ init_pci(struct vmctx *ctx)
 	 * Accesses to memory addresses that are not allocated to system
 	 * memory or PCI devices return 0xff's.
 	 */
-	error = vm_get_memory_seg(ctx, 0, &lowmem);
+	error = vm_get_memory_seg(ctx, 0, &lowmem, NULL);
 	assert(error == 0);

 	memset(&memp, 0, sizeof(struct mem_range));
--- a/usr.sbin/bhyve/rtc.c
+++ b/usr.sbin/bhyve/rtc.c
@ -341,14 +341,14 @@ rtc_init(struct vmctx *ctx)
 	 * 0x34/0x35 - 64KB chunks above 16MB, below 4GB
 	 * 0x5b/0x5c/0x5d - 64KB chunks above 4GB
 	 */
-	err = vm_get_memory_seg(ctx, 0, &lomem);
+	err = vm_get_memory_seg(ctx, 0, &lomem, NULL);
 	assert(err == 0);

 	lomem = (lomem - m_16MB) / m_64KB;
 	rtc_nvram[nvoff(RTC_LMEM_LSB)] = lomem;
 	rtc_nvram[nvoff(RTC_LMEM_MSB)] = lomem >> 8;

-	if (vm_get_memory_seg(ctx, m_4GB, &himem) == 0) {	  
+	if (vm_get_memory_seg(ctx, m_4GB, &himem, NULL) == 0) {	  
 		himem /= m_64KB;
 		rtc_nvram[nvoff(RTC_HMEM_LSB)] = himem;
 		rtc_nvram[nvoff(RTC_HMEM_SB)]  = himem >> 8;
--- a/usr.sbin/bhyvectl/bhyvectl.c
+++ b/usr.sbin/bhyvectl/bhyvectl.c
@ -188,12 +188,13 @@ usage(void)
 	"       [--unassign-pptdev=<bus/slot/func>]\n"
 	"       [--set-mem=<memory in units of MB>]\n"
 	"       [--get-lowmem]\n"
-	"       [--get-highmem]\n",
+	"       [--get-highmem]\n"
+	"       [--get-gpa-pmap]\n",
 	progname);
 	exit(1);
 }

-static int get_stats, getcap, setcap, capval;
+static int get_stats, getcap, setcap, capval, get_gpa_pmap;
 static const char *capname;
 static int create, destroy, get_lowmem, get_highmem;
 static uint64_t memsize;
@ -377,18 +378,20 @@ enum {
 	SET_CAP,
 	CAPNAME,
 	UNASSIGN_PPTDEV,
+	GET_GPA_PMAP,
 };

 int
 main(int argc, char *argv[])
 {
 	char *vmname;
-	int error, ch, vcpu;
-	vm_paddr_t gpa;
+	int error, ch, vcpu, ptenum;
+	vm_paddr_t gpa, gpa_pmap;
 	size_t len;
 	struct vm_exit vmexit;
-	uint64_t ctl, eptp, bm, addr, u64;
+	uint64_t ctl, eptp, bm, addr, u64, pteval[4], *pte;
 	struct vmctx *ctx;
+	int wired;

 	uint64_t cr0, cr3, cr4, dr7, rsp, rip, rflags, efer, pat;
 	uint64_t rax, rbx, rcx, rdx, rsi, rdi, rbp;
@ -427,6 +430,7 @@ main(int argc, char *argv[])
 		{ "capname",	REQ_ARG,	0,	CAPNAME },
 		{ "unassign-pptdev", REQ_ARG,	0,	UNASSIGN_PPTDEV },
 		{ "setcap",	REQ_ARG,	0,	SET_CAP },
+		{ "get-gpa-pmap", REQ_ARG,	0,	GET_GPA_PMAP },
 		{ "getcap",	NO_ARG,		&getcap,	1 },
 		{ "get-stats",	NO_ARG,		&get_stats,	1 },
 		{ "get-desc-ds",NO_ARG,		&get_desc_ds,	1 },
@ -666,6 +670,10 @@ main(int argc, char *argv[])
 			capval = strtoul(optarg, NULL, 0);
 			setcap = 1;
 			break;
+		case GET_GPA_PMAP:
+			gpa_pmap = strtoul(optarg, NULL, 0);
+			get_gpa_pmap = 1;
+			break;
 		case CAPNAME:
 			capname = optarg;
 			break;
@ -819,16 +827,18 @@ main(int argc, char *argv[])

 	if (!error && (get_lowmem || get_all)) {
 		gpa = 0;
-		error = vm_get_memory_seg(ctx, gpa, &len);
+		error = vm_get_memory_seg(ctx, gpa, &len, &wired);
 		if (error == 0)
-			printf("lowmem\t\t0x%016lx/%ld\n", gpa, len);
+			printf("lowmem\t\t0x%016lx/%ld%s\n", gpa, len,
+			    wired ? " wired" : "");
 	}

 	if (!error && (get_highmem || get_all)) {
 		gpa = 4 * GB;
-		error = vm_get_memory_seg(ctx, gpa, &len);
+		error = vm_get_memory_seg(ctx, gpa, &len, &wired);
 		if (error == 0)
-			printf("highmem\t\t0x%016lx/%ld\n", gpa, len);
+			printf("highmem\t\t0x%016lx/%ld%s\n", gpa, len,
+			    wired ? " wired" : "");
 	}

 	if (!error && (get_efer || get_all)) {
@ -1457,6 +1467,17 @@ main(int argc, char *argv[])
 			printf("Capability \"%s\" is not available\n", capname);
 	}

+	if (!error && get_gpa_pmap) {
+		error = vm_get_gpa_pmap(ctx, gpa_pmap, pteval, &ptenum);
+		if (error == 0) {
+			printf("gpa %#lx:", gpa_pmap);
+			pte = &pteval[0];
+			while (ptenum-- > 0)
+				printf(" %#lx", *pte++);
+			printf("\n");
+		}
+	}
+
 	if (!error && (getcap || get_all)) {
 		int captype, val, getcaptype;

--- a/usr.sbin/bhyveload/bhyveload.c
+++ b/usr.sbin/bhyveload/bhyveload.c
@ -492,8 +492,8 @@ static void
 cb_getmem(void *arg, uint64_t *ret_lowmem, uint64_t *ret_highmem)
 {

-	vm_get_memory_seg(ctx, 0, ret_lowmem);
-	vm_get_memory_seg(ctx, 4 * GB, ret_highmem);
+	vm_get_memory_seg(ctx, 0, ret_lowmem, NULL);
+	vm_get_memory_seg(ctx, 4 * GB, ret_highmem, NULL);
 }

 static const char *