MFC - tracking commit

2013-10-06 09:37:57 +00:00 · 2013-10-06 09:37:57 +00:00 · 21998ad688
commit 21998ad688
parent 2368f0cbd1
46 changed files with 2440 additions and 1295 deletions
--- a/lib/libvmmapi/vmmapi.c
+++ b/lib/libvmmapi/vmmapi.c
@ -124,7 +124,8 @@ vm_destroy(struct vmctx *vm)
 }

 int
-vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len)
+vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len,
+		  int *wired)
 {
 	int error;
 	struct vm_memory_segment seg;
@ -133,6 +134,8 @@ vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len)
 	seg.gpa = gpa;
 	error = ioctl(ctx->fd, VM_GET_MEMORY_SEG, &seg);
 	*ret_len = seg.len;
+	if (wired != NULL)
+		*wired = seg.wired;
 	return (error);
 }

@ -741,3 +744,23 @@ vcpu_reset(struct vmctx *vmctx, int vcpu)
 done:
 	return (error);
 }
+
+int
+vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num)
+{
+	int error, i;
+	struct vm_gpa_pte gpapte;
+
+	bzero(&gpapte, sizeof(gpapte));
+	gpapte.gpa = gpa;
+
+	error = ioctl(ctx->fd, VM_GET_GPA_PMAP, &gpapte);
+
+	if (error == 0) {
+		*num = gpapte.ptenum;
+		for (i = 0; i < gpapte.ptenum; i++)
+			pte[i] = gpapte.pte[i];
+	}
+
+	return (error);
+}
--- a/lib/libvmmapi/vmmapi.h
+++ b/lib/libvmmapi/vmmapi.h
@ -45,9 +45,11 @@ enum vm_mmap_style {
 int	vm_create(const char *name);
 struct vmctx *vm_open(const char *name);
 void	vm_destroy(struct vmctx *ctx);
-int	vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len);
+int	vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len,
+			  int *wired);
 int	vm_setup_memory(struct vmctx *ctx, size_t len, enum vm_mmap_style s);
 void	*vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len);
+int	vm_get_gpa_pmap(struct vmctx *, uint64_t gpa, uint64_t *pte, int *num);
 uint32_t vm_get_lowmem_limit(struct vmctx *ctx);
 void	vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit);
 int	vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
--- a/sys/amd64/amd64/machdep.c
+++ b/sys/amd64/amd64/machdep.c
@ -1574,7 +1574,7 @@ getmemsize(caddr_t kmdp, u_int64_t first)
 			/*
 			 * map page into kernel: valid, read/write,non-cacheable
 			 */
-			*pte = pa | PG_V | PG_RW | PG_N;
+			*pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
 			invltlb();

 			tmp = *(int *)ptr;
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
--- a/sys/amd64/amd64/trap.c
+++ b/sys/amd64/amd64/trap.c
@ -733,6 +733,14 @@ trap_pfault(frame, usermode)
 		}
 	}

+	/*
+	 * If the trap was caused by errant bits in the PTE then panic.
+	 */
+	if (frame->tf_err & PGEX_RSV) {
+		trap_fatal(frame, eva);
+		return (-1);
+	}
+
 	/*
 	 * PGEX_I is defined only if the execute disable bit capability is
 	 * supported and enabled.
@ -822,10 +830,11 @@ trap_fatal(frame, eva)
 #endif
 	if (type == T_PAGEFLT) {
 		printf("fault virtual address	= 0x%lx\n", eva);
-		printf("fault code		= %s %s %s, %s\n",
+		printf("fault code		= %s %s %s%s, %s\n",
 			code & PGEX_U ? "user" : "supervisor",
 			code & PGEX_W ? "write" : "read",
 			code & PGEX_I ? "instruction" : "data",
+			code & PGEX_RSV ? " rsv" : "",
 			code & PGEX_P ? "protection violation" : "page not present");
 	}
 	printf("instruction pointer	= 0x%lx:0x%lx\n",
--- a/sys/amd64/include/pcpu.h
+++ b/sys/amd64/include/pcpu.h
@ -62,7 +62,8 @@
 	u_int	pc_cmci_mask;		/* MCx banks for CMCI */	\
 	uint64_t pc_dbreg[16];		/* ddb debugging regs */	\
 	int pc_dbreg_cmd;		/* ddb debugging reg cmd */	\
-	char	__pad[161]		/* be divisor of PAGE_SIZE	\
+	u_int	pc_vcpu_id;		/* Xen vCPU ID */		\
+	char	__pad[157]		/* be divisor of PAGE_SIZE	\
 					   after cache alignment */

 #define	PC_DBREG_CMD_NONE	0
--- a/sys/amd64/include/pmap.h
+++ b/sys/amd64/include/pmap.h
@ -50,41 +50,74 @@
 * of the fields not present here and there, depending on a lot of things.
 */
 				/* ---- Intel Nomenclature ---- */
-#define	PG_V		0x001	/* P	Valid			*/
-#define PG_RW		0x002	/* R/W	Read/Write		*/
-#define PG_U		0x004	/* U/S  User/Supervisor		*/
-#define	PG_NC_PWT	0x008	/* PWT	Write through		*/
-#define	PG_NC_PCD	0x010	/* PCD	Cache disable		*/
-#define PG_A		0x020	/* A	Accessed		*/
-#define	PG_M		0x040	/* D	Dirty			*/
-#define	PG_PS		0x080	/* PS	Page size (0=4k,1=2M)	*/
-#define	PG_PTE_PAT	0x080	/* PAT	PAT index		*/
-#define	PG_G		0x100	/* G	Global			*/
-#define	PG_AVAIL1	0x200	/*    /	Available for system	*/
-#define	PG_AVAIL2	0x400	/*   <	programmers use		*/
-#define	PG_AVAIL3	0x800	/*    \				*/
-#define	PG_PDE_PAT	0x1000	/* PAT	PAT index		*/
-#define	PG_NX		(1ul<<63) /* No-execute */
-
-
-/* Our various interpretations of the above */
-#define PG_W		PG_AVAIL1	/* "Wired" pseudoflag */
-#define	PG_MANAGED	PG_AVAIL2
-#define	PG_FRAME	(0x000ffffffffff000ul)
-#define	PG_PS_FRAME	(0x000fffffffe00000ul)
-#define	PG_PROT		(PG_RW|PG_U)	/* all protection bits . */
-#define PG_N		(PG_NC_PWT|PG_NC_PCD)	/* Non-cacheable */
+#define	X86_PG_V	0x001	/* P	Valid			*/
+#define	X86_PG_RW	0x002	/* R/W	Read/Write		*/
+#define	X86_PG_U	0x004	/* U/S  User/Supervisor		*/
+#define	X86_PG_NC_PWT	0x008	/* PWT	Write through		*/
+#define	X86_PG_NC_PCD	0x010	/* PCD	Cache disable		*/
+#define	X86_PG_A	0x020	/* A	Accessed		*/
+#define	X86_PG_M	0x040	/* D	Dirty			*/
+#define	X86_PG_PS	0x080	/* PS	Page size (0=4k,1=2M)	*/
+#define	X86_PG_PTE_PAT	0x080	/* PAT	PAT index		*/
+#define	X86_PG_G	0x100	/* G	Global			*/
+#define	X86_PG_AVAIL1	0x200	/*    /	Available for system	*/
+#define	X86_PG_AVAIL2	0x400	/*   <	programmers use		*/
+#define	X86_PG_AVAIL3	0x800	/*    \				*/
+#define	X86_PG_PDE_PAT	0x1000	/* PAT	PAT index		*/
+#define	X86_PG_NX	(1ul<<63) /* No-execute */
+#define	X86_PG_AVAIL(x)	(1ul << (x))

 /* Page level cache control fields used to determine the PAT type */
-#define PG_PDE_CACHE	(PG_PDE_PAT | PG_NC_PWT | PG_NC_PCD)
-#define PG_PTE_CACHE	(PG_PTE_PAT | PG_NC_PWT | PG_NC_PCD)
+#define	X86_PG_PDE_CACHE (X86_PG_PDE_PAT | X86_PG_NC_PWT | X86_PG_NC_PCD)
+#define	X86_PG_PTE_CACHE (X86_PG_PTE_PAT | X86_PG_NC_PWT | X86_PG_NC_PCD)
+
+/*
+ * Intel extended page table (EPT) bit definitions.
+ */
+#define	EPT_PG_READ		0x001	/* R	Read		*/
+#define	EPT_PG_WRITE		0x002	/* W	Write		*/
+#define	EPT_PG_EXECUTE		0x004	/* X	Execute		*/
+#define	EPT_PG_IGNORE_PAT	0x040	/* IPAT	Ignore PAT	*/
+#define	EPT_PG_PS		0x080	/* PS	Page size	*/
+#define	EPT_PG_A		0x100	/* A	Accessed	*/
+#define	EPT_PG_M		0x200	/* D	Dirty		*/
+#define	EPT_PG_MEMORY_TYPE(x)	((x) << 3) /* MT Memory Type	*/
+
+/*
+ * Define the PG_xx macros in terms of the bits on x86 PTEs.
+ */
+#define	PG_V		X86_PG_V
+#define	PG_RW		X86_PG_RW
+#define	PG_U		X86_PG_U
+#define	PG_NC_PWT	X86_PG_NC_PWT
+#define	PG_NC_PCD	X86_PG_NC_PCD
+#define	PG_A		X86_PG_A
+#define	PG_M		X86_PG_M
+#define	PG_PS		X86_PG_PS
+#define	PG_PTE_PAT	X86_PG_PTE_PAT
+#define	PG_G		X86_PG_G
+#define	PG_AVAIL1	X86_PG_AVAIL1
+#define	PG_AVAIL2	X86_PG_AVAIL2
+#define	PG_AVAIL3	X86_PG_AVAIL3
+#define	PG_PDE_PAT	X86_PG_PDE_PAT
+#define	PG_NX		X86_PG_NX
+#define	PG_PDE_CACHE	X86_PG_PDE_CACHE
+#define	PG_PTE_CACHE	X86_PG_PTE_CACHE
+
+/* Our various interpretations of the above */
+#define	PG_W		X86_PG_AVAIL3	/* "Wired" pseudoflag */
+#define	PG_MANAGED	X86_PG_AVAIL2
+#define	EPT_PG_EMUL_V	X86_PG_AVAIL(52)
+#define	EPT_PG_EMUL_RW	X86_PG_AVAIL(53)
+#define	PG_FRAME	(0x000ffffffffff000ul)
+#define	PG_PS_FRAME	(0x000fffffffe00000ul)

 /*
 * Promotion to a 2MB (PDE) page mapping requires that the corresponding 4KB
 * (PTE) page mappings have identical settings for the following fields:
 */
-#define	PG_PTE_PROMOTE	(PG_NX | PG_MANAGED | PG_W | PG_G | PG_PTE_PAT | \
-	    PG_M | PG_A | PG_NC_PCD | PG_NC_PWT | PG_U | PG_RW | PG_V)
+#define	PG_PTE_PROMOTE	(PG_NX | PG_MANAGED | PG_W | PG_G | PG_PTE_CACHE | \
+	    PG_M | PG_A | PG_U | PG_RW | PG_V)

 /*
 * Page Protection Exception bits
@ -96,6 +129,28 @@
 #define PGEX_RSV	0x08	/* reserved PTE field is non-zero */
 #define PGEX_I		0x10	/* during an instruction fetch */

+/* 
+ * undef the PG_xx macros that define bits in the regular x86 PTEs that
+ * have a different position in nested PTEs. This is done when compiling
+ * code that needs to be aware of the differences between regular x86 and
+ * nested PTEs.
+ *
+ * The appropriate bitmask will be calculated at runtime based on the pmap
+ * type.
+ */
+#ifdef AMD64_NPT_AWARE
+#undef PG_AVAIL1		/* X86_PG_AVAIL1 aliases with EPT_PG_M */
+#undef PG_G
+#undef PG_A
+#undef PG_M
+#undef PG_PDE_PAT
+#undef PG_PDE_CACHE
+#undef PG_PTE_PAT
+#undef PG_PTE_CACHE
+#undef PG_RW
+#undef PG_V
+#endif
+
 /*
 * Pte related macros.  This is complicated by having to deal with
 * the sign extension of the 48th bit.
@ -256,6 +311,11 @@ struct pmap {
 	int			pm_flags;
 };

+/* flags */
+#define	PMAP_PDE_SUPERPAGE	(1 << 0)	/* supports 2MB superpages */
+#define	PMAP_EMULATE_AD_BITS	(1 << 1)	/* needs A/D bits emulation */
+#define	PMAP_SUPPORTS_EXEC_ONLY	(1 << 2)	/* execute only mappings ok */
+
 typedef struct pmap	*pmap_t;

 #ifdef _KERNEL
@ -272,6 +332,9 @@ extern struct pmap	kernel_pmap_store;
 #define	PMAP_MTX(pmap)		(&(pmap)->pm_mtx)
 #define	PMAP_TRYLOCK(pmap)	mtx_trylock(&(pmap)->pm_mtx)
 #define	PMAP_UNLOCK(pmap)	mtx_unlock(&(pmap)->pm_mtx)
+
+int	pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags);
+int	pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype);
 #endif

 /*
@ -330,7 +393,7 @@ void	pmap_invalidate_all(pmap_t);
 void	pmap_invalidate_cache(void);
 void	pmap_invalidate_cache_pages(vm_page_t *pages, int count);
 void	pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva);
-
+void	pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num);
 #endif /* _KERNEL */

 #endif /* !LOCORE */
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@ -39,19 +39,18 @@ struct seg_desc;
 struct vm_exit;
 struct vm_run;
 struct vlapic;
+struct vmspace;
+struct vm_object;
+struct pmap;

 enum x2apic_state;

 typedef int	(*vmm_init_func_t)(void);
 typedef int	(*vmm_cleanup_func_t)(void);
-typedef void *	(*vmi_init_func_t)(struct vm *vm); /* instance specific apis */
-typedef int	(*vmi_run_func_t)(void *vmi, int vcpu, register_t rip);
+typedef void *	(*vmi_init_func_t)(struct vm *vm, struct pmap *pmap);
+typedef int	(*vmi_run_func_t)(void *vmi, int vcpu, register_t rip,
+				  struct pmap *pmap);
 typedef void	(*vmi_cleanup_func_t)(void *vmi);
-typedef int	(*vmi_mmap_set_func_t)(void *vmi, vm_paddr_t gpa,
-				       vm_paddr_t hpa, size_t length,
-				       vm_memattr_t attr, int prot,
-				       boolean_t superpages_ok);
-typedef vm_paddr_t (*vmi_mmap_get_func_t)(void *vmi, vm_paddr_t gpa);
 typedef int	(*vmi_get_register_t)(void *vmi, int vcpu, int num,
 				      uint64_t *retval);
 typedef int	(*vmi_set_register_t)(void *vmi, int vcpu, int num,
@ -65,6 +64,8 @@ typedef int	(*vmi_inject_event_t)(void *vmi, int vcpu,
 				      uint32_t code, int code_valid);
 typedef int	(*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval);
 typedef int	(*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val);
+typedef struct vmspace * (*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max);
+typedef void	(*vmi_vmspace_free)(struct vmspace *vmspace);

 struct vmm_ops {
 	vmm_init_func_t		init;		/* module wide initialization */
@ -73,8 +74,6 @@ struct vmm_ops {
 	vmi_init_func_t		vminit;		/* vm-specific initialization */
 	vmi_run_func_t		vmrun;
 	vmi_cleanup_func_t	vmcleanup;
-	vmi_mmap_set_func_t	vmmmap_set;
-	vmi_mmap_get_func_t	vmmmap_get;
 	vmi_get_register_t	vmgetreg;
 	vmi_set_register_t	vmsetreg;
 	vmi_get_desc_t		vmgetdesc;
@ -82,6 +81,8 @@ struct vmm_ops {
 	vmi_inject_event_t	vminject;
 	vmi_get_cap_t		vmgetcap;
 	vmi_set_cap_t		vmsetcap;
+	vmi_vmspace_alloc	vmspace_alloc;
+	vmi_vmspace_free	vmspace_free;
 };

 extern struct vmm_ops vmm_ops_intel;
@ -93,9 +94,14 @@ const char *vm_name(struct vm *vm);
 int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len);
 int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
 int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len);
-vm_paddr_t vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t size);
+void *vm_gpa_hold(struct vm *, vm_paddr_t gpa, size_t len, int prot,
+		  void **cookie);
+void vm_gpa_release(void *cookie);
 int vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
 	      struct vm_memory_segment *seg);
+int vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len,
+		  vm_offset_t *offset, struct vm_object **object);
+boolean_t vm_mem_allocated(struct vm *vm, vm_paddr_t gpa);
 int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval);
 int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val);
 int vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
@ -130,8 +136,9 @@ void *vm_iommu_domain(struct vm *vm);

 enum vcpu_state {
 	VCPU_IDLE,
+	VCPU_FROZEN,
 	VCPU_RUNNING,
-	VCPU_CANNOT_RUN,
+	VCPU_SLEEPING,
 };

 int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state);
@ -145,7 +152,9 @@ vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu)

 void *vcpu_stats(struct vm *vm, int vcpu);
 void vm_interrupt_hostcpu(struct vm *vm, int vcpu);
-
+struct vmspace *vm_get_vmspace(struct vm *vm);
+int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func);
+int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func);
 #endif	/* KERNEL */

 #include <machine/vmm_instruction_emul.h>
@ -247,6 +256,7 @@ enum vm_exitcode {
 	VM_EXITCODE_MTRAP,
 	VM_EXITCODE_PAUSE,
 	VM_EXITCODE_PAGING,
+	VM_EXITCODE_INST_EMUL,
 	VM_EXITCODE_SPINUP_AP,
 	VM_EXITCODE_MAX
 };
@ -266,8 +276,15 @@ struct vm_exit {
 		} inout;
 		struct {
 			uint64_t	gpa;
-			struct vie	vie;
+			int		fault_type;
+			int		protection;
 		} paging;
+		struct {
+			uint64_t	gpa;
+			uint64_t	gla;
+			uint64_t	cr3;
+			struct vie	vie;
+		} inst_emul;
 		/*
 		 * VMX specific payload. Used when there is no "better"
 		 * exitcode to represent the VM-exit.
--- a/sys/amd64/include/vmm_dev.h
+++ b/sys/amd64/include/vmm_dev.h
@ -36,7 +36,8 @@ int	vmmdev_cleanup(void);

 struct vm_memory_segment {
 	vm_paddr_t	gpa;	/* in */
-	size_t		len;	/* in */
+	size_t		len;
+	int		wired;
 };

 struct vm_register {
@ -135,6 +136,12 @@ struct vm_x2apic {
 	enum x2apic_state	state;
 };

+struct vm_gpa_pte {
+	uint64_t	gpa;				/* in */
+	uint64_t	pte[4];				/* out */
+	int		ptenum;
+};
+
 enum {
 	/* general routines */
 	IOCNUM_ABIVERS = 0,
@ -145,6 +152,7 @@ enum {
 	/* memory apis */
 	IOCNUM_MAP_MEMORY = 10,
 	IOCNUM_GET_MEMORY_SEG = 11,
+	IOCNUM_GET_GPA_PMAP = 12,

 	/* register/state accessors */
 	IOCNUM_SET_REGISTER = 20,
@ -215,4 +223,6 @@ enum {
 	_IOW('v', IOCNUM_SET_X2APIC_STATE, struct vm_x2apic)
 #define	VM_GET_X2APIC_STATE \
 	_IOWR('v', IOCNUM_GET_X2APIC_STATE, struct vm_x2apic)
+#define	VM_GET_GPA_PMAP \
+	_IOWR('v', IOCNUM_GET_GPA_PMAP, struct vm_gpa_pte)
 #endif
--- a/sys/amd64/include/vmm_instruction_emul.h
+++ b/sys/amd64/include/vmm_instruction_emul.h
@ -102,11 +102,15 @@ int vmm_emulate_instruction(void *vm, int cpuid, uint64_t gpa, struct vie *vie,
 #ifdef _KERNEL
 /*
 * APIs to fetch and decode the instruction from nested page fault handler.
+ *
+ * 'vie' must be initialized before calling 'vmm_fetch_instruction()'
 */
 int vmm_fetch_instruction(struct vm *vm, int cpuid,
 			  uint64_t rip, int inst_length, uint64_t cr3,
 			  struct vie *vie);

+void vie_init(struct vie *vie);
+
 /*
 * Decode the instruction fetched into 'vie' so it can be emulated.
 *
--- a/sys/amd64/vmm/amd/amdv.c
+++ b/sys/amd64/vmm/amd/amdv.c
@ -54,7 +54,7 @@ amdv_cleanup(void)
 }

 static void *
-amdv_vminit(struct vm *vm)
+amdv_vminit(struct vm *vm, struct pmap *pmap)
 {

 	printf("amdv_vminit: not implemented\n");
@ -62,7 +62,7 @@ amdv_vminit(struct vm *vm)
 }

 static int
-amdv_vmrun(void *arg, int vcpu, register_t rip)
+amdv_vmrun(void *arg, int vcpu, register_t rip, struct pmap *pmap)
 {

 	printf("amdv_vmrun: not implemented\n");
@ -77,23 +77,6 @@ amdv_vmcleanup(void *arg)
 	return;
 }

-static int
-amdv_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
-	    vm_memattr_t attr, int prot, boolean_t spok)
-{
-
-	printf("amdv_vmmmap_set: not implemented\n");
-	return (EINVAL);
-}
-
-static vm_paddr_t
-amdv_vmmmap_get(void *arg, vm_paddr_t gpa)
-{
-
-	printf("amdv_vmmmap_get: not implemented\n");
-	return (EINVAL);
-}
-
 static int
 amdv_getreg(void *arg, int vcpu, int regnum, uint64_t *retval)
 {
@ -151,21 +134,37 @@ amdv_setcap(void *arg, int vcpu, int type, int val)
 	return (EINVAL);
 }

+static struct vmspace *
+amdv_vmspace_alloc(vm_offset_t min, vm_offset_t max)
+{
+
+	printf("amdv_vmspace_alloc: not implemented\n");
+	return (NULL);
+}
+
+static void
+amdv_vmspace_free(struct vmspace *vmspace)
+{
+
+	printf("amdv_vmspace_free: not implemented\n");
+	return;
+}
+
 struct vmm_ops vmm_ops_amd = {
 	amdv_init,
 	amdv_cleanup,
 	amdv_vminit,
 	amdv_vmrun,
 	amdv_vmcleanup,
-	amdv_vmmmap_set,
-	amdv_vmmmap_get,
 	amdv_getreg,
 	amdv_setreg,
 	amdv_getdesc,
 	amdv_setdesc,
 	amdv_inject_event,
 	amdv_getcap,
-	amdv_setcap
+	amdv_setcap,
+	amdv_vmspace_alloc,
+	amdv_vmspace_free,
 };

 static int
--- a/sys/amd64/vmm/intel/ept.c
+++ b/sys/amd64/vmm/intel/ept.c
@ -29,32 +29,31 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");

+#include <sys/param.h>
+#include <sys/kernel.h>
 #include <sys/types.h>
-#include <sys/errno.h>
 #include <sys/systm.h>
-#include <sys/malloc.h>
 #include <sys/smp.h>
+#include <sys/sysctl.h>

 #include <vm/vm.h>
 #include <vm/pmap.h>
-
-#include <machine/param.h>
-#include <machine/cpufunc.h>
-#include <machine/pmap.h>
-#include <machine/vmparam.h>
+#include <vm/vm_extern.h>

 #include <machine/vmm.h>
+
 #include "vmx_cpufunc.h"
 #include "vmx_msr.h"
-#include "vmx.h"
 #include "ept.h"

+#define	EPT_SUPPORTS_EXEC_ONLY(cap)	((cap) & (1UL << 0))
 #define	EPT_PWL4(cap)			((cap) & (1UL << 6))
 #define	EPT_MEMORY_TYPE_WB(cap)		((cap) & (1UL << 14))
 #define	EPT_PDE_SUPERPAGE(cap)		((cap) & (1UL << 16))	/* 2MB pages */
 #define	EPT_PDPTE_SUPERPAGE(cap)	((cap) & (1UL << 17))	/* 1GB pages */
-#define	INVVPID_SUPPORTED(cap)		((cap) & (1UL << 32))
 #define	INVEPT_SUPPORTED(cap)		((cap) & (1UL << 20))
+#define	AD_BITS_SUPPORTED(cap)		((cap) & (1UL << 21))
+#define	INVVPID_SUPPORTED(cap)		((cap) & (1UL << 32))

 #define	INVVPID_ALL_TYPES_MASK		0xF0000000000UL
 #define	INVVPID_ALL_TYPES_SUPPORTED(cap)	\
@ -64,28 +63,22 @@ __FBSDID("$FreeBSD$");
 #define	INVEPT_ALL_TYPES_SUPPORTED(cap)		\
 	(((cap) & INVEPT_ALL_TYPES_MASK) == INVEPT_ALL_TYPES_MASK)

-#define	EPT_PG_RD			(1 << 0)
-#define	EPT_PG_WR			(1 << 1)
-#define	EPT_PG_EX			(1 << 2)
-#define	EPT_PG_MEMORY_TYPE(x)		((x) << 3)
-#define	EPT_PG_IGNORE_PAT		(1 << 6)
-#define	EPT_PG_SUPERPAGE		(1 << 7)
+#define	EPT_PWLEVELS		4		/* page walk levels */
+#define	EPT_ENABLE_AD_BITS	(1 << 6)

-#define	EPT_ADDR_MASK			((uint64_t)-1 << 12)
+SYSCTL_DECL(_hw_vmm);
+SYSCTL_NODE(_hw_vmm, OID_AUTO, ept, CTLFLAG_RW, NULL, NULL);

-MALLOC_DECLARE(M_VMX);
+static int ept_enable_ad_bits;

-static uint64_t page_sizes_mask;
-
-/*
- * Set this to 1 to have the EPT tables respect the guest PAT settings
- */
-static int ept_pat_passthru;
+static int ept_pmap_flags;
+SYSCTL_INT(_hw_vmm_ept, OID_AUTO, pmap_flags, CTLFLAG_RD,
+    &ept_pmap_flags, 0, NULL);

 int
 ept_init(void)
 {
-	int page_shift;
+	int use_hw_ad_bits, use_superpages, use_exec_only;
 	uint64_t cap;

 	cap = rdmsr(MSR_VMX_EPT_VPID_CAP);
@ -105,17 +98,22 @@ ept_init(void)
 	    !INVEPT_ALL_TYPES_SUPPORTED(cap))
 		return (EINVAL);

-	/* Set bits in 'page_sizes_mask' for each valid page size */
-	page_shift = PAGE_SHIFT;
-	page_sizes_mask = 1UL << page_shift;		/* 4KB page */
+	use_superpages = 1;
+	TUNABLE_INT_FETCH("hw.vmm.ept.use_superpages", &use_superpages);
+	if (use_superpages && EPT_PDE_SUPERPAGE(cap))
+		ept_pmap_flags |= PMAP_PDE_SUPERPAGE;	/* 2MB superpage */

-	page_shift += 9;
-	if (EPT_PDE_SUPERPAGE(cap))
-		page_sizes_mask |= 1UL << page_shift;	/* 2MB superpage */
+	use_hw_ad_bits = 1;
+	TUNABLE_INT_FETCH("hw.vmm.ept.use_hw_ad_bits", &use_hw_ad_bits);
+	if (use_hw_ad_bits && AD_BITS_SUPPORTED(cap))
+		ept_enable_ad_bits = 1;
+	else
+		ept_pmap_flags |= PMAP_EMULATE_AD_BITS;

-	page_shift += 9;
-	if (EPT_PDPTE_SUPERPAGE(cap))
-		page_sizes_mask |= 1UL << page_shift;	/* 1GB superpage */
+	use_exec_only = 1;
+	TUNABLE_INT_FETCH("hw.vmm.ept.use_exec_only", &use_exec_only);
+	if (use_exec_only && EPT_SUPPORTS_EXEC_ONLY(cap))
+		ept_pmap_flags |= PMAP_SUPPORTS_EXEC_ONLY;

 	return (0);
 }
@ -154,233 +152,6 @@ ept_dump(uint64_t *ptp, int nlevels)
 }
 #endif

-static size_t
-ept_create_mapping(uint64_t *ptp, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
-		   vm_memattr_t attr, vm_prot_t prot, boolean_t spok)
-{
-	int spshift, ptpshift, ptpindex, nlevels;
-
-	/*
-	 * Compute the size of the mapping that we can accomodate.
-	 *
-	 * This is based on three factors:
-	 * - super page sizes supported by the processor
-	 * - alignment of the region starting at 'gpa' and 'hpa'
-	 * - length of the region 'len'
-	 */
-	spshift = PAGE_SHIFT;
-	if (spok)
-		spshift += (EPT_PWLEVELS - 1) * 9;
-	while (spshift >= PAGE_SHIFT) {
-		uint64_t spsize = 1UL << spshift;
-		if ((page_sizes_mask & spsize) != 0 &&
-		    (gpa & (spsize - 1)) == 0 &&
-		    (hpa & (spsize - 1)) == 0 &&
-		    length >= spsize) {
-			break;
-		}
-		spshift -= 9;
-	}
-
-	if (spshift < PAGE_SHIFT) {
-		panic("Invalid spshift for gpa 0x%016lx, hpa 0x%016lx, "
-		      "length 0x%016lx, page_sizes_mask 0x%016lx",
-		      gpa, hpa, length, page_sizes_mask);
-	}
-
-	nlevels = EPT_PWLEVELS;
-	while (--nlevels >= 0) {
-		ptpshift = PAGE_SHIFT + nlevels * 9;
-		ptpindex = (gpa >> ptpshift) & 0x1FF;
-
-		/* We have reached the leaf mapping */
-		if (spshift >= ptpshift)
-			break;
-
-		/*
-		 * We are working on a non-leaf page table page.
-		 *
-		 * Create the next level page table page if necessary and point
-		 * to it from the current page table.
-		 */
-		if (ptp[ptpindex] == 0) {
-			void *nlp = malloc(PAGE_SIZE, M_VMX, M_WAITOK | M_ZERO);
-			ptp[ptpindex] = vtophys(nlp);
-			ptp[ptpindex] |= EPT_PG_RD | EPT_PG_WR | EPT_PG_EX;
-		}
-
-		/* Work our way down to the next level page table page */
-		ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & EPT_ADDR_MASK);
-	}
-
-	if ((gpa & ((1UL << ptpshift) - 1)) != 0) {
-		panic("ept_create_mapping: gpa 0x%016lx and ptpshift %d "
-		      "mismatch\n", gpa, ptpshift);
-	}
-
-	if (prot != VM_PROT_NONE) {
-		/* Do the mapping */
-		ptp[ptpindex] = hpa;
-
-		/* Apply the access controls */
-		if (prot & VM_PROT_READ)
-			ptp[ptpindex] |= EPT_PG_RD;
-		if (prot & VM_PROT_WRITE)
-			ptp[ptpindex] |= EPT_PG_WR;
-		if (prot & VM_PROT_EXECUTE)
-			ptp[ptpindex] |= EPT_PG_EX;
-
-		/*
-		 * By default the PAT type is ignored - this appears to
-		 * be how other hypervisors handle EPT. Allow this to be
-		 * overridden.
-		 */
-		ptp[ptpindex] |= EPT_PG_MEMORY_TYPE(attr);
-		if (!ept_pat_passthru)
-			ptp[ptpindex] |= EPT_PG_IGNORE_PAT;
-
-		if (nlevels > 0)
-			ptp[ptpindex] |= EPT_PG_SUPERPAGE;
-	} else {
-		/* Remove the mapping */
-		ptp[ptpindex] = 0;
-	}
-
-	return (1UL << ptpshift);
-}
-
-static vm_paddr_t
-ept_lookup_mapping(uint64_t *ptp, vm_paddr_t gpa)
-{
-	int nlevels, ptpshift, ptpindex;
-	uint64_t ptpval, hpabase, pgmask;
-
-	nlevels = EPT_PWLEVELS;
-	while (--nlevels >= 0) {
-		ptpshift = PAGE_SHIFT + nlevels * 9;
-		ptpindex = (gpa >> ptpshift) & 0x1FF;
-
-		ptpval = ptp[ptpindex];
-
-		/* Cannot make progress beyond this point */
-		if ((ptpval & (EPT_PG_RD | EPT_PG_WR | EPT_PG_EX)) == 0)
-			break;
-
-		if (nlevels == 0 || (ptpval & EPT_PG_SUPERPAGE)) {
-			pgmask = (1UL << ptpshift) - 1;
-			hpabase = ptpval & ~pgmask;
-			return (hpabase | (gpa & pgmask));
-		}
-
-		/* Work our way down to the next level page table page */
-		ptp = (uint64_t *)PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK);
-	}
-
-	return ((vm_paddr_t)-1);
-}
-
-static void
-ept_free_pt_entry(pt_entry_t pte)
-{
-	if (pte == 0)
-		return;
-
-	/* sanity check */
-	if ((pte & EPT_PG_SUPERPAGE) != 0)
-		panic("ept_free_pt_entry: pte cannot have superpage bit");
-
-	return;
-}
-
-static void
-ept_free_pd_entry(pd_entry_t pde)
-{
-	pt_entry_t	*pt;
-	int		i;
-
-	if (pde == 0)
-		return;
-
-	if ((pde & EPT_PG_SUPERPAGE) == 0) {
-		pt = (pt_entry_t *)PHYS_TO_DMAP(pde & EPT_ADDR_MASK);
-		for (i = 0; i < NPTEPG; i++)
-			ept_free_pt_entry(pt[i]);
-		free(pt, M_VMX);	/* free the page table page */
-	}
-}
-
-static void
-ept_free_pdp_entry(pdp_entry_t pdpe)
-{
-	pd_entry_t 	*pd;
-	int		 i;
-
-	if (pdpe == 0)
-		return;
-
-	if ((pdpe & EPT_PG_SUPERPAGE) == 0) {
-		pd = (pd_entry_t *)PHYS_TO_DMAP(pdpe & EPT_ADDR_MASK);
-		for (i = 0; i < NPDEPG; i++)
-			ept_free_pd_entry(pd[i]);
-		free(pd, M_VMX);	/* free the page directory page */
-	}
-}
-
-static void
-ept_free_pml4_entry(pml4_entry_t pml4e)
-{
-	pdp_entry_t	*pdp;
-	int		i;
-
-	if (pml4e == 0)
-		return;
-
-	if ((pml4e & EPT_PG_SUPERPAGE) == 0) {
-		pdp = (pdp_entry_t *)PHYS_TO_DMAP(pml4e & EPT_ADDR_MASK);
-		for (i = 0; i < NPDPEPG; i++)
-			ept_free_pdp_entry(pdp[i]);
-		free(pdp, M_VMX);	/* free the page directory ptr page */
-	}
-}
-
-void
-ept_vmcleanup(struct vmx *vmx)
-{
-	int 		 i;
-
-	for (i = 0; i < NPML4EPG; i++)
-		ept_free_pml4_entry(vmx->pml4ept[i]);
-}
-
-int
-ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t len,
-		vm_memattr_t attr, int prot, boolean_t spok)
-{
-	size_t n;
-	struct vmx *vmx = arg;
-
-	while (len > 0) {
-		n = ept_create_mapping(vmx->pml4ept, gpa, hpa, len, attr,
-				       prot, spok);
-		len -= n;
-		gpa += n;
-		hpa += n;
-	}
-
-	return (0);
-}
-
-vm_paddr_t
-ept_vmmmap_get(void *arg, vm_paddr_t gpa)
-{
-	vm_paddr_t hpa;
-	struct vmx *vmx;
-
-	vmx = arg;
-	hpa = ept_lookup_mapping(vmx->pml4ept, gpa);
-	return (hpa);
-}
-
 static void
 invept_single_context(void *arg)
 {
@ -390,11 +161,44 @@ invept_single_context(void *arg)
 }

 void
-ept_invalidate_mappings(u_long pml4ept)
+ept_invalidate_mappings(u_long eptp)
 {
 	struct invept_desc invept_desc = { 0 };

-	invept_desc.eptp = EPTP(pml4ept);
+	invept_desc.eptp = eptp;

 	smp_rendezvous(NULL, invept_single_context, NULL, &invept_desc);
 }
+
+static int
+ept_pinit(pmap_t pmap)
+{
+
+	return (pmap_pinit_type(pmap, PT_EPT, ept_pmap_flags));
+}
+
+struct vmspace *
+ept_vmspace_alloc(vm_offset_t min, vm_offset_t max)
+{
+
+	return (vmspace_alloc(min, max, ept_pinit));
+}
+
+void
+ept_vmspace_free(struct vmspace *vmspace)
+{
+
+	vmspace_free(vmspace);
+}
+
+uint64_t
+eptp(uint64_t pml4)
+{
+	uint64_t eptp_val;
+
+	eptp_val = pml4 | (EPT_PWLEVELS - 1) << 3 | PAT_WRITE_BACK;
+	if (ept_enable_ad_bits)
+		eptp_val |= EPT_ENABLE_AD_BITS;
+
+	return (eptp_val);
+}
--- a/sys/amd64/vmm/intel/ept.h
+++ b/sys/amd64/vmm/intel/ept.h
@ -31,13 +31,9 @@

 struct vmx;

-#define	EPT_PWLEVELS	4		/* page walk levels */
-#define	EPTP(pml4)	((pml4) | (EPT_PWLEVELS - 1) << 3 | PAT_WRITE_BACK)
-
 int	ept_init(void);
-int	ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
-	    vm_memattr_t attr, int prot, boolean_t allow_superpage_mappings);
-vm_paddr_t ept_vmmmap_get(void *arg, vm_paddr_t gpa);
-void	ept_invalidate_mappings(u_long ept_pml4);
-void	ept_vmcleanup(struct vmx *vmx);
+void	ept_invalidate_mappings(u_long eptp);
+struct vmspace *ept_vmspace_alloc(vm_offset_t min, vm_offset_t max);
+void	ept_vmspace_free(struct vmspace *vmspace);
+uint64_t eptp(uint64_t pml4);
 #endif
--- a/sys/amd64/vmm/intel/vmcs.c
+++ b/sys/amd64/vmm/intel/vmcs.c
@ -318,14 +318,14 @@ done:

 int
 vmcs_set_defaults(struct vmcs *vmcs,
-		  u_long host_rip, u_long host_rsp, u_long ept_pml4,
+		  u_long host_rip, u_long host_rsp, uint64_t eptp,
 		  uint32_t pinbased_ctls, uint32_t procbased_ctls,
 		  uint32_t procbased_ctls2, uint32_t exit_ctls,
 		  uint32_t entry_ctls, u_long msr_bitmap, uint16_t vpid)
 {
 	int error, codesel, datasel, tsssel;
 	u_long cr0, cr4, efer;
-	uint64_t eptp, pat, fsbase, idtrbase;
+	uint64_t pat, fsbase, idtrbase;
 	uint32_t exc_bitmap;

 	codesel = vmm_get_host_codesel();
@ -432,7 +432,6 @@ vmcs_set_defaults(struct vmcs *vmcs,
 		goto done;

 	/* eptp */
-	eptp = EPTP(ept_pml4);
 	if ((error = vmwrite(VMCS_EPTP, eptp)) != 0)
 		goto done;

--- a/sys/amd64/vmm/intel/vmcs.h
+++ b/sys/amd64/vmm/intel/vmcs.h
@ -47,7 +47,7 @@ struct msr_entry {

 int vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count);
 int	vmcs_set_defaults(struct vmcs *vmcs, u_long host_rip, u_long host_rsp,
-			  u_long ept_pml4,
+			  uint64_t eptp,
 			  uint32_t pinbased_ctls, uint32_t procbased_ctls,
 			  uint32_t procbased_ctls2, uint32_t exit_ctls,
 			  uint32_t entry_ctls, u_long msr_bitmap,
@ -68,6 +68,8 @@ uint64_t vmcs_read(uint32_t encoding);
 #define	vmcs_guest_cr3()		vmcs_read(VMCS_GUEST_CR3)
 #define	vmcs_gpa()			vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS)
 #define	vmcs_gla()			vmcs_read(VMCS_GUEST_LINEAR_ADDRESS)
+#define	vmcs_idt_vectoring_info()	vmcs_read(VMCS_IDT_VECTORING_INFO)
+#define	vmcs_idt_vectoring_err()	vmcs_read(VMCS_IDT_VECTORING_ERROR)

 #endif	/* _KERNEL */

@ -313,6 +315,12 @@ uint64_t vmcs_read(uint32_t encoding);
 #define	VMCS_INTERRUPTION_INFO_HW_INTR	(0 << 8)
 #define	VMCS_INTERRUPTION_INFO_NMI	(2 << 8)

+/*
+ * VMCS IDT-Vectoring information fields
+ */
+#define	VMCS_IDT_VEC_VALID		(1 << 31)
+#define	VMCS_IDT_VEC_ERRCODE_VALID	(1 << 11)
+
 /*
 * VMCS Guest interruptibility field
 */
@ -332,6 +340,9 @@ uint64_t vmcs_read(uint32_t encoding);
 #define	EPT_VIOLATION_DATA_READ		(1UL << 0)
 #define	EPT_VIOLATION_DATA_WRITE	(1UL << 1)
 #define	EPT_VIOLATION_INST_FETCH	(1UL << 2)
+#define	EPT_VIOLATION_GPA_READABLE	(1UL << 3)
+#define	EPT_VIOLATION_GPA_WRITEABLE	(1UL << 4)
+#define	EPT_VIOLATION_GPA_EXECUTABLE	(1UL << 5)
 #define	EPT_VIOLATION_GLA_VALID		(1UL << 7)
 #define	EPT_VIOLATION_XLAT_VALID	(1UL << 8)

--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@ -49,8 +49,6 @@ __FBSDID("$FreeBSD$");
 #include <machine/specialreg.h>
 #include <machine/vmparam.h>

-#include <x86/apicreg.h>
-
 #include <machine/vmm.h>
 #include "vmm_host.h"
 #include "vmm_lapic.h"
@ -167,9 +165,6 @@ static int cap_pause_exit;
 static int cap_unrestricted_guest;
 static int cap_monitor_trap;
 
-/* statistics */
-static VMM_STAT_INTEL(VMEXIT_HLT_IGNORED, "number of times hlt was ignored");
-
 static struct unrhdr *vpid_unr;
 static u_int vpid_alloc_failed;
 SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD,
@ -740,7 +735,7 @@ vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial)
 #define	vmx_setup_cr4_shadow(vmcs,init)	vmx_setup_cr_shadow(4, (vmcs), (init))

 static void *
-vmx_vminit(struct vm *vm)
+vmx_vminit(struct vm *vm, pmap_t pmap)
 {
 	uint16_t vpid[VM_MAXCPU];
 	int i, error, guest_msr_count;
@ -753,6 +748,8 @@ vmx_vminit(struct vm *vm)
 	}
 	vmx->vm = vm;

+	vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4));
+
 	/*
 	 * Clean up EPTP-tagged guest physical and combined mappings
 	 *
@ -762,7 +759,7 @@ vmx_vminit(struct vm *vm)
 	 *
 	 * Combined mappings for this EP4TA are also invalidated for all VPIDs.
 	 */
-	ept_invalidate_mappings(vtophys(vmx->pml4ept));
+	ept_invalidate_mappings(vmx->eptp);

 	msr_bitmap_initialize(vmx->msr_bitmap);

@ -818,7 +815,7 @@ vmx_vminit(struct vm *vm)
 		error = vmcs_set_defaults(&vmx->vmcs[i],
 					  (u_long)vmx_longjmp,
 					  (u_long)&vmx->ctx[i],
-					  vtophys(vmx->pml4ept),
+					  vmx->eptp,
 					  pinbased_ctls,
 					  procbased_ctls,
 					  procbased_ctls2,
@ -856,6 +853,9 @@ vmx_vminit(struct vm *vm)
 		error = vmx_setup_cr4_shadow(&vmx->vmcs[i], 0);
 		if (error != 0)
 			panic("vmx_setup_cr4_shadow %d", error);
+
+		vmx->ctx[i].pmap = pmap;
+		vmx->ctx[i].eptp = vmx->eptp;
 	}

 	return (vmx);
@ -1281,21 +1281,49 @@ vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
 }

 static int
-vmx_ept_fault(struct vm *vm, int cpu,
-	      uint64_t gla, uint64_t gpa, uint64_t rip, int inst_length,
-	      uint64_t cr3, uint64_t ept_qual, struct vie *vie)
+ept_fault_type(uint64_t ept_qual)
 {
-	int read, write, error;
+	int fault_type;

-	/* EPT violation on an instruction fetch doesn't make sense here */
+	if (ept_qual & EPT_VIOLATION_DATA_WRITE)
+		fault_type = VM_PROT_WRITE;
+	else if (ept_qual & EPT_VIOLATION_INST_FETCH)
+		fault_type = VM_PROT_EXECUTE;
+	else
+		fault_type= VM_PROT_READ;
+
+	return (fault_type);
+}
+
+static int
+ept_protection(uint64_t ept_qual)
+{
+	int prot = 0;
+
+	if (ept_qual & EPT_VIOLATION_GPA_READABLE)
+		prot |= VM_PROT_READ;
+	if (ept_qual & EPT_VIOLATION_GPA_WRITEABLE)
+		prot |= VM_PROT_WRITE;
+	if (ept_qual & EPT_VIOLATION_GPA_EXECUTABLE)
+		prot |= VM_PROT_EXECUTE;
+
+	return (prot);
+}
+
+static boolean_t
+ept_emulation_fault(uint64_t ept_qual)
+{
+	int read, write;
+
+	/* EPT fault on an instruction fetch doesn't make sense here */
 	if (ept_qual & EPT_VIOLATION_INST_FETCH)
-		return (UNHANDLED);
+		return (FALSE);

-	/* EPT violation must be a read fault or a write fault */
+	/* EPT fault must be a read fault or a write fault */
 	read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0;
 	write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0;
 	if ((read | write) == 0)
-		return (UNHANDLED);
+		return (FALSE);

 	/*
 	 * The EPT violation must have been caused by accessing a
@ -1304,26 +1332,10 @@ vmx_ept_fault(struct vm *vm, int cpu,
 	 */
 	if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 ||
 	    (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) {
-		return (UNHANDLED);
+		return (FALSE);
 	}

-	/* Fetch, decode and emulate the faulting instruction */
-	if (vmm_fetch_instruction(vm, cpu, rip, inst_length, cr3, vie) != 0)
-		return (UNHANDLED);
-
-	if (vmm_decode_instruction(vm, cpu, gla, vie) != 0)
-		return (UNHANDLED);
-
-	/*
-	 * Check if this is a local apic access
-	 */
-	if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE)
-		return (UNHANDLED);
-
-	error = vmm_emulate_instruction(vm, cpu, gpa, vie,
-					lapic_mmio_read, lapic_mmio_write, 0);
-
-	return (error ? UNHANDLED : HANDLED);
+	return (TRUE);
 }

 static int
@ -1332,18 +1344,47 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 	int error, handled;
 	struct vmcs *vmcs;
 	struct vmxctx *vmxctx;
-	uint32_t eax, ecx, edx;
-	uint64_t qual, gla, gpa, cr3, intr_info;
+	uint32_t eax, ecx, edx, idtvec_info, idtvec_err, reason;
+	uint64_t qual, gpa;

 	handled = 0;
 	vmcs = &vmx->vmcs[vcpu];
 	vmxctx = &vmx->ctx[vcpu];
 	qual = vmexit->u.vmx.exit_qualification;
+	reason = vmexit->u.vmx.exit_reason;
 	vmexit->exitcode = VM_EXITCODE_BOGUS;

 	vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1);

-	switch (vmexit->u.vmx.exit_reason) {
+	/*
+	 * VM exits that could be triggered during event injection on the
+	 * previous VM entry need to be handled specially by re-injecting
+	 * the event.
+	 *
+	 * See "Information for VM Exits During Event Delivery" in Intel SDM
+	 * for details.
+	 */
+	switch (reason) {
+	case EXIT_REASON_EPT_FAULT:
+	case EXIT_REASON_EPT_MISCONFIG:
+	case EXIT_REASON_APIC:
+	case EXIT_REASON_TASK_SWITCH:
+	case EXIT_REASON_EXCEPTION:
+		idtvec_info = vmcs_idt_vectoring_info();
+		if (idtvec_info & VMCS_IDT_VEC_VALID) {
+			idtvec_info &= ~(1 << 12); /* clear undefined bit */
+			vmwrite(VMCS_ENTRY_INTR_INFO, idtvec_info);
+			if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
+				idtvec_err = vmcs_idt_vectoring_err();
+				vmwrite(VMCS_ENTRY_EXCEPTION_ERROR, idtvec_err);
+			}
+			vmwrite(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
+		}
+	default:
+		break;
+	}
+
+	switch (reason) {
 	case EXIT_REASON_CR_ACCESS:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1);
 		handled = vmx_emulate_cr_access(vmx, vcpu, qual);
@ -1374,19 +1415,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 		break;
 	case EXIT_REASON_HLT:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1);
-		/*
-		 * If there is an event waiting to be injected then there is
-		 * no need to 'hlt'.
-		 */
-		error = vmread(VMCS_ENTRY_INTR_INFO, &intr_info);
-		if (error)
-			panic("vmx_exit_process: vmread(intrinfo) %d", error);
-
-		if (intr_info & VMCS_INTERRUPTION_INFO_VALID) {
-			handled = 1;
-			vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT_IGNORED, 1);
-		} else
-			vmexit->exitcode = VM_EXITCODE_HLT;
+		vmexit->exitcode = VM_EXITCODE_HLT;
 		break;
 	case EXIT_REASON_MTF:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1);
@ -1440,15 +1469,22 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 		break;
 	case EXIT_REASON_EPT_FAULT:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EPT_FAULT, 1);
-		gla = vmcs_gla();
+		/*
+		 * If 'gpa' lies within the address space allocated to
+		 * memory then this must be a nested page fault otherwise
+		 * this must be an instruction that accesses MMIO space.
+		 */
 		gpa = vmcs_gpa();
-		cr3 = vmcs_guest_cr3();
-		handled = vmx_ept_fault(vmx->vm, vcpu, gla, gpa,
-					vmexit->rip, vmexit->inst_length,
-					cr3, qual, &vmexit->u.paging.vie);
-		if (!handled) {
+		if (vm_mem_allocated(vmx->vm, gpa)) {
 			vmexit->exitcode = VM_EXITCODE_PAGING;
 			vmexit->u.paging.gpa = gpa;
+			vmexit->u.paging.fault_type = ept_fault_type(qual);
+			vmexit->u.paging.protection = ept_protection(qual);
+		} else if (ept_emulation_fault(qual)) {
+			vmexit->exitcode = VM_EXITCODE_INST_EMUL;
+			vmexit->u.inst_emul.gpa = gpa;
+			vmexit->u.inst_emul.gla = vmcs_gla();
+			vmexit->u.inst_emul.cr3 = vmcs_guest_cr3();
 		}
 		break;
 	default:
@ -1470,14 +1506,6 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 		vm_exit_update_rip(vmexit);
 		vmexit->rip += vmexit->inst_length;
 		vmexit->inst_length = 0;
-
-		/*
-		 * Special case for spinning up an AP - exit to userspace to
-		 * give the controlling process a chance to intercept and
-		 * spin up a thread for the AP.
-		 */
-		if (vmexit->exitcode == VM_EXITCODE_SPINUP_AP)
-			handled = 0;
 	} else {
 		if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
 			/*
@ -1497,7 +1525,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 }

 static int
-vmx_run(void *arg, int vcpu, register_t rip)
+vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap)
 {
 	int error, vie, rc, handled, astpending;
 	uint32_t exit_reason;
@ -1505,7 +1533,7 @@ vmx_run(void *arg, int vcpu, register_t rip)
 	struct vmxctx *vmxctx;
 	struct vmcs *vmcs;
 	struct vm_exit *vmexit;
-	
+
 	vmx = arg;
 	vmcs = &vmx->vmcs[vcpu];
 	vmxctx = &vmx->ctx[vcpu];
@ -1514,6 +1542,11 @@ vmx_run(void *arg, int vcpu, register_t rip)
 	astpending = 0;
 	vmexit = vm_exitinfo(vmx->vm, vcpu);

+	KASSERT(vmxctx->pmap == pmap,
+	    ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap));
+	KASSERT(vmxctx->eptp == vmx->eptp,
+	    ("eptp %p different than ctx eptp %#lx", eptp, vmxctx->eptp));
+
 	/*
 	 * XXX Can we avoid doing this every time we do a vm run?
 	 */
@ -1576,6 +1609,9 @@ vmx_run(void *arg, int vcpu, register_t rip)
 				vmxctx->launch_error, vie);
 #endif
 			goto err_exit;
+		case VMX_RETURN_INVEPT:
+			panic("vm %s:%d invept error %d",
+			      vm_name(vmx->vm), vcpu, vmxctx->launch_error);
 		default:
 			panic("vmx_setjmp returned %d", rc);
 		}
@ -1654,7 +1690,6 @@ vmx_vmcleanup(void *arg)
 	if (error != 0)
 		panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error);

-	ept_vmcleanup(vmx);
 	free(vmx, M_VMX);

 	return;
@ -2000,13 +2035,13 @@ struct vmm_ops vmm_ops_intel = {
 	vmx_vminit,
 	vmx_run,
 	vmx_vmcleanup,
-	ept_vmmmap_set,
-	ept_vmmmap_get,
 	vmx_getreg,
 	vmx_setreg,
 	vmx_getdesc,
 	vmx_setdesc,
 	vmx_inject,
 	vmx_getcap,
-	vmx_setcap
+	vmx_setcap,
+	ept_vmspace_alloc,
+	ept_vmspace_free,
 };
--- a/sys/amd64/vmm/intel/vmx.h
+++ b/sys/amd64/vmm/intel/vmx.h
@ -31,6 +31,8 @@

 #include "vmcs.h"

+struct pmap;
+
 #define	GUEST_MSR_MAX_ENTRIES	64		/* arbitrary */

 struct vmxctx {
@ -68,6 +70,15 @@ struct vmxctx {
 	
 	int		launched;		/* vmcs launch state */
 	int		launch_error;
+
+	long		eptgen[MAXCPU];		/* cached pmap->pm_eptgen */
+
+	/*
+	 * The 'eptp' and the 'pmap' do not change during the lifetime of
+	 * the VM so it is safe to keep a copy in each vcpu's vmxctx.
+	 */
+	vm_paddr_t	eptp;
+	struct pmap	*pmap;
 };

 struct vmxcap {
@ -82,16 +93,15 @@ struct vmxstate {

 /* virtual machine softc */
 struct vmx {
-	pml4_entry_t	pml4ept[NPML4EPG];
 	struct vmcs	vmcs[VM_MAXCPU];	/* one vmcs per virtual cpu */
 	char		msr_bitmap[PAGE_SIZE];
 	struct msr_entry guest_msrs[VM_MAXCPU][GUEST_MSR_MAX_ENTRIES];
 	struct vmxctx	ctx[VM_MAXCPU];
 	struct vmxcap	cap[VM_MAXCPU];
 	struct vmxstate	state[VM_MAXCPU];
+	uint64_t	eptp;
 	struct vm	*vm;
 };
-CTASSERT((offsetof(struct vmx, pml4ept) & PAGE_MASK) == 0);
 CTASSERT((offsetof(struct vmx, vmcs) & PAGE_MASK) == 0);
 CTASSERT((offsetof(struct vmx, msr_bitmap) & PAGE_MASK) == 0);
 CTASSERT((offsetof(struct vmx, guest_msrs) & 15) == 0);
@ -101,6 +111,7 @@ CTASSERT((offsetof(struct vmx, guest_msrs) & 15) == 0);
 #define	VMX_RETURN_VMRESUME	2
 #define	VMX_RETURN_VMLAUNCH	3
 #define	VMX_RETURN_AST		4
+#define	VMX_RETURN_INVEPT	5
 /*
 * vmx_setjmp() returns:
 * - 0 when it returns directly
@ -108,6 +119,7 @@ CTASSERT((offsetof(struct vmx, guest_msrs) & 15) == 0);
 * - 2 when it returns from vmx_resume (which would only be in the error case)
 * - 3 when it returns from vmx_launch (which would only be in the error case)
 * - 4 when it returns from vmx_resume or vmx_launch because of AST pending
+ * - 5 when it returns from vmx_launch/vmx_resume because of invept error
 */
 int	vmx_setjmp(struct vmxctx *ctx);
 void	vmx_longjmp(void);			/* returns via vmx_setjmp */
--- a/sys/amd64/vmm/intel/vmx_genassym.c
+++ b/sys/amd64/vmm/intel/vmx_genassym.c
@ -72,6 +72,10 @@ ASSYM(VMXCTX_HOST_RBX, offsetof(struct vmxctx, host_rbx));
 ASSYM(VMXCTX_HOST_RIP, offsetof(struct vmxctx, host_rip));

 ASSYM(VMXCTX_LAUNCH_ERROR, offsetof(struct vmxctx, launch_error));
+ASSYM(VMXCTX_EPTGEN, offsetof(struct vmxctx, eptgen));
+
+ASSYM(VMXCTX_PMAP, offsetof(struct vmxctx, pmap));
+ASSYM(VMXCTX_EPTP, offsetof(struct vmxctx, eptp));

 ASSYM(VM_SUCCESS,	VM_SUCCESS);
 ASSYM(VM_FAIL_INVALID,	VM_FAIL_INVALID);
@ -82,8 +86,13 @@ ASSYM(VMX_RETURN_LONGJMP,	VMX_RETURN_LONGJMP);
 ASSYM(VMX_RETURN_VMRESUME,	VMX_RETURN_VMRESUME);
 ASSYM(VMX_RETURN_VMLAUNCH,	VMX_RETURN_VMLAUNCH);
 ASSYM(VMX_RETURN_AST,		VMX_RETURN_AST);
+ASSYM(VMX_RETURN_INVEPT,	VMX_RETURN_INVEPT);

 ASSYM(TDF_ASTPENDING, TDF_ASTPENDING);
 ASSYM(TDF_NEEDRESCHED, TDF_NEEDRESCHED);
 ASSYM(TD_FLAGS, offsetof(struct thread, td_flags));
 ASSYM(PC_CURTHREAD, offsetof(struct pcpu, pc_curthread));
+ASSYM(PC_CPUID, offsetof(struct pcpu, pc_cpuid));
+
+ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active));
+ASSYM(PM_EPTGEN, offsetof(struct pmap, pm_eptgen));
--- a/sys/amd64/vmm/intel/vmx_support.S
+++ b/sys/amd64/vmm/intel/vmx_support.S
@ -30,6 +30,12 @@

 #include "vmx_assym.s"

+#ifdef SMP
+#define	LK	lock ;
+#else
+#define	LK
+#endif
+
 /*
 * Disable interrupts before updating %rsp in VMX_CHECK_AST or
 * VMX_GUEST_RESTORE.
@ -86,15 +92,73 @@
 	movq	VMXCTX_GUEST_R15(%rdi),%r15;				\
 	movq	VMXCTX_GUEST_RDI(%rdi),%rdi; /* restore rdi the last */

-#define	VM_INSTRUCTION_ERROR(reg)					\
+/*
+ * Check for an error after executing a VMX instruction.
+ * 'errreg' will be zero on success and non-zero otherwise.
+ * 'ctxreg' points to the 'struct vmxctx' associated with the vcpu.
+ */
+#define	VM_INSTRUCTION_ERROR(errreg, ctxreg)				\
 	jnc 	1f;							\
-	movl 	$VM_FAIL_INVALID,reg;		/* CF is set */		\
+	movl 	$VM_FAIL_INVALID,errreg;		/* CF is set */	\
 	jmp 	3f;							\
 1:	jnz 	2f;							\
-	movl 	$VM_FAIL_VALID,reg;		/* ZF is set */		\
+	movl 	$VM_FAIL_VALID,errreg;		/* ZF is set */		\
 	jmp 	3f;							\
-2:	movl 	$VM_SUCCESS,reg;					\
-3:	movl	reg,VMXCTX_LAUNCH_ERROR(%rsp)
+2:	movl 	$VM_SUCCESS,errreg;					\
+3:	movl	errreg,VMXCTX_LAUNCH_ERROR(ctxreg)
+
+/*
+ * set or clear the appropriate bit in 'pm_active'
+ * %rdi = vmxctx
+ * %rax, %r11 = scratch registers
+ */
+#define	VMX_SET_PM_ACTIVE						\
+	movq	VMXCTX_PMAP(%rdi), %r11;				\
+	movl	PCPU(CPUID), %eax;					\
+	LK btsl	%eax, PM_ACTIVE(%r11)
+
+#define	VMX_CLEAR_PM_ACTIVE						\
+	movq	VMXCTX_PMAP(%rdi), %r11;				\
+	movl	PCPU(CPUID), %eax;					\
+	LK btrl	%eax, PM_ACTIVE(%r11)
+
+/*
+ * If 'vmxctx->eptgen[curcpu]' is not identical to 'pmap->pm_eptgen'
+ * then we must invalidate all mappings associated with this eptp.
+ *
+ * %rdi = vmxctx
+ * %rax, %rbx, %r11 = scratch registers
+ */
+#define	VMX_CHECK_EPTGEN						\
+	movl	PCPU(CPUID), %ebx;					\
+	movq	VMXCTX_PMAP(%rdi), %r11;				\
+	movq	PM_EPTGEN(%r11), %rax;					\
+	cmpq	%rax, VMXCTX_EPTGEN(%rdi, %rbx, 8);			\
+	je	9f;							\
+									\
+	/* Refresh 'vmxctx->eptgen[curcpu]' */				\
+	movq	%rax, VMXCTX_EPTGEN(%rdi, %rbx, 8);			\
+									\
+	/* Setup the invept descriptor at the top of tmpstk */		\
+	mov	%rdi, %r11;						\
+	addq	$VMXCTX_TMPSTKTOP, %r11;				\
+	movq	VMXCTX_EPTP(%rdi), %rax;				\
+	movq	%rax, -16(%r11);					\
+	movq	$0x0, -8(%r11);						\
+	mov	$0x1, %eax;	/* Single context invalidate */		\
+	invept	-16(%r11), %rax;					\
+									\
+	/* Check for invept error */					\
+	VM_INSTRUCTION_ERROR(%eax, %rdi);				\
+	testl	%eax, %eax;						\
+	jz	9f;							\
+									\
+	/* Return via vmx_setjmp with retval of VMX_RETURN_INVEPT */	\
+	movq	$VMX_RETURN_INVEPT, %rsi;				\
+	movq	%rdi,%rsp;						\
+	addq	$VMXCTX_TMPSTKTOP, %rsp;				\
+	callq	vmx_return;						\
+9:	;

 	.text
 /*
@ -129,6 +193,9 @@ END(vmx_setjmp)
 * Return to vmm context through vmx_setjmp() with a value of 'retval'.
 */
 ENTRY(vmx_return)
+	/* The pmap is no longer active on the host cpu */
+	VMX_CLEAR_PM_ACTIVE
+
 	/* Restore host context. */
 	movq	VMXCTX_HOST_R15(%rdi),%r15
 	movq	VMXCTX_HOST_R14(%rdi),%r14
@ -193,6 +260,10 @@ ENTRY(vmx_resume)

 	VMX_CHECK_AST

+	VMX_SET_PM_ACTIVE	/* This vcpu is now active on the host cpu */
+
+	VMX_CHECK_EPTGEN	/* Check if we have to invalidate TLB */
+
 	/*
 	 * Restore guest state that is not automatically loaded from the vmcs.
 	 */
@ -203,7 +274,7 @@ ENTRY(vmx_resume)
 	/*
 	 * Capture the reason why vmresume failed.
 	 */
-	VM_INSTRUCTION_ERROR(%eax)
+	VM_INSTRUCTION_ERROR(%eax, %rsp)

 	/* Return via vmx_setjmp with return value of VMX_RETURN_VMRESUME */
 	movq	%rsp,%rdi
@ -225,6 +296,10 @@ ENTRY(vmx_launch)

 	VMX_CHECK_AST

+	VMX_SET_PM_ACTIVE	/* This vcpu is now active on the host cpu */
+
+	VMX_CHECK_EPTGEN	/* Check if we have to invalidate TLB */
+
 	/*
 	 * Restore guest state that is not automatically loaded from the vmcs.
 	 */
@ -235,7 +310,7 @@ ENTRY(vmx_launch)
 	/*
 	 * Capture the reason why vmlaunch failed.
 	 */
-	VM_INSTRUCTION_ERROR(%eax)
+	VM_INSTRUCTION_ERROR(%eax, %rsp)

 	/* Return via vmx_setjmp with return value of VMX_RETURN_VMLAUNCH */
 	movq	%rsp,%rdi
--- a/sys/amd64/vmm/io/ppt.c
+++ b/sys/amd64/vmm/io/ppt.c
@ -281,6 +281,43 @@ ppt_teardown_msix(struct pptdev *ppt)
 	ppt->msix.num_msgs = 0;
 }

+int
+ppt_num_devices(struct vm *vm)
+{
+	int i, num;
+
+	num = 0;
+	for (i = 0; i < num_pptdevs; i++) {
+		if (pptdevs[i].vm == vm)
+			num++;
+	}
+	return (num);
+}
+
+boolean_t
+ppt_is_mmio(struct vm *vm, vm_paddr_t gpa)
+{
+	int i, n;
+	struct pptdev *ppt;
+	struct vm_memory_segment *seg;
+
+	for (n = 0; n < num_pptdevs; n++) {
+		ppt = &pptdevs[n];
+		if (ppt->vm != vm)
+			continue;
+
+		for (i = 0; i < MAX_MMIOSEGS; i++) {
+			seg = &ppt->mmio[i];
+			if (seg->len == 0)
+				continue;
+			if (gpa >= seg->gpa && gpa < seg->gpa + seg->len)
+				return (TRUE);
+		}
+	}
+
+	return (FALSE);
+}
+
 int
 ppt_assign_device(struct vm *vm, int bus, int slot, int func)
 {
@ -336,7 +373,7 @@ ppt_unassign_all(struct vm *vm)
 			bus = pci_get_bus(dev);
 			slot = pci_get_slot(dev);
 			func = pci_get_function(dev);
-			ppt_unassign_device(vm, bus, slot, func);
+			vm_unassign_pptdev(vm, bus, slot, func);
 		}
 	}

@ -591,10 +628,3 @@ ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,

 	return (0);
 }
-
-int
-ppt_num_devices(void)
-{
-
-	return (num_pptdevs);
-}
--- a/sys/amd64/vmm/io/ppt.h
+++ b/sys/amd64/vmm/io/ppt.h
@ -29,14 +29,20 @@
 #ifndef _IO_PPT_H_
 #define	_IO_PPT_H_

-int	ppt_assign_device(struct vm *vm, int bus, int slot, int func);
-int	ppt_unassign_device(struct vm *vm, int bus, int slot, int func);
 int	ppt_unassign_all(struct vm *vm);
 int	ppt_map_mmio(struct vm *vm, int bus, int slot, int func,
 		     vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
 int	ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
 		      int destcpu, int vector, int numvec);
 int	ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
-		       int idx, uint32_t msg, uint32_t vector_control, uint64_t addr);
-int	ppt_num_devices(void);
+		int idx, uint32_t msg, uint32_t vector_control, uint64_t addr);
+int	ppt_num_devices(struct vm *vm);
+boolean_t ppt_is_mmio(struct vm *vm, vm_paddr_t gpa);
+
+/*
+ * The following functions should never be called directly.
+ * Use 'vm_assign_pptdev()' and 'vm_unassign_pptdev()' instead.
+ */
+int	ppt_assign_device(struct vm *vm, int bus, int slot, int func);
+int	ppt_unassign_device(struct vm *vm, int bus, int slot, int func);
 #endif
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@ -39,18 +39,28 @@ __FBSDID("$FreeBSD$");
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
+#include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/systm.h>

 #include <vm/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_param.h>

 #include <machine/vm.h>
 #include <machine/pcb.h>
 #include <machine/smp.h>
 #include <x86/apicreg.h>
+#include <machine/pmap.h>
+#include <machine/vmparam.h>

 #include <machine/vmm.h>
+#include "vmm_ktr.h"
 #include "vmm_host.h"
 #include "vmm_mem.h"
 #include "vmm_util.h"
@ -84,15 +94,23 @@ struct vcpu {
 #define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
 #define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
 #define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
+#define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)

+struct mem_seg {
+	vm_paddr_t	gpa;
+	size_t		len;
+	boolean_t	wired;
+	vm_object_t	object;
+};
 #define	VM_MAX_MEMORY_SEGMENTS	2

 struct vm {
 	void		*cookie;	/* processor-specific data */
 	void		*iommu;		/* iommu-specific data */
+	struct vmspace	*vmspace;	/* guest's address space */
 	struct vcpu	vcpu[VM_MAXCPU];
 	int		num_mem_segs;
-	struct vm_memory_segment mem_segs[VM_MAX_MEMORY_SEGMENTS];
+	struct mem_seg	mem_segs[VM_MAX_MEMORY_SEGMENTS];
 	char		name[VM_MAX_NAMELEN];

 	/*
@ -109,16 +127,14 @@ static struct vmm_ops *ops;
 #define	VMM_INIT()	(ops != NULL ? (*ops->init)() : 0)
 #define	VMM_CLEANUP()	(ops != NULL ? (*ops->cleanup)() : 0)

-#define	VMINIT(vm)	(ops != NULL ? (*ops->vminit)(vm): NULL)
-#define	VMRUN(vmi, vcpu, rip) \
-	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip) : ENXIO)
+#define	VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
+#define	VMRUN(vmi, vcpu, rip, pmap) \
+	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap) : ENXIO)
 #define	VMCLEANUP(vmi)	(ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
-#define	VMMMAP_SET(vmi, gpa, hpa, len, attr, prot, spm)			\
-    	(ops != NULL ? 							\
-    	(*ops->vmmmap_set)(vmi, gpa, hpa, len, attr, prot, spm) :	\
-	ENXIO)
-#define	VMMMAP_GET(vmi, gpa) \
-	(ops != NULL ? (*ops->vmmmap_get)(vmi, gpa) : ENXIO)
+#define	VMSPACE_ALLOC(min, max) \
+	(ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
+#define	VMSPACE_FREE(vmspace) \
+	(ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO)
 #define	VMGETREG(vmi, vcpu, num, retval)		\
 	(ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
 #define	VMSETREG(vmi, vcpu, num, val)		\
@ -213,8 +229,7 @@ vmm_handler(module_t mod, int what, void *arg)
 	switch (what) {
 	case MOD_LOAD:
 		vmmdev_init();
-		if (ppt_num_devices() > 0)
-			iommu_init();
+		iommu_init();
 		error = vmm_init();
 		if (error == 0)
 			vmm_initialized = 1;
@ -265,7 +280,7 @@ vm_create(const char *name, struct vm **retvm)
 {
 	int i;
 	struct vm *vm;
-	vm_paddr_t maxaddr;
+	struct vmspace *vmspace;

 	const int BSP = 0;

@ -279,59 +294,34 @@ vm_create(const char *name, struct vm **retvm)
 	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
 		return (EINVAL);

+	vmspace = VMSPACE_ALLOC(VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS);
+	if (vmspace == NULL)
+		return (ENOMEM);
+
 	vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
 	strcpy(vm->name, name);
-	vm->cookie = VMINIT(vm);
+	vm->cookie = VMINIT(vm, vmspace_pmap(vmspace));

 	for (i = 0; i < VM_MAXCPU; i++) {
 		vcpu_init(vm, i);
 		guest_msrs_init(vm, i);
 	}

-	maxaddr = vmm_mem_maxaddr();
-	vm->iommu = iommu_create_domain(maxaddr);
 	vm_activate_cpu(vm, BSP);
+	vm->vmspace = vmspace;

 	*retvm = vm;
 	return (0);
 }

 static void
-vm_free_mem_seg(struct vm *vm, struct vm_memory_segment *seg)
+vm_free_mem_seg(struct vm *vm, struct mem_seg *seg)
 {
-	size_t len;
-	vm_paddr_t hpa;
-	void *host_domain;

-	host_domain = iommu_host_domain();
+	if (seg->object != NULL)
+		vmm_mem_free(vm->vmspace, seg->gpa, seg->len);

-	len = 0;
-	while (len < seg->len) {
-		hpa = vm_gpa2hpa(vm, seg->gpa + len, PAGE_SIZE);
-		if (hpa == (vm_paddr_t)-1) {
-			panic("vm_free_mem_segs: cannot free hpa "
-			      "associated with gpa 0x%016lx", seg->gpa + len);
-		}
-
-		/*
-		 * Remove the 'gpa' to 'hpa' mapping in VMs domain.
-		 * And resurrect the 1:1 mapping for 'hpa' in 'host_domain'.
-		 */
-		iommu_remove_mapping(vm->iommu, seg->gpa + len, PAGE_SIZE);
-		iommu_create_mapping(host_domain, hpa, hpa, PAGE_SIZE);
-
-		vmm_mem_free(hpa, PAGE_SIZE);
-
-		len += PAGE_SIZE;
-	}
-
-	/*
-	 * Invalidate cached translations associated with 'vm->iommu' since
-	 * we have now moved some pages from it.
-	 */
-	iommu_invalidate_tlb(vm->iommu);
-
-	bzero(seg, sizeof(struct vm_memory_segment));
+	bzero(seg, sizeof(*seg));
 }

 void
@ -341,6 +331,9 @@ vm_destroy(struct vm *vm)

 	ppt_unassign_all(vm);

+	if (vm->iommu != NULL)
+		iommu_destroy_domain(vm->iommu);
+
 	for (i = 0; i < vm->num_mem_segs; i++)
 		vm_free_mem_seg(vm, &vm->mem_segs[i]);

@ -349,7 +342,7 @@ vm_destroy(struct vm *vm)
 	for (i = 0; i < VM_MAXCPU; i++)
 		vcpu_cleanup(&vm->vcpu[i]);

-	iommu_destroy_domain(vm->iommu);
+	VMSPACE_FREE(vm->vmspace);

 	VMCLEANUP(vm->cookie);

@ -365,52 +358,48 @@ vm_name(struct vm *vm)
 int
 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
 {
-	const boolean_t spok = TRUE;	/* superpage mappings are ok */
+	vm_object_t obj;

-	return (VMMMAP_SET(vm->cookie, gpa, hpa, len, VM_MEMATTR_UNCACHEABLE,
-			   VM_PROT_RW, spok));
+	if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
+		return (ENOMEM);
+	else
+		return (0);
 }

 int
 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
 {
-	const boolean_t spok = TRUE;	/* superpage mappings are ok */

-	return (VMMMAP_SET(vm->cookie, gpa, 0, len, 0,
-			   VM_PROT_NONE, spok));
+	vmm_mmio_free(vm->vmspace, gpa, len);
+	return (0);
 }

-/*
- * Returns TRUE if 'gpa' is available for allocation and FALSE otherwise
- */
-static boolean_t
-vm_gpa_available(struct vm *vm, vm_paddr_t gpa)
+boolean_t
+vm_mem_allocated(struct vm *vm, vm_paddr_t gpa)
 {
 	int i;
 	vm_paddr_t gpabase, gpalimit;

-	if (gpa & PAGE_MASK)
-		panic("vm_gpa_available: gpa (0x%016lx) not page aligned", gpa);
-
 	for (i = 0; i < vm->num_mem_segs; i++) {
 		gpabase = vm->mem_segs[i].gpa;
 		gpalimit = gpabase + vm->mem_segs[i].len;
 		if (gpa >= gpabase && gpa < gpalimit)
-			return (FALSE);
+			return (TRUE);		/* 'gpa' is regular memory */
 	}

-	return (TRUE);
+	if (ppt_is_mmio(vm, gpa))
+		return (TRUE);			/* 'gpa' is pci passthru mmio */
+
+	return (FALSE);
 }

 int
 vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
 {
-	int error, available, allocated;
-	struct vm_memory_segment *seg;
-	vm_paddr_t g, hpa;
-	void *host_domain;
-
-	const boolean_t spok = TRUE;	/* superpage mappings are ok */
+	int available, allocated;
+	struct mem_seg *seg;
+	vm_object_t object;
+	vm_paddr_t g;

 	if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
 		return (EINVAL);
@ -418,10 +407,10 @@ vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
 	available = allocated = 0;
 	g = gpa;
 	while (g < gpa + len) {
-		if (vm_gpa_available(vm, g))
-			available++;
-		else
+		if (vm_mem_allocated(vm, g))
 			allocated++;
+		else
+			available++;

 		g += PAGE_SIZE;
 	}
@ -443,61 +432,203 @@ vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
 	if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
 		return (E2BIG);

-	host_domain = iommu_host_domain();
-
 	seg = &vm->mem_segs[vm->num_mem_segs];

-	error = 0;
+	if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL)
+		return (ENOMEM);
+
 	seg->gpa = gpa;
-	seg->len = 0;
-	while (seg->len < len) {
-		hpa = vmm_mem_alloc(PAGE_SIZE);
-		if (hpa == 0) {
-			error = ENOMEM;
-			break;
-		}
-
-		error = VMMMAP_SET(vm->cookie, gpa + seg->len, hpa, PAGE_SIZE,
-				   VM_MEMATTR_WRITE_BACK, VM_PROT_ALL, spok);
-		if (error)
-			break;
-
-		/*
-		 * Remove the 1:1 mapping for 'hpa' from the 'host_domain'.
-		 * Add mapping for 'gpa + seg->len' to 'hpa' in the VMs domain.
-		 */
-		iommu_remove_mapping(host_domain, hpa, PAGE_SIZE);
-		iommu_create_mapping(vm->iommu, gpa + seg->len, hpa, PAGE_SIZE);
-
-		seg->len += PAGE_SIZE;
-	}
-
-	if (error) {
-		vm_free_mem_seg(vm, seg);
-		return (error);
-	}
-
-	/*
-	 * Invalidate cached translations associated with 'host_domain' since
-	 * we have now moved some pages from it.
-	 */
-	iommu_invalidate_tlb(host_domain);
+	seg->len = len;
+	seg->object = object;
+	seg->wired = FALSE;

 	vm->num_mem_segs++;

 	return (0);
 }

-vm_paddr_t
-vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t len)
+static void
+vm_gpa_unwire(struct vm *vm)
 {
-	vm_paddr_t nextpage;
+	int i, rv;
+	struct mem_seg *seg;

-	nextpage = rounddown(gpa + PAGE_SIZE, PAGE_SIZE);
-	if (len > nextpage - gpa)
-		panic("vm_gpa2hpa: invalid gpa/len: 0x%016lx/%lu", gpa, len);
+	for (i = 0; i < vm->num_mem_segs; i++) {
+		seg = &vm->mem_segs[i];
+		if (!seg->wired)
+			continue;

-	return (VMMMAP_GET(vm->cookie, gpa));
+		rv = vm_map_unwire(&vm->vmspace->vm_map,
+				   seg->gpa, seg->gpa + seg->len,
+				   VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
+		KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment "
+		    "%#lx/%ld could not be unwired: %d",
+		    vm_name(vm), seg->gpa, seg->len, rv));
+
+		seg->wired = FALSE;
+	}
+}
+
+static int
+vm_gpa_wire(struct vm *vm)
+{
+	int i, rv;
+	struct mem_seg *seg;
+
+	for (i = 0; i < vm->num_mem_segs; i++) {
+		seg = &vm->mem_segs[i];
+		if (seg->wired)
+			continue;
+
+		/* XXX rlimits? */
+		rv = vm_map_wire(&vm->vmspace->vm_map,
+				 seg->gpa, seg->gpa + seg->len,
+				 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
+		if (rv != KERN_SUCCESS)
+			break;
+
+		seg->wired = TRUE;
+	}
+
+	if (i < vm->num_mem_segs) {
+		/*
+		 * Undo the wiring before returning an error.
+		 */
+		vm_gpa_unwire(vm);
+		return (EAGAIN);
+	}
+
+	return (0);
+}
+
+static void
+vm_iommu_modify(struct vm *vm, boolean_t map)
+{
+	int i, sz;
+	vm_paddr_t gpa, hpa;
+	struct mem_seg *seg;
+	void *vp, *cookie, *host_domain;
+
+	sz = PAGE_SIZE;
+	host_domain = iommu_host_domain();
+
+	for (i = 0; i < vm->num_mem_segs; i++) {
+		seg = &vm->mem_segs[i];
+		KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired",
+		    vm_name(vm), seg->gpa, seg->len));
+
+		gpa = seg->gpa;
+		while (gpa < seg->gpa + seg->len) {
+			vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE,
+					 &cookie);
+			KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx",
+			    vm_name(vm), gpa));
+
+			vm_gpa_release(cookie);
+
+			hpa = DMAP_TO_PHYS((uintptr_t)vp);
+			if (map) {
+				iommu_create_mapping(vm->iommu, gpa, hpa, sz);
+				iommu_remove_mapping(host_domain, hpa, sz);
+			} else {
+				iommu_remove_mapping(vm->iommu, gpa, sz);
+				iommu_create_mapping(host_domain, hpa, hpa, sz);
+			}
+
+			gpa += PAGE_SIZE;
+		}
+	}
+
+	/*
+	 * Invalidate the cached translations associated with the domain
+	 * from which pages were removed.
+	 */
+	if (map)
+		iommu_invalidate_tlb(host_domain);
+	else
+		iommu_invalidate_tlb(vm->iommu);
+}
+
+#define	vm_iommu_unmap(vm)	vm_iommu_modify((vm), FALSE)
+#define	vm_iommu_map(vm)	vm_iommu_modify((vm), TRUE)
+
+int
+vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
+{
+	int error;
+
+	error = ppt_unassign_device(vm, bus, slot, func);
+	if (error)
+		return (error);
+
+	if (ppt_num_devices(vm) == 0) {
+		vm_iommu_unmap(vm);
+		vm_gpa_unwire(vm);
+	}
+	return (0);
+}
+
+int
+vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
+{
+	int error;
+	vm_paddr_t maxaddr;
+
+	/*
+	 * Virtual machines with pci passthru devices get special treatment:
+	 * - the guest physical memory is wired
+	 * - the iommu is programmed to do the 'gpa' to 'hpa' translation
+	 *
+	 * We need to do this before the first pci passthru device is attached.
+	 */
+	if (ppt_num_devices(vm) == 0) {
+		KASSERT(vm->iommu == NULL,
+		    ("vm_assign_pptdev: iommu must be NULL"));
+		maxaddr = vmm_mem_maxaddr();
+		vm->iommu = iommu_create_domain(maxaddr);
+
+		error = vm_gpa_wire(vm);
+		if (error)
+			return (error);
+
+		vm_iommu_map(vm);
+	}
+
+	error = ppt_assign_device(vm, bus, slot, func);
+	return (error);
+}
+
+void *
+vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
+	    void **cookie)
+{
+	int count, pageoff;
+	vm_page_t m;
+
+	pageoff = gpa & PAGE_MASK;
+	if (len > PAGE_SIZE - pageoff)
+		panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
+
+	count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
+	    trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
+
+	if (count == 1) {
+		*cookie = m;
+		return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
+	} else {
+		*cookie = NULL;
+		return (NULL);
+	}
+}
+
+void
+vm_gpa_release(void *cookie)
+{
+	vm_page_t m = cookie;
+
+	vm_page_lock(m);
+	vm_page_unhold(m);
+	vm_page_unlock(m);
 }

 int
@ -508,13 +639,42 @@ vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,

 	for (i = 0; i < vm->num_mem_segs; i++) {
 		if (gpabase == vm->mem_segs[i].gpa) {
-			*seg = vm->mem_segs[i];
+			seg->gpa = vm->mem_segs[i].gpa;
+			seg->len = vm->mem_segs[i].len;
+			seg->wired = vm->mem_segs[i].wired;
 			return (0);
 		}
 	}
 	return (-1);
 }

+int
+vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len,
+	      vm_offset_t *offset, struct vm_object **object)
+{
+	int i;
+	size_t seg_len;
+	vm_paddr_t seg_gpa;
+	vm_object_t seg_obj;
+
+	for (i = 0; i < vm->num_mem_segs; i++) {
+		if ((seg_obj = vm->mem_segs[i].object) == NULL)
+			continue;
+
+		seg_gpa = vm->mem_segs[i].gpa;
+		seg_len = vm->mem_segs[i].len;
+
+		if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) {
+			*offset = gpa - seg_gpa;
+			*object = seg_obj;
+			vm_object_reference(seg_obj);
+			return (0);
+		}
+	}
+
+	return (EINVAL);
+}
+
 int
 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
 {
@ -633,26 +793,215 @@ save_guest_fpustate(struct vcpu *vcpu)

 static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");

+static int
+vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
+{
+	int error;
+
+	vcpu_assert_locked(vcpu);
+
+	/*
+	 * The following state transitions are allowed:
+	 * IDLE -> FROZEN -> IDLE
+	 * FROZEN -> RUNNING -> FROZEN
+	 * FROZEN -> SLEEPING -> FROZEN
+	 */
+	switch (vcpu->state) {
+	case VCPU_IDLE:
+	case VCPU_RUNNING:
+	case VCPU_SLEEPING:
+		error = (newstate != VCPU_FROZEN);
+		break;
+	case VCPU_FROZEN:
+		error = (newstate == VCPU_FROZEN);
+		break;
+	default:
+		error = 1;
+		break;
+	}
+
+	if (error == 0)
+		vcpu->state = newstate;
+	else
+		error = EBUSY;
+
+	return (error);
+}
+
+static void
+vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
+{
+	int error;
+
+	if ((error = vcpu_set_state(vm, vcpuid, newstate)) != 0)
+		panic("Error %d setting state to %d\n", error, newstate);
+}
+
+static void
+vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
+{
+	int error;
+
+	if ((error = vcpu_set_state_locked(vcpu, newstate)) != 0)
+		panic("Error %d setting state to %d", error, newstate);
+}
+
+/*
+ * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
+ */
+static int
+vm_handle_hlt(struct vm *vm, int vcpuid, boolean_t *retu)
+{
+	struct vcpu *vcpu;
+	int sleepticks, t;
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	vcpu_lock(vcpu);
+
+	/*
+	 * Figure out the number of host ticks until the next apic
+	 * timer interrupt in the guest.
+	 */
+	sleepticks = lapic_timer_tick(vm, vcpuid);
+
+	/*
+	 * If the guest local apic timer is disabled then sleep for
+	 * a long time but not forever.
+	 */
+	if (sleepticks < 0)
+		sleepticks = hz;
+
+	/*
+	 * Do a final check for pending NMI or interrupts before
+	 * really putting this thread to sleep.
+	 *
+	 * These interrupts could have happened any time after we
+	 * returned from VMRUN() and before we grabbed the vcpu lock.
+	 */
+	if (!vm_nmi_pending(vm, vcpuid) && lapic_pending_intr(vm, vcpuid) < 0) {
+		if (sleepticks <= 0)
+			panic("invalid sleepticks %d", sleepticks);
+		t = ticks;
+		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
+		msleep_spin(vcpu, &vcpu->mtx, "vmidle", sleepticks);
+		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
+		vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
+	}
+	vcpu_unlock(vcpu);
+
+	return (0);
+}
+
+static int
+vm_handle_paging(struct vm *vm, int vcpuid, boolean_t *retu)
+{
+	int rv, ftype;
+	struct vm_map *map;
+	struct vcpu *vcpu;
+	struct vm_exit *vme;
+
+	vcpu = &vm->vcpu[vcpuid];
+	vme = &vcpu->exitinfo;
+
+	ftype = vme->u.paging.fault_type;
+	KASSERT(ftype == VM_PROT_READ ||
+	    ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
+	    ("vm_handle_paging: invalid fault_type %d", ftype));
+
+	if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
+		rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
+		    vme->u.paging.gpa, ftype);
+		if (rv == 0)
+			goto done;
+	}
+
+	map = &vm->vmspace->vm_map;
+	rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
+
+	VMM_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, ftype = %d",
+		 rv, vme->u.paging.gpa, ftype);
+
+	if (rv != KERN_SUCCESS)
+		return (EFAULT);
+done:
+	/* restart execution at the faulting instruction */
+	vme->inst_length = 0;
+
+	return (0);
+}
+
+static int
+vm_handle_inst_emul(struct vm *vm, int vcpuid, boolean_t *retu)
+{
+	struct vie *vie;
+	struct vcpu *vcpu;
+	struct vm_exit *vme;
+	int error, inst_length;
+	uint64_t rip, gla, gpa, cr3;
+
+	vcpu = &vm->vcpu[vcpuid];
+	vme = &vcpu->exitinfo;
+
+	rip = vme->rip;
+	inst_length = vme->inst_length;
+
+	gla = vme->u.inst_emul.gla;
+	gpa = vme->u.inst_emul.gpa;
+	cr3 = vme->u.inst_emul.cr3;
+	vie = &vme->u.inst_emul.vie;
+
+	vie_init(vie);
+
+	/* Fetch, decode and emulate the faulting instruction */
+	if (vmm_fetch_instruction(vm, vcpuid, rip, inst_length, cr3, vie) != 0)
+		return (EFAULT);
+
+	if (vmm_decode_instruction(vm, vcpuid, gla, vie) != 0)
+		return (EFAULT);
+
+	/* return to userland unless this is a local apic access */
+	if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE) {
+		*retu = TRUE;
+		return (0);
+	}
+
+	error = vmm_emulate_instruction(vm, vcpuid, gpa, vie,
+					lapic_mmio_read, lapic_mmio_write, 0);
+
+	/* return to userland to spin up the AP */
+	if (error == 0 && vme->exitcode == VM_EXITCODE_SPINUP_AP)
+		*retu = TRUE;
+
+	return (error);
+}
+
 int
 vm_run(struct vm *vm, struct vm_run *vmrun)
 {
-	int error, vcpuid, sleepticks, t;
+	int error, vcpuid;
 	struct vcpu *vcpu;
 	struct pcb *pcb;
 	uint64_t tscval, rip;
 	struct vm_exit *vme;
+	boolean_t retu;
+	pmap_t pmap;

 	vcpuid = vmrun->cpuid;

 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);

+	pmap = vmspace_pmap(vm->vmspace);
 	vcpu = &vm->vcpu[vcpuid];
-	vme = &vmrun->vm_exit;
+	vme = &vcpu->exitinfo;
 	rip = vmrun->rip;
 restart:
 	critical_enter();

+	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
+	    ("vm_run: absurd pm_active"));
+
 	tscval = rdtsc();

 	pcb = PCPU_GET(curpcb);
@ -661,62 +1010,44 @@ restart:
 	restore_guest_msrs(vm, vcpuid);	
 	restore_guest_fpustate(vcpu);

+	vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
 	vcpu->hostcpu = curcpu;
-	error = VMRUN(vm->cookie, vcpuid, rip);
+	error = VMRUN(vm->cookie, vcpuid, rip, pmap);
 	vcpu->hostcpu = NOCPU;
+	vcpu_require_state(vm, vcpuid, VCPU_FROZEN);

 	save_guest_fpustate(vcpu);
 	restore_host_msrs(vm, vcpuid);

 	vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);

-	/* copy the exit information */
-	bcopy(&vcpu->exitinfo, vme, sizeof(struct vm_exit));
-
 	critical_exit();

-	/*
-	 * Oblige the guest's desire to 'hlt' by sleeping until the vcpu
-	 * is ready to run.
-	 */
-	if (error == 0 && vme->exitcode == VM_EXITCODE_HLT) {
-		vcpu_lock(vcpu);
-
-		/*
-		 * Figure out the number of host ticks until the next apic
-		 * timer interrupt in the guest.
-		 */
-		sleepticks = lapic_timer_tick(vm, vcpuid);
-
-		/*
-		 * If the guest local apic timer is disabled then sleep for
-		 * a long time but not forever.
-		 */
-		if (sleepticks < 0)
-			sleepticks = hz;
-
-		/*
-		 * Do a final check for pending NMI or interrupts before
-		 * really putting this thread to sleep.
-		 *
-		 * These interrupts could have happened any time after we
-		 * returned from VMRUN() and before we grabbed the vcpu lock.
-		 */
-		if (!vm_nmi_pending(vm, vcpuid) &&
-		    lapic_pending_intr(vm, vcpuid) < 0) {
-			if (sleepticks <= 0)
-				panic("invalid sleepticks %d", sleepticks);
-			t = ticks;
-			msleep_spin(vcpu, &vcpu->mtx, "vmidle", sleepticks);
-			vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
+	if (error == 0) {
+		retu = FALSE;
+		switch (vme->exitcode) {
+		case VM_EXITCODE_HLT:
+			error = vm_handle_hlt(vm, vcpuid, &retu);
+			break;
+		case VM_EXITCODE_PAGING:
+			error = vm_handle_paging(vm, vcpuid, &retu);
+			break;
+		case VM_EXITCODE_INST_EMUL:
+			error = vm_handle_inst_emul(vm, vcpuid, &retu);
+			break;
+		default:
+			retu = TRUE;	/* handled in userland */
+			break;
 		}
+	}

-		vcpu_unlock(vcpu);
-
+	if (error == 0 && retu == FALSE) {
 		rip = vme->rip + vme->inst_length;
 		goto restart;
 	}

+	/* copy the exit information */
+	bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
 	return (error);
 }

@ -869,7 +1200,7 @@ vm_iommu_domain(struct vm *vm)
 }

 int
-vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state state)
+vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
 {
 	int error;
 	struct vcpu *vcpu;
@ -880,20 +1211,7 @@ vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state state)
 	vcpu = &vm->vcpu[vcpuid];

 	vcpu_lock(vcpu);
-
-	/*
-	 * The following state transitions are allowed:
-	 * IDLE -> RUNNING -> IDLE
-	 * IDLE -> CANNOT_RUN -> IDLE
-	 */
-	if ((vcpu->state == VCPU_IDLE && state != VCPU_IDLE) ||
-	    (vcpu->state != VCPU_IDLE && state == VCPU_IDLE)) {
-		error = 0;
-		vcpu->state = state;
-	} else {
-		error = EBUSY;
-	}
-
+	error = vcpu_set_state_locked(vcpu, newstate);
 	vcpu_unlock(vcpu);

 	return (error);
@ -979,16 +1297,7 @@ vm_interrupt_hostcpu(struct vm *vm, int vcpuid)
 	vcpu_lock(vcpu);
 	hostcpu = vcpu->hostcpu;
 	if (hostcpu == NOCPU) {
-		/*
-		 * If the vcpu is 'RUNNING' but without a valid 'hostcpu' then
-		 * the host thread must be sleeping waiting for an event to
-		 * kick the vcpu out of 'hlt'.
-		 *
-		 * XXX this is racy because the condition exists right before
-		 * and after calling VMRUN() in vm_run(). The wakeup() is
-		 * benign in this case.
-		 */
-		if (vcpu->state == VCPU_RUNNING)
+		if (vcpu->state == VCPU_SLEEPING)
 			wakeup_one(vcpu);
 	} else {
 		if (vcpu->state != VCPU_RUNNING)
@ -998,3 +1307,10 @@ vm_interrupt_hostcpu(struct vm *vm, int vcpuid)
 	}
 	vcpu_unlock(vcpu);
 }
+
+struct vmspace *
+vm_get_vmspace(struct vm *vm)
+{
+
+	return (vm->vmspace);
+}
--- a/sys/amd64/vmm/vmm_dev.c
+++ b/sys/amd64/vmm/vmm_dev.c
@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$");

 #include <vm/vm.h>
 #include <vm/pmap.h>
+#include <vm/vm_map.h>

 #include <machine/pmap.h>
 #include <machine/vmparam.h>
@ -95,8 +96,9 @@ vmmdev_lookup2(struct cdev *cdev)
 static int
 vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
 {
-	int error, off, c;
-	vm_paddr_t hpa, gpa;
+	int error, off, c, prot;
+	vm_paddr_t gpa;
+	void *hpa, *cookie;
 	struct vmmdev_softc *sc;

 	static char zerobuf[PAGE_SIZE];
@ -107,6 +109,7 @@ vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
 	if (sc == NULL)
 		error = ENXIO;

+	prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
 	while (uio->uio_resid > 0 && error == 0) {
 		gpa = uio->uio_offset;
 		off = gpa & PAGE_MASK;
@ -120,14 +123,16 @@ vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
 		 * Since this device does not support lseek(2), dd(1) will
 		 * read(2) blocks of data to simulate the lseek(2).
 		 */
-		hpa = vm_gpa2hpa(sc->vm, gpa, c);
-		if (hpa == (vm_paddr_t)-1) {
+		hpa = vm_gpa_hold(sc->vm, gpa, c, prot, &cookie);
+		if (hpa == NULL) {
 			if (uio->uio_rw == UIO_READ)
 				error = uiomove(zerobuf, c, uio);
 			else
 				error = EFAULT;
-		} else
-			error = uiomove((void *)PHYS_TO_DMAP(hpa), c, uio);
+		} else {
+			error = uiomove(hpa, c, uio);
+			vm_gpa_release(cookie);
+		}
 	}

 	mtx_unlock(&vmmdev_mtx);
@ -139,7 +144,6 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 	     struct thread *td)
 {
 	int error, vcpu, state_changed;
-	enum vcpu_state new_state;
 	struct vmmdev_softc *sc;
 	struct vm_memory_segment *seg;
 	struct vm_register *vmreg;
@ -156,6 +160,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 	struct vm_stats *vmstats;
 	struct vm_stat_desc *statdesc;
 	struct vm_x2apic *x2apic;
+	struct vm_gpa_pte *gpapte;

 	sc = vmmdev_lookup2(cdev);
 	if (sc == NULL)
@ -189,12 +194,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 			goto done;
 		}

-		if (cmd == VM_RUN)
-			new_state = VCPU_RUNNING;
-		else
-			new_state = VCPU_CANNOT_RUN;
-
-		error = vcpu_set_state(sc->vm, vcpu, new_state);
+		error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN);
 		if (error)
 			goto done;

@ -211,7 +211,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 		 */
 		error = 0;
 		for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) {
-			error = vcpu_set_state(sc->vm, vcpu, VCPU_CANNOT_RUN);
+			error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN);
 			if (error)
 				break;
 		}
@ -271,13 +271,13 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 		break;
 	case VM_BIND_PPTDEV:
 		pptdev = (struct vm_pptdev *)data;
-		error = ppt_assign_device(sc->vm, pptdev->bus, pptdev->slot,
-					  pptdev->func);
+		error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
+					 pptdev->func);
 		break;
 	case VM_UNBIND_PPTDEV:
 		pptdev = (struct vm_pptdev *)data;
-		error = ppt_unassign_device(sc->vm, pptdev->bus, pptdev->slot,
-					    pptdev->func);
+		error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
+					   pptdev->func);
 		break;
 	case VM_INJECT_EVENT:
 		vmevent = (struct vm_event *)data;
@ -348,6 +348,12 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 		error = vm_get_x2apic_state(sc->vm,
 					    x2apic->cpuid, &x2apic->state);
 		break;
+	case VM_GET_GPA_PMAP:
+		gpapte = (struct vm_gpa_pte *)data;
+		pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)),
+				 gpapte->gpa, gpapte->pte, &gpapte->ptenum);
+		error = 0;
+		break;
 	default:
 		error = ENOTTY;
 		break;
@ -361,25 +367,25 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 	}

 done:
+	/* Make sure that no handler returns a bogus value like ERESTART */
+	KASSERT(error >= 0, ("vmmdev_ioctl: invalid error return %d", error));
 	return (error);
 }

 static int
-vmmdev_mmap(struct cdev *cdev, vm_ooffset_t offset, vm_paddr_t *paddr,
-    int nprot, vm_memattr_t *memattr)
+vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset,
+		   vm_size_t size, struct vm_object **object, int nprot)
 {
 	int error;
 	struct vmmdev_softc *sc;

-	error = -1;
 	mtx_lock(&vmmdev_mtx);

 	sc = vmmdev_lookup2(cdev);
-	if (sc != NULL && (nprot & PROT_EXEC) == 0) {
-		*paddr = vm_gpa2hpa(sc->vm, (vm_paddr_t)offset, PAGE_SIZE);
-		if (*paddr != (vm_paddr_t)-1)
-			error = 0;
-	}
+	if (sc != NULL && (nprot & PROT_EXEC) == 0)
+		error = vm_get_memobj(sc->vm, *offset, size, offset, object);
+	else
+		error = EINVAL;

 	mtx_unlock(&vmmdev_mtx);

@ -446,7 +452,7 @@ static struct cdevsw vmmdevsw = {
 	.d_name		= "vmmdev",
 	.d_version	= D_VERSION,
 	.d_ioctl	= vmmdev_ioctl,
-	.d_mmap		= vmmdev_mmap,
+	.d_mmap_single	= vmmdev_mmap_single,
 	.d_read		= vmmdev_rw,
 	.d_write	= vmmdev_rw,
 };
--- a/sys/amd64/vmm/vmm_instruction_emul.c
+++ b/sys/amd64/vmm/vmm_instruction_emul.c
@ -465,7 +465,7 @@ vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 }

 #ifdef _KERNEL
-static void
+void
 vie_init(struct vie *vie)
 {

@ -479,9 +479,9 @@ static int
 gla2gpa(struct vm *vm, uint64_t gla, uint64_t ptpphys,
 	uint64_t *gpa, uint64_t *gpaend)
 {
-	vm_paddr_t hpa;
 	int nlevels, ptpshift, ptpindex;
 	uint64_t *ptpbase, pte, pgsize;
+	void *cookie;

 	/*
 	 * XXX assumes 64-bit guest with 4 page walk levels
@ -491,18 +491,19 @@ gla2gpa(struct vm *vm, uint64_t gla, uint64_t ptpphys,
 		/* Zero out the lower 12 bits and the upper 12 bits */
 		ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12;

-		hpa = vm_gpa2hpa(vm, ptpphys, PAGE_SIZE);
-		if (hpa == -1)
+		ptpbase = vm_gpa_hold(vm, ptpphys, PAGE_SIZE, VM_PROT_READ,
+				      &cookie);
+		if (ptpbase == NULL)
 			goto error;

-		ptpbase = (uint64_t *)PHYS_TO_DMAP(hpa);
-
 		ptpshift = PAGE_SHIFT + nlevels * 9;
 		ptpindex = (gla >> ptpshift) & 0x1FF;
 		pgsize = 1UL << ptpshift;

 		pte = ptpbase[ptpindex];

+		vm_gpa_release(cookie);
+
 		if ((pte & PG_V) == 0)
 			goto error;

@ -530,18 +531,18 @@ int
 vmm_fetch_instruction(struct vm *vm, int cpuid, uint64_t rip, int inst_length,
 		      uint64_t cr3, struct vie *vie)
 {
-	int n, err;
-	uint64_t hpa, gpa, gpaend, off;
+	int n, err, prot;
+	uint64_t gpa, gpaend, off;
+	void *hpa, *cookie;

 	/*
 	 * XXX cache previously fetched instructions using 'rip' as the tag
 	 */

+	prot = VM_PROT_READ | VM_PROT_EXECUTE;
 	if (inst_length > VIE_INST_SIZE)
 		panic("vmm_fetch_instruction: invalid length %d", inst_length);

-	vie_init(vie);
-
 	/* Copy the instruction into 'vie' */
 	while (vie->num_valid < inst_length) {
 		err = gla2gpa(vm, rip, cr3, &gpa, &gpaend);
@ -551,11 +552,12 @@ vmm_fetch_instruction(struct vm *vm, int cpuid, uint64_t rip, int inst_length,
 		off = gpa & PAGE_MASK;
 		n = min(inst_length - vie->num_valid, PAGE_SIZE - off);

-		hpa = vm_gpa2hpa(vm, gpa, n);
-		if (hpa == -1)
+		if ((hpa = vm_gpa_hold(vm, gpa, n, prot, &cookie)) == NULL)
 			break;

-		bcopy((void *)PHYS_TO_DMAP(hpa), &vie->inst[vie->num_valid], n);
+		bcopy(hpa, &vie->inst[vie->num_valid], n);
+
+		vm_gpa_release(cookie);

 		rip += n;
 		vie->num_valid += n;
--- a/sys/amd64/vmm/vmm_mem.c
+++ b/sys/amd64/vmm/vmm_mem.c
@ -30,40 +30,24 @@
 __FBSDID("$FreeBSD$");

 #include <sys/param.h>
-#include <sys/lock.h>
-#include <sys/mutex.h>
-#include <sys/linker.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
-#include <sys/kernel.h>
-#include <sys/sysctl.h>
+#include <sys/sglist.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>

 #include <vm/vm.h>
+#include <vm/vm_param.h>
 #include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
 #include <vm/vm_page.h>
-#include <vm/vm_pageout.h>
+#include <vm/vm_pager.h>

 #include <machine/md_var.h>
-#include <machine/metadata.h>
-#include <machine/pc/bios.h>
-#include <machine/vmparam.h>
-#include <machine/pmap.h>

-#include "vmm_util.h"
 #include "vmm_mem.h"

-SYSCTL_DECL(_hw_vmm);
-
-static u_long pages_allocated;
-SYSCTL_ULONG(_hw_vmm, OID_AUTO, pages_allocated, CTLFLAG_RD,
-	     &pages_allocated, 0, "4KB pages allocated");
-
-static void
-update_pages_allocated(int howmany)
-{
-	pages_allocated += howmany;	/* XXX locking? */
-}
-
 int
 vmm_mem_init(void)
 {
@ -71,60 +55,95 @@ vmm_mem_init(void)
 	return (0);
 }

-vm_paddr_t
-vmm_mem_alloc(size_t size)
+vm_object_t
+vmm_mmio_alloc(struct vmspace *vmspace, vm_paddr_t gpa, size_t len,
+	       vm_paddr_t hpa)
 {
-	int flags;
-	vm_page_t m;
-	vm_paddr_t pa;
+	int error;
+	vm_object_t obj;
+	struct sglist *sg;

-	if (size != PAGE_SIZE)
-		panic("vmm_mem_alloc: invalid allocation size %lu", size);
+	sg = sglist_alloc(1, M_WAITOK);
+	error = sglist_append_phys(sg, hpa, len);
+	KASSERT(error == 0, ("error %d appending physaddr to sglist", error));

-	flags = VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
-		VM_ALLOC_ZERO;
-
-	while (1) {
+	obj = vm_pager_allocate(OBJT_SG, sg, len, VM_PROT_RW, 0, NULL);
+	if (obj != NULL) {
 		/*
-		 * XXX need policy to determine when to back off the allocation
+		 * VT-x ignores the MTRR settings when figuring out the
+		 * memory type for translations obtained through EPT.
+		 *
+		 * Therefore we explicitly force the pages provided by
+		 * this object to be mapped as uncacheable.
 		 */
-		m = vm_page_alloc(NULL, 0, flags);
-		if (m == NULL)
-			VM_WAIT;
-		else
-			break;
+		VM_OBJECT_WLOCK(obj);
+		error = vm_object_set_memattr(obj, VM_MEMATTR_UNCACHEABLE);
+		VM_OBJECT_WUNLOCK(obj);
+		if (error != KERN_SUCCESS) {
+			panic("vmm_mmio_alloc: vm_object_set_memattr error %d",
+				error);
+		}
+		error = vm_map_find(&vmspace->vm_map, obj, 0, &gpa, len, 0,
+				    VMFS_NO_SPACE, VM_PROT_RW, VM_PROT_RW, 0);
+		if (error != KERN_SUCCESS) {
+			vm_object_deallocate(obj);
+			obj = NULL;
+		}
 	}

-	pa = VM_PAGE_TO_PHYS(m);
-	
-	if ((m->flags & PG_ZERO) == 0)
-		pagezero((void *)PHYS_TO_DMAP(pa));
-	m->valid = VM_PAGE_BITS_ALL;
+	/*
+	 * Drop the reference on the sglist.
+	 *
+	 * If the scatter/gather object was successfully allocated then it
+	 * has incremented the reference count on the sglist. Dropping the
+	 * initial reference count ensures that the sglist will be freed
+	 * when the object is deallocated.
+	 * 
+	 * If the object could not be allocated then we end up freeing the
+	 * sglist.
+	 */
+	sglist_free(sg);

-	update_pages_allocated(1);
-
-	return (pa);
+	return (obj);
 }

 void
-vmm_mem_free(vm_paddr_t base, size_t length)
+vmm_mmio_free(struct vmspace *vmspace, vm_paddr_t gpa, size_t len)
 {
-	vm_page_t m;

-	if (base & PAGE_MASK) {
-		panic("vmm_mem_free: base 0x%0lx must be aligned on a "
-		      "0x%0x boundary\n", base, PAGE_SIZE);
+	vm_map_remove(&vmspace->vm_map, gpa, gpa + len);
+}
+
+vm_object_t
+vmm_mem_alloc(struct vmspace *vmspace, vm_paddr_t gpa, size_t len)
+{
+	int error;
+	vm_object_t obj;
+
+	if (gpa & PAGE_MASK)
+		panic("vmm_mem_alloc: invalid gpa %#lx", gpa);
+
+	if (len == 0 || (len & PAGE_MASK) != 0)
+		panic("vmm_mem_alloc: invalid allocation size %lu", len);
+
+	obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT);
+	if (obj != NULL) {
+		error = vm_map_find(&vmspace->vm_map, obj, 0, &gpa, len, 0,
+				    VMFS_NO_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0);
+		if (error != KERN_SUCCESS) {
+			vm_object_deallocate(obj);
+			obj = NULL;
+		}
 	}

-	if (length != PAGE_SIZE)
-		panic("vmm_mem_free: invalid length %lu", length);
+	return (obj);
+}

-	m = PHYS_TO_VM_PAGE(base);
-	m->wire_count--;
-	vm_page_free(m);
-	atomic_subtract_int(&cnt.v_wire_count, 1);
+void
+vmm_mem_free(struct vmspace *vmspace, vm_paddr_t gpa, size_t len)
+{

-	update_pages_allocated(-1);
+	vm_map_remove(&vmspace->vm_map, gpa, gpa + len);
 }

 vm_paddr_t
--- a/sys/amd64/vmm/vmm_mem.h
+++ b/sys/amd64/vmm/vmm_mem.h
@ -29,9 +29,15 @@
 #ifndef	_VMM_MEM_H_
 #define	_VMM_MEM_H_

+struct vmspace;
+struct vm_object;
+
 int		vmm_mem_init(void);
-vm_paddr_t	vmm_mem_alloc(size_t size);
-void		vmm_mem_free(vm_paddr_t start, size_t size);
+struct vm_object *vmm_mem_alloc(struct vmspace *, vm_paddr_t gpa, size_t size);
+struct vm_object *vmm_mmio_alloc(struct vmspace *, vm_paddr_t gpa, size_t len,
+				 vm_paddr_t hpa);
+void		vmm_mem_free(struct vmspace *, vm_paddr_t gpa, size_t size);
+void		vmm_mmio_free(struct vmspace *, vm_paddr_t gpa, size_t size);
 vm_paddr_t	vmm_mem_maxaddr(void);

 #endif
--- a/sys/cam/ctl/ctl_frontend_iscsi.c
+++ b/sys/cam/ctl/ctl_frontend_iscsi.c
@ -2239,20 +2239,16 @@ cfiscsi_lun_disable(void *arg, struct ctl_id target_id, int lun_id)
 }

 static void
-cfiscsi_datamove(union ctl_io *io)
+cfiscsi_datamove_in(union ctl_io *io)
 {
 	struct cfiscsi_session *cs;
 	struct icl_pdu *request, *response;
 	const struct iscsi_bhs_scsi_command *bhssc;
 	struct iscsi_bhs_data_in *bhsdi;
-	struct iscsi_bhs_r2t *bhsr2t;
-	struct cfiscsi_data_wait *cdw;
 	struct ctl_sg_entry ctl_sg_entry, *ctl_sglist;
 	size_t copy_len, len, off;
 	const char *addr;
 	int ctl_sg_count, error, i;
-	uint32_t target_transfer_tag;
-	bool done;

 	request = io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr;
 	cs = PDU_SESSION(request);
@ -2278,215 +2274,240 @@ cfiscsi_datamove(union ctl_io *io)
 	 */
 	PDU_TOTAL_TRANSFER_LEN(request) = io->scsiio.kern_total_len;

-	if ((io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_IN) {
 #if 0
-		if (ctl_sg_count > 1)
-			CFISCSI_SESSION_DEBUG(cs, "ctl_sg_count = %d", ctl_sg_count);
+	if (ctl_sg_count > 1)
+		CFISCSI_SESSION_DEBUG(cs, "ctl_sg_count = %d", ctl_sg_count);
 #endif

-		/*
-		 * This is the offset within the current SCSI command;
-		 * i.e. for the first call of datamove(), it will be 0,
-		 * and for subsequent ones it will be the sum of lengths
-		 * of previous ones.
-		 */
-		off = htonl(io->scsiio.kern_rel_offset);
-		if (off > 1)
-			CFISCSI_SESSION_DEBUG(cs, "off = %zd", off);
+	/*
+	 * This is the offset within the current SCSI command;
+	 * i.e. for the first call of datamove(), it will be 0,
+	 * and for subsequent ones it will be the sum of lengths
+	 * of previous ones.
+	 */
+	off = htonl(io->scsiio.kern_rel_offset);
+	if (off > 1)
+		CFISCSI_SESSION_DEBUG(cs, "off = %zd", off);

-		i = 0;
-		addr = NULL;
-		len = 0;
-		response = NULL;
-		bhsdi = NULL;
-		for (;;) {
-			KASSERT(i < ctl_sg_count, ("i >= ctl_sg_count"));
+	i = 0;
+	addr = NULL;
+	len = 0;
+	response = NULL;
+	bhsdi = NULL;
+	for (;;) {
+		KASSERT(i < ctl_sg_count, ("i >= ctl_sg_count"));
+		if (response == NULL) {
+			response = cfiscsi_pdu_new_response(request, M_NOWAIT);
 			if (response == NULL) {
-				response =
-				    cfiscsi_pdu_new_response(request, M_NOWAIT);
-				if (response == NULL) {
-					CFISCSI_SESSION_WARN(cs, "failed to "
-					    "allocate memory; dropping connection");
-					icl_pdu_free(request);
-					cfiscsi_session_terminate(cs);
-					return;
-				}
-				bhsdi = (struct iscsi_bhs_data_in *)
-				    response->ip_bhs;
-				bhsdi->bhsdi_opcode =
-				    ISCSI_BHS_OPCODE_SCSI_DATA_IN;
-				bhsdi->bhsdi_initiator_task_tag =
-				    bhssc->bhssc_initiator_task_tag;
-				bhsdi->bhsdi_datasn =
-				    htonl(PDU_EXPDATASN(request));
-				PDU_EXPDATASN(request)++;
-				bhsdi->bhsdi_buffer_offset = htonl(off);
-			}
-
-			if (len == 0) {
-				addr = ctl_sglist[i].addr;
-				len = ctl_sglist[i].len;
-				KASSERT(len > 0, ("len <= 0"));
-			}
-
-			copy_len = len;
-			if (response->ip_data_len + copy_len >
-			    cs->cs_max_data_segment_length)
-				copy_len = cs->cs_max_data_segment_length -
-				    response->ip_data_len;
-			KASSERT(copy_len <= len, ("copy_len > len"));
-			error = icl_pdu_append_data(response, addr, copy_len, M_NOWAIT);
-			if (error != 0) {
 				CFISCSI_SESSION_WARN(cs, "failed to "
 				    "allocate memory; dropping connection");
 				icl_pdu_free(request);
-				icl_pdu_free(response);
 				cfiscsi_session_terminate(cs);
 				return;
 			}
-			addr += copy_len;
-			len -= copy_len;
-			off += copy_len;
-			io->scsiio.ext_data_filled += copy_len;
-
-			if (len == 0) {
-				/*
-				 * End of scatter-gather segment;
-				 * proceed to the next one...
-				 */
-				if (i == ctl_sg_count - 1) {
-					/*
-					 * ... unless this was the last one.
-					 */
-					break;
-				}
-				i++;
-			}
-
-			if (response->ip_data_len ==
-			    cs->cs_max_data_segment_length) {
-				/*
-				 * Can't stuff more data into the current PDU;
-				 * queue it.  Note that's not enough to check
-				 * for kern_data_resid == 0  instead; there
-				 * may be several Data-In PDUs for the final
-				 * call to cfiscsi_datamove(), and we want
-				 * to set the F flag only on the last of them.
-				 */
-				if (off == io->scsiio.kern_total_len)
-					bhsdi->bhsdi_flags |= BHSDI_FLAGS_F;
-				KASSERT(response->ip_data_len > 0,
-				    ("sending empty Data-In"));
-				cfiscsi_pdu_queue(response);
-				response = NULL;
-				bhsdi = NULL;
-			}
+			bhsdi = (struct iscsi_bhs_data_in *)response->ip_bhs;
+			bhsdi->bhsdi_opcode = ISCSI_BHS_OPCODE_SCSI_DATA_IN;
+			bhsdi->bhsdi_initiator_task_tag =
+			    bhssc->bhssc_initiator_task_tag;
+			bhsdi->bhsdi_datasn = htonl(PDU_EXPDATASN(request));
+			PDU_EXPDATASN(request)++;
+			bhsdi->bhsdi_buffer_offset = htonl(off);
 		}
-		KASSERT(i == ctl_sg_count - 1, ("missed SG segment"));
-		KASSERT(len == 0, ("missed data from SG segment"));
-		if (response != NULL) {
-			if (off == io->scsiio.kern_total_len) {
-				bhsdi->bhsdi_flags |= BHSDI_FLAGS_F;
-			} else {
-				CFISCSI_SESSION_DEBUG(cs, "not setting the F flag; "
-				    "have %zd, need %zd", off,
-				    (size_t)io->scsiio.kern_total_len);
+
+		if (len == 0) {
+			addr = ctl_sglist[i].addr;
+			len = ctl_sglist[i].len;
+			KASSERT(len > 0, ("len <= 0"));
+		}
+
+		copy_len = len;
+		if (response->ip_data_len + copy_len >
+		    cs->cs_max_data_segment_length)
+			copy_len = cs->cs_max_data_segment_length -
+			    response->ip_data_len;
+		KASSERT(copy_len <= len, ("copy_len > len"));
+		error = icl_pdu_append_data(response, addr, copy_len, M_NOWAIT);
+		if (error != 0) {
+			CFISCSI_SESSION_WARN(cs, "failed to "
+			    "allocate memory; dropping connection");
+			icl_pdu_free(request);
+			icl_pdu_free(response);
+			cfiscsi_session_terminate(cs);
+			return;
+		}
+		addr += copy_len;
+		len -= copy_len;
+		off += copy_len;
+		io->scsiio.ext_data_filled += copy_len;
+
+		if (len == 0) {
+			/*
+			 * End of scatter-gather segment;
+			 * proceed to the next one...
+			 */
+			if (i == ctl_sg_count - 1) {
+				/*
+				 * ... unless this was the last one.
+				 */
+				break;
 			}
+			i++;
+		}
+
+		if (response->ip_data_len == cs->cs_max_data_segment_length) {
+			/*
+			 * Can't stuff more data into the current PDU;
+			 * queue it.  Note that's not enough to check
+			 * for kern_data_resid == 0  instead; there
+			 * may be several Data-In PDUs for the final
+			 * call to cfiscsi_datamove(), and we want
+			 * to set the F flag only on the last of them.
+			 */
+			if (off == io->scsiio.kern_total_len)
+				bhsdi->bhsdi_flags |= BHSDI_FLAGS_F;
 			KASSERT(response->ip_data_len > 0,
 			    ("sending empty Data-In"));
 			cfiscsi_pdu_queue(response);
+			response = NULL;
+			bhsdi = NULL;
 		}
-
-		io->scsiio.be_move_done(io);
-	} else {
-		CFISCSI_SESSION_LOCK(cs);
-		target_transfer_tag = cs->cs_target_transfer_tag;
-		cs->cs_target_transfer_tag++;
-		CFISCSI_SESSION_UNLOCK(cs);
-
-#if 0
-		CFISCSI_SESSION_DEBUG(cs, "expecting Data-Out with initiator "
-		    "task tag 0x%x, target transfer tag 0x%x",
-		    bhssc->bhssc_initiator_task_tag, target_transfer_tag);
-#endif
-		cdw = uma_zalloc(cfiscsi_data_wait_zone, M_NOWAIT | M_ZERO);
-		if (cdw == NULL) {
-			CFISCSI_SESSION_WARN(cs, "failed to "
-			    "allocate memory; dropping connection");
-			icl_pdu_free(request);
-			cfiscsi_session_terminate(cs);
+	}
+	KASSERT(i == ctl_sg_count - 1, ("missed SG segment"));
+	KASSERT(len == 0, ("missed data from SG segment"));
+	if (response != NULL) {
+		if (off == io->scsiio.kern_total_len) {
+			bhsdi->bhsdi_flags |= BHSDI_FLAGS_F;
+		} else {
+			CFISCSI_SESSION_DEBUG(cs, "not setting the F flag; "
+			    "have %zd, need %zd", off,
+			    (size_t)io->scsiio.kern_total_len);
 		}
-		cdw->cdw_ctl_io = io;
-		cdw->cdw_target_transfer_tag = htonl(target_transfer_tag);
-		cdw->cdw_initiator_task_tag = bhssc->bhssc_initiator_task_tag;
-
-		if (cs->cs_immediate_data &&
-		    icl_pdu_data_segment_length(request) > 0) {
-			done = cfiscsi_handle_data_segment(request, cdw);
-			if (done) {
-				uma_zfree(cfiscsi_data_wait_zone, cdw);
-				io->scsiio.be_move_done(io);
-				return;
-			}
-
-#if 0
-			if (io->scsiio.ext_data_filled != 0)
-				CFISCSI_SESSION_DEBUG(cs, "got %zd bytes of immediate data, need %zd",
-				    io->scsiio.ext_data_filled, io->scsiio.kern_data_len);
-#endif
-		}
-
-		CFISCSI_SESSION_LOCK(cs);
-		TAILQ_INSERT_TAIL(&cs->cs_waiting_for_data_out, cdw, cdw_next);
-		CFISCSI_SESSION_UNLOCK(cs);
-
-		/*
-		 * XXX: We should limit the number of outstanding R2T PDUs
-		 * 	per task to MaxOutstandingR2T.
-		 */
-		response = cfiscsi_pdu_new_response(request, M_NOWAIT);
-		if (response == NULL) {
-			CFISCSI_SESSION_WARN(cs, "failed to "
-			    "allocate memory; dropping connection");
-			icl_pdu_free(request);
-			cfiscsi_session_terminate(cs);
-		}
-		bhsr2t = (struct iscsi_bhs_r2t *)response->ip_bhs;
-		bhsr2t->bhsr2t_opcode = ISCSI_BHS_OPCODE_R2T;
-		bhsr2t->bhsr2t_flags = 0x80;
-		bhsr2t->bhsr2t_lun = bhssc->bhssc_lun;
-		bhsr2t->bhsr2t_initiator_task_tag =
-		    bhssc->bhssc_initiator_task_tag;
-		bhsr2t->bhsr2t_target_transfer_tag =
-		    htonl(target_transfer_tag);
-		/*
-		 * XXX: Here we assume that cfiscsi_datamove() won't ever
-		 *	be running concurrently on several CPUs for a given
-		 *	command.
-		 */
-		bhsr2t->bhsr2t_r2tsn = htonl(PDU_R2TSN(request));
-		PDU_R2TSN(request)++;
-		/*
-		 * This is the offset within the current SCSI command;
-		 * i.e. for the first call of datamove(), it will be 0,
-		 * and for subsequent ones it will be the sum of lengths
-		 * of previous ones.
-		 *
-		 * The ext_data_filled is to account for unsolicited
-		 * (immediate) data that might have already arrived.
-		 */
-		bhsr2t->bhsr2t_buffer_offset =
-		    htonl(io->scsiio.kern_rel_offset + io->scsiio.ext_data_filled);
-		/*
-		 * This is the total length (sum of S/G lengths) this call
-		 * to cfiscsi_datamove() is supposed to handle.
-		 *
-		 * XXX: Limit it to MaxBurstLength.
-		 */
-		bhsr2t->bhsr2t_desired_data_transfer_length =
-		    htonl(io->scsiio.kern_data_len - io->scsiio.ext_data_filled);
+		KASSERT(response->ip_data_len > 0, ("sending empty Data-In"));
 		cfiscsi_pdu_queue(response);
 	}
+
+	io->scsiio.be_move_done(io);
+}
+
+static void
+cfiscsi_datamove_out(union ctl_io *io)
+{
+	struct cfiscsi_session *cs;
+	struct icl_pdu *request, *response;
+	const struct iscsi_bhs_scsi_command *bhssc;
+	struct iscsi_bhs_r2t *bhsr2t;
+	struct cfiscsi_data_wait *cdw;
+	uint32_t target_transfer_tag;
+	bool done;
+
+	request = io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr;
+	cs = PDU_SESSION(request);
+
+	bhssc = (const struct iscsi_bhs_scsi_command *)request->ip_bhs;
+	KASSERT((bhssc->bhssc_opcode & ~ISCSI_BHS_OPCODE_IMMEDIATE) ==
+	    ISCSI_BHS_OPCODE_SCSI_COMMAND,
+	    ("bhssc->bhssc_opcode != ISCSI_BHS_OPCODE_SCSI_COMMAND"));
+
+	/*
+	 * We need to record it so that we can properly report
+	 * underflow/underflow.
+	 */
+	PDU_TOTAL_TRANSFER_LEN(request) = io->scsiio.kern_total_len;
+
+	CFISCSI_SESSION_LOCK(cs);
+	target_transfer_tag = cs->cs_target_transfer_tag;
+	cs->cs_target_transfer_tag++;
+	CFISCSI_SESSION_UNLOCK(cs);
+
+#if 0
+	CFISCSI_SESSION_DEBUG(cs, "expecting Data-Out with initiator "
+	    "task tag 0x%x, target transfer tag 0x%x",
+	    bhssc->bhssc_initiator_task_tag, target_transfer_tag);
+#endif
+	cdw = uma_zalloc(cfiscsi_data_wait_zone, M_NOWAIT | M_ZERO);
+	if (cdw == NULL) {
+		CFISCSI_SESSION_WARN(cs, "failed to "
+		    "allocate memory; dropping connection");
+		icl_pdu_free(request);
+		cfiscsi_session_terminate(cs);
+	}
+	cdw->cdw_ctl_io = io;
+	cdw->cdw_target_transfer_tag = htonl(target_transfer_tag);
+	cdw->cdw_initiator_task_tag = bhssc->bhssc_initiator_task_tag;
+
+	if (cs->cs_immediate_data && icl_pdu_data_segment_length(request) > 0) {
+		done = cfiscsi_handle_data_segment(request, cdw);
+		if (done) {
+			uma_zfree(cfiscsi_data_wait_zone, cdw);
+			io->scsiio.be_move_done(io);
+			return;
+		}
+
+#if 0
+		if (io->scsiio.ext_data_filled != 0)
+			CFISCSI_SESSION_DEBUG(cs, "got %zd bytes of immediate data, need %zd",
+			    io->scsiio.ext_data_filled, io->scsiio.kern_data_len);
+#endif
+	}
+
+	CFISCSI_SESSION_LOCK(cs);
+	TAILQ_INSERT_TAIL(&cs->cs_waiting_for_data_out, cdw, cdw_next);
+	CFISCSI_SESSION_UNLOCK(cs);
+
+	/*
+	 * XXX: We should limit the number of outstanding R2T PDUs
+	 * 	per task to MaxOutstandingR2T.
+	 */
+	response = cfiscsi_pdu_new_response(request, M_NOWAIT);
+	if (response == NULL) {
+		CFISCSI_SESSION_WARN(cs, "failed to "
+		    "allocate memory; dropping connection");
+		icl_pdu_free(request);
+		cfiscsi_session_terminate(cs);
+	}
+	bhsr2t = (struct iscsi_bhs_r2t *)response->ip_bhs;
+	bhsr2t->bhsr2t_opcode = ISCSI_BHS_OPCODE_R2T;
+	bhsr2t->bhsr2t_flags = 0x80;
+	bhsr2t->bhsr2t_lun = bhssc->bhssc_lun;
+	bhsr2t->bhsr2t_initiator_task_tag = bhssc->bhssc_initiator_task_tag;
+	bhsr2t->bhsr2t_target_transfer_tag = htonl(target_transfer_tag);
+	/*
+	 * XXX: Here we assume that cfiscsi_datamove() won't ever
+	 *	be running concurrently on several CPUs for a given
+	 *	command.
+	 */
+	bhsr2t->bhsr2t_r2tsn = htonl(PDU_R2TSN(request));
+	PDU_R2TSN(request)++;
+	/*
+	 * This is the offset within the current SCSI command;
+	 * i.e. for the first call of datamove(), it will be 0,
+	 * and for subsequent ones it will be the sum of lengths
+	 * of previous ones.
+	 *
+	 * The ext_data_filled is to account for unsolicited
+	 * (immediate) data that might have already arrived.
+	 */
+	bhsr2t->bhsr2t_buffer_offset =
+	    htonl(io->scsiio.kern_rel_offset + io->scsiio.ext_data_filled);
+	/*
+	 * This is the total length (sum of S/G lengths) this call
+	 * to cfiscsi_datamove() is supposed to handle.
+	 *
+	 * XXX: Limit it to MaxBurstLength.
+	 */
+	bhsr2t->bhsr2t_desired_data_transfer_length =
+	    htonl(io->scsiio.kern_data_len - io->scsiio.ext_data_filled);
+	cfiscsi_pdu_queue(response);
+}
+
+static void
+cfiscsi_datamove(union ctl_io *io)
+{
+
+	if ((io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_IN)
+		cfiscsi_datamove_in(io);
+	else
+		cfiscsi_datamove_out(io);
 }

 static void
--- a/sys/conf/files
+++ b/sys/conf/files
@ -2920,7 +2920,6 @@ kern/tty_pts.c			standard
 kern/tty_tty.c			standard
 kern/tty_ttydisc.c		standard
 kern/uipc_accf.c		optional inet
-kern/uipc_cow.c			optional socket_send_cow
 kern/uipc_debug.c		optional ddb
 kern/uipc_domain.c		standard
 kern/uipc_mbuf.c		standard
--- a/sys/dev/e1000/if_igb.c
+++ b/sys/dev/e1000/if_igb.c
@ -2883,6 +2883,9 @@ igb_setup_msix(struct adapter *adapter)
 	if (queues > maxqueues)
 		queues = maxqueues;

+	/* reflect correct sysctl value */
+	igb_num_queues = queues;
+
 	/*
 	** One vector (RX/TX pair) per queue
 	** plus an additional for Link interrupt
--- a/sys/dev/hyperv/vmbus/hv_hv.c
+++ b/sys/dev/hyperv/vmbus/hv_hv.c
@ -29,6 +29,8 @@
 /**
 * Implements low-level interactions with Hypver-V/Azure
 */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");

 #include <sys/param.h>
 #include <sys/malloc.h>
@ -88,6 +90,14 @@ hv_vmbus_query_hypervisor_presence(void)
 {
 	u_int regs[4];
 	int hyper_v_detected = 0;
+
+	/*
+	 * When Xen is detected and native Xen PV support is enabled,
+	 * ignore Xen's HyperV emulation.
+	 */
+	if (vm_guest == VM_GUEST_XEN)
+		return (0);
+
 	do_cpuid(1, regs);
 	if (regs[2] & 0x80000000) { /* if(a hypervisor is detected) */
 		/* make sure this really is Hyper-V */
--- a/sys/dev/ixgbe/ixgbe.c
+++ b/sys/dev/ixgbe/ixgbe.c
@ -234,6 +234,9 @@ MODULE_DEPEND(ixgbe, ether, 1, 1, 1);
 ** TUNEABLE PARAMETERS:
 */

+static SYSCTL_NODE(_hw, OID_AUTO, ix, CTLFLAG_RD, 0,
+		   "IXGBE driver parameters");
+
 /*
 ** AIM: Adaptive Interrupt Moderation
 ** which means that the interrupt rate
@ -242,17 +245,29 @@ MODULE_DEPEND(ixgbe, ether, 1, 1, 1);
 */
 static int ixgbe_enable_aim = TRUE;
 TUNABLE_INT("hw.ixgbe.enable_aim", &ixgbe_enable_aim);
+SYSCTL_INT(_hw_ix, OID_AUTO, enable_aim, CTLFLAG_RW, &ixgbe_enable_aim, 0,
+    "Enable adaptive interrupt moderation");

 static int ixgbe_max_interrupt_rate = (4000000 / IXGBE_LOW_LATENCY);
 TUNABLE_INT("hw.ixgbe.max_interrupt_rate", &ixgbe_max_interrupt_rate);
+SYSCTL_INT(_hw_ix, OID_AUTO, max_interrupt_rate, CTLFLAG_RDTUN,
+    &ixgbe_max_interrupt_rate, 0, "Maximum interrupts per second");

 /* How many packets rxeof tries to clean at a time */
 static int ixgbe_rx_process_limit = 256;
 TUNABLE_INT("hw.ixgbe.rx_process_limit", &ixgbe_rx_process_limit);
+SYSCTL_INT(_hw_ix, OID_AUTO, rx_process_limit, CTLFLAG_RDTUN,
+    &ixgbe_rx_process_limit, 0,
+    "Maximum number of received packets to process at a time,"
+    "-1 means unlimited");

 /* How many packets txeof tries to clean at a time */
 static int ixgbe_tx_process_limit = 256;
 TUNABLE_INT("hw.ixgbe.tx_process_limit", &ixgbe_tx_process_limit);
+SYSCTL_INT(_hw_ix, OID_AUTO, tx_process_limit, CTLFLAG_RDTUN,
+    &ixgbe_tx_process_limit, 0,
+    "Maximum number of sent packets to process at a time,"
+    "-1 means unlimited");

 /*
 ** Smart speed setting, default to on
@ -269,6 +284,8 @@ static int ixgbe_smart_speed = ixgbe_smart_speed_on;
 */
 static int ixgbe_enable_msix = 1;
 TUNABLE_INT("hw.ixgbe.enable_msix", &ixgbe_enable_msix);
+SYSCTL_INT(_hw_ix, OID_AUTO, enable_msix, CTLFLAG_RDTUN, &ixgbe_enable_msix, 0,
+    "Enable MSI-X interrupts");

 /*
 * Number of Queues, can be set to 0,
@ -278,6 +295,8 @@ TUNABLE_INT("hw.ixgbe.enable_msix", &ixgbe_enable_msix);
 */
 static int ixgbe_num_queues = 0;
 TUNABLE_INT("hw.ixgbe.num_queues", &ixgbe_num_queues);
+SYSCTL_INT(_hw_ix, OID_AUTO, num_queues, CTLFLAG_RDTUN, &ixgbe_num_queues, 0,
+    "Number of queues to configure, 0 indicates autoconfigure");

 /*
 ** Number of TX descriptors per ring,
@ -286,10 +305,14 @@ TUNABLE_INT("hw.ixgbe.num_queues", &ixgbe_num_queues);
 */
 static int ixgbe_txd = PERFORM_TXD;
 TUNABLE_INT("hw.ixgbe.txd", &ixgbe_txd);
+SYSCTL_INT(_hw_ix, OID_AUTO, txd, CTLFLAG_RDTUN, &ixgbe_txd, 0,
+    "Number of receive descriptors per queue");

 /* Number of RX descriptors per ring */
 static int ixgbe_rxd = PERFORM_RXD;
 TUNABLE_INT("hw.ixgbe.rxd", &ixgbe_rxd);
+SYSCTL_INT(_hw_ix, OID_AUTO, rxd, CTLFLAG_RDTUN, &ixgbe_rxd, 0,
+    "Number of receive descriptors per queue");

 /*
 ** Defining this on will allow the use
@ -2442,6 +2465,9 @@ ixgbe_setup_msix(struct adapter *adapter)
 	else if ((ixgbe_num_queues == 0) && (queues > 8))
 		queues = 8;

+	/* reflect correct sysctl value */
+	ixgbe_num_queues = queues;
+
 	/*
 	** Want one vector (RX/TX pair) per queue
 	** plus an additional for Link.
--- a/sys/dev/virtio/network/if_vtnet.c
+++ b/sys/dev/virtio/network/if_vtnet.c
@ -1700,9 +1700,9 @@ vtnet_rxq_input(struct vtnet_rxq *rxq, struct mbuf *m,
 	rxq->vtnrx_stats.vrxs_ipackets++;
 	rxq->vtnrx_stats.vrxs_ibytes += m->m_pkthdr.len;

-	/* VTNET_RXQ_UNLOCK(rxq); */
+	VTNET_RXQ_UNLOCK(rxq);
 	(*ifp->if_input)(ifp, m);
-	/* VTNET_RXQ_LOCK(rxq); */
+	VTNET_RXQ_LOCK(rxq);
 }

 static int
@ -1782,6 +1782,10 @@ vtnet_rxq_eof(struct vtnet_rxq *rxq)
 		m_adj(m, adjsz);

 		vtnet_rxq_input(rxq, m, hdr);
+
+		/* Must recheck after dropping the Rx lock. */
+		if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
+			break;
 	}

 	if (deq > 0)
--- a/sys/dev/xen/timer/timer.c
+++ b/sys/dev/xen/timer/timer.c
@ -396,7 +396,7 @@ xentimer_et_start(struct eventtimer *et,
 {
 	int error = 0, i = 0;
 	struct xentimer_softc *sc = et->et_priv;
-	int cpu = PCPU_GET(acpi_id);
+	int cpu = PCPU_GET(vcpu_id);
 	struct xentimer_pcpu_data *pcpu = DPCPU_PTR(xentimer_pcpu);
 	uint64_t first_in_ns, next_time;

@ -433,7 +433,7 @@ xentimer_et_start(struct eventtimer *et,
 static int
 xentimer_et_stop(struct eventtimer *et)
 {
-	int cpu = PCPU_GET(acpi_id);
+	int cpu = PCPU_GET(vcpu_id);
 	struct xentimer_pcpu_data *pcpu = DPCPU_PTR(xentimer_pcpu);

 	pcpu->timer = 0;
--- a/sys/i386/include/pcpu.h
+++ b/sys/i386/include/pcpu.h
@ -62,13 +62,13 @@ struct shadow_time_info {
 	vm_paddr_t *pc_pdir_shadow;					\
 	uint64_t pc_processed_system_time;				\
 	struct shadow_time_info pc_shadow_time;				\
-	char	__pad[189]
+	char	__pad[185]

 #else /* !XEN */

 #define PCPU_XEN_FIELDS							\
 	;								\
-	char	__pad[237]
+	char	__pad[233]

 #endif

@ -84,7 +84,8 @@ struct shadow_time_info {
 	u_int   pc_acpi_id;		/* ACPI CPU id */		\
 	u_int	pc_apic_id;						\
 	int	pc_private_tss;		/* Flag indicating private tss*/\
-	u_int	pc_cmci_mask		/* MCx banks for CMCI */	\
+	u_int	pc_cmci_mask;		/* MCx banks for CMCI */	\
+	u_int	pc_vcpu_id		/* Xen vCPU ID */		\
 	PCPU_XEN_FIELDS

 #ifdef _KERNEL
--- a/sys/i386/xen/mp_machdep.c
+++ b/sys/i386/xen/mp_machdep.c
@ -783,13 +783,7 @@ start_all_aps(void)
 		dpcpu_init((void *)kmem_malloc(kernel_arena, DPCPU_SIZE,
 		    M_WAITOK | M_ZERO), bootAP);
 		pc->pc_apic_id = cpu_apic_ids[bootAP];
-		/*
-		 * The i386 PV port uses the apic_id as vCPU id, but the
-		 * PVHVM port needs to use the acpi_id, so set it for PV
-		 * also in order to work with shared devices between PV
-		 * and PVHVM.
-		 */
-		pc->pc_acpi_id = cpu_apic_ids[bootAP];
+		pc->pc_vcpu_id = cpu_apic_ids[bootAP];
 		pc->pc_prvspace = pc;
 		pc->pc_curthread = 0;

--- a/sys/i386/xen/mptable.c
+++ b/sys/i386/xen/mptable.c
@ -88,7 +88,7 @@ mptable_setup_local(void)
 {

 	PCPU_SET(apic_id, 0);
-	PCPU_SET(acpi_id, 0);
+	PCPU_SET(vcpu_id, 0);
 	return (0);
 }

--- a/sys/kern/kern_malloc.c
+++ b/sys/kern/kern_malloc.c
@ -55,7 +55,6 @@ __FBSDID("$FreeBSD$");
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
-#include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/vmmeter.h>
 #include <sys/proc.h>
@ -699,10 +698,10 @@ kmeminit(void)
 	 * VM_KMEM_SIZE_MAX is dependent on the maximum KVA space
 	 * available.
 	 *
-	 * Note that the kmem_map is also used by the zone allocator,
+	 * Note that the kmem arena is also used by the zone allocator,
 	 * so make sure that there is enough space.
 	 */
-	vm_kmem_size = VM_KMEM_SIZE + nmbclusters * PAGE_SIZE;
+	vm_kmem_size = VM_KMEM_SIZE;
 	mem_size = cnt.v_page_count;

 #if defined(VM_KMEM_SIZE_SCALE)
--- a/sys/kern/uipc_cow.c
+++ b/sys/kern/uipc_cow.c
@ -1,182 +0,0 @@
-/*--
- * Copyright (c) 1997, Duke University
- * All rights reserved.
- *
- * Author:
- *         Andrew Gallatin <gallatin@cs.duke.edu>  
- *            
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. The name of Duke University may not be used to endorse or promote 
- *    products derived from this software without specific prior written 
- *    permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY DUKE UNIVERSITY ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL DUKE UNIVERSITY BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITSOR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
- * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
- */
-
-/*
- * This is a set of routines for enabling and disabling copy on write
- * protection for data written into sockets.
- */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/sysctl.h>
-#include <sys/kernel.h>
-#include <sys/proc.h>
-#include <sys/lock.h>
-#include <sys/mutex.h>
-#include <sys/mbuf.h>
-#include <sys/sf_buf.h>
-#include <sys/socketvar.h>
-#include <sys/uio.h>
-
-#include <vm/vm.h>
-#include <vm/vm_extern.h>
-#include <vm/vm_param.h>
-#include <vm/pmap.h>
-#include <vm/vm_map.h>
-#include <vm/vm_page.h>
-#include <vm/vm_object.h>
-
-FEATURE(zero_copy_sockets, "Zero copy sockets support");
-
-struct netsend_cow_stats {
-	int attempted;
-	int fail_not_mapped;
-	int fail_sf_buf;
-	int success;
-	int iodone;
-};
-
-static struct netsend_cow_stats socow_stats;
-
-static int socow_iodone(struct mbuf *m, void *addr, void *args);
-
-static int
-socow_iodone(struct mbuf *m, void *addr, void *args)
-{	
-	struct sf_buf *sf;
-	vm_page_t pp;
-
-	sf = args;
-	pp = sf_buf_page(sf);
-	sf_buf_free(sf);
-	/* remove COW mapping  */
-	vm_page_lock(pp);
-	vm_page_cowclear(pp);
-	vm_page_unwire(pp, 0);
-	/*
-	 * Check for the object going away on us. This can
-	 * happen since we don't hold a reference to it.
-	 * If so, we're responsible for freeing the page.
-	 */
-	if (pp->wire_count == 0 && pp->object == NULL)
-		vm_page_free(pp);
-	vm_page_unlock(pp);
-	socow_stats.iodone++;
-	return (EXT_FREE_OK);
-}
-
-int
-socow_setup(struct mbuf *m0, struct uio *uio)
-{
-	struct sf_buf *sf;
-	vm_page_t pp;
-	struct iovec *iov;
-	struct vmspace *vmspace;
-	struct vm_map *map;
-	vm_offset_t offset, uva;
-	vm_size_t len;
-
-	socow_stats.attempted++;
-	vmspace = curproc->p_vmspace;
-	map = &vmspace->vm_map;
-	uva = (vm_offset_t) uio->uio_iov->iov_base;
-	offset = uva & PAGE_MASK;
-	len = PAGE_SIZE - offset;
-
-	/*
-	 * Verify that access to the given address is allowed from user-space.
-	 */
-	if (vm_fault_quick_hold_pages(map, uva, len, VM_PROT_READ, &pp, 1) <
-	    0) {
-		socow_stats.fail_not_mapped++;
-		return(0);
-	}
-
-	/* 
-	 * set up COW
-	 */
-	vm_page_lock(pp);
-	if (vm_page_cowsetup(pp) != 0) {
-		vm_page_unhold(pp);
-		vm_page_unlock(pp);
-		return (0);
-	}
-
-	/*
-	 * wire the page for I/O
-	 */
-	vm_page_wire(pp);
-	vm_page_unhold(pp);
-	vm_page_unlock(pp);
-	/*
-	 * Allocate an sf buf
-	 */
-	sf = sf_buf_alloc(pp, SFB_CATCH);
-	if (sf == NULL) {
-		vm_page_lock(pp);
-		vm_page_cowclear(pp);
-		vm_page_unwire(pp, 0);
-		/*
-		 * Check for the object going away on us. This can
-		 * happen since we don't hold a reference to it.
-		 * If so, we're responsible for freeing the page.
-		 */
-		if (pp->wire_count == 0 && pp->object == NULL)
-			vm_page_free(pp);
-		vm_page_unlock(pp);
-		socow_stats.fail_sf_buf++;
-		return(0);
-	}
-	/* 
-	 * attach to mbuf
-	 */
-	MEXTADD(m0, sf_buf_kva(sf), PAGE_SIZE, socow_iodone,
-	    (void*)sf_buf_kva(sf), sf, M_RDONLY, EXT_SFBUF);
-	m0->m_len = len;
-	m0->m_data = (caddr_t)sf_buf_kva(sf) + offset;
-	socow_stats.success++;
-
-	iov = uio->uio_iov;
-	iov->iov_base = (char *)iov->iov_base + m0->m_len;
-	iov->iov_len -= m0->m_len;
-	uio->uio_resid -= m0->m_len;
-	uio->uio_offset += m0->m_len;
-	if (iov->iov_len == 0) {
-		uio->uio_iov++;
-		uio->uio_iovcnt--;
-	}
-
-	return(m0->m_len);
-}
--- a/sys/x86/acpica/madt.c
+++ b/sys/x86/acpica/madt.c
@ -575,4 +575,4 @@ madt_set_ids(void *dummy)
 			    la->la_acpi_id);
 	}
 }
-SYSINIT(madt_set_ids, SI_SUB_CPU, SI_ORDER_ANY, madt_set_ids, NULL);
+SYSINIT(madt_set_ids, SI_SUB_CPU, SI_ORDER_MIDDLE, madt_set_ids, NULL);
--- a/sys/x86/xen/hvm.c
+++ b/sys/x86/xen/hvm.c
@ -700,6 +700,7 @@ xen_hvm_init(enum xen_hvm_init_type init_type)

 		setup_xen_features();
 		cpu_ops = xen_hvm_cpu_ops;
+ 		vm_guest = VM_GUEST_XEN;
 		break;
 	case XEN_HVM_INIT_RESUME:
 		if (error != 0)
@ -742,6 +743,22 @@ xen_hvm_sysinit(void *arg __unused)
 	xen_hvm_init(XEN_HVM_INIT_COLD);
 }

+static void
+xen_set_vcpu_id(void)
+{
+	struct pcpu *pc;
+	int i;
+
+	/* Set vcpu_id to acpi_id */
+	CPU_FOREACH(i) {
+		pc = pcpu_find(i);
+		pc->pc_vcpu_id = pc->pc_acpi_id;
+		if (bootverbose)
+			printf("XEN: CPU %u has VCPU ID %u\n",
+			       i, pc->pc_vcpu_id);
+	}
+}
+
 static void
 xen_hvm_cpu_init(void)
 {
@ -762,7 +779,7 @@ xen_hvm_cpu_init(void)
 	}

 	vcpu_info = DPCPU_PTR(vcpu_local_info);
-	cpu = PCPU_GET(acpi_id);
+	cpu = PCPU_GET(vcpu_id);
 	info.mfn = vtophys(vcpu_info) >> PAGE_SHIFT;
 	info.offset = vtophys(vcpu_info) - trunc_page(vtophys(vcpu_info));

@ -778,3 +795,4 @@ SYSINIT(xen_hvm_init, SI_SUB_HYPERVISOR, SI_ORDER_FIRST, xen_hvm_sysinit, NULL);
 SYSINIT(xen_setup_cpus, SI_SUB_SMP, SI_ORDER_FIRST, xen_setup_cpus, NULL);
 #endif
 SYSINIT(xen_hvm_cpu_init, SI_SUB_INTR, SI_ORDER_FIRST, xen_hvm_cpu_init, NULL);
+SYSINIT(xen_set_vcpu_id, SI_SUB_CPU, SI_ORDER_ANY, xen_set_vcpu_id, NULL);
--- a/sys/x86/xen/xen_intr.c
+++ b/sys/x86/xen/xen_intr.c
@ -611,9 +611,9 @@ xen_rebind_ipi(struct xenisrc *isrc)
 {
 #ifdef SMP
 	int cpu = isrc->xi_cpu;
-	int acpi_id = pcpu_find(cpu)->pc_acpi_id;
+	int vcpu_id = pcpu_find(cpu)->pc_vcpu_id;
 	int error;
-	struct evtchn_bind_ipi bind_ipi = { .vcpu = acpi_id };
+	struct evtchn_bind_ipi bind_ipi = { .vcpu = vcpu_id };

 	error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
 	                                    &bind_ipi);
@ -640,10 +640,10 @@ static void
 xen_rebind_virq(struct xenisrc *isrc)
 {
 	int cpu = isrc->xi_cpu;
-	int acpi_id = pcpu_find(cpu)->pc_acpi_id;
+	int vcpu_id = pcpu_find(cpu)->pc_vcpu_id;
 	int error;
 	struct evtchn_bind_virq bind_virq = { .virq = isrc->xi_virq,
-	                                      .vcpu = acpi_id };
+	                                      .vcpu = vcpu_id };

 	error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
 	                                    &bind_virq);
@ -796,7 +796,7 @@ xen_intr_assign_cpu(struct intsrc *base_isrc, u_int apic_id)
 #ifdef SMP
 	struct evtchn_bind_vcpu bind_vcpu;
 	struct xenisrc *isrc;
-	u_int to_cpu, acpi_id;
+	u_int to_cpu, vcpu_id;
 	int error;

 #ifdef XENHVM
@ -805,7 +805,7 @@ xen_intr_assign_cpu(struct intsrc *base_isrc, u_int apic_id)
 #endif

 	to_cpu = apic_cpuid(apic_id);
-	acpi_id = pcpu_find(to_cpu)->pc_acpi_id;
+	vcpu_id = pcpu_find(to_cpu)->pc_vcpu_id;
 	xen_intr_intrcnt_add(to_cpu);

 	mtx_lock(&xen_intr_isrc_lock);
@ -830,7 +830,7 @@ xen_intr_assign_cpu(struct intsrc *base_isrc, u_int apic_id)
 	}

 	bind_vcpu.port = isrc->xi_port;
-	bind_vcpu.vcpu = acpi_id;
+	bind_vcpu.vcpu = vcpu_id;

 	/*
 	 * Allow interrupts to be fielded on the new VCPU before
@ -1063,9 +1063,9 @@ xen_intr_bind_virq(device_t dev, u_int virq, u_int cpu,
    driver_filter_t filter, driver_intr_t handler, void *arg,
    enum intr_type flags, xen_intr_handle_t *port_handlep)
 {
-	int acpi_id = pcpu_find(cpu)->pc_acpi_id;
+	int vcpu_id = pcpu_find(cpu)->pc_vcpu_id;
 	struct xenisrc *isrc;
-	struct evtchn_bind_virq bind_virq = { .virq = virq, .vcpu = acpi_id };
+	struct evtchn_bind_virq bind_virq = { .virq = virq, .vcpu = vcpu_id };
 	int error;

 	/* Ensure the target CPU is ready to handle evtchn interrupts. */
@ -1126,9 +1126,9 @@ xen_intr_alloc_and_bind_ipi(device_t dev, u_int cpu,
    xen_intr_handle_t *port_handlep)
 {
 #ifdef SMP
-	int acpi_id = pcpu_find(cpu)->pc_acpi_id;
+	int vcpu_id = pcpu_find(cpu)->pc_vcpu_id;
 	struct xenisrc *isrc;
-	struct evtchn_bind_ipi bind_ipi = { .vcpu = acpi_id };
+	struct evtchn_bind_ipi bind_ipi = { .vcpu = vcpu_id };
 	int error;

 	/* Ensure the target CPU is ready to handle evtchn interrupts. */
--- a/usr.sbin/bhyve/bhyverun.c
+++ b/usr.sbin/bhyve/bhyverun.c
@ -101,7 +101,7 @@ struct bhyvestats {
        uint64_t        vmexit_hlt;
        uint64_t        vmexit_pause;
        uint64_t        vmexit_mtrap;
-        uint64_t        vmexit_paging;
+        uint64_t        vmexit_inst_emul;
        uint64_t        cpu_switch_rotate;
        uint64_t        cpu_switch_direct;
        int             io_reset;
@ -208,14 +208,12 @@ fbsdrun_addcpu(struct vmctx *ctx, int vcpu, uint64_t rip)
 	vmexit[vcpu].rip = rip;
 	vmexit[vcpu].inst_length = 0;

-	if (vcpu == BSP) {
-		mt_vmm_info[vcpu].mt_ctx = ctx;
-		mt_vmm_info[vcpu].mt_vcpu = vcpu;
-	
-		error = pthread_create(&mt_vmm_info[vcpu].mt_thr, NULL,
-				fbsdrun_start_thread, &mt_vmm_info[vcpu]);
-		assert(error == 0);
-	}
+	mt_vmm_info[vcpu].mt_ctx = ctx;
+	mt_vmm_info[vcpu].mt_vcpu = vcpu;
+
+	error = pthread_create(&mt_vmm_info[vcpu].mt_thr, NULL,
+	    fbsdrun_start_thread, &mt_vmm_info[vcpu]);
+	assert(error == 0);
 }

 static int
@ -385,13 +383,13 @@ vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 }

 static int
-vmexit_paging(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
 	int err;
-	stats.vmexit_paging++;
+	stats.vmexit_inst_emul++;

-	err = emulate_mem(ctx, *pvcpu, vmexit->u.paging.gpa,
-			  &vmexit->u.paging.vie);
+	err = emulate_mem(ctx, *pvcpu, vmexit->u.inst_emul.gpa,
+			  &vmexit->u.inst_emul.vie);

 	if (err) {
 		if (err == EINVAL) {
@ -400,7 +398,7 @@ vmexit_paging(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 			    vmexit->rip);
 		} else if (err == ESRCH) {
 			fprintf(stderr, "Unhandled memory access to 0x%lx\n",
-			    vmexit->u.paging.gpa);
+			    vmexit->u.inst_emul.gpa);
 		}

 		return (VMEXIT_ABORT);
@ -416,7 +414,7 @@ static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
 	[VM_EXITCODE_RDMSR]  = vmexit_rdmsr,
 	[VM_EXITCODE_WRMSR]  = vmexit_wrmsr,
 	[VM_EXITCODE_MTRAP]  = vmexit_mtrap,
-	[VM_EXITCODE_PAGING] = vmexit_paging,
+	[VM_EXITCODE_INST_EMUL] = vmexit_inst_emul,
 	[VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap,
 };

--- a/usr.sbin/bhyve/pci_emul.c
+++ b/usr.sbin/bhyve/pci_emul.c
@ -1048,7 +1048,7 @@ init_pci(struct vmctx *ctx)
 	 * Accesses to memory addresses that are not allocated to system
 	 * memory or PCI devices return 0xff's.
 	 */
-	error = vm_get_memory_seg(ctx, 0, &lowmem);
+	error = vm_get_memory_seg(ctx, 0, &lowmem, NULL);
 	assert(error == 0);

 	memset(&memp, 0, sizeof(struct mem_range));
--- a/usr.sbin/bhyve/rtc.c
+++ b/usr.sbin/bhyve/rtc.c
@ -341,14 +341,14 @@ rtc_init(struct vmctx *ctx)
 	 * 0x34/0x35 - 64KB chunks above 16MB, below 4GB
 	 * 0x5b/0x5c/0x5d - 64KB chunks above 4GB
 	 */
-	err = vm_get_memory_seg(ctx, 0, &lomem);
+	err = vm_get_memory_seg(ctx, 0, &lomem, NULL);
 	assert(err == 0);

 	lomem = (lomem - m_16MB) / m_64KB;
 	rtc_nvram[nvoff(RTC_LMEM_LSB)] = lomem;
 	rtc_nvram[nvoff(RTC_LMEM_MSB)] = lomem >> 8;

-	if (vm_get_memory_seg(ctx, m_4GB, &himem) == 0) {	  
+	if (vm_get_memory_seg(ctx, m_4GB, &himem, NULL) == 0) {	  
 		himem /= m_64KB;
 		rtc_nvram[nvoff(RTC_HMEM_LSB)] = himem;
 		rtc_nvram[nvoff(RTC_HMEM_SB)]  = himem >> 8;
--- a/usr.sbin/bhyvectl/bhyvectl.c
+++ b/usr.sbin/bhyvectl/bhyvectl.c
@ -188,12 +188,13 @@ usage(void)
 	"       [--unassign-pptdev=<bus/slot/func>]\n"
 	"       [--set-mem=<memory in units of MB>]\n"
 	"       [--get-lowmem]\n"
-	"       [--get-highmem]\n",
+	"       [--get-highmem]\n"
+	"       [--get-gpa-pmap]\n",
 	progname);
 	exit(1);
 }

-static int get_stats, getcap, setcap, capval;
+static int get_stats, getcap, setcap, capval, get_gpa_pmap;
 static const char *capname;
 static int create, destroy, get_lowmem, get_highmem;
 static uint64_t memsize;
@ -377,18 +378,20 @@ enum {
 	SET_CAP,
 	CAPNAME,
 	UNASSIGN_PPTDEV,
+	GET_GPA_PMAP,
 };

 int
 main(int argc, char *argv[])
 {
 	char *vmname;
-	int error, ch, vcpu;
-	vm_paddr_t gpa;
+	int error, ch, vcpu, ptenum;
+	vm_paddr_t gpa, gpa_pmap;
 	size_t len;
 	struct vm_exit vmexit;
-	uint64_t ctl, eptp, bm, addr, u64;
+	uint64_t ctl, eptp, bm, addr, u64, pteval[4], *pte;
 	struct vmctx *ctx;
+	int wired;

 	uint64_t cr0, cr3, cr4, dr7, rsp, rip, rflags, efer, pat;
 	uint64_t rax, rbx, rcx, rdx, rsi, rdi, rbp;
@ -427,6 +430,7 @@ main(int argc, char *argv[])
 		{ "capname",	REQ_ARG,	0,	CAPNAME },
 		{ "unassign-pptdev", REQ_ARG,	0,	UNASSIGN_PPTDEV },
 		{ "setcap",	REQ_ARG,	0,	SET_CAP },
+		{ "get-gpa-pmap", REQ_ARG,	0,	GET_GPA_PMAP },
 		{ "getcap",	NO_ARG,		&getcap,	1 },
 		{ "get-stats",	NO_ARG,		&get_stats,	1 },
 		{ "get-desc-ds",NO_ARG,		&get_desc_ds,	1 },
@ -666,6 +670,10 @@ main(int argc, char *argv[])
 			capval = strtoul(optarg, NULL, 0);
 			setcap = 1;
 			break;
+		case GET_GPA_PMAP:
+			gpa_pmap = strtoul(optarg, NULL, 0);
+			get_gpa_pmap = 1;
+			break;
 		case CAPNAME:
 			capname = optarg;
 			break;
@ -819,16 +827,18 @@ main(int argc, char *argv[])

 	if (!error && (get_lowmem || get_all)) {
 		gpa = 0;
-		error = vm_get_memory_seg(ctx, gpa, &len);
+		error = vm_get_memory_seg(ctx, gpa, &len, &wired);
 		if (error == 0)
-			printf("lowmem\t\t0x%016lx/%ld\n", gpa, len);
+			printf("lowmem\t\t0x%016lx/%ld%s\n", gpa, len,
+			    wired ? " wired" : "");
 	}

 	if (!error && (get_highmem || get_all)) {
 		gpa = 4 * GB;
-		error = vm_get_memory_seg(ctx, gpa, &len);
+		error = vm_get_memory_seg(ctx, gpa, &len, &wired);
 		if (error == 0)
-			printf("highmem\t\t0x%016lx/%ld\n", gpa, len);
+			printf("highmem\t\t0x%016lx/%ld%s\n", gpa, len,
+			    wired ? " wired" : "");
 	}

 	if (!error && (get_efer || get_all)) {
@ -1457,6 +1467,17 @@ main(int argc, char *argv[])
 			printf("Capability \"%s\" is not available\n", capname);
 	}

+	if (!error && get_gpa_pmap) {
+		error = vm_get_gpa_pmap(ctx, gpa_pmap, pteval, &ptenum);
+		if (error == 0) {
+			printf("gpa %#lx:", gpa_pmap);
+			pte = &pteval[0];
+			while (ptenum-- > 0)
+				printf(" %#lx", *pte++);
+			printf("\n");
+		}
+	}
+
 	if (!error && (getcap || get_all)) {
 		int captype, val, getcaptype;

--- a/usr.sbin/bhyveload/bhyveload.c
+++ b/usr.sbin/bhyveload/bhyveload.c
@ -492,8 +492,8 @@ static void
 cb_getmem(void *arg, uint64_t *ret_lowmem, uint64_t *ret_highmem)
 {

-	vm_get_memory_seg(ctx, 0, ret_lowmem);
-	vm_get_memory_seg(ctx, 4 * GB, ret_highmem);
+	vm_get_memory_seg(ctx, 0, ret_lowmem, NULL);
+	vm_get_memory_seg(ctx, 4 * GB, ret_highmem, NULL);
 }

 static const char *