MFC - tracking commit

This commit is contained in:
markm 2013-10-06 09:37:57 +00:00
parent 2368f0cbd1
commit 21998ad688
46 changed files with 2440 additions and 1295 deletions

View File

@ -124,7 +124,8 @@ vm_destroy(struct vmctx *vm)
}
int
vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len)
vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len,
int *wired)
{
int error;
struct vm_memory_segment seg;
@ -133,6 +134,8 @@ vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len)
seg.gpa = gpa;
error = ioctl(ctx->fd, VM_GET_MEMORY_SEG, &seg);
*ret_len = seg.len;
if (wired != NULL)
*wired = seg.wired;
return (error);
}
@ -741,3 +744,23 @@ vcpu_reset(struct vmctx *vmctx, int vcpu)
done:
return (error);
}
int
vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num)
{
int error, i;
struct vm_gpa_pte gpapte;
bzero(&gpapte, sizeof(gpapte));
gpapte.gpa = gpa;
error = ioctl(ctx->fd, VM_GET_GPA_PMAP, &gpapte);
if (error == 0) {
*num = gpapte.ptenum;
for (i = 0; i < gpapte.ptenum; i++)
pte[i] = gpapte.pte[i];
}
return (error);
}

View File

@ -45,9 +45,11 @@ enum vm_mmap_style {
int vm_create(const char *name);
struct vmctx *vm_open(const char *name);
void vm_destroy(struct vmctx *ctx);
int vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len);
int vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len,
int *wired);
int vm_setup_memory(struct vmctx *ctx, size_t len, enum vm_mmap_style s);
void *vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len);
int vm_get_gpa_pmap(struct vmctx *, uint64_t gpa, uint64_t *pte, int *num);
uint32_t vm_get_lowmem_limit(struct vmctx *ctx);
void vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit);
int vm_set_desc(struct vmctx *ctx, int vcpu, int reg,

View File

@ -1574,7 +1574,7 @@ getmemsize(caddr_t kmdp, u_int64_t first)
/*
* map page into kernel: valid, read/write,non-cacheable
*/
*pte = pa | PG_V | PG_RW | PG_N;
*pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
invltlb();
tmp = *(int *)ptr;

File diff suppressed because it is too large Load Diff

View File

@ -733,6 +733,14 @@ trap_pfault(frame, usermode)
}
}
/*
* If the trap was caused by errant bits in the PTE then panic.
*/
if (frame->tf_err & PGEX_RSV) {
trap_fatal(frame, eva);
return (-1);
}
/*
* PGEX_I is defined only if the execute disable bit capability is
* supported and enabled.
@ -822,10 +830,11 @@ trap_fatal(frame, eva)
#endif
if (type == T_PAGEFLT) {
printf("fault virtual address = 0x%lx\n", eva);
printf("fault code = %s %s %s, %s\n",
printf("fault code = %s %s %s%s, %s\n",
code & PGEX_U ? "user" : "supervisor",
code & PGEX_W ? "write" : "read",
code & PGEX_I ? "instruction" : "data",
code & PGEX_RSV ? " rsv" : "",
code & PGEX_P ? "protection violation" : "page not present");
}
printf("instruction pointer = 0x%lx:0x%lx\n",

View File

@ -62,7 +62,8 @@
u_int pc_cmci_mask; /* MCx banks for CMCI */ \
uint64_t pc_dbreg[16]; /* ddb debugging regs */ \
int pc_dbreg_cmd; /* ddb debugging reg cmd */ \
char __pad[161] /* be divisor of PAGE_SIZE \
u_int pc_vcpu_id; /* Xen vCPU ID */ \
char __pad[157] /* be divisor of PAGE_SIZE \
after cache alignment */
#define PC_DBREG_CMD_NONE 0

View File

@ -50,41 +50,74 @@
* of the fields not present here and there, depending on a lot of things.
*/
/* ---- Intel Nomenclature ---- */
#define PG_V 0x001 /* P Valid */
#define PG_RW 0x002 /* R/W Read/Write */
#define PG_U 0x004 /* U/S User/Supervisor */
#define PG_NC_PWT 0x008 /* PWT Write through */
#define PG_NC_PCD 0x010 /* PCD Cache disable */
#define PG_A 0x020 /* A Accessed */
#define PG_M 0x040 /* D Dirty */
#define PG_PS 0x080 /* PS Page size (0=4k,1=2M) */
#define PG_PTE_PAT 0x080 /* PAT PAT index */
#define PG_G 0x100 /* G Global */
#define PG_AVAIL1 0x200 /* / Available for system */
#define PG_AVAIL2 0x400 /* < programmers use */
#define PG_AVAIL3 0x800 /* \ */
#define PG_PDE_PAT 0x1000 /* PAT PAT index */
#define PG_NX (1ul<<63) /* No-execute */
/* Our various interpretations of the above */
#define PG_W PG_AVAIL1 /* "Wired" pseudoflag */
#define PG_MANAGED PG_AVAIL2
#define PG_FRAME (0x000ffffffffff000ul)
#define PG_PS_FRAME (0x000fffffffe00000ul)
#define PG_PROT (PG_RW|PG_U) /* all protection bits . */
#define PG_N (PG_NC_PWT|PG_NC_PCD) /* Non-cacheable */
#define X86_PG_V 0x001 /* P Valid */
#define X86_PG_RW 0x002 /* R/W Read/Write */
#define X86_PG_U 0x004 /* U/S User/Supervisor */
#define X86_PG_NC_PWT 0x008 /* PWT Write through */
#define X86_PG_NC_PCD 0x010 /* PCD Cache disable */
#define X86_PG_A 0x020 /* A Accessed */
#define X86_PG_M 0x040 /* D Dirty */
#define X86_PG_PS 0x080 /* PS Page size (0=4k,1=2M) */
#define X86_PG_PTE_PAT 0x080 /* PAT PAT index */
#define X86_PG_G 0x100 /* G Global */
#define X86_PG_AVAIL1 0x200 /* / Available for system */
#define X86_PG_AVAIL2 0x400 /* < programmers use */
#define X86_PG_AVAIL3 0x800 /* \ */
#define X86_PG_PDE_PAT 0x1000 /* PAT PAT index */
#define X86_PG_NX (1ul<<63) /* No-execute */
#define X86_PG_AVAIL(x) (1ul << (x))
/* Page level cache control fields used to determine the PAT type */
#define PG_PDE_CACHE (PG_PDE_PAT | PG_NC_PWT | PG_NC_PCD)
#define PG_PTE_CACHE (PG_PTE_PAT | PG_NC_PWT | PG_NC_PCD)
#define X86_PG_PDE_CACHE (X86_PG_PDE_PAT | X86_PG_NC_PWT | X86_PG_NC_PCD)
#define X86_PG_PTE_CACHE (X86_PG_PTE_PAT | X86_PG_NC_PWT | X86_PG_NC_PCD)
/*
* Intel extended page table (EPT) bit definitions.
*/
#define EPT_PG_READ 0x001 /* R Read */
#define EPT_PG_WRITE 0x002 /* W Write */
#define EPT_PG_EXECUTE 0x004 /* X Execute */
#define EPT_PG_IGNORE_PAT 0x040 /* IPAT Ignore PAT */
#define EPT_PG_PS 0x080 /* PS Page size */
#define EPT_PG_A 0x100 /* A Accessed */
#define EPT_PG_M 0x200 /* D Dirty */
#define EPT_PG_MEMORY_TYPE(x) ((x) << 3) /* MT Memory Type */
/*
* Define the PG_xx macros in terms of the bits on x86 PTEs.
*/
#define PG_V X86_PG_V
#define PG_RW X86_PG_RW
#define PG_U X86_PG_U
#define PG_NC_PWT X86_PG_NC_PWT
#define PG_NC_PCD X86_PG_NC_PCD
#define PG_A X86_PG_A
#define PG_M X86_PG_M
#define PG_PS X86_PG_PS
#define PG_PTE_PAT X86_PG_PTE_PAT
#define PG_G X86_PG_G
#define PG_AVAIL1 X86_PG_AVAIL1
#define PG_AVAIL2 X86_PG_AVAIL2
#define PG_AVAIL3 X86_PG_AVAIL3
#define PG_PDE_PAT X86_PG_PDE_PAT
#define PG_NX X86_PG_NX
#define PG_PDE_CACHE X86_PG_PDE_CACHE
#define PG_PTE_CACHE X86_PG_PTE_CACHE
/* Our various interpretations of the above */
#define PG_W X86_PG_AVAIL3 /* "Wired" pseudoflag */
#define PG_MANAGED X86_PG_AVAIL2
#define EPT_PG_EMUL_V X86_PG_AVAIL(52)
#define EPT_PG_EMUL_RW X86_PG_AVAIL(53)
#define PG_FRAME (0x000ffffffffff000ul)
#define PG_PS_FRAME (0x000fffffffe00000ul)
/*
* Promotion to a 2MB (PDE) page mapping requires that the corresponding 4KB
* (PTE) page mappings have identical settings for the following fields:
*/
#define PG_PTE_PROMOTE (PG_NX | PG_MANAGED | PG_W | PG_G | PG_PTE_PAT | \
PG_M | PG_A | PG_NC_PCD | PG_NC_PWT | PG_U | PG_RW | PG_V)
#define PG_PTE_PROMOTE (PG_NX | PG_MANAGED | PG_W | PG_G | PG_PTE_CACHE | \
PG_M | PG_A | PG_U | PG_RW | PG_V)
/*
* Page Protection Exception bits
@ -96,6 +129,28 @@
#define PGEX_RSV 0x08 /* reserved PTE field is non-zero */
#define PGEX_I 0x10 /* during an instruction fetch */
/*
* undef the PG_xx macros that define bits in the regular x86 PTEs that
* have a different position in nested PTEs. This is done when compiling
* code that needs to be aware of the differences between regular x86 and
* nested PTEs.
*
* The appropriate bitmask will be calculated at runtime based on the pmap
* type.
*/
#ifdef AMD64_NPT_AWARE
#undef PG_AVAIL1 /* X86_PG_AVAIL1 aliases with EPT_PG_M */
#undef PG_G
#undef PG_A
#undef PG_M
#undef PG_PDE_PAT
#undef PG_PDE_CACHE
#undef PG_PTE_PAT
#undef PG_PTE_CACHE
#undef PG_RW
#undef PG_V
#endif
/*
* Pte related macros. This is complicated by having to deal with
* the sign extension of the 48th bit.
@ -256,6 +311,11 @@ struct pmap {
int pm_flags;
};
/* flags */
#define PMAP_PDE_SUPERPAGE (1 << 0) /* supports 2MB superpages */
#define PMAP_EMULATE_AD_BITS (1 << 1) /* needs A/D bits emulation */
#define PMAP_SUPPORTS_EXEC_ONLY (1 << 2) /* execute only mappings ok */
typedef struct pmap *pmap_t;
#ifdef _KERNEL
@ -272,6 +332,9 @@ extern struct pmap kernel_pmap_store;
#define PMAP_MTX(pmap) (&(pmap)->pm_mtx)
#define PMAP_TRYLOCK(pmap) mtx_trylock(&(pmap)->pm_mtx)
#define PMAP_UNLOCK(pmap) mtx_unlock(&(pmap)->pm_mtx)
int pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags);
int pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype);
#endif
/*
@ -330,7 +393,7 @@ void pmap_invalidate_all(pmap_t);
void pmap_invalidate_cache(void);
void pmap_invalidate_cache_pages(vm_page_t *pages, int count);
void pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva);
void pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num);
#endif /* _KERNEL */
#endif /* !LOCORE */

View File

@ -39,19 +39,18 @@ struct seg_desc;
struct vm_exit;
struct vm_run;
struct vlapic;
struct vmspace;
struct vm_object;
struct pmap;
enum x2apic_state;
typedef int (*vmm_init_func_t)(void);
typedef int (*vmm_cleanup_func_t)(void);
typedef void * (*vmi_init_func_t)(struct vm *vm); /* instance specific apis */
typedef int (*vmi_run_func_t)(void *vmi, int vcpu, register_t rip);
typedef void * (*vmi_init_func_t)(struct vm *vm, struct pmap *pmap);
typedef int (*vmi_run_func_t)(void *vmi, int vcpu, register_t rip,
struct pmap *pmap);
typedef void (*vmi_cleanup_func_t)(void *vmi);
typedef int (*vmi_mmap_set_func_t)(void *vmi, vm_paddr_t gpa,
vm_paddr_t hpa, size_t length,
vm_memattr_t attr, int prot,
boolean_t superpages_ok);
typedef vm_paddr_t (*vmi_mmap_get_func_t)(void *vmi, vm_paddr_t gpa);
typedef int (*vmi_get_register_t)(void *vmi, int vcpu, int num,
uint64_t *retval);
typedef int (*vmi_set_register_t)(void *vmi, int vcpu, int num,
@ -65,6 +64,8 @@ typedef int (*vmi_inject_event_t)(void *vmi, int vcpu,
uint32_t code, int code_valid);
typedef int (*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval);
typedef int (*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val);
typedef struct vmspace * (*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max);
typedef void (*vmi_vmspace_free)(struct vmspace *vmspace);
struct vmm_ops {
vmm_init_func_t init; /* module wide initialization */
@ -73,8 +74,6 @@ struct vmm_ops {
vmi_init_func_t vminit; /* vm-specific initialization */
vmi_run_func_t vmrun;
vmi_cleanup_func_t vmcleanup;
vmi_mmap_set_func_t vmmmap_set;
vmi_mmap_get_func_t vmmmap_get;
vmi_get_register_t vmgetreg;
vmi_set_register_t vmsetreg;
vmi_get_desc_t vmgetdesc;
@ -82,6 +81,8 @@ struct vmm_ops {
vmi_inject_event_t vminject;
vmi_get_cap_t vmgetcap;
vmi_set_cap_t vmsetcap;
vmi_vmspace_alloc vmspace_alloc;
vmi_vmspace_free vmspace_free;
};
extern struct vmm_ops vmm_ops_intel;
@ -93,9 +94,14 @@ const char *vm_name(struct vm *vm);
int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len);
int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len);
vm_paddr_t vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t size);
void *vm_gpa_hold(struct vm *, vm_paddr_t gpa, size_t len, int prot,
void **cookie);
void vm_gpa_release(void *cookie);
int vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
struct vm_memory_segment *seg);
int vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len,
vm_offset_t *offset, struct vm_object **object);
boolean_t vm_mem_allocated(struct vm *vm, vm_paddr_t gpa);
int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval);
int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val);
int vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
@ -130,8 +136,9 @@ void *vm_iommu_domain(struct vm *vm);
enum vcpu_state {
VCPU_IDLE,
VCPU_FROZEN,
VCPU_RUNNING,
VCPU_CANNOT_RUN,
VCPU_SLEEPING,
};
int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state);
@ -145,7 +152,9 @@ vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu)
void *vcpu_stats(struct vm *vm, int vcpu);
void vm_interrupt_hostcpu(struct vm *vm, int vcpu);
struct vmspace *vm_get_vmspace(struct vm *vm);
int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func);
int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func);
#endif /* KERNEL */
#include <machine/vmm_instruction_emul.h>
@ -247,6 +256,7 @@ enum vm_exitcode {
VM_EXITCODE_MTRAP,
VM_EXITCODE_PAUSE,
VM_EXITCODE_PAGING,
VM_EXITCODE_INST_EMUL,
VM_EXITCODE_SPINUP_AP,
VM_EXITCODE_MAX
};
@ -266,8 +276,15 @@ struct vm_exit {
} inout;
struct {
uint64_t gpa;
struct vie vie;
int fault_type;
int protection;
} paging;
struct {
uint64_t gpa;
uint64_t gla;
uint64_t cr3;
struct vie vie;
} inst_emul;
/*
* VMX specific payload. Used when there is no "better"
* exitcode to represent the VM-exit.

View File

@ -36,7 +36,8 @@ int vmmdev_cleanup(void);
struct vm_memory_segment {
vm_paddr_t gpa; /* in */
size_t len; /* in */
size_t len;
int wired;
};
struct vm_register {
@ -135,6 +136,12 @@ struct vm_x2apic {
enum x2apic_state state;
};
struct vm_gpa_pte {
uint64_t gpa; /* in */
uint64_t pte[4]; /* out */
int ptenum;
};
enum {
/* general routines */
IOCNUM_ABIVERS = 0,
@ -145,6 +152,7 @@ enum {
/* memory apis */
IOCNUM_MAP_MEMORY = 10,
IOCNUM_GET_MEMORY_SEG = 11,
IOCNUM_GET_GPA_PMAP = 12,
/* register/state accessors */
IOCNUM_SET_REGISTER = 20,
@ -215,4 +223,6 @@ enum {
_IOW('v', IOCNUM_SET_X2APIC_STATE, struct vm_x2apic)
#define VM_GET_X2APIC_STATE \
_IOWR('v', IOCNUM_GET_X2APIC_STATE, struct vm_x2apic)
#define VM_GET_GPA_PMAP \
_IOWR('v', IOCNUM_GET_GPA_PMAP, struct vm_gpa_pte)
#endif

View File

@ -102,11 +102,15 @@ int vmm_emulate_instruction(void *vm, int cpuid, uint64_t gpa, struct vie *vie,
#ifdef _KERNEL
/*
* APIs to fetch and decode the instruction from nested page fault handler.
*
* 'vie' must be initialized before calling 'vmm_fetch_instruction()'
*/
int vmm_fetch_instruction(struct vm *vm, int cpuid,
uint64_t rip, int inst_length, uint64_t cr3,
struct vie *vie);
void vie_init(struct vie *vie);
/*
* Decode the instruction fetched into 'vie' so it can be emulated.
*

View File

@ -54,7 +54,7 @@ amdv_cleanup(void)
}
static void *
amdv_vminit(struct vm *vm)
amdv_vminit(struct vm *vm, struct pmap *pmap)
{
printf("amdv_vminit: not implemented\n");
@ -62,7 +62,7 @@ amdv_vminit(struct vm *vm)
}
static int
amdv_vmrun(void *arg, int vcpu, register_t rip)
amdv_vmrun(void *arg, int vcpu, register_t rip, struct pmap *pmap)
{
printf("amdv_vmrun: not implemented\n");
@ -77,23 +77,6 @@ amdv_vmcleanup(void *arg)
return;
}
static int
amdv_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
vm_memattr_t attr, int prot, boolean_t spok)
{
printf("amdv_vmmmap_set: not implemented\n");
return (EINVAL);
}
static vm_paddr_t
amdv_vmmmap_get(void *arg, vm_paddr_t gpa)
{
printf("amdv_vmmmap_get: not implemented\n");
return (EINVAL);
}
static int
amdv_getreg(void *arg, int vcpu, int regnum, uint64_t *retval)
{
@ -151,21 +134,37 @@ amdv_setcap(void *arg, int vcpu, int type, int val)
return (EINVAL);
}
static struct vmspace *
amdv_vmspace_alloc(vm_offset_t min, vm_offset_t max)
{
printf("amdv_vmspace_alloc: not implemented\n");
return (NULL);
}
static void
amdv_vmspace_free(struct vmspace *vmspace)
{
printf("amdv_vmspace_free: not implemented\n");
return;
}
struct vmm_ops vmm_ops_amd = {
amdv_init,
amdv_cleanup,
amdv_vminit,
amdv_vmrun,
amdv_vmcleanup,
amdv_vmmmap_set,
amdv_vmmmap_get,
amdv_getreg,
amdv_setreg,
amdv_getdesc,
amdv_setdesc,
amdv_inject_event,
amdv_getcap,
amdv_setcap
amdv_setcap,
amdv_vmspace_alloc,
amdv_vmspace_free,
};
static int

View File

@ -29,32 +29,31 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/types.h>
#include <sys/errno.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/smp.h>
#include <sys/sysctl.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <machine/param.h>
#include <machine/cpufunc.h>
#include <machine/pmap.h>
#include <machine/vmparam.h>
#include <vm/vm_extern.h>
#include <machine/vmm.h>
#include "vmx_cpufunc.h"
#include "vmx_msr.h"
#include "vmx.h"
#include "ept.h"
#define EPT_SUPPORTS_EXEC_ONLY(cap) ((cap) & (1UL << 0))
#define EPT_PWL4(cap) ((cap) & (1UL << 6))
#define EPT_MEMORY_TYPE_WB(cap) ((cap) & (1UL << 14))
#define EPT_PDE_SUPERPAGE(cap) ((cap) & (1UL << 16)) /* 2MB pages */
#define EPT_PDPTE_SUPERPAGE(cap) ((cap) & (1UL << 17)) /* 1GB pages */
#define INVVPID_SUPPORTED(cap) ((cap) & (1UL << 32))
#define INVEPT_SUPPORTED(cap) ((cap) & (1UL << 20))
#define AD_BITS_SUPPORTED(cap) ((cap) & (1UL << 21))
#define INVVPID_SUPPORTED(cap) ((cap) & (1UL << 32))
#define INVVPID_ALL_TYPES_MASK 0xF0000000000UL
#define INVVPID_ALL_TYPES_SUPPORTED(cap) \
@ -64,28 +63,22 @@ __FBSDID("$FreeBSD$");
#define INVEPT_ALL_TYPES_SUPPORTED(cap) \
(((cap) & INVEPT_ALL_TYPES_MASK) == INVEPT_ALL_TYPES_MASK)
#define EPT_PG_RD (1 << 0)
#define EPT_PG_WR (1 << 1)
#define EPT_PG_EX (1 << 2)
#define EPT_PG_MEMORY_TYPE(x) ((x) << 3)
#define EPT_PG_IGNORE_PAT (1 << 6)
#define EPT_PG_SUPERPAGE (1 << 7)
#define EPT_PWLEVELS 4 /* page walk levels */
#define EPT_ENABLE_AD_BITS (1 << 6)
#define EPT_ADDR_MASK ((uint64_t)-1 << 12)
SYSCTL_DECL(_hw_vmm);
SYSCTL_NODE(_hw_vmm, OID_AUTO, ept, CTLFLAG_RW, NULL, NULL);
MALLOC_DECLARE(M_VMX);
static int ept_enable_ad_bits;
static uint64_t page_sizes_mask;
/*
* Set this to 1 to have the EPT tables respect the guest PAT settings
*/
static int ept_pat_passthru;
static int ept_pmap_flags;
SYSCTL_INT(_hw_vmm_ept, OID_AUTO, pmap_flags, CTLFLAG_RD,
&ept_pmap_flags, 0, NULL);
int
ept_init(void)
{
int page_shift;
int use_hw_ad_bits, use_superpages, use_exec_only;
uint64_t cap;
cap = rdmsr(MSR_VMX_EPT_VPID_CAP);
@ -105,17 +98,22 @@ ept_init(void)
!INVEPT_ALL_TYPES_SUPPORTED(cap))
return (EINVAL);
/* Set bits in 'page_sizes_mask' for each valid page size */
page_shift = PAGE_SHIFT;
page_sizes_mask = 1UL << page_shift; /* 4KB page */
use_superpages = 1;
TUNABLE_INT_FETCH("hw.vmm.ept.use_superpages", &use_superpages);
if (use_superpages && EPT_PDE_SUPERPAGE(cap))
ept_pmap_flags |= PMAP_PDE_SUPERPAGE; /* 2MB superpage */
page_shift += 9;
if (EPT_PDE_SUPERPAGE(cap))
page_sizes_mask |= 1UL << page_shift; /* 2MB superpage */
use_hw_ad_bits = 1;
TUNABLE_INT_FETCH("hw.vmm.ept.use_hw_ad_bits", &use_hw_ad_bits);
if (use_hw_ad_bits && AD_BITS_SUPPORTED(cap))
ept_enable_ad_bits = 1;
else
ept_pmap_flags |= PMAP_EMULATE_AD_BITS;
page_shift += 9;
if (EPT_PDPTE_SUPERPAGE(cap))
page_sizes_mask |= 1UL << page_shift; /* 1GB superpage */
use_exec_only = 1;
TUNABLE_INT_FETCH("hw.vmm.ept.use_exec_only", &use_exec_only);
if (use_exec_only && EPT_SUPPORTS_EXEC_ONLY(cap))
ept_pmap_flags |= PMAP_SUPPORTS_EXEC_ONLY;
return (0);
}
@ -154,233 +152,6 @@ ept_dump(uint64_t *ptp, int nlevels)
}
#endif
static size_t
ept_create_mapping(uint64_t *ptp, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
vm_memattr_t attr, vm_prot_t prot, boolean_t spok)
{
int spshift, ptpshift, ptpindex, nlevels;
/*
* Compute the size of the mapping that we can accomodate.
*
* This is based on three factors:
* - super page sizes supported by the processor
* - alignment of the region starting at 'gpa' and 'hpa'
* - length of the region 'len'
*/
spshift = PAGE_SHIFT;
if (spok)
spshift += (EPT_PWLEVELS - 1) * 9;
while (spshift >= PAGE_SHIFT) {
uint64_t spsize = 1UL << spshift;
if ((page_sizes_mask & spsize) != 0 &&
(gpa & (spsize - 1)) == 0 &&
(hpa & (spsize - 1)) == 0 &&
length >= spsize) {
break;
}
spshift -= 9;
}
if (spshift < PAGE_SHIFT) {
panic("Invalid spshift for gpa 0x%016lx, hpa 0x%016lx, "
"length 0x%016lx, page_sizes_mask 0x%016lx",
gpa, hpa, length, page_sizes_mask);
}
nlevels = EPT_PWLEVELS;
while (--nlevels >= 0) {
ptpshift = PAGE_SHIFT + nlevels * 9;
ptpindex = (gpa >> ptpshift) & 0x1FF;
/* We have reached the leaf mapping */
if (spshift >= ptpshift)
break;
/*
* We are working on a non-leaf page table page.
*
* Create the next level page table page if necessary and point
* to it from the current page table.
*/
if (ptp[ptpindex] == 0) {
void *nlp = malloc(PAGE_SIZE, M_VMX, M_WAITOK | M_ZERO);
ptp[ptpindex] = vtophys(nlp);
ptp[ptpindex] |= EPT_PG_RD | EPT_PG_WR | EPT_PG_EX;
}
/* Work our way down to the next level page table page */
ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & EPT_ADDR_MASK);
}
if ((gpa & ((1UL << ptpshift) - 1)) != 0) {
panic("ept_create_mapping: gpa 0x%016lx and ptpshift %d "
"mismatch\n", gpa, ptpshift);
}
if (prot != VM_PROT_NONE) {
/* Do the mapping */
ptp[ptpindex] = hpa;
/* Apply the access controls */
if (prot & VM_PROT_READ)
ptp[ptpindex] |= EPT_PG_RD;
if (prot & VM_PROT_WRITE)
ptp[ptpindex] |= EPT_PG_WR;
if (prot & VM_PROT_EXECUTE)
ptp[ptpindex] |= EPT_PG_EX;
/*
* By default the PAT type is ignored - this appears to
* be how other hypervisors handle EPT. Allow this to be
* overridden.
*/
ptp[ptpindex] |= EPT_PG_MEMORY_TYPE(attr);
if (!ept_pat_passthru)
ptp[ptpindex] |= EPT_PG_IGNORE_PAT;
if (nlevels > 0)
ptp[ptpindex] |= EPT_PG_SUPERPAGE;
} else {
/* Remove the mapping */
ptp[ptpindex] = 0;
}
return (1UL << ptpshift);
}
static vm_paddr_t
ept_lookup_mapping(uint64_t *ptp, vm_paddr_t gpa)
{
int nlevels, ptpshift, ptpindex;
uint64_t ptpval, hpabase, pgmask;
nlevels = EPT_PWLEVELS;
while (--nlevels >= 0) {
ptpshift = PAGE_SHIFT + nlevels * 9;
ptpindex = (gpa >> ptpshift) & 0x1FF;
ptpval = ptp[ptpindex];
/* Cannot make progress beyond this point */
if ((ptpval & (EPT_PG_RD | EPT_PG_WR | EPT_PG_EX)) == 0)
break;
if (nlevels == 0 || (ptpval & EPT_PG_SUPERPAGE)) {
pgmask = (1UL << ptpshift) - 1;
hpabase = ptpval & ~pgmask;
return (hpabase | (gpa & pgmask));
}
/* Work our way down to the next level page table page */
ptp = (uint64_t *)PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK);
}
return ((vm_paddr_t)-1);
}
static void
ept_free_pt_entry(pt_entry_t pte)
{
if (pte == 0)
return;
/* sanity check */
if ((pte & EPT_PG_SUPERPAGE) != 0)
panic("ept_free_pt_entry: pte cannot have superpage bit");
return;
}
static void
ept_free_pd_entry(pd_entry_t pde)
{
pt_entry_t *pt;
int i;
if (pde == 0)
return;
if ((pde & EPT_PG_SUPERPAGE) == 0) {
pt = (pt_entry_t *)PHYS_TO_DMAP(pde & EPT_ADDR_MASK);
for (i = 0; i < NPTEPG; i++)
ept_free_pt_entry(pt[i]);
free(pt, M_VMX); /* free the page table page */
}
}
static void
ept_free_pdp_entry(pdp_entry_t pdpe)
{
pd_entry_t *pd;
int i;
if (pdpe == 0)
return;
if ((pdpe & EPT_PG_SUPERPAGE) == 0) {
pd = (pd_entry_t *)PHYS_TO_DMAP(pdpe & EPT_ADDR_MASK);
for (i = 0; i < NPDEPG; i++)
ept_free_pd_entry(pd[i]);
free(pd, M_VMX); /* free the page directory page */
}
}
static void
ept_free_pml4_entry(pml4_entry_t pml4e)
{
pdp_entry_t *pdp;
int i;
if (pml4e == 0)
return;
if ((pml4e & EPT_PG_SUPERPAGE) == 0) {
pdp = (pdp_entry_t *)PHYS_TO_DMAP(pml4e & EPT_ADDR_MASK);
for (i = 0; i < NPDPEPG; i++)
ept_free_pdp_entry(pdp[i]);
free(pdp, M_VMX); /* free the page directory ptr page */
}
}
void
ept_vmcleanup(struct vmx *vmx)
{
int i;
for (i = 0; i < NPML4EPG; i++)
ept_free_pml4_entry(vmx->pml4ept[i]);
}
int
ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t len,
vm_memattr_t attr, int prot, boolean_t spok)
{
size_t n;
struct vmx *vmx = arg;
while (len > 0) {
n = ept_create_mapping(vmx->pml4ept, gpa, hpa, len, attr,
prot, spok);
len -= n;
gpa += n;
hpa += n;
}
return (0);
}
vm_paddr_t
ept_vmmmap_get(void *arg, vm_paddr_t gpa)
{
vm_paddr_t hpa;
struct vmx *vmx;
vmx = arg;
hpa = ept_lookup_mapping(vmx->pml4ept, gpa);
return (hpa);
}
static void
invept_single_context(void *arg)
{
@ -390,11 +161,44 @@ invept_single_context(void *arg)
}
void
ept_invalidate_mappings(u_long pml4ept)
ept_invalidate_mappings(u_long eptp)
{
struct invept_desc invept_desc = { 0 };
invept_desc.eptp = EPTP(pml4ept);
invept_desc.eptp = eptp;
smp_rendezvous(NULL, invept_single_context, NULL, &invept_desc);
}
static int
ept_pinit(pmap_t pmap)
{
return (pmap_pinit_type(pmap, PT_EPT, ept_pmap_flags));
}
struct vmspace *
ept_vmspace_alloc(vm_offset_t min, vm_offset_t max)
{
return (vmspace_alloc(min, max, ept_pinit));
}
void
ept_vmspace_free(struct vmspace *vmspace)
{
vmspace_free(vmspace);
}
uint64_t
eptp(uint64_t pml4)
{
uint64_t eptp_val;
eptp_val = pml4 | (EPT_PWLEVELS - 1) << 3 | PAT_WRITE_BACK;
if (ept_enable_ad_bits)
eptp_val |= EPT_ENABLE_AD_BITS;
return (eptp_val);
}

View File

@ -31,13 +31,9 @@
struct vmx;
#define EPT_PWLEVELS 4 /* page walk levels */
#define EPTP(pml4) ((pml4) | (EPT_PWLEVELS - 1) << 3 | PAT_WRITE_BACK)
int ept_init(void);
int ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
vm_memattr_t attr, int prot, boolean_t allow_superpage_mappings);
vm_paddr_t ept_vmmmap_get(void *arg, vm_paddr_t gpa);
void ept_invalidate_mappings(u_long ept_pml4);
void ept_vmcleanup(struct vmx *vmx);
void ept_invalidate_mappings(u_long eptp);
struct vmspace *ept_vmspace_alloc(vm_offset_t min, vm_offset_t max);
void ept_vmspace_free(struct vmspace *vmspace);
uint64_t eptp(uint64_t pml4);
#endif

View File

@ -318,14 +318,14 @@ done:
int
vmcs_set_defaults(struct vmcs *vmcs,
u_long host_rip, u_long host_rsp, u_long ept_pml4,
u_long host_rip, u_long host_rsp, uint64_t eptp,
uint32_t pinbased_ctls, uint32_t procbased_ctls,
uint32_t procbased_ctls2, uint32_t exit_ctls,
uint32_t entry_ctls, u_long msr_bitmap, uint16_t vpid)
{
int error, codesel, datasel, tsssel;
u_long cr0, cr4, efer;
uint64_t eptp, pat, fsbase, idtrbase;
uint64_t pat, fsbase, idtrbase;
uint32_t exc_bitmap;
codesel = vmm_get_host_codesel();
@ -432,7 +432,6 @@ vmcs_set_defaults(struct vmcs *vmcs,
goto done;
/* eptp */
eptp = EPTP(ept_pml4);
if ((error = vmwrite(VMCS_EPTP, eptp)) != 0)
goto done;

View File

@ -47,7 +47,7 @@ struct msr_entry {
int vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count);
int vmcs_set_defaults(struct vmcs *vmcs, u_long host_rip, u_long host_rsp,
u_long ept_pml4,
uint64_t eptp,
uint32_t pinbased_ctls, uint32_t procbased_ctls,
uint32_t procbased_ctls2, uint32_t exit_ctls,
uint32_t entry_ctls, u_long msr_bitmap,
@ -68,6 +68,8 @@ uint64_t vmcs_read(uint32_t encoding);
#define vmcs_guest_cr3() vmcs_read(VMCS_GUEST_CR3)
#define vmcs_gpa() vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS)
#define vmcs_gla() vmcs_read(VMCS_GUEST_LINEAR_ADDRESS)
#define vmcs_idt_vectoring_info() vmcs_read(VMCS_IDT_VECTORING_INFO)
#define vmcs_idt_vectoring_err() vmcs_read(VMCS_IDT_VECTORING_ERROR)
#endif /* _KERNEL */
@ -313,6 +315,12 @@ uint64_t vmcs_read(uint32_t encoding);
#define VMCS_INTERRUPTION_INFO_HW_INTR (0 << 8)
#define VMCS_INTERRUPTION_INFO_NMI (2 << 8)
/*
* VMCS IDT-Vectoring information fields
*/
#define VMCS_IDT_VEC_VALID (1 << 31)
#define VMCS_IDT_VEC_ERRCODE_VALID (1 << 11)
/*
* VMCS Guest interruptibility field
*/
@ -332,6 +340,9 @@ uint64_t vmcs_read(uint32_t encoding);
#define EPT_VIOLATION_DATA_READ (1UL << 0)
#define EPT_VIOLATION_DATA_WRITE (1UL << 1)
#define EPT_VIOLATION_INST_FETCH (1UL << 2)
#define EPT_VIOLATION_GPA_READABLE (1UL << 3)
#define EPT_VIOLATION_GPA_WRITEABLE (1UL << 4)
#define EPT_VIOLATION_GPA_EXECUTABLE (1UL << 5)
#define EPT_VIOLATION_GLA_VALID (1UL << 7)
#define EPT_VIOLATION_XLAT_VALID (1UL << 8)

View File

@ -49,8 +49,6 @@ __FBSDID("$FreeBSD$");
#include <machine/specialreg.h>
#include <machine/vmparam.h>
#include <x86/apicreg.h>
#include <machine/vmm.h>
#include "vmm_host.h"
#include "vmm_lapic.h"
@ -167,9 +165,6 @@ static int cap_pause_exit;
static int cap_unrestricted_guest;
static int cap_monitor_trap;
/* statistics */
static VMM_STAT_INTEL(VMEXIT_HLT_IGNORED, "number of times hlt was ignored");
static struct unrhdr *vpid_unr;
static u_int vpid_alloc_failed;
SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD,
@ -740,7 +735,7 @@ vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial)
#define vmx_setup_cr4_shadow(vmcs,init) vmx_setup_cr_shadow(4, (vmcs), (init))
static void *
vmx_vminit(struct vm *vm)
vmx_vminit(struct vm *vm, pmap_t pmap)
{
uint16_t vpid[VM_MAXCPU];
int i, error, guest_msr_count;
@ -753,6 +748,8 @@ vmx_vminit(struct vm *vm)
}
vmx->vm = vm;
vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4));
/*
* Clean up EPTP-tagged guest physical and combined mappings
*
@ -762,7 +759,7 @@ vmx_vminit(struct vm *vm)
*
* Combined mappings for this EP4TA are also invalidated for all VPIDs.
*/
ept_invalidate_mappings(vtophys(vmx->pml4ept));
ept_invalidate_mappings(vmx->eptp);
msr_bitmap_initialize(vmx->msr_bitmap);
@ -818,7 +815,7 @@ vmx_vminit(struct vm *vm)
error = vmcs_set_defaults(&vmx->vmcs[i],
(u_long)vmx_longjmp,
(u_long)&vmx->ctx[i],
vtophys(vmx->pml4ept),
vmx->eptp,
pinbased_ctls,
procbased_ctls,
procbased_ctls2,
@ -856,6 +853,9 @@ vmx_vminit(struct vm *vm)
error = vmx_setup_cr4_shadow(&vmx->vmcs[i], 0);
if (error != 0)
panic("vmx_setup_cr4_shadow %d", error);
vmx->ctx[i].pmap = pmap;
vmx->ctx[i].eptp = vmx->eptp;
}
return (vmx);
@ -1281,21 +1281,49 @@ vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
}
static int
vmx_ept_fault(struct vm *vm, int cpu,
uint64_t gla, uint64_t gpa, uint64_t rip, int inst_length,
uint64_t cr3, uint64_t ept_qual, struct vie *vie)
ept_fault_type(uint64_t ept_qual)
{
int read, write, error;
int fault_type;
/* EPT violation on an instruction fetch doesn't make sense here */
if (ept_qual & EPT_VIOLATION_DATA_WRITE)
fault_type = VM_PROT_WRITE;
else if (ept_qual & EPT_VIOLATION_INST_FETCH)
fault_type = VM_PROT_EXECUTE;
else
fault_type= VM_PROT_READ;
return (fault_type);
}
static int
ept_protection(uint64_t ept_qual)
{
int prot = 0;
if (ept_qual & EPT_VIOLATION_GPA_READABLE)
prot |= VM_PROT_READ;
if (ept_qual & EPT_VIOLATION_GPA_WRITEABLE)
prot |= VM_PROT_WRITE;
if (ept_qual & EPT_VIOLATION_GPA_EXECUTABLE)
prot |= VM_PROT_EXECUTE;
return (prot);
}
static boolean_t
ept_emulation_fault(uint64_t ept_qual)
{
int read, write;
/* EPT fault on an instruction fetch doesn't make sense here */
if (ept_qual & EPT_VIOLATION_INST_FETCH)
return (UNHANDLED);
return (FALSE);
/* EPT violation must be a read fault or a write fault */
/* EPT fault must be a read fault or a write fault */
read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0;
write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0;
if ((read | write) == 0)
return (UNHANDLED);
return (FALSE);
/*
* The EPT violation must have been caused by accessing a
@ -1304,26 +1332,10 @@ vmx_ept_fault(struct vm *vm, int cpu,
*/
if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 ||
(ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) {
return (UNHANDLED);
return (FALSE);
}
/* Fetch, decode and emulate the faulting instruction */
if (vmm_fetch_instruction(vm, cpu, rip, inst_length, cr3, vie) != 0)
return (UNHANDLED);
if (vmm_decode_instruction(vm, cpu, gla, vie) != 0)
return (UNHANDLED);
/*
* Check if this is a local apic access
*/
if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE)
return (UNHANDLED);
error = vmm_emulate_instruction(vm, cpu, gpa, vie,
lapic_mmio_read, lapic_mmio_write, 0);
return (error ? UNHANDLED : HANDLED);
return (TRUE);
}
static int
@ -1332,18 +1344,47 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
int error, handled;
struct vmcs *vmcs;
struct vmxctx *vmxctx;
uint32_t eax, ecx, edx;
uint64_t qual, gla, gpa, cr3, intr_info;
uint32_t eax, ecx, edx, idtvec_info, idtvec_err, reason;
uint64_t qual, gpa;
handled = 0;
vmcs = &vmx->vmcs[vcpu];
vmxctx = &vmx->ctx[vcpu];
qual = vmexit->u.vmx.exit_qualification;
reason = vmexit->u.vmx.exit_reason;
vmexit->exitcode = VM_EXITCODE_BOGUS;
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1);
switch (vmexit->u.vmx.exit_reason) {
/*
* VM exits that could be triggered during event injection on the
* previous VM entry need to be handled specially by re-injecting
* the event.
*
* See "Information for VM Exits During Event Delivery" in Intel SDM
* for details.
*/
switch (reason) {
case EXIT_REASON_EPT_FAULT:
case EXIT_REASON_EPT_MISCONFIG:
case EXIT_REASON_APIC:
case EXIT_REASON_TASK_SWITCH:
case EXIT_REASON_EXCEPTION:
idtvec_info = vmcs_idt_vectoring_info();
if (idtvec_info & VMCS_IDT_VEC_VALID) {
idtvec_info &= ~(1 << 12); /* clear undefined bit */
vmwrite(VMCS_ENTRY_INTR_INFO, idtvec_info);
if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
idtvec_err = vmcs_idt_vectoring_err();
vmwrite(VMCS_ENTRY_EXCEPTION_ERROR, idtvec_err);
}
vmwrite(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
}
default:
break;
}
switch (reason) {
case EXIT_REASON_CR_ACCESS:
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1);
handled = vmx_emulate_cr_access(vmx, vcpu, qual);
@ -1374,19 +1415,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
break;
case EXIT_REASON_HLT:
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1);
/*
* If there is an event waiting to be injected then there is
* no need to 'hlt'.
*/
error = vmread(VMCS_ENTRY_INTR_INFO, &intr_info);
if (error)
panic("vmx_exit_process: vmread(intrinfo) %d", error);
if (intr_info & VMCS_INTERRUPTION_INFO_VALID) {
handled = 1;
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT_IGNORED, 1);
} else
vmexit->exitcode = VM_EXITCODE_HLT;
vmexit->exitcode = VM_EXITCODE_HLT;
break;
case EXIT_REASON_MTF:
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1);
@ -1440,15 +1469,22 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
break;
case EXIT_REASON_EPT_FAULT:
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EPT_FAULT, 1);
gla = vmcs_gla();
/*
* If 'gpa' lies within the address space allocated to
* memory then this must be a nested page fault otherwise
* this must be an instruction that accesses MMIO space.
*/
gpa = vmcs_gpa();
cr3 = vmcs_guest_cr3();
handled = vmx_ept_fault(vmx->vm, vcpu, gla, gpa,
vmexit->rip, vmexit->inst_length,
cr3, qual, &vmexit->u.paging.vie);
if (!handled) {
if (vm_mem_allocated(vmx->vm, gpa)) {
vmexit->exitcode = VM_EXITCODE_PAGING;
vmexit->u.paging.gpa = gpa;
vmexit->u.paging.fault_type = ept_fault_type(qual);
vmexit->u.paging.protection = ept_protection(qual);
} else if (ept_emulation_fault(qual)) {
vmexit->exitcode = VM_EXITCODE_INST_EMUL;
vmexit->u.inst_emul.gpa = gpa;
vmexit->u.inst_emul.gla = vmcs_gla();
vmexit->u.inst_emul.cr3 = vmcs_guest_cr3();
}
break;
default:
@ -1470,14 +1506,6 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
vm_exit_update_rip(vmexit);
vmexit->rip += vmexit->inst_length;
vmexit->inst_length = 0;
/*
* Special case for spinning up an AP - exit to userspace to
* give the controlling process a chance to intercept and
* spin up a thread for the AP.
*/
if (vmexit->exitcode == VM_EXITCODE_SPINUP_AP)
handled = 0;
} else {
if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
/*
@ -1497,7 +1525,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
}
static int
vmx_run(void *arg, int vcpu, register_t rip)
vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap)
{
int error, vie, rc, handled, astpending;
uint32_t exit_reason;
@ -1505,7 +1533,7 @@ vmx_run(void *arg, int vcpu, register_t rip)
struct vmxctx *vmxctx;
struct vmcs *vmcs;
struct vm_exit *vmexit;
vmx = arg;
vmcs = &vmx->vmcs[vcpu];
vmxctx = &vmx->ctx[vcpu];
@ -1514,6 +1542,11 @@ vmx_run(void *arg, int vcpu, register_t rip)
astpending = 0;
vmexit = vm_exitinfo(vmx->vm, vcpu);
KASSERT(vmxctx->pmap == pmap,
("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap));
KASSERT(vmxctx->eptp == vmx->eptp,
("eptp %p different than ctx eptp %#lx", eptp, vmxctx->eptp));
/*
* XXX Can we avoid doing this every time we do a vm run?
*/
@ -1576,6 +1609,9 @@ vmx_run(void *arg, int vcpu, register_t rip)
vmxctx->launch_error, vie);
#endif
goto err_exit;
case VMX_RETURN_INVEPT:
panic("vm %s:%d invept error %d",
vm_name(vmx->vm), vcpu, vmxctx->launch_error);
default:
panic("vmx_setjmp returned %d", rc);
}
@ -1654,7 +1690,6 @@ vmx_vmcleanup(void *arg)
if (error != 0)
panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error);
ept_vmcleanup(vmx);
free(vmx, M_VMX);
return;
@ -2000,13 +2035,13 @@ struct vmm_ops vmm_ops_intel = {
vmx_vminit,
vmx_run,
vmx_vmcleanup,
ept_vmmmap_set,
ept_vmmmap_get,
vmx_getreg,
vmx_setreg,
vmx_getdesc,
vmx_setdesc,
vmx_inject,
vmx_getcap,
vmx_setcap
vmx_setcap,
ept_vmspace_alloc,
ept_vmspace_free,
};

View File

@ -31,6 +31,8 @@
#include "vmcs.h"
struct pmap;
#define GUEST_MSR_MAX_ENTRIES 64 /* arbitrary */
struct vmxctx {
@ -68,6 +70,15 @@ struct vmxctx {
int launched; /* vmcs launch state */
int launch_error;
long eptgen[MAXCPU]; /* cached pmap->pm_eptgen */
/*
* The 'eptp' and the 'pmap' do not change during the lifetime of
* the VM so it is safe to keep a copy in each vcpu's vmxctx.
*/
vm_paddr_t eptp;
struct pmap *pmap;
};
struct vmxcap {
@ -82,16 +93,15 @@ struct vmxstate {
/* virtual machine softc */
struct vmx {
pml4_entry_t pml4ept[NPML4EPG];
struct vmcs vmcs[VM_MAXCPU]; /* one vmcs per virtual cpu */
char msr_bitmap[PAGE_SIZE];
struct msr_entry guest_msrs[VM_MAXCPU][GUEST_MSR_MAX_ENTRIES];
struct vmxctx ctx[VM_MAXCPU];
struct vmxcap cap[VM_MAXCPU];
struct vmxstate state[VM_MAXCPU];
uint64_t eptp;
struct vm *vm;
};
CTASSERT((offsetof(struct vmx, pml4ept) & PAGE_MASK) == 0);
CTASSERT((offsetof(struct vmx, vmcs) & PAGE_MASK) == 0);
CTASSERT((offsetof(struct vmx, msr_bitmap) & PAGE_MASK) == 0);
CTASSERT((offsetof(struct vmx, guest_msrs) & 15) == 0);
@ -101,6 +111,7 @@ CTASSERT((offsetof(struct vmx, guest_msrs) & 15) == 0);
#define VMX_RETURN_VMRESUME 2
#define VMX_RETURN_VMLAUNCH 3
#define VMX_RETURN_AST 4
#define VMX_RETURN_INVEPT 5
/*
* vmx_setjmp() returns:
* - 0 when it returns directly
@ -108,6 +119,7 @@ CTASSERT((offsetof(struct vmx, guest_msrs) & 15) == 0);
* - 2 when it returns from vmx_resume (which would only be in the error case)
* - 3 when it returns from vmx_launch (which would only be in the error case)
* - 4 when it returns from vmx_resume or vmx_launch because of AST pending
* - 5 when it returns from vmx_launch/vmx_resume because of invept error
*/
int vmx_setjmp(struct vmxctx *ctx);
void vmx_longjmp(void); /* returns via vmx_setjmp */

View File

@ -72,6 +72,10 @@ ASSYM(VMXCTX_HOST_RBX, offsetof(struct vmxctx, host_rbx));
ASSYM(VMXCTX_HOST_RIP, offsetof(struct vmxctx, host_rip));
ASSYM(VMXCTX_LAUNCH_ERROR, offsetof(struct vmxctx, launch_error));
ASSYM(VMXCTX_EPTGEN, offsetof(struct vmxctx, eptgen));
ASSYM(VMXCTX_PMAP, offsetof(struct vmxctx, pmap));
ASSYM(VMXCTX_EPTP, offsetof(struct vmxctx, eptp));
ASSYM(VM_SUCCESS, VM_SUCCESS);
ASSYM(VM_FAIL_INVALID, VM_FAIL_INVALID);
@ -82,8 +86,13 @@ ASSYM(VMX_RETURN_LONGJMP, VMX_RETURN_LONGJMP);
ASSYM(VMX_RETURN_VMRESUME, VMX_RETURN_VMRESUME);
ASSYM(VMX_RETURN_VMLAUNCH, VMX_RETURN_VMLAUNCH);
ASSYM(VMX_RETURN_AST, VMX_RETURN_AST);
ASSYM(VMX_RETURN_INVEPT, VMX_RETURN_INVEPT);
ASSYM(TDF_ASTPENDING, TDF_ASTPENDING);
ASSYM(TDF_NEEDRESCHED, TDF_NEEDRESCHED);
ASSYM(TD_FLAGS, offsetof(struct thread, td_flags));
ASSYM(PC_CURTHREAD, offsetof(struct pcpu, pc_curthread));
ASSYM(PC_CPUID, offsetof(struct pcpu, pc_cpuid));
ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active));
ASSYM(PM_EPTGEN, offsetof(struct pmap, pm_eptgen));

View File

@ -30,6 +30,12 @@
#include "vmx_assym.s"
#ifdef SMP
#define LK lock ;
#else
#define LK
#endif
/*
* Disable interrupts before updating %rsp in VMX_CHECK_AST or
* VMX_GUEST_RESTORE.
@ -86,15 +92,73 @@
movq VMXCTX_GUEST_R15(%rdi),%r15; \
movq VMXCTX_GUEST_RDI(%rdi),%rdi; /* restore rdi the last */
#define VM_INSTRUCTION_ERROR(reg) \
/*
* Check for an error after executing a VMX instruction.
* 'errreg' will be zero on success and non-zero otherwise.
* 'ctxreg' points to the 'struct vmxctx' associated with the vcpu.
*/
#define VM_INSTRUCTION_ERROR(errreg, ctxreg) \
jnc 1f; \
movl $VM_FAIL_INVALID,reg; /* CF is set */ \
movl $VM_FAIL_INVALID,errreg; /* CF is set */ \
jmp 3f; \
1: jnz 2f; \
movl $VM_FAIL_VALID,reg; /* ZF is set */ \
movl $VM_FAIL_VALID,errreg; /* ZF is set */ \
jmp 3f; \
2: movl $VM_SUCCESS,reg; \
3: movl reg,VMXCTX_LAUNCH_ERROR(%rsp)
2: movl $VM_SUCCESS,errreg; \
3: movl errreg,VMXCTX_LAUNCH_ERROR(ctxreg)
/*
* set or clear the appropriate bit in 'pm_active'
* %rdi = vmxctx
* %rax, %r11 = scratch registers
*/
#define VMX_SET_PM_ACTIVE \
movq VMXCTX_PMAP(%rdi), %r11; \
movl PCPU(CPUID), %eax; \
LK btsl %eax, PM_ACTIVE(%r11)
#define VMX_CLEAR_PM_ACTIVE \
movq VMXCTX_PMAP(%rdi), %r11; \
movl PCPU(CPUID), %eax; \
LK btrl %eax, PM_ACTIVE(%r11)
/*
* If 'vmxctx->eptgen[curcpu]' is not identical to 'pmap->pm_eptgen'
* then we must invalidate all mappings associated with this eptp.
*
* %rdi = vmxctx
* %rax, %rbx, %r11 = scratch registers
*/
#define VMX_CHECK_EPTGEN \
movl PCPU(CPUID), %ebx; \
movq VMXCTX_PMAP(%rdi), %r11; \
movq PM_EPTGEN(%r11), %rax; \
cmpq %rax, VMXCTX_EPTGEN(%rdi, %rbx, 8); \
je 9f; \
\
/* Refresh 'vmxctx->eptgen[curcpu]' */ \
movq %rax, VMXCTX_EPTGEN(%rdi, %rbx, 8); \
\
/* Setup the invept descriptor at the top of tmpstk */ \
mov %rdi, %r11; \
addq $VMXCTX_TMPSTKTOP, %r11; \
movq VMXCTX_EPTP(%rdi), %rax; \
movq %rax, -16(%r11); \
movq $0x0, -8(%r11); \
mov $0x1, %eax; /* Single context invalidate */ \
invept -16(%r11), %rax; \
\
/* Check for invept error */ \
VM_INSTRUCTION_ERROR(%eax, %rdi); \
testl %eax, %eax; \
jz 9f; \
\
/* Return via vmx_setjmp with retval of VMX_RETURN_INVEPT */ \
movq $VMX_RETURN_INVEPT, %rsi; \
movq %rdi,%rsp; \
addq $VMXCTX_TMPSTKTOP, %rsp; \
callq vmx_return; \
9: ;
.text
/*
@ -129,6 +193,9 @@ END(vmx_setjmp)
* Return to vmm context through vmx_setjmp() with a value of 'retval'.
*/
ENTRY(vmx_return)
/* The pmap is no longer active on the host cpu */
VMX_CLEAR_PM_ACTIVE
/* Restore host context. */
movq VMXCTX_HOST_R15(%rdi),%r15
movq VMXCTX_HOST_R14(%rdi),%r14
@ -193,6 +260,10 @@ ENTRY(vmx_resume)
VMX_CHECK_AST
VMX_SET_PM_ACTIVE /* This vcpu is now active on the host cpu */
VMX_CHECK_EPTGEN /* Check if we have to invalidate TLB */
/*
* Restore guest state that is not automatically loaded from the vmcs.
*/
@ -203,7 +274,7 @@ ENTRY(vmx_resume)
/*
* Capture the reason why vmresume failed.
*/
VM_INSTRUCTION_ERROR(%eax)
VM_INSTRUCTION_ERROR(%eax, %rsp)
/* Return via vmx_setjmp with return value of VMX_RETURN_VMRESUME */
movq %rsp,%rdi
@ -225,6 +296,10 @@ ENTRY(vmx_launch)
VMX_CHECK_AST
VMX_SET_PM_ACTIVE /* This vcpu is now active on the host cpu */
VMX_CHECK_EPTGEN /* Check if we have to invalidate TLB */
/*
* Restore guest state that is not automatically loaded from the vmcs.
*/
@ -235,7 +310,7 @@ ENTRY(vmx_launch)
/*
* Capture the reason why vmlaunch failed.
*/
VM_INSTRUCTION_ERROR(%eax)
VM_INSTRUCTION_ERROR(%eax, %rsp)
/* Return via vmx_setjmp with return value of VMX_RETURN_VMLAUNCH */
movq %rsp,%rdi

View File

@ -281,6 +281,43 @@ ppt_teardown_msix(struct pptdev *ppt)
ppt->msix.num_msgs = 0;
}
int
ppt_num_devices(struct vm *vm)
{
int i, num;
num = 0;
for (i = 0; i < num_pptdevs; i++) {
if (pptdevs[i].vm == vm)
num++;
}
return (num);
}
boolean_t
ppt_is_mmio(struct vm *vm, vm_paddr_t gpa)
{
int i, n;
struct pptdev *ppt;
struct vm_memory_segment *seg;
for (n = 0; n < num_pptdevs; n++) {
ppt = &pptdevs[n];
if (ppt->vm != vm)
continue;
for (i = 0; i < MAX_MMIOSEGS; i++) {
seg = &ppt->mmio[i];
if (seg->len == 0)
continue;
if (gpa >= seg->gpa && gpa < seg->gpa + seg->len)
return (TRUE);
}
}
return (FALSE);
}
int
ppt_assign_device(struct vm *vm, int bus, int slot, int func)
{
@ -336,7 +373,7 @@ ppt_unassign_all(struct vm *vm)
bus = pci_get_bus(dev);
slot = pci_get_slot(dev);
func = pci_get_function(dev);
ppt_unassign_device(vm, bus, slot, func);
vm_unassign_pptdev(vm, bus, slot, func);
}
}
@ -591,10 +628,3 @@ ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
return (0);
}
int
ppt_num_devices(void)
{
return (num_pptdevs);
}

View File

@ -29,14 +29,20 @@
#ifndef _IO_PPT_H_
#define _IO_PPT_H_
int ppt_assign_device(struct vm *vm, int bus, int slot, int func);
int ppt_unassign_device(struct vm *vm, int bus, int slot, int func);
int ppt_unassign_all(struct vm *vm);
int ppt_map_mmio(struct vm *vm, int bus, int slot, int func,
vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
int ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
int destcpu, int vector, int numvec);
int ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
int idx, uint32_t msg, uint32_t vector_control, uint64_t addr);
int ppt_num_devices(void);
int idx, uint32_t msg, uint32_t vector_control, uint64_t addr);
int ppt_num_devices(struct vm *vm);
boolean_t ppt_is_mmio(struct vm *vm, vm_paddr_t gpa);
/*
* The following functions should never be called directly.
* Use 'vm_assign_pptdev()' and 'vm_unassign_pptdev()' instead.
*/
int ppt_assign_device(struct vm *vm, int bus, int slot, int func);
int ppt_unassign_device(struct vm *vm, int bus, int slot, int func);
#endif

View File

@ -39,18 +39,28 @@ __FBSDID("$FreeBSD$");
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/rwlock.h>
#include <sys/sched.h>
#include <sys/smp.h>
#include <sys/systm.h>
#include <vm/vm.h>
#include <vm/vm_object.h>
#include <vm/vm_page.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_extern.h>
#include <vm/vm_param.h>
#include <machine/vm.h>
#include <machine/pcb.h>
#include <machine/smp.h>
#include <x86/apicreg.h>
#include <machine/pmap.h>
#include <machine/vmparam.h>
#include <machine/vmm.h>
#include "vmm_ktr.h"
#include "vmm_host.h"
#include "vmm_mem.h"
#include "vmm_util.h"
@ -84,15 +94,23 @@ struct vcpu {
#define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
#define vcpu_lock(v) mtx_lock_spin(&((v)->mtx))
#define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx))
#define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED)
struct mem_seg {
vm_paddr_t gpa;
size_t len;
boolean_t wired;
vm_object_t object;
};
#define VM_MAX_MEMORY_SEGMENTS 2
struct vm {
void *cookie; /* processor-specific data */
void *iommu; /* iommu-specific data */
struct vmspace *vmspace; /* guest's address space */
struct vcpu vcpu[VM_MAXCPU];
int num_mem_segs;
struct vm_memory_segment mem_segs[VM_MAX_MEMORY_SEGMENTS];
struct mem_seg mem_segs[VM_MAX_MEMORY_SEGMENTS];
char name[VM_MAX_NAMELEN];
/*
@ -109,16 +127,14 @@ static struct vmm_ops *ops;
#define VMM_INIT() (ops != NULL ? (*ops->init)() : 0)
#define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0)
#define VMINIT(vm) (ops != NULL ? (*ops->vminit)(vm): NULL)
#define VMRUN(vmi, vcpu, rip) \
(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip) : ENXIO)
#define VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
#define VMRUN(vmi, vcpu, rip, pmap) \
(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap) : ENXIO)
#define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
#define VMMMAP_SET(vmi, gpa, hpa, len, attr, prot, spm) \
(ops != NULL ? \
(*ops->vmmmap_set)(vmi, gpa, hpa, len, attr, prot, spm) : \
ENXIO)
#define VMMMAP_GET(vmi, gpa) \
(ops != NULL ? (*ops->vmmmap_get)(vmi, gpa) : ENXIO)
#define VMSPACE_ALLOC(min, max) \
(ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
#define VMSPACE_FREE(vmspace) \
(ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO)
#define VMGETREG(vmi, vcpu, num, retval) \
(ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
#define VMSETREG(vmi, vcpu, num, val) \
@ -213,8 +229,7 @@ vmm_handler(module_t mod, int what, void *arg)
switch (what) {
case MOD_LOAD:
vmmdev_init();
if (ppt_num_devices() > 0)
iommu_init();
iommu_init();
error = vmm_init();
if (error == 0)
vmm_initialized = 1;
@ -265,7 +280,7 @@ vm_create(const char *name, struct vm **retvm)
{
int i;
struct vm *vm;
vm_paddr_t maxaddr;
struct vmspace *vmspace;
const int BSP = 0;
@ -279,59 +294,34 @@ vm_create(const char *name, struct vm **retvm)
if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
return (EINVAL);
vmspace = VMSPACE_ALLOC(VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS);
if (vmspace == NULL)
return (ENOMEM);
vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
strcpy(vm->name, name);
vm->cookie = VMINIT(vm);
vm->cookie = VMINIT(vm, vmspace_pmap(vmspace));
for (i = 0; i < VM_MAXCPU; i++) {
vcpu_init(vm, i);
guest_msrs_init(vm, i);
}
maxaddr = vmm_mem_maxaddr();
vm->iommu = iommu_create_domain(maxaddr);
vm_activate_cpu(vm, BSP);
vm->vmspace = vmspace;
*retvm = vm;
return (0);
}
static void
vm_free_mem_seg(struct vm *vm, struct vm_memory_segment *seg)
vm_free_mem_seg(struct vm *vm, struct mem_seg *seg)
{
size_t len;
vm_paddr_t hpa;
void *host_domain;
host_domain = iommu_host_domain();
if (seg->object != NULL)
vmm_mem_free(vm->vmspace, seg->gpa, seg->len);
len = 0;
while (len < seg->len) {
hpa = vm_gpa2hpa(vm, seg->gpa + len, PAGE_SIZE);
if (hpa == (vm_paddr_t)-1) {
panic("vm_free_mem_segs: cannot free hpa "
"associated with gpa 0x%016lx", seg->gpa + len);
}
/*
* Remove the 'gpa' to 'hpa' mapping in VMs domain.
* And resurrect the 1:1 mapping for 'hpa' in 'host_domain'.
*/
iommu_remove_mapping(vm->iommu, seg->gpa + len, PAGE_SIZE);
iommu_create_mapping(host_domain, hpa, hpa, PAGE_SIZE);
vmm_mem_free(hpa, PAGE_SIZE);
len += PAGE_SIZE;
}
/*
* Invalidate cached translations associated with 'vm->iommu' since
* we have now moved some pages from it.
*/
iommu_invalidate_tlb(vm->iommu);
bzero(seg, sizeof(struct vm_memory_segment));
bzero(seg, sizeof(*seg));
}
void
@ -341,6 +331,9 @@ vm_destroy(struct vm *vm)
ppt_unassign_all(vm);
if (vm->iommu != NULL)
iommu_destroy_domain(vm->iommu);
for (i = 0; i < vm->num_mem_segs; i++)
vm_free_mem_seg(vm, &vm->mem_segs[i]);
@ -349,7 +342,7 @@ vm_destroy(struct vm *vm)
for (i = 0; i < VM_MAXCPU; i++)
vcpu_cleanup(&vm->vcpu[i]);
iommu_destroy_domain(vm->iommu);
VMSPACE_FREE(vm->vmspace);
VMCLEANUP(vm->cookie);
@ -365,52 +358,48 @@ vm_name(struct vm *vm)
int
vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
{
const boolean_t spok = TRUE; /* superpage mappings are ok */
vm_object_t obj;
return (VMMMAP_SET(vm->cookie, gpa, hpa, len, VM_MEMATTR_UNCACHEABLE,
VM_PROT_RW, spok));
if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
return (ENOMEM);
else
return (0);
}
int
vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
{
const boolean_t spok = TRUE; /* superpage mappings are ok */
return (VMMMAP_SET(vm->cookie, gpa, 0, len, 0,
VM_PROT_NONE, spok));
vmm_mmio_free(vm->vmspace, gpa, len);
return (0);
}
/*
* Returns TRUE if 'gpa' is available for allocation and FALSE otherwise
*/
static boolean_t
vm_gpa_available(struct vm *vm, vm_paddr_t gpa)
boolean_t
vm_mem_allocated(struct vm *vm, vm_paddr_t gpa)
{
int i;
vm_paddr_t gpabase, gpalimit;
if (gpa & PAGE_MASK)
panic("vm_gpa_available: gpa (0x%016lx) not page aligned", gpa);
for (i = 0; i < vm->num_mem_segs; i++) {
gpabase = vm->mem_segs[i].gpa;
gpalimit = gpabase + vm->mem_segs[i].len;
if (gpa >= gpabase && gpa < gpalimit)
return (FALSE);
return (TRUE); /* 'gpa' is regular memory */
}
return (TRUE);
if (ppt_is_mmio(vm, gpa))
return (TRUE); /* 'gpa' is pci passthru mmio */
return (FALSE);
}
int
vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
{
int error, available, allocated;
struct vm_memory_segment *seg;
vm_paddr_t g, hpa;
void *host_domain;
const boolean_t spok = TRUE; /* superpage mappings are ok */
int available, allocated;
struct mem_seg *seg;
vm_object_t object;
vm_paddr_t g;
if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
return (EINVAL);
@ -418,10 +407,10 @@ vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
available = allocated = 0;
g = gpa;
while (g < gpa + len) {
if (vm_gpa_available(vm, g))
available++;
else
if (vm_mem_allocated(vm, g))
allocated++;
else
available++;
g += PAGE_SIZE;
}
@ -443,61 +432,203 @@ vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
return (E2BIG);
host_domain = iommu_host_domain();
seg = &vm->mem_segs[vm->num_mem_segs];
error = 0;
if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL)
return (ENOMEM);
seg->gpa = gpa;
seg->len = 0;
while (seg->len < len) {
hpa = vmm_mem_alloc(PAGE_SIZE);
if (hpa == 0) {
error = ENOMEM;
break;
}
error = VMMMAP_SET(vm->cookie, gpa + seg->len, hpa, PAGE_SIZE,
VM_MEMATTR_WRITE_BACK, VM_PROT_ALL, spok);
if (error)
break;
/*
* Remove the 1:1 mapping for 'hpa' from the 'host_domain'.
* Add mapping for 'gpa + seg->len' to 'hpa' in the VMs domain.
*/
iommu_remove_mapping(host_domain, hpa, PAGE_SIZE);
iommu_create_mapping(vm->iommu, gpa + seg->len, hpa, PAGE_SIZE);
seg->len += PAGE_SIZE;
}
if (error) {
vm_free_mem_seg(vm, seg);
return (error);
}
/*
* Invalidate cached translations associated with 'host_domain' since
* we have now moved some pages from it.
*/
iommu_invalidate_tlb(host_domain);
seg->len = len;
seg->object = object;
seg->wired = FALSE;
vm->num_mem_segs++;
return (0);
}
vm_paddr_t
vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t len)
static void
vm_gpa_unwire(struct vm *vm)
{
vm_paddr_t nextpage;
int i, rv;
struct mem_seg *seg;
nextpage = rounddown(gpa + PAGE_SIZE, PAGE_SIZE);
if (len > nextpage - gpa)
panic("vm_gpa2hpa: invalid gpa/len: 0x%016lx/%lu", gpa, len);
for (i = 0; i < vm->num_mem_segs; i++) {
seg = &vm->mem_segs[i];
if (!seg->wired)
continue;
return (VMMMAP_GET(vm->cookie, gpa));
rv = vm_map_unwire(&vm->vmspace->vm_map,
seg->gpa, seg->gpa + seg->len,
VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment "
"%#lx/%ld could not be unwired: %d",
vm_name(vm), seg->gpa, seg->len, rv));
seg->wired = FALSE;
}
}
static int
vm_gpa_wire(struct vm *vm)
{
int i, rv;
struct mem_seg *seg;
for (i = 0; i < vm->num_mem_segs; i++) {
seg = &vm->mem_segs[i];
if (seg->wired)
continue;
/* XXX rlimits? */
rv = vm_map_wire(&vm->vmspace->vm_map,
seg->gpa, seg->gpa + seg->len,
VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
if (rv != KERN_SUCCESS)
break;
seg->wired = TRUE;
}
if (i < vm->num_mem_segs) {
/*
* Undo the wiring before returning an error.
*/
vm_gpa_unwire(vm);
return (EAGAIN);
}
return (0);
}
static void
vm_iommu_modify(struct vm *vm, boolean_t map)
{
int i, sz;
vm_paddr_t gpa, hpa;
struct mem_seg *seg;
void *vp, *cookie, *host_domain;
sz = PAGE_SIZE;
host_domain = iommu_host_domain();
for (i = 0; i < vm->num_mem_segs; i++) {
seg = &vm->mem_segs[i];
KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired",
vm_name(vm), seg->gpa, seg->len));
gpa = seg->gpa;
while (gpa < seg->gpa + seg->len) {
vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE,
&cookie);
KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx",
vm_name(vm), gpa));
vm_gpa_release(cookie);
hpa = DMAP_TO_PHYS((uintptr_t)vp);
if (map) {
iommu_create_mapping(vm->iommu, gpa, hpa, sz);
iommu_remove_mapping(host_domain, hpa, sz);
} else {
iommu_remove_mapping(vm->iommu, gpa, sz);
iommu_create_mapping(host_domain, hpa, hpa, sz);
}
gpa += PAGE_SIZE;
}
}
/*
* Invalidate the cached translations associated with the domain
* from which pages were removed.
*/
if (map)
iommu_invalidate_tlb(host_domain);
else
iommu_invalidate_tlb(vm->iommu);
}
#define vm_iommu_unmap(vm) vm_iommu_modify((vm), FALSE)
#define vm_iommu_map(vm) vm_iommu_modify((vm), TRUE)
int
vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
{
int error;
error = ppt_unassign_device(vm, bus, slot, func);
if (error)
return (error);
if (ppt_num_devices(vm) == 0) {
vm_iommu_unmap(vm);
vm_gpa_unwire(vm);
}
return (0);
}
int
vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
{
int error;
vm_paddr_t maxaddr;
/*
* Virtual machines with pci passthru devices get special treatment:
* - the guest physical memory is wired
* - the iommu is programmed to do the 'gpa' to 'hpa' translation
*
* We need to do this before the first pci passthru device is attached.
*/
if (ppt_num_devices(vm) == 0) {
KASSERT(vm->iommu == NULL,
("vm_assign_pptdev: iommu must be NULL"));
maxaddr = vmm_mem_maxaddr();
vm->iommu = iommu_create_domain(maxaddr);
error = vm_gpa_wire(vm);
if (error)
return (error);
vm_iommu_map(vm);
}
error = ppt_assign_device(vm, bus, slot, func);
return (error);
}
void *
vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
void **cookie)
{
int count, pageoff;
vm_page_t m;
pageoff = gpa & PAGE_MASK;
if (len > PAGE_SIZE - pageoff)
panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
if (count == 1) {
*cookie = m;
return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
} else {
*cookie = NULL;
return (NULL);
}
}
void
vm_gpa_release(void *cookie)
{
vm_page_t m = cookie;
vm_page_lock(m);
vm_page_unhold(m);
vm_page_unlock(m);
}
int
@ -508,13 +639,42 @@ vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
for (i = 0; i < vm->num_mem_segs; i++) {
if (gpabase == vm->mem_segs[i].gpa) {
*seg = vm->mem_segs[i];
seg->gpa = vm->mem_segs[i].gpa;
seg->len = vm->mem_segs[i].len;
seg->wired = vm->mem_segs[i].wired;
return (0);
}
}
return (-1);
}
int
vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len,
vm_offset_t *offset, struct vm_object **object)
{
int i;
size_t seg_len;
vm_paddr_t seg_gpa;
vm_object_t seg_obj;
for (i = 0; i < vm->num_mem_segs; i++) {
if ((seg_obj = vm->mem_segs[i].object) == NULL)
continue;
seg_gpa = vm->mem_segs[i].gpa;
seg_len = vm->mem_segs[i].len;
if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) {
*offset = gpa - seg_gpa;
*object = seg_obj;
vm_object_reference(seg_obj);
return (0);
}
}
return (EINVAL);
}
int
vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
{
@ -633,26 +793,215 @@ save_guest_fpustate(struct vcpu *vcpu)
static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
static int
vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
{
int error;
vcpu_assert_locked(vcpu);
/*
* The following state transitions are allowed:
* IDLE -> FROZEN -> IDLE
* FROZEN -> RUNNING -> FROZEN
* FROZEN -> SLEEPING -> FROZEN
*/
switch (vcpu->state) {
case VCPU_IDLE:
case VCPU_RUNNING:
case VCPU_SLEEPING:
error = (newstate != VCPU_FROZEN);
break;
case VCPU_FROZEN:
error = (newstate == VCPU_FROZEN);
break;
default:
error = 1;
break;
}
if (error == 0)
vcpu->state = newstate;
else
error = EBUSY;
return (error);
}
static void
vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
{
int error;
if ((error = vcpu_set_state(vm, vcpuid, newstate)) != 0)
panic("Error %d setting state to %d\n", error, newstate);
}
static void
vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
{
int error;
if ((error = vcpu_set_state_locked(vcpu, newstate)) != 0)
panic("Error %d setting state to %d", error, newstate);
}
/*
* Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
*/
static int
vm_handle_hlt(struct vm *vm, int vcpuid, boolean_t *retu)
{
struct vcpu *vcpu;
int sleepticks, t;
vcpu = &vm->vcpu[vcpuid];
vcpu_lock(vcpu);
/*
* Figure out the number of host ticks until the next apic
* timer interrupt in the guest.
*/
sleepticks = lapic_timer_tick(vm, vcpuid);
/*
* If the guest local apic timer is disabled then sleep for
* a long time but not forever.
*/
if (sleepticks < 0)
sleepticks = hz;
/*
* Do a final check for pending NMI or interrupts before
* really putting this thread to sleep.
*
* These interrupts could have happened any time after we
* returned from VMRUN() and before we grabbed the vcpu lock.
*/
if (!vm_nmi_pending(vm, vcpuid) && lapic_pending_intr(vm, vcpuid) < 0) {
if (sleepticks <= 0)
panic("invalid sleepticks %d", sleepticks);
t = ticks;
vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
msleep_spin(vcpu, &vcpu->mtx, "vmidle", sleepticks);
vcpu_require_state_locked(vcpu, VCPU_FROZEN);
vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
}
vcpu_unlock(vcpu);
return (0);
}
static int
vm_handle_paging(struct vm *vm, int vcpuid, boolean_t *retu)
{
int rv, ftype;
struct vm_map *map;
struct vcpu *vcpu;
struct vm_exit *vme;
vcpu = &vm->vcpu[vcpuid];
vme = &vcpu->exitinfo;
ftype = vme->u.paging.fault_type;
KASSERT(ftype == VM_PROT_READ ||
ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
("vm_handle_paging: invalid fault_type %d", ftype));
if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
vme->u.paging.gpa, ftype);
if (rv == 0)
goto done;
}
map = &vm->vmspace->vm_map;
rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
VMM_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, ftype = %d",
rv, vme->u.paging.gpa, ftype);
if (rv != KERN_SUCCESS)
return (EFAULT);
done:
/* restart execution at the faulting instruction */
vme->inst_length = 0;
return (0);
}
static int
vm_handle_inst_emul(struct vm *vm, int vcpuid, boolean_t *retu)
{
struct vie *vie;
struct vcpu *vcpu;
struct vm_exit *vme;
int error, inst_length;
uint64_t rip, gla, gpa, cr3;
vcpu = &vm->vcpu[vcpuid];
vme = &vcpu->exitinfo;
rip = vme->rip;
inst_length = vme->inst_length;
gla = vme->u.inst_emul.gla;
gpa = vme->u.inst_emul.gpa;
cr3 = vme->u.inst_emul.cr3;
vie = &vme->u.inst_emul.vie;
vie_init(vie);
/* Fetch, decode and emulate the faulting instruction */
if (vmm_fetch_instruction(vm, vcpuid, rip, inst_length, cr3, vie) != 0)
return (EFAULT);
if (vmm_decode_instruction(vm, vcpuid, gla, vie) != 0)
return (EFAULT);
/* return to userland unless this is a local apic access */
if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE) {
*retu = TRUE;
return (0);
}
error = vmm_emulate_instruction(vm, vcpuid, gpa, vie,
lapic_mmio_read, lapic_mmio_write, 0);
/* return to userland to spin up the AP */
if (error == 0 && vme->exitcode == VM_EXITCODE_SPINUP_AP)
*retu = TRUE;
return (error);
}
int
vm_run(struct vm *vm, struct vm_run *vmrun)
{
int error, vcpuid, sleepticks, t;
int error, vcpuid;
struct vcpu *vcpu;
struct pcb *pcb;
uint64_t tscval, rip;
struct vm_exit *vme;
boolean_t retu;
pmap_t pmap;
vcpuid = vmrun->cpuid;
if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
return (EINVAL);
pmap = vmspace_pmap(vm->vmspace);
vcpu = &vm->vcpu[vcpuid];
vme = &vmrun->vm_exit;
vme = &vcpu->exitinfo;
rip = vmrun->rip;
restart:
critical_enter();
KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
("vm_run: absurd pm_active"));
tscval = rdtsc();
pcb = PCPU_GET(curpcb);
@ -661,62 +1010,44 @@ restart:
restore_guest_msrs(vm, vcpuid);
restore_guest_fpustate(vcpu);
vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
vcpu->hostcpu = curcpu;
error = VMRUN(vm->cookie, vcpuid, rip);
error = VMRUN(vm->cookie, vcpuid, rip, pmap);
vcpu->hostcpu = NOCPU;
vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
save_guest_fpustate(vcpu);
restore_host_msrs(vm, vcpuid);
vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
/* copy the exit information */
bcopy(&vcpu->exitinfo, vme, sizeof(struct vm_exit));
critical_exit();
/*
* Oblige the guest's desire to 'hlt' by sleeping until the vcpu
* is ready to run.
*/
if (error == 0 && vme->exitcode == VM_EXITCODE_HLT) {
vcpu_lock(vcpu);
/*
* Figure out the number of host ticks until the next apic
* timer interrupt in the guest.
*/
sleepticks = lapic_timer_tick(vm, vcpuid);
/*
* If the guest local apic timer is disabled then sleep for
* a long time but not forever.
*/
if (sleepticks < 0)
sleepticks = hz;
/*
* Do a final check for pending NMI or interrupts before
* really putting this thread to sleep.
*
* These interrupts could have happened any time after we
* returned from VMRUN() and before we grabbed the vcpu lock.
*/
if (!vm_nmi_pending(vm, vcpuid) &&
lapic_pending_intr(vm, vcpuid) < 0) {
if (sleepticks <= 0)
panic("invalid sleepticks %d", sleepticks);
t = ticks;
msleep_spin(vcpu, &vcpu->mtx, "vmidle", sleepticks);
vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
if (error == 0) {
retu = FALSE;
switch (vme->exitcode) {
case VM_EXITCODE_HLT:
error = vm_handle_hlt(vm, vcpuid, &retu);
break;
case VM_EXITCODE_PAGING:
error = vm_handle_paging(vm, vcpuid, &retu);
break;
case VM_EXITCODE_INST_EMUL:
error = vm_handle_inst_emul(vm, vcpuid, &retu);
break;
default:
retu = TRUE; /* handled in userland */
break;
}
}
vcpu_unlock(vcpu);
if (error == 0 && retu == FALSE) {
rip = vme->rip + vme->inst_length;
goto restart;
}
/* copy the exit information */
bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
return (error);
}
@ -869,7 +1200,7 @@ vm_iommu_domain(struct vm *vm)
}
int
vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state state)
vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
{
int error;
struct vcpu *vcpu;
@ -880,20 +1211,7 @@ vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state state)
vcpu = &vm->vcpu[vcpuid];
vcpu_lock(vcpu);
/*
* The following state transitions are allowed:
* IDLE -> RUNNING -> IDLE
* IDLE -> CANNOT_RUN -> IDLE
*/
if ((vcpu->state == VCPU_IDLE && state != VCPU_IDLE) ||
(vcpu->state != VCPU_IDLE && state == VCPU_IDLE)) {
error = 0;
vcpu->state = state;
} else {
error = EBUSY;
}
error = vcpu_set_state_locked(vcpu, newstate);
vcpu_unlock(vcpu);
return (error);
@ -979,16 +1297,7 @@ vm_interrupt_hostcpu(struct vm *vm, int vcpuid)
vcpu_lock(vcpu);
hostcpu = vcpu->hostcpu;
if (hostcpu == NOCPU) {
/*
* If the vcpu is 'RUNNING' but without a valid 'hostcpu' then
* the host thread must be sleeping waiting for an event to
* kick the vcpu out of 'hlt'.
*
* XXX this is racy because the condition exists right before
* and after calling VMRUN() in vm_run(). The wakeup() is
* benign in this case.
*/
if (vcpu->state == VCPU_RUNNING)
if (vcpu->state == VCPU_SLEEPING)
wakeup_one(vcpu);
} else {
if (vcpu->state != VCPU_RUNNING)
@ -998,3 +1307,10 @@ vm_interrupt_hostcpu(struct vm *vm, int vcpuid)
}
vcpu_unlock(vcpu);
}
struct vmspace *
vm_get_vmspace(struct vm *vm)
{
return (vm->vmspace);
}

View File

@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$");
#include <vm/vm.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <machine/pmap.h>
#include <machine/vmparam.h>
@ -95,8 +96,9 @@ vmmdev_lookup2(struct cdev *cdev)
static int
vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
{
int error, off, c;
vm_paddr_t hpa, gpa;
int error, off, c, prot;
vm_paddr_t gpa;
void *hpa, *cookie;
struct vmmdev_softc *sc;
static char zerobuf[PAGE_SIZE];
@ -107,6 +109,7 @@ vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
if (sc == NULL)
error = ENXIO;
prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
while (uio->uio_resid > 0 && error == 0) {
gpa = uio->uio_offset;
off = gpa & PAGE_MASK;
@ -120,14 +123,16 @@ vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
* Since this device does not support lseek(2), dd(1) will
* read(2) blocks of data to simulate the lseek(2).
*/
hpa = vm_gpa2hpa(sc->vm, gpa, c);
if (hpa == (vm_paddr_t)-1) {
hpa = vm_gpa_hold(sc->vm, gpa, c, prot, &cookie);
if (hpa == NULL) {
if (uio->uio_rw == UIO_READ)
error = uiomove(zerobuf, c, uio);
else
error = EFAULT;
} else
error = uiomove((void *)PHYS_TO_DMAP(hpa), c, uio);
} else {
error = uiomove(hpa, c, uio);
vm_gpa_release(cookie);
}
}
mtx_unlock(&vmmdev_mtx);
@ -139,7 +144,6 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
struct thread *td)
{
int error, vcpu, state_changed;
enum vcpu_state new_state;
struct vmmdev_softc *sc;
struct vm_memory_segment *seg;
struct vm_register *vmreg;
@ -156,6 +160,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
struct vm_stats *vmstats;
struct vm_stat_desc *statdesc;
struct vm_x2apic *x2apic;
struct vm_gpa_pte *gpapte;
sc = vmmdev_lookup2(cdev);
if (sc == NULL)
@ -189,12 +194,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
goto done;
}
if (cmd == VM_RUN)
new_state = VCPU_RUNNING;
else
new_state = VCPU_CANNOT_RUN;
error = vcpu_set_state(sc->vm, vcpu, new_state);
error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN);
if (error)
goto done;
@ -211,7 +211,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
*/
error = 0;
for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) {
error = vcpu_set_state(sc->vm, vcpu, VCPU_CANNOT_RUN);
error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN);
if (error)
break;
}
@ -271,13 +271,13 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
break;
case VM_BIND_PPTDEV:
pptdev = (struct vm_pptdev *)data;
error = ppt_assign_device(sc->vm, pptdev->bus, pptdev->slot,
pptdev->func);
error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
pptdev->func);
break;
case VM_UNBIND_PPTDEV:
pptdev = (struct vm_pptdev *)data;
error = ppt_unassign_device(sc->vm, pptdev->bus, pptdev->slot,
pptdev->func);
error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
pptdev->func);
break;
case VM_INJECT_EVENT:
vmevent = (struct vm_event *)data;
@ -348,6 +348,12 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
error = vm_get_x2apic_state(sc->vm,
x2apic->cpuid, &x2apic->state);
break;
case VM_GET_GPA_PMAP:
gpapte = (struct vm_gpa_pte *)data;
pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)),
gpapte->gpa, gpapte->pte, &gpapte->ptenum);
error = 0;
break;
default:
error = ENOTTY;
break;
@ -361,25 +367,25 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
}
done:
/* Make sure that no handler returns a bogus value like ERESTART */
KASSERT(error >= 0, ("vmmdev_ioctl: invalid error return %d", error));
return (error);
}
static int
vmmdev_mmap(struct cdev *cdev, vm_ooffset_t offset, vm_paddr_t *paddr,
int nprot, vm_memattr_t *memattr)
vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset,
vm_size_t size, struct vm_object **object, int nprot)
{
int error;
struct vmmdev_softc *sc;
error = -1;
mtx_lock(&vmmdev_mtx);
sc = vmmdev_lookup2(cdev);
if (sc != NULL && (nprot & PROT_EXEC) == 0) {
*paddr = vm_gpa2hpa(sc->vm, (vm_paddr_t)offset, PAGE_SIZE);
if (*paddr != (vm_paddr_t)-1)
error = 0;
}
if (sc != NULL && (nprot & PROT_EXEC) == 0)
error = vm_get_memobj(sc->vm, *offset, size, offset, object);
else
error = EINVAL;
mtx_unlock(&vmmdev_mtx);
@ -446,7 +452,7 @@ static struct cdevsw vmmdevsw = {
.d_name = "vmmdev",
.d_version = D_VERSION,
.d_ioctl = vmmdev_ioctl,
.d_mmap = vmmdev_mmap,
.d_mmap_single = vmmdev_mmap_single,
.d_read = vmmdev_rw,
.d_write = vmmdev_rw,
};

View File

@ -465,7 +465,7 @@ vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
}
#ifdef _KERNEL
static void
void
vie_init(struct vie *vie)
{
@ -479,9 +479,9 @@ static int
gla2gpa(struct vm *vm, uint64_t gla, uint64_t ptpphys,
uint64_t *gpa, uint64_t *gpaend)
{
vm_paddr_t hpa;
int nlevels, ptpshift, ptpindex;
uint64_t *ptpbase, pte, pgsize;
void *cookie;
/*
* XXX assumes 64-bit guest with 4 page walk levels
@ -491,18 +491,19 @@ gla2gpa(struct vm *vm, uint64_t gla, uint64_t ptpphys,
/* Zero out the lower 12 bits and the upper 12 bits */
ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12;
hpa = vm_gpa2hpa(vm, ptpphys, PAGE_SIZE);
if (hpa == -1)
ptpbase = vm_gpa_hold(vm, ptpphys, PAGE_SIZE, VM_PROT_READ,
&cookie);
if (ptpbase == NULL)
goto error;
ptpbase = (uint64_t *)PHYS_TO_DMAP(hpa);
ptpshift = PAGE_SHIFT + nlevels * 9;
ptpindex = (gla >> ptpshift) & 0x1FF;
pgsize = 1UL << ptpshift;
pte = ptpbase[ptpindex];
vm_gpa_release(cookie);
if ((pte & PG_V) == 0)
goto error;
@ -530,18 +531,18 @@ int
vmm_fetch_instruction(struct vm *vm, int cpuid, uint64_t rip, int inst_length,
uint64_t cr3, struct vie *vie)
{
int n, err;
uint64_t hpa, gpa, gpaend, off;
int n, err, prot;
uint64_t gpa, gpaend, off;
void *hpa, *cookie;
/*
* XXX cache previously fetched instructions using 'rip' as the tag
*/
prot = VM_PROT_READ | VM_PROT_EXECUTE;
if (inst_length > VIE_INST_SIZE)
panic("vmm_fetch_instruction: invalid length %d", inst_length);
vie_init(vie);
/* Copy the instruction into 'vie' */
while (vie->num_valid < inst_length) {
err = gla2gpa(vm, rip, cr3, &gpa, &gpaend);
@ -551,11 +552,12 @@ vmm_fetch_instruction(struct vm *vm, int cpuid, uint64_t rip, int inst_length,
off = gpa & PAGE_MASK;
n = min(inst_length - vie->num_valid, PAGE_SIZE - off);
hpa = vm_gpa2hpa(vm, gpa, n);
if (hpa == -1)
if ((hpa = vm_gpa_hold(vm, gpa, n, prot, &cookie)) == NULL)
break;
bcopy((void *)PHYS_TO_DMAP(hpa), &vie->inst[vie->num_valid], n);
bcopy(hpa, &vie->inst[vie->num_valid], n);
vm_gpa_release(cookie);
rip += n;
vie->num_valid += n;

View File

@ -30,40 +30,24 @@
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/linker.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/kernel.h>
#include <sys/sysctl.h>
#include <sys/sglist.h>
#include <sys/lock.h>
#include <sys/rwlock.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_object.h>
#include <vm/vm_page.h>
#include <vm/vm_pageout.h>
#include <vm/vm_pager.h>
#include <machine/md_var.h>
#include <machine/metadata.h>
#include <machine/pc/bios.h>
#include <machine/vmparam.h>
#include <machine/pmap.h>
#include "vmm_util.h"
#include "vmm_mem.h"
SYSCTL_DECL(_hw_vmm);
static u_long pages_allocated;
SYSCTL_ULONG(_hw_vmm, OID_AUTO, pages_allocated, CTLFLAG_RD,
&pages_allocated, 0, "4KB pages allocated");
static void
update_pages_allocated(int howmany)
{
pages_allocated += howmany; /* XXX locking? */
}
int
vmm_mem_init(void)
{
@ -71,60 +55,95 @@ vmm_mem_init(void)
return (0);
}
vm_paddr_t
vmm_mem_alloc(size_t size)
vm_object_t
vmm_mmio_alloc(struct vmspace *vmspace, vm_paddr_t gpa, size_t len,
vm_paddr_t hpa)
{
int flags;
vm_page_t m;
vm_paddr_t pa;
int error;
vm_object_t obj;
struct sglist *sg;
if (size != PAGE_SIZE)
panic("vmm_mem_alloc: invalid allocation size %lu", size);
sg = sglist_alloc(1, M_WAITOK);
error = sglist_append_phys(sg, hpa, len);
KASSERT(error == 0, ("error %d appending physaddr to sglist", error));
flags = VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
VM_ALLOC_ZERO;
while (1) {
obj = vm_pager_allocate(OBJT_SG, sg, len, VM_PROT_RW, 0, NULL);
if (obj != NULL) {
/*
* XXX need policy to determine when to back off the allocation
* VT-x ignores the MTRR settings when figuring out the
* memory type for translations obtained through EPT.
*
* Therefore we explicitly force the pages provided by
* this object to be mapped as uncacheable.
*/
m = vm_page_alloc(NULL, 0, flags);
if (m == NULL)
VM_WAIT;
else
break;
VM_OBJECT_WLOCK(obj);
error = vm_object_set_memattr(obj, VM_MEMATTR_UNCACHEABLE);
VM_OBJECT_WUNLOCK(obj);
if (error != KERN_SUCCESS) {
panic("vmm_mmio_alloc: vm_object_set_memattr error %d",
error);
}
error = vm_map_find(&vmspace->vm_map, obj, 0, &gpa, len, 0,
VMFS_NO_SPACE, VM_PROT_RW, VM_PROT_RW, 0);
if (error != KERN_SUCCESS) {
vm_object_deallocate(obj);
obj = NULL;
}
}
pa = VM_PAGE_TO_PHYS(m);
if ((m->flags & PG_ZERO) == 0)
pagezero((void *)PHYS_TO_DMAP(pa));
m->valid = VM_PAGE_BITS_ALL;
/*
* Drop the reference on the sglist.
*
* If the scatter/gather object was successfully allocated then it
* has incremented the reference count on the sglist. Dropping the
* initial reference count ensures that the sglist will be freed
* when the object is deallocated.
*
* If the object could not be allocated then we end up freeing the
* sglist.
*/
sglist_free(sg);
update_pages_allocated(1);
return (pa);
return (obj);
}
void
vmm_mem_free(vm_paddr_t base, size_t length)
vmm_mmio_free(struct vmspace *vmspace, vm_paddr_t gpa, size_t len)
{
vm_page_t m;
if (base & PAGE_MASK) {
panic("vmm_mem_free: base 0x%0lx must be aligned on a "
"0x%0x boundary\n", base, PAGE_SIZE);
vm_map_remove(&vmspace->vm_map, gpa, gpa + len);
}
vm_object_t
vmm_mem_alloc(struct vmspace *vmspace, vm_paddr_t gpa, size_t len)
{
int error;
vm_object_t obj;
if (gpa & PAGE_MASK)
panic("vmm_mem_alloc: invalid gpa %#lx", gpa);
if (len == 0 || (len & PAGE_MASK) != 0)
panic("vmm_mem_alloc: invalid allocation size %lu", len);
obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT);
if (obj != NULL) {
error = vm_map_find(&vmspace->vm_map, obj, 0, &gpa, len, 0,
VMFS_NO_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0);
if (error != KERN_SUCCESS) {
vm_object_deallocate(obj);
obj = NULL;
}
}
if (length != PAGE_SIZE)
panic("vmm_mem_free: invalid length %lu", length);
return (obj);
}
m = PHYS_TO_VM_PAGE(base);
m->wire_count--;
vm_page_free(m);
atomic_subtract_int(&cnt.v_wire_count, 1);
void
vmm_mem_free(struct vmspace *vmspace, vm_paddr_t gpa, size_t len)
{
update_pages_allocated(-1);
vm_map_remove(&vmspace->vm_map, gpa, gpa + len);
}
vm_paddr_t

View File

@ -29,9 +29,15 @@
#ifndef _VMM_MEM_H_
#define _VMM_MEM_H_
struct vmspace;
struct vm_object;
int vmm_mem_init(void);
vm_paddr_t vmm_mem_alloc(size_t size);
void vmm_mem_free(vm_paddr_t start, size_t size);
struct vm_object *vmm_mem_alloc(struct vmspace *, vm_paddr_t gpa, size_t size);
struct vm_object *vmm_mmio_alloc(struct vmspace *, vm_paddr_t gpa, size_t len,
vm_paddr_t hpa);
void vmm_mem_free(struct vmspace *, vm_paddr_t gpa, size_t size);
void vmm_mmio_free(struct vmspace *, vm_paddr_t gpa, size_t size);
vm_paddr_t vmm_mem_maxaddr(void);
#endif

View File

@ -2239,20 +2239,16 @@ cfiscsi_lun_disable(void *arg, struct ctl_id target_id, int lun_id)
}
static void
cfiscsi_datamove(union ctl_io *io)
cfiscsi_datamove_in(union ctl_io *io)
{
struct cfiscsi_session *cs;
struct icl_pdu *request, *response;
const struct iscsi_bhs_scsi_command *bhssc;
struct iscsi_bhs_data_in *bhsdi;
struct iscsi_bhs_r2t *bhsr2t;
struct cfiscsi_data_wait *cdw;
struct ctl_sg_entry ctl_sg_entry, *ctl_sglist;
size_t copy_len, len, off;
const char *addr;
int ctl_sg_count, error, i;
uint32_t target_transfer_tag;
bool done;
request = io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr;
cs = PDU_SESSION(request);
@ -2278,215 +2274,240 @@ cfiscsi_datamove(union ctl_io *io)
*/
PDU_TOTAL_TRANSFER_LEN(request) = io->scsiio.kern_total_len;
if ((io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_IN) {
#if 0
if (ctl_sg_count > 1)
CFISCSI_SESSION_DEBUG(cs, "ctl_sg_count = %d", ctl_sg_count);
if (ctl_sg_count > 1)
CFISCSI_SESSION_DEBUG(cs, "ctl_sg_count = %d", ctl_sg_count);
#endif
/*
* This is the offset within the current SCSI command;
* i.e. for the first call of datamove(), it will be 0,
* and for subsequent ones it will be the sum of lengths
* of previous ones.
*/
off = htonl(io->scsiio.kern_rel_offset);
if (off > 1)
CFISCSI_SESSION_DEBUG(cs, "off = %zd", off);
/*
* This is the offset within the current SCSI command;
* i.e. for the first call of datamove(), it will be 0,
* and for subsequent ones it will be the sum of lengths
* of previous ones.
*/
off = htonl(io->scsiio.kern_rel_offset);
if (off > 1)
CFISCSI_SESSION_DEBUG(cs, "off = %zd", off);
i = 0;
addr = NULL;
len = 0;
response = NULL;
bhsdi = NULL;
for (;;) {
KASSERT(i < ctl_sg_count, ("i >= ctl_sg_count"));
i = 0;
addr = NULL;
len = 0;
response = NULL;
bhsdi = NULL;
for (;;) {
KASSERT(i < ctl_sg_count, ("i >= ctl_sg_count"));
if (response == NULL) {
response = cfiscsi_pdu_new_response(request, M_NOWAIT);
if (response == NULL) {
response =
cfiscsi_pdu_new_response(request, M_NOWAIT);
if (response == NULL) {
CFISCSI_SESSION_WARN(cs, "failed to "
"allocate memory; dropping connection");
icl_pdu_free(request);
cfiscsi_session_terminate(cs);
return;
}
bhsdi = (struct iscsi_bhs_data_in *)
response->ip_bhs;
bhsdi->bhsdi_opcode =
ISCSI_BHS_OPCODE_SCSI_DATA_IN;
bhsdi->bhsdi_initiator_task_tag =
bhssc->bhssc_initiator_task_tag;
bhsdi->bhsdi_datasn =
htonl(PDU_EXPDATASN(request));
PDU_EXPDATASN(request)++;
bhsdi->bhsdi_buffer_offset = htonl(off);
}
if (len == 0) {
addr = ctl_sglist[i].addr;
len = ctl_sglist[i].len;
KASSERT(len > 0, ("len <= 0"));
}
copy_len = len;
if (response->ip_data_len + copy_len >
cs->cs_max_data_segment_length)
copy_len = cs->cs_max_data_segment_length -
response->ip_data_len;
KASSERT(copy_len <= len, ("copy_len > len"));
error = icl_pdu_append_data(response, addr, copy_len, M_NOWAIT);
if (error != 0) {
CFISCSI_SESSION_WARN(cs, "failed to "
"allocate memory; dropping connection");
icl_pdu_free(request);
icl_pdu_free(response);
cfiscsi_session_terminate(cs);
return;
}
addr += copy_len;
len -= copy_len;
off += copy_len;
io->scsiio.ext_data_filled += copy_len;
if (len == 0) {
/*
* End of scatter-gather segment;
* proceed to the next one...
*/
if (i == ctl_sg_count - 1) {
/*
* ... unless this was the last one.
*/
break;
}
i++;
}
if (response->ip_data_len ==
cs->cs_max_data_segment_length) {
/*
* Can't stuff more data into the current PDU;
* queue it. Note that's not enough to check
* for kern_data_resid == 0 instead; there
* may be several Data-In PDUs for the final
* call to cfiscsi_datamove(), and we want
* to set the F flag only on the last of them.
*/
if (off == io->scsiio.kern_total_len)
bhsdi->bhsdi_flags |= BHSDI_FLAGS_F;
KASSERT(response->ip_data_len > 0,
("sending empty Data-In"));
cfiscsi_pdu_queue(response);
response = NULL;
bhsdi = NULL;
}
bhsdi = (struct iscsi_bhs_data_in *)response->ip_bhs;
bhsdi->bhsdi_opcode = ISCSI_BHS_OPCODE_SCSI_DATA_IN;
bhsdi->bhsdi_initiator_task_tag =
bhssc->bhssc_initiator_task_tag;
bhsdi->bhsdi_datasn = htonl(PDU_EXPDATASN(request));
PDU_EXPDATASN(request)++;
bhsdi->bhsdi_buffer_offset = htonl(off);
}
KASSERT(i == ctl_sg_count - 1, ("missed SG segment"));
KASSERT(len == 0, ("missed data from SG segment"));
if (response != NULL) {
if (off == io->scsiio.kern_total_len) {
bhsdi->bhsdi_flags |= BHSDI_FLAGS_F;
} else {
CFISCSI_SESSION_DEBUG(cs, "not setting the F flag; "
"have %zd, need %zd", off,
(size_t)io->scsiio.kern_total_len);
if (len == 0) {
addr = ctl_sglist[i].addr;
len = ctl_sglist[i].len;
KASSERT(len > 0, ("len <= 0"));
}
copy_len = len;
if (response->ip_data_len + copy_len >
cs->cs_max_data_segment_length)
copy_len = cs->cs_max_data_segment_length -
response->ip_data_len;
KASSERT(copy_len <= len, ("copy_len > len"));
error = icl_pdu_append_data(response, addr, copy_len, M_NOWAIT);
if (error != 0) {
CFISCSI_SESSION_WARN(cs, "failed to "
"allocate memory; dropping connection");
icl_pdu_free(request);
icl_pdu_free(response);
cfiscsi_session_terminate(cs);
return;
}
addr += copy_len;
len -= copy_len;
off += copy_len;
io->scsiio.ext_data_filled += copy_len;
if (len == 0) {
/*
* End of scatter-gather segment;
* proceed to the next one...
*/
if (i == ctl_sg_count - 1) {
/*
* ... unless this was the last one.
*/
break;
}
i++;
}
if (response->ip_data_len == cs->cs_max_data_segment_length) {
/*
* Can't stuff more data into the current PDU;
* queue it. Note that's not enough to check
* for kern_data_resid == 0 instead; there
* may be several Data-In PDUs for the final
* call to cfiscsi_datamove(), and we want
* to set the F flag only on the last of them.
*/
if (off == io->scsiio.kern_total_len)
bhsdi->bhsdi_flags |= BHSDI_FLAGS_F;
KASSERT(response->ip_data_len > 0,
("sending empty Data-In"));
cfiscsi_pdu_queue(response);
response = NULL;
bhsdi = NULL;
}
io->scsiio.be_move_done(io);
} else {
CFISCSI_SESSION_LOCK(cs);
target_transfer_tag = cs->cs_target_transfer_tag;
cs->cs_target_transfer_tag++;
CFISCSI_SESSION_UNLOCK(cs);
#if 0
CFISCSI_SESSION_DEBUG(cs, "expecting Data-Out with initiator "
"task tag 0x%x, target transfer tag 0x%x",
bhssc->bhssc_initiator_task_tag, target_transfer_tag);
#endif
cdw = uma_zalloc(cfiscsi_data_wait_zone, M_NOWAIT | M_ZERO);
if (cdw == NULL) {
CFISCSI_SESSION_WARN(cs, "failed to "
"allocate memory; dropping connection");
icl_pdu_free(request);
cfiscsi_session_terminate(cs);
}
KASSERT(i == ctl_sg_count - 1, ("missed SG segment"));
KASSERT(len == 0, ("missed data from SG segment"));
if (response != NULL) {
if (off == io->scsiio.kern_total_len) {
bhsdi->bhsdi_flags |= BHSDI_FLAGS_F;
} else {
CFISCSI_SESSION_DEBUG(cs, "not setting the F flag; "
"have %zd, need %zd", off,
(size_t)io->scsiio.kern_total_len);
}
cdw->cdw_ctl_io = io;
cdw->cdw_target_transfer_tag = htonl(target_transfer_tag);
cdw->cdw_initiator_task_tag = bhssc->bhssc_initiator_task_tag;
if (cs->cs_immediate_data &&
icl_pdu_data_segment_length(request) > 0) {
done = cfiscsi_handle_data_segment(request, cdw);
if (done) {
uma_zfree(cfiscsi_data_wait_zone, cdw);
io->scsiio.be_move_done(io);
return;
}
#if 0
if (io->scsiio.ext_data_filled != 0)
CFISCSI_SESSION_DEBUG(cs, "got %zd bytes of immediate data, need %zd",
io->scsiio.ext_data_filled, io->scsiio.kern_data_len);
#endif
}
CFISCSI_SESSION_LOCK(cs);
TAILQ_INSERT_TAIL(&cs->cs_waiting_for_data_out, cdw, cdw_next);
CFISCSI_SESSION_UNLOCK(cs);
/*
* XXX: We should limit the number of outstanding R2T PDUs
* per task to MaxOutstandingR2T.
*/
response = cfiscsi_pdu_new_response(request, M_NOWAIT);
if (response == NULL) {
CFISCSI_SESSION_WARN(cs, "failed to "
"allocate memory; dropping connection");
icl_pdu_free(request);
cfiscsi_session_terminate(cs);
}
bhsr2t = (struct iscsi_bhs_r2t *)response->ip_bhs;
bhsr2t->bhsr2t_opcode = ISCSI_BHS_OPCODE_R2T;
bhsr2t->bhsr2t_flags = 0x80;
bhsr2t->bhsr2t_lun = bhssc->bhssc_lun;
bhsr2t->bhsr2t_initiator_task_tag =
bhssc->bhssc_initiator_task_tag;
bhsr2t->bhsr2t_target_transfer_tag =
htonl(target_transfer_tag);
/*
* XXX: Here we assume that cfiscsi_datamove() won't ever
* be running concurrently on several CPUs for a given
* command.
*/
bhsr2t->bhsr2t_r2tsn = htonl(PDU_R2TSN(request));
PDU_R2TSN(request)++;
/*
* This is the offset within the current SCSI command;
* i.e. for the first call of datamove(), it will be 0,
* and for subsequent ones it will be the sum of lengths
* of previous ones.
*
* The ext_data_filled is to account for unsolicited
* (immediate) data that might have already arrived.
*/
bhsr2t->bhsr2t_buffer_offset =
htonl(io->scsiio.kern_rel_offset + io->scsiio.ext_data_filled);
/*
* This is the total length (sum of S/G lengths) this call
* to cfiscsi_datamove() is supposed to handle.
*
* XXX: Limit it to MaxBurstLength.
*/
bhsr2t->bhsr2t_desired_data_transfer_length =
htonl(io->scsiio.kern_data_len - io->scsiio.ext_data_filled);
KASSERT(response->ip_data_len > 0, ("sending empty Data-In"));
cfiscsi_pdu_queue(response);
}
io->scsiio.be_move_done(io);
}
static void
cfiscsi_datamove_out(union ctl_io *io)
{
struct cfiscsi_session *cs;
struct icl_pdu *request, *response;
const struct iscsi_bhs_scsi_command *bhssc;
struct iscsi_bhs_r2t *bhsr2t;
struct cfiscsi_data_wait *cdw;
uint32_t target_transfer_tag;
bool done;
request = io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr;
cs = PDU_SESSION(request);
bhssc = (const struct iscsi_bhs_scsi_command *)request->ip_bhs;
KASSERT((bhssc->bhssc_opcode & ~ISCSI_BHS_OPCODE_IMMEDIATE) ==
ISCSI_BHS_OPCODE_SCSI_COMMAND,
("bhssc->bhssc_opcode != ISCSI_BHS_OPCODE_SCSI_COMMAND"));
/*
* We need to record it so that we can properly report
* underflow/underflow.
*/
PDU_TOTAL_TRANSFER_LEN(request) = io->scsiio.kern_total_len;
CFISCSI_SESSION_LOCK(cs);
target_transfer_tag = cs->cs_target_transfer_tag;
cs->cs_target_transfer_tag++;
CFISCSI_SESSION_UNLOCK(cs);
#if 0
CFISCSI_SESSION_DEBUG(cs, "expecting Data-Out with initiator "
"task tag 0x%x, target transfer tag 0x%x",
bhssc->bhssc_initiator_task_tag, target_transfer_tag);
#endif
cdw = uma_zalloc(cfiscsi_data_wait_zone, M_NOWAIT | M_ZERO);
if (cdw == NULL) {
CFISCSI_SESSION_WARN(cs, "failed to "
"allocate memory; dropping connection");
icl_pdu_free(request);
cfiscsi_session_terminate(cs);
}
cdw->cdw_ctl_io = io;
cdw->cdw_target_transfer_tag = htonl(target_transfer_tag);
cdw->cdw_initiator_task_tag = bhssc->bhssc_initiator_task_tag;
if (cs->cs_immediate_data && icl_pdu_data_segment_length(request) > 0) {
done = cfiscsi_handle_data_segment(request, cdw);
if (done) {
uma_zfree(cfiscsi_data_wait_zone, cdw);
io->scsiio.be_move_done(io);
return;
}
#if 0
if (io->scsiio.ext_data_filled != 0)
CFISCSI_SESSION_DEBUG(cs, "got %zd bytes of immediate data, need %zd",
io->scsiio.ext_data_filled, io->scsiio.kern_data_len);
#endif
}
CFISCSI_SESSION_LOCK(cs);
TAILQ_INSERT_TAIL(&cs->cs_waiting_for_data_out, cdw, cdw_next);
CFISCSI_SESSION_UNLOCK(cs);
/*
* XXX: We should limit the number of outstanding R2T PDUs
* per task to MaxOutstandingR2T.
*/
response = cfiscsi_pdu_new_response(request, M_NOWAIT);
if (response == NULL) {
CFISCSI_SESSION_WARN(cs, "failed to "
"allocate memory; dropping connection");
icl_pdu_free(request);
cfiscsi_session_terminate(cs);
}
bhsr2t = (struct iscsi_bhs_r2t *)response->ip_bhs;
bhsr2t->bhsr2t_opcode = ISCSI_BHS_OPCODE_R2T;
bhsr2t->bhsr2t_flags = 0x80;
bhsr2t->bhsr2t_lun = bhssc->bhssc_lun;
bhsr2t->bhsr2t_initiator_task_tag = bhssc->bhssc_initiator_task_tag;
bhsr2t->bhsr2t_target_transfer_tag = htonl(target_transfer_tag);
/*
* XXX: Here we assume that cfiscsi_datamove() won't ever
* be running concurrently on several CPUs for a given
* command.
*/
bhsr2t->bhsr2t_r2tsn = htonl(PDU_R2TSN(request));
PDU_R2TSN(request)++;
/*
* This is the offset within the current SCSI command;
* i.e. for the first call of datamove(), it will be 0,
* and for subsequent ones it will be the sum of lengths
* of previous ones.
*
* The ext_data_filled is to account for unsolicited
* (immediate) data that might have already arrived.
*/
bhsr2t->bhsr2t_buffer_offset =
htonl(io->scsiio.kern_rel_offset + io->scsiio.ext_data_filled);
/*
* This is the total length (sum of S/G lengths) this call
* to cfiscsi_datamove() is supposed to handle.
*
* XXX: Limit it to MaxBurstLength.
*/
bhsr2t->bhsr2t_desired_data_transfer_length =
htonl(io->scsiio.kern_data_len - io->scsiio.ext_data_filled);
cfiscsi_pdu_queue(response);
}
static void
cfiscsi_datamove(union ctl_io *io)
{
if ((io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_IN)
cfiscsi_datamove_in(io);
else
cfiscsi_datamove_out(io);
}
static void

View File

@ -2920,7 +2920,6 @@ kern/tty_pts.c standard
kern/tty_tty.c standard
kern/tty_ttydisc.c standard
kern/uipc_accf.c optional inet
kern/uipc_cow.c optional socket_send_cow
kern/uipc_debug.c optional ddb
kern/uipc_domain.c standard
kern/uipc_mbuf.c standard

View File

@ -2883,6 +2883,9 @@ igb_setup_msix(struct adapter *adapter)
if (queues > maxqueues)
queues = maxqueues;
/* reflect correct sysctl value */
igb_num_queues = queues;
/*
** One vector (RX/TX pair) per queue
** plus an additional for Link interrupt

View File

@ -29,6 +29,8 @@
/**
* Implements low-level interactions with Hypver-V/Azure
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/malloc.h>
@ -88,6 +90,14 @@ hv_vmbus_query_hypervisor_presence(void)
{
u_int regs[4];
int hyper_v_detected = 0;
/*
* When Xen is detected and native Xen PV support is enabled,
* ignore Xen's HyperV emulation.
*/
if (vm_guest == VM_GUEST_XEN)
return (0);
do_cpuid(1, regs);
if (regs[2] & 0x80000000) { /* if(a hypervisor is detected) */
/* make sure this really is Hyper-V */

View File

@ -234,6 +234,9 @@ MODULE_DEPEND(ixgbe, ether, 1, 1, 1);
** TUNEABLE PARAMETERS:
*/
static SYSCTL_NODE(_hw, OID_AUTO, ix, CTLFLAG_RD, 0,
"IXGBE driver parameters");
/*
** AIM: Adaptive Interrupt Moderation
** which means that the interrupt rate
@ -242,17 +245,29 @@ MODULE_DEPEND(ixgbe, ether, 1, 1, 1);
*/
static int ixgbe_enable_aim = TRUE;
TUNABLE_INT("hw.ixgbe.enable_aim", &ixgbe_enable_aim);
SYSCTL_INT(_hw_ix, OID_AUTO, enable_aim, CTLFLAG_RW, &ixgbe_enable_aim, 0,
"Enable adaptive interrupt moderation");
static int ixgbe_max_interrupt_rate = (4000000 / IXGBE_LOW_LATENCY);
TUNABLE_INT("hw.ixgbe.max_interrupt_rate", &ixgbe_max_interrupt_rate);
SYSCTL_INT(_hw_ix, OID_AUTO, max_interrupt_rate, CTLFLAG_RDTUN,
&ixgbe_max_interrupt_rate, 0, "Maximum interrupts per second");
/* How many packets rxeof tries to clean at a time */
static int ixgbe_rx_process_limit = 256;
TUNABLE_INT("hw.ixgbe.rx_process_limit", &ixgbe_rx_process_limit);
SYSCTL_INT(_hw_ix, OID_AUTO, rx_process_limit, CTLFLAG_RDTUN,
&ixgbe_rx_process_limit, 0,
"Maximum number of received packets to process at a time,"
"-1 means unlimited");
/* How many packets txeof tries to clean at a time */
static int ixgbe_tx_process_limit = 256;
TUNABLE_INT("hw.ixgbe.tx_process_limit", &ixgbe_tx_process_limit);
SYSCTL_INT(_hw_ix, OID_AUTO, tx_process_limit, CTLFLAG_RDTUN,
&ixgbe_tx_process_limit, 0,
"Maximum number of sent packets to process at a time,"
"-1 means unlimited");
/*
** Smart speed setting, default to on
@ -269,6 +284,8 @@ static int ixgbe_smart_speed = ixgbe_smart_speed_on;
*/
static int ixgbe_enable_msix = 1;
TUNABLE_INT("hw.ixgbe.enable_msix", &ixgbe_enable_msix);
SYSCTL_INT(_hw_ix, OID_AUTO, enable_msix, CTLFLAG_RDTUN, &ixgbe_enable_msix, 0,
"Enable MSI-X interrupts");
/*
* Number of Queues, can be set to 0,
@ -278,6 +295,8 @@ TUNABLE_INT("hw.ixgbe.enable_msix", &ixgbe_enable_msix);
*/
static int ixgbe_num_queues = 0;
TUNABLE_INT("hw.ixgbe.num_queues", &ixgbe_num_queues);
SYSCTL_INT(_hw_ix, OID_AUTO, num_queues, CTLFLAG_RDTUN, &ixgbe_num_queues, 0,
"Number of queues to configure, 0 indicates autoconfigure");
/*
** Number of TX descriptors per ring,
@ -286,10 +305,14 @@ TUNABLE_INT("hw.ixgbe.num_queues", &ixgbe_num_queues);
*/
static int ixgbe_txd = PERFORM_TXD;
TUNABLE_INT("hw.ixgbe.txd", &ixgbe_txd);
SYSCTL_INT(_hw_ix, OID_AUTO, txd, CTLFLAG_RDTUN, &ixgbe_txd, 0,
"Number of receive descriptors per queue");
/* Number of RX descriptors per ring */
static int ixgbe_rxd = PERFORM_RXD;
TUNABLE_INT("hw.ixgbe.rxd", &ixgbe_rxd);
SYSCTL_INT(_hw_ix, OID_AUTO, rxd, CTLFLAG_RDTUN, &ixgbe_rxd, 0,
"Number of receive descriptors per queue");
/*
** Defining this on will allow the use
@ -2442,6 +2465,9 @@ ixgbe_setup_msix(struct adapter *adapter)
else if ((ixgbe_num_queues == 0) && (queues > 8))
queues = 8;
/* reflect correct sysctl value */
ixgbe_num_queues = queues;
/*
** Want one vector (RX/TX pair) per queue
** plus an additional for Link.

View File

@ -1700,9 +1700,9 @@ vtnet_rxq_input(struct vtnet_rxq *rxq, struct mbuf *m,
rxq->vtnrx_stats.vrxs_ipackets++;
rxq->vtnrx_stats.vrxs_ibytes += m->m_pkthdr.len;
/* VTNET_RXQ_UNLOCK(rxq); */
VTNET_RXQ_UNLOCK(rxq);
(*ifp->if_input)(ifp, m);
/* VTNET_RXQ_LOCK(rxq); */
VTNET_RXQ_LOCK(rxq);
}
static int
@ -1782,6 +1782,10 @@ vtnet_rxq_eof(struct vtnet_rxq *rxq)
m_adj(m, adjsz);
vtnet_rxq_input(rxq, m, hdr);
/* Must recheck after dropping the Rx lock. */
if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
break;
}
if (deq > 0)

View File

@ -396,7 +396,7 @@ xentimer_et_start(struct eventtimer *et,
{
int error = 0, i = 0;
struct xentimer_softc *sc = et->et_priv;
int cpu = PCPU_GET(acpi_id);
int cpu = PCPU_GET(vcpu_id);
struct xentimer_pcpu_data *pcpu = DPCPU_PTR(xentimer_pcpu);
uint64_t first_in_ns, next_time;
@ -433,7 +433,7 @@ xentimer_et_start(struct eventtimer *et,
static int
xentimer_et_stop(struct eventtimer *et)
{
int cpu = PCPU_GET(acpi_id);
int cpu = PCPU_GET(vcpu_id);
struct xentimer_pcpu_data *pcpu = DPCPU_PTR(xentimer_pcpu);
pcpu->timer = 0;

View File

@ -62,13 +62,13 @@ struct shadow_time_info {
vm_paddr_t *pc_pdir_shadow; \
uint64_t pc_processed_system_time; \
struct shadow_time_info pc_shadow_time; \
char __pad[189]
char __pad[185]
#else /* !XEN */
#define PCPU_XEN_FIELDS \
; \
char __pad[237]
char __pad[233]
#endif
@ -84,7 +84,8 @@ struct shadow_time_info {
u_int pc_acpi_id; /* ACPI CPU id */ \
u_int pc_apic_id; \
int pc_private_tss; /* Flag indicating private tss*/\
u_int pc_cmci_mask /* MCx banks for CMCI */ \
u_int pc_cmci_mask; /* MCx banks for CMCI */ \
u_int pc_vcpu_id /* Xen vCPU ID */ \
PCPU_XEN_FIELDS
#ifdef _KERNEL

View File

@ -783,13 +783,7 @@ start_all_aps(void)
dpcpu_init((void *)kmem_malloc(kernel_arena, DPCPU_SIZE,
M_WAITOK | M_ZERO), bootAP);
pc->pc_apic_id = cpu_apic_ids[bootAP];
/*
* The i386 PV port uses the apic_id as vCPU id, but the
* PVHVM port needs to use the acpi_id, so set it for PV
* also in order to work with shared devices between PV
* and PVHVM.
*/
pc->pc_acpi_id = cpu_apic_ids[bootAP];
pc->pc_vcpu_id = cpu_apic_ids[bootAP];
pc->pc_prvspace = pc;
pc->pc_curthread = 0;

View File

@ -88,7 +88,7 @@ mptable_setup_local(void)
{
PCPU_SET(apic_id, 0);
PCPU_SET(acpi_id, 0);
PCPU_SET(vcpu_id, 0);
return (0);
}

View File

@ -55,7 +55,6 @@ __FBSDID("$FreeBSD$");
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/mutex.h>
#include <sys/vmmeter.h>
#include <sys/proc.h>
@ -699,10 +698,10 @@ kmeminit(void)
* VM_KMEM_SIZE_MAX is dependent on the maximum KVA space
* available.
*
* Note that the kmem_map is also used by the zone allocator,
* Note that the kmem arena is also used by the zone allocator,
* so make sure that there is enough space.
*/
vm_kmem_size = VM_KMEM_SIZE + nmbclusters * PAGE_SIZE;
vm_kmem_size = VM_KMEM_SIZE;
mem_size = cnt.v_page_count;
#if defined(VM_KMEM_SIZE_SCALE)

View File

@ -1,182 +0,0 @@
/*--
* Copyright (c) 1997, Duke University
* All rights reserved.
*
* Author:
* Andrew Gallatin <gallatin@cs.duke.edu>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of Duke University may not be used to endorse or promote
* products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY DUKE UNIVERSITY ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL DUKE UNIVERSITY BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITSOR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
* IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* This is a set of routines for enabling and disabling copy on write
* protection for data written into sockets.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysctl.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/mbuf.h>
#include <sys/sf_buf.h>
#include <sys/socketvar.h>
#include <sys/uio.h>
#include <vm/vm.h>
#include <vm/vm_extern.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_page.h>
#include <vm/vm_object.h>
FEATURE(zero_copy_sockets, "Zero copy sockets support");
struct netsend_cow_stats {
int attempted;
int fail_not_mapped;
int fail_sf_buf;
int success;
int iodone;
};
static struct netsend_cow_stats socow_stats;
static int socow_iodone(struct mbuf *m, void *addr, void *args);
static int
socow_iodone(struct mbuf *m, void *addr, void *args)
{
struct sf_buf *sf;
vm_page_t pp;
sf = args;
pp = sf_buf_page(sf);
sf_buf_free(sf);
/* remove COW mapping */
vm_page_lock(pp);
vm_page_cowclear(pp);
vm_page_unwire(pp, 0);
/*
* Check for the object going away on us. This can
* happen since we don't hold a reference to it.
* If so, we're responsible for freeing the page.
*/
if (pp->wire_count == 0 && pp->object == NULL)
vm_page_free(pp);
vm_page_unlock(pp);
socow_stats.iodone++;
return (EXT_FREE_OK);
}
int
socow_setup(struct mbuf *m0, struct uio *uio)
{
struct sf_buf *sf;
vm_page_t pp;
struct iovec *iov;
struct vmspace *vmspace;
struct vm_map *map;
vm_offset_t offset, uva;
vm_size_t len;
socow_stats.attempted++;
vmspace = curproc->p_vmspace;
map = &vmspace->vm_map;
uva = (vm_offset_t) uio->uio_iov->iov_base;
offset = uva & PAGE_MASK;
len = PAGE_SIZE - offset;
/*
* Verify that access to the given address is allowed from user-space.
*/
if (vm_fault_quick_hold_pages(map, uva, len, VM_PROT_READ, &pp, 1) <
0) {
socow_stats.fail_not_mapped++;
return(0);
}
/*
* set up COW
*/
vm_page_lock(pp);
if (vm_page_cowsetup(pp) != 0) {
vm_page_unhold(pp);
vm_page_unlock(pp);
return (0);
}
/*
* wire the page for I/O
*/
vm_page_wire(pp);
vm_page_unhold(pp);
vm_page_unlock(pp);
/*
* Allocate an sf buf
*/
sf = sf_buf_alloc(pp, SFB_CATCH);
if (sf == NULL) {
vm_page_lock(pp);
vm_page_cowclear(pp);
vm_page_unwire(pp, 0);
/*
* Check for the object going away on us. This can
* happen since we don't hold a reference to it.
* If so, we're responsible for freeing the page.
*/
if (pp->wire_count == 0 && pp->object == NULL)
vm_page_free(pp);
vm_page_unlock(pp);
socow_stats.fail_sf_buf++;
return(0);
}
/*
* attach to mbuf
*/
MEXTADD(m0, sf_buf_kva(sf), PAGE_SIZE, socow_iodone,
(void*)sf_buf_kva(sf), sf, M_RDONLY, EXT_SFBUF);
m0->m_len = len;
m0->m_data = (caddr_t)sf_buf_kva(sf) + offset;
socow_stats.success++;
iov = uio->uio_iov;
iov->iov_base = (char *)iov->iov_base + m0->m_len;
iov->iov_len -= m0->m_len;
uio->uio_resid -= m0->m_len;
uio->uio_offset += m0->m_len;
if (iov->iov_len == 0) {
uio->uio_iov++;
uio->uio_iovcnt--;
}
return(m0->m_len);
}

View File

@ -575,4 +575,4 @@ madt_set_ids(void *dummy)
la->la_acpi_id);
}
}
SYSINIT(madt_set_ids, SI_SUB_CPU, SI_ORDER_ANY, madt_set_ids, NULL);
SYSINIT(madt_set_ids, SI_SUB_CPU, SI_ORDER_MIDDLE, madt_set_ids, NULL);

View File

@ -700,6 +700,7 @@ xen_hvm_init(enum xen_hvm_init_type init_type)
setup_xen_features();
cpu_ops = xen_hvm_cpu_ops;
vm_guest = VM_GUEST_XEN;
break;
case XEN_HVM_INIT_RESUME:
if (error != 0)
@ -742,6 +743,22 @@ xen_hvm_sysinit(void *arg __unused)
xen_hvm_init(XEN_HVM_INIT_COLD);
}
static void
xen_set_vcpu_id(void)
{
struct pcpu *pc;
int i;
/* Set vcpu_id to acpi_id */
CPU_FOREACH(i) {
pc = pcpu_find(i);
pc->pc_vcpu_id = pc->pc_acpi_id;
if (bootverbose)
printf("XEN: CPU %u has VCPU ID %u\n",
i, pc->pc_vcpu_id);
}
}
static void
xen_hvm_cpu_init(void)
{
@ -762,7 +779,7 @@ xen_hvm_cpu_init(void)
}
vcpu_info = DPCPU_PTR(vcpu_local_info);
cpu = PCPU_GET(acpi_id);
cpu = PCPU_GET(vcpu_id);
info.mfn = vtophys(vcpu_info) >> PAGE_SHIFT;
info.offset = vtophys(vcpu_info) - trunc_page(vtophys(vcpu_info));
@ -778,3 +795,4 @@ SYSINIT(xen_hvm_init, SI_SUB_HYPERVISOR, SI_ORDER_FIRST, xen_hvm_sysinit, NULL);
SYSINIT(xen_setup_cpus, SI_SUB_SMP, SI_ORDER_FIRST, xen_setup_cpus, NULL);
#endif
SYSINIT(xen_hvm_cpu_init, SI_SUB_INTR, SI_ORDER_FIRST, xen_hvm_cpu_init, NULL);
SYSINIT(xen_set_vcpu_id, SI_SUB_CPU, SI_ORDER_ANY, xen_set_vcpu_id, NULL);

View File

@ -611,9 +611,9 @@ xen_rebind_ipi(struct xenisrc *isrc)
{
#ifdef SMP
int cpu = isrc->xi_cpu;
int acpi_id = pcpu_find(cpu)->pc_acpi_id;
int vcpu_id = pcpu_find(cpu)->pc_vcpu_id;
int error;
struct evtchn_bind_ipi bind_ipi = { .vcpu = acpi_id };
struct evtchn_bind_ipi bind_ipi = { .vcpu = vcpu_id };
error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
&bind_ipi);
@ -640,10 +640,10 @@ static void
xen_rebind_virq(struct xenisrc *isrc)
{
int cpu = isrc->xi_cpu;
int acpi_id = pcpu_find(cpu)->pc_acpi_id;
int vcpu_id = pcpu_find(cpu)->pc_vcpu_id;
int error;
struct evtchn_bind_virq bind_virq = { .virq = isrc->xi_virq,
.vcpu = acpi_id };
.vcpu = vcpu_id };
error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
&bind_virq);
@ -796,7 +796,7 @@ xen_intr_assign_cpu(struct intsrc *base_isrc, u_int apic_id)
#ifdef SMP
struct evtchn_bind_vcpu bind_vcpu;
struct xenisrc *isrc;
u_int to_cpu, acpi_id;
u_int to_cpu, vcpu_id;
int error;
#ifdef XENHVM
@ -805,7 +805,7 @@ xen_intr_assign_cpu(struct intsrc *base_isrc, u_int apic_id)
#endif
to_cpu = apic_cpuid(apic_id);
acpi_id = pcpu_find(to_cpu)->pc_acpi_id;
vcpu_id = pcpu_find(to_cpu)->pc_vcpu_id;
xen_intr_intrcnt_add(to_cpu);
mtx_lock(&xen_intr_isrc_lock);
@ -830,7 +830,7 @@ xen_intr_assign_cpu(struct intsrc *base_isrc, u_int apic_id)
}
bind_vcpu.port = isrc->xi_port;
bind_vcpu.vcpu = acpi_id;
bind_vcpu.vcpu = vcpu_id;
/*
* Allow interrupts to be fielded on the new VCPU before
@ -1063,9 +1063,9 @@ xen_intr_bind_virq(device_t dev, u_int virq, u_int cpu,
driver_filter_t filter, driver_intr_t handler, void *arg,
enum intr_type flags, xen_intr_handle_t *port_handlep)
{
int acpi_id = pcpu_find(cpu)->pc_acpi_id;
int vcpu_id = pcpu_find(cpu)->pc_vcpu_id;
struct xenisrc *isrc;
struct evtchn_bind_virq bind_virq = { .virq = virq, .vcpu = acpi_id };
struct evtchn_bind_virq bind_virq = { .virq = virq, .vcpu = vcpu_id };
int error;
/* Ensure the target CPU is ready to handle evtchn interrupts. */
@ -1126,9 +1126,9 @@ xen_intr_alloc_and_bind_ipi(device_t dev, u_int cpu,
xen_intr_handle_t *port_handlep)
{
#ifdef SMP
int acpi_id = pcpu_find(cpu)->pc_acpi_id;
int vcpu_id = pcpu_find(cpu)->pc_vcpu_id;
struct xenisrc *isrc;
struct evtchn_bind_ipi bind_ipi = { .vcpu = acpi_id };
struct evtchn_bind_ipi bind_ipi = { .vcpu = vcpu_id };
int error;
/* Ensure the target CPU is ready to handle evtchn interrupts. */

View File

@ -101,7 +101,7 @@ struct bhyvestats {
uint64_t vmexit_hlt;
uint64_t vmexit_pause;
uint64_t vmexit_mtrap;
uint64_t vmexit_paging;
uint64_t vmexit_inst_emul;
uint64_t cpu_switch_rotate;
uint64_t cpu_switch_direct;
int io_reset;
@ -208,14 +208,12 @@ fbsdrun_addcpu(struct vmctx *ctx, int vcpu, uint64_t rip)
vmexit[vcpu].rip = rip;
vmexit[vcpu].inst_length = 0;
if (vcpu == BSP) {
mt_vmm_info[vcpu].mt_ctx = ctx;
mt_vmm_info[vcpu].mt_vcpu = vcpu;
error = pthread_create(&mt_vmm_info[vcpu].mt_thr, NULL,
fbsdrun_start_thread, &mt_vmm_info[vcpu]);
assert(error == 0);
}
mt_vmm_info[vcpu].mt_ctx = ctx;
mt_vmm_info[vcpu].mt_vcpu = vcpu;
error = pthread_create(&mt_vmm_info[vcpu].mt_thr, NULL,
fbsdrun_start_thread, &mt_vmm_info[vcpu]);
assert(error == 0);
}
static int
@ -385,13 +383,13 @@ vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
}
static int
vmexit_paging(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
int err;
stats.vmexit_paging++;
stats.vmexit_inst_emul++;
err = emulate_mem(ctx, *pvcpu, vmexit->u.paging.gpa,
&vmexit->u.paging.vie);
err = emulate_mem(ctx, *pvcpu, vmexit->u.inst_emul.gpa,
&vmexit->u.inst_emul.vie);
if (err) {
if (err == EINVAL) {
@ -400,7 +398,7 @@ vmexit_paging(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
vmexit->rip);
} else if (err == ESRCH) {
fprintf(stderr, "Unhandled memory access to 0x%lx\n",
vmexit->u.paging.gpa);
vmexit->u.inst_emul.gpa);
}
return (VMEXIT_ABORT);
@ -416,7 +414,7 @@ static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
[VM_EXITCODE_RDMSR] = vmexit_rdmsr,
[VM_EXITCODE_WRMSR] = vmexit_wrmsr,
[VM_EXITCODE_MTRAP] = vmexit_mtrap,
[VM_EXITCODE_PAGING] = vmexit_paging,
[VM_EXITCODE_INST_EMUL] = vmexit_inst_emul,
[VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap,
};

View File

@ -1048,7 +1048,7 @@ init_pci(struct vmctx *ctx)
* Accesses to memory addresses that are not allocated to system
* memory or PCI devices return 0xff's.
*/
error = vm_get_memory_seg(ctx, 0, &lowmem);
error = vm_get_memory_seg(ctx, 0, &lowmem, NULL);
assert(error == 0);
memset(&memp, 0, sizeof(struct mem_range));

View File

@ -341,14 +341,14 @@ rtc_init(struct vmctx *ctx)
* 0x34/0x35 - 64KB chunks above 16MB, below 4GB
* 0x5b/0x5c/0x5d - 64KB chunks above 4GB
*/
err = vm_get_memory_seg(ctx, 0, &lomem);
err = vm_get_memory_seg(ctx, 0, &lomem, NULL);
assert(err == 0);
lomem = (lomem - m_16MB) / m_64KB;
rtc_nvram[nvoff(RTC_LMEM_LSB)] = lomem;
rtc_nvram[nvoff(RTC_LMEM_MSB)] = lomem >> 8;
if (vm_get_memory_seg(ctx, m_4GB, &himem) == 0) {
if (vm_get_memory_seg(ctx, m_4GB, &himem, NULL) == 0) {
himem /= m_64KB;
rtc_nvram[nvoff(RTC_HMEM_LSB)] = himem;
rtc_nvram[nvoff(RTC_HMEM_SB)] = himem >> 8;

View File

@ -188,12 +188,13 @@ usage(void)
" [--unassign-pptdev=<bus/slot/func>]\n"
" [--set-mem=<memory in units of MB>]\n"
" [--get-lowmem]\n"
" [--get-highmem]\n",
" [--get-highmem]\n"
" [--get-gpa-pmap]\n",
progname);
exit(1);
}
static int get_stats, getcap, setcap, capval;
static int get_stats, getcap, setcap, capval, get_gpa_pmap;
static const char *capname;
static int create, destroy, get_lowmem, get_highmem;
static uint64_t memsize;
@ -377,18 +378,20 @@ enum {
SET_CAP,
CAPNAME,
UNASSIGN_PPTDEV,
GET_GPA_PMAP,
};
int
main(int argc, char *argv[])
{
char *vmname;
int error, ch, vcpu;
vm_paddr_t gpa;
int error, ch, vcpu, ptenum;
vm_paddr_t gpa, gpa_pmap;
size_t len;
struct vm_exit vmexit;
uint64_t ctl, eptp, bm, addr, u64;
uint64_t ctl, eptp, bm, addr, u64, pteval[4], *pte;
struct vmctx *ctx;
int wired;
uint64_t cr0, cr3, cr4, dr7, rsp, rip, rflags, efer, pat;
uint64_t rax, rbx, rcx, rdx, rsi, rdi, rbp;
@ -427,6 +430,7 @@ main(int argc, char *argv[])
{ "capname", REQ_ARG, 0, CAPNAME },
{ "unassign-pptdev", REQ_ARG, 0, UNASSIGN_PPTDEV },
{ "setcap", REQ_ARG, 0, SET_CAP },
{ "get-gpa-pmap", REQ_ARG, 0, GET_GPA_PMAP },
{ "getcap", NO_ARG, &getcap, 1 },
{ "get-stats", NO_ARG, &get_stats, 1 },
{ "get-desc-ds",NO_ARG, &get_desc_ds, 1 },
@ -666,6 +670,10 @@ main(int argc, char *argv[])
capval = strtoul(optarg, NULL, 0);
setcap = 1;
break;
case GET_GPA_PMAP:
gpa_pmap = strtoul(optarg, NULL, 0);
get_gpa_pmap = 1;
break;
case CAPNAME:
capname = optarg;
break;
@ -819,16 +827,18 @@ main(int argc, char *argv[])
if (!error && (get_lowmem || get_all)) {
gpa = 0;
error = vm_get_memory_seg(ctx, gpa, &len);
error = vm_get_memory_seg(ctx, gpa, &len, &wired);
if (error == 0)
printf("lowmem\t\t0x%016lx/%ld\n", gpa, len);
printf("lowmem\t\t0x%016lx/%ld%s\n", gpa, len,
wired ? " wired" : "");
}
if (!error && (get_highmem || get_all)) {
gpa = 4 * GB;
error = vm_get_memory_seg(ctx, gpa, &len);
error = vm_get_memory_seg(ctx, gpa, &len, &wired);
if (error == 0)
printf("highmem\t\t0x%016lx/%ld\n", gpa, len);
printf("highmem\t\t0x%016lx/%ld%s\n", gpa, len,
wired ? " wired" : "");
}
if (!error && (get_efer || get_all)) {
@ -1457,6 +1467,17 @@ main(int argc, char *argv[])
printf("Capability \"%s\" is not available\n", capname);
}
if (!error && get_gpa_pmap) {
error = vm_get_gpa_pmap(ctx, gpa_pmap, pteval, &ptenum);
if (error == 0) {
printf("gpa %#lx:", gpa_pmap);
pte = &pteval[0];
while (ptenum-- > 0)
printf(" %#lx", *pte++);
printf("\n");
}
}
if (!error && (getcap || get_all)) {
int captype, val, getcaptype;

View File

@ -492,8 +492,8 @@ static void
cb_getmem(void *arg, uint64_t *ret_lowmem, uint64_t *ret_highmem)
{
vm_get_memory_seg(ctx, 0, ret_lowmem);
vm_get_memory_seg(ctx, 4 * GB, ret_highmem);
vm_get_memory_seg(ctx, 0, ret_lowmem, NULL);
vm_get_memory_seg(ctx, 4 * GB, ret_highmem, NULL);
}
static const char *