diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c index 93955c7c233e..982887688664 100644 --- a/lib/libvmmapi/vmmapi.c +++ b/lib/libvmmapi/vmmapi.c @@ -368,14 +368,13 @@ vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *ret_val) } int -vm_run(struct vmctx *ctx, int vcpu, uint64_t rip, struct vm_exit *vmexit) +vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit) { int error; struct vm_run vmrun; bzero(&vmrun, sizeof(vmrun)); vmrun.cpuid = vcpu; - vmrun.rip = rip; error = ioctl(ctx->fd, VM_RUN, &vmrun); bcopy(&vmrun.vm_exit, vmexit, sizeof(struct vm_exit)); @@ -399,35 +398,21 @@ vm_reinit(struct vmctx *ctx) return (ioctl(ctx->fd, VM_REINIT, 0)); } -static int -vm_inject_exception_real(struct vmctx *ctx, int vcpu, int vector, - int error_code, int error_code_valid) +int +vm_inject_exception(struct vmctx *ctx, int vcpu, int vector, int errcode_valid, + uint32_t errcode, int restart_instruction) { struct vm_exception exc; - bzero(&exc, sizeof(exc)); exc.cpuid = vcpu; exc.vector = vector; - exc.error_code = error_code; - exc.error_code_valid = error_code_valid; + exc.error_code = errcode; + exc.error_code_valid = errcode_valid; + exc.restart_instruction = restart_instruction; return (ioctl(ctx->fd, VM_INJECT_EXCEPTION, &exc)); } -int -vm_inject_exception(struct vmctx *ctx, int vcpu, int vector) -{ - - return (vm_inject_exception_real(ctx, vcpu, vector, 0, 0)); -} - -int -vm_inject_exception2(struct vmctx *ctx, int vcpu, int vector, int errcode) -{ - - return (vm_inject_exception_real(ctx, vcpu, vector, errcode, 1)); -} - int vm_apicid2vcpu(struct vmctx *ctx, int apicid) { @@ -1002,6 +987,7 @@ int vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt) { + void *va; uint64_t gpa; int error, fault, i, n, off; @@ -1021,7 +1007,11 @@ vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, off = gpa & PAGE_MASK; n = min(len, PAGE_SIZE - off); - iov->iov_base = (void *)gpa; + va = vm_map_gpa(ctx, gpa, n); + if (va == NULL) + return (-1); + + iov->iov_base = va; iov->iov_len = n; iov++; iovcnt--; @@ -1032,20 +1022,25 @@ vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, return (0); } +void +vm_copy_teardown(struct vmctx *ctx, int vcpu, struct iovec *iov, int iovcnt) +{ + + return; +} + void vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *iov, void *vp, size_t len) { const char *src; char *dst; - uint64_t gpa; size_t n; dst = vp; while (len) { assert(iov->iov_len); - gpa = (uint64_t)iov->iov_base; n = min(len, iov->iov_len); - src = vm_map_gpa(ctx, gpa, n); + src = iov->iov_base; bcopy(src, dst, n); iov++; @@ -1060,15 +1055,13 @@ vm_copyout(struct vmctx *ctx, int vcpu, const void *vp, struct iovec *iov, { const char *src; char *dst; - uint64_t gpa; size_t n; src = vp; while (len) { assert(iov->iov_len); - gpa = (uint64_t)iov->iov_base; n = min(len, iov->iov_len); - dst = vm_map_gpa(ctx, gpa, n); + dst = iov->iov_base; bcopy(src, dst, n); iov++; @@ -1146,3 +1139,63 @@ vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t info1) error = ioctl(ctx->fd, VM_SET_INTINFO, &vmii); return (error); } + +int +vm_rtc_write(struct vmctx *ctx, int offset, uint8_t value) +{ + struct vm_rtc_data rtcdata; + int error; + + bzero(&rtcdata, sizeof(struct vm_rtc_data)); + rtcdata.offset = offset; + rtcdata.value = value; + error = ioctl(ctx->fd, VM_RTC_WRITE, &rtcdata); + return (error); +} + +int +vm_rtc_read(struct vmctx *ctx, int offset, uint8_t *retval) +{ + struct vm_rtc_data rtcdata; + int error; + + bzero(&rtcdata, sizeof(struct vm_rtc_data)); + rtcdata.offset = offset; + error = ioctl(ctx->fd, VM_RTC_READ, &rtcdata); + if (error == 0) + *retval = rtcdata.value; + return (error); +} + +int +vm_rtc_settime(struct vmctx *ctx, time_t secs) +{ + struct vm_rtc_time rtctime; + int error; + + bzero(&rtctime, sizeof(struct vm_rtc_time)); + rtctime.secs = secs; + error = ioctl(ctx->fd, VM_RTC_SETTIME, &rtctime); + return (error); +} + +int +vm_rtc_gettime(struct vmctx *ctx, time_t *secs) +{ + struct vm_rtc_time rtctime; + int error; + + bzero(&rtctime, sizeof(struct vm_rtc_time)); + error = ioctl(ctx->fd, VM_RTC_GETTIME, &rtctime); + if (error == 0) + *secs = rtctime.secs; + return (error); +} + +int +vm_restart_instruction(void *arg, int vcpu) +{ + struct vmctx *ctx = arg; + + return (ioctl(ctx->fd, VM_RESTART_INSTRUCTION, &vcpu)); +} diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h index fbb6ddd3acfb..06b29302640d 100644 --- a/lib/libvmmapi/vmmapi.h +++ b/lib/libvmmapi/vmmapi.h @@ -32,6 +32,12 @@ #include #include +/* + * API version for out-of-tree consumers like grub-bhyve for making compile + * time decisions. + */ +#define VMMAPI_VERSION 0101 /* 2 digit major followed by 2 digit minor */ + struct iovec; struct vmctx; enum x2apic_state; @@ -70,13 +76,12 @@ int vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *seg_desc); int vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val); int vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *retval); -int vm_run(struct vmctx *ctx, int vcpu, uint64_t rip, - struct vm_exit *ret_vmexit); +int vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *ret_vmexit); int vm_suspend(struct vmctx *ctx, enum vm_suspend_how how); int vm_reinit(struct vmctx *ctx); int vm_apicid2vcpu(struct vmctx *ctx, int apicid); -int vm_inject_exception(struct vmctx *ctx, int vcpu, int vec); -int vm_inject_exception2(struct vmctx *ctx, int vcpu, int vec, int errcode); +int vm_inject_exception(struct vmctx *ctx, int vcpu, int vector, + int errcode_valid, uint32_t errcode, int restart_instruction); int vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector); int vm_lapic_local_irq(struct vmctx *ctx, int vcpu, int vector); int vm_lapic_msi(struct vmctx *ctx, uint64_t addr, uint64_t msg); @@ -132,6 +137,14 @@ void vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *guest_iov, void *host_dst, size_t len); void vm_copyout(struct vmctx *ctx, int vcpu, const void *host_src, struct iovec *guest_iov, size_t len); +void vm_copy_teardown(struct vmctx *ctx, int vcpu, struct iovec *iov, + int iovcnt); + +/* RTC */ +int vm_rtc_write(struct vmctx *ctx, int offset, uint8_t value); +int vm_rtc_read(struct vmctx *ctx, int offset, uint8_t *retval); +int vm_rtc_settime(struct vmctx *ctx, time_t secs); +int vm_rtc_gettime(struct vmctx *ctx, time_t *secs); /* Reset vcpu register state */ int vcpu_reset(struct vmctx *ctx, int vcpu); diff --git a/share/examples/bhyve/vmrun.sh b/share/examples/bhyve/vmrun.sh index e1cbfcf90be2..d3d5cdf02212 100755 --- a/share/examples/bhyve/vmrun.sh +++ b/share/examples/bhyve/vmrun.sh @@ -39,7 +39,13 @@ DEFAULT_CONSOLE=stdio DEFAULT_VIRTIO_DISK="./diskdev" DEFAULT_ISOFILE="./release.iso" +errmsg() { + echo "*** $1" +} + usage() { + local msg=$1 + echo "Usage: vmrun.sh [-ahi] [-c ] [-C ] [-d ]" echo " [-e ] [-g ] [-H ]" echo " [-I ] [-m ]" @@ -58,18 +64,18 @@ usage() { echo " -m: memory size (default is ${DEFAULT_MEMSIZE})" echo " -t: tap device for virtio-net (default is $DEFAULT_TAPDEV)" echo "" - echo " This script needs to be executed with superuser privileges" - echo "" + [ -n "$msg" ] && errmsg "$msg" exit 1 } if [ `id -u` -ne 0 ]; then - usage + errmsg "This script must be executed with superuser privileges" + exit 1 fi kldstat -n vmm > /dev/null 2>&1 if [ $? -ne 0 ]; then - echo "vmm.ko is not loaded!" + errmsg "vmm.ko is not loaded" exit 1 fi @@ -143,7 +149,7 @@ fi shift $((${OPTIND} - 1)) if [ $# -ne 1 ]; then - usage + usage "virtual machine name not specified" fi vmname="$1" diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h index 8a8c3f471831..cf7f5bcae0cd 100644 --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -286,9 +286,10 @@ int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func); struct vatpic *vm_atpic(struct vm *vm); struct vatpit *vm_atpit(struct vm *vm); struct vpmtmr *vm_pmtmr(struct vm *vm); +struct vrtc *vm_rtc(struct vm *vm); /* - * Inject exception 'vme' into the guest vcpu. This function returns 0 on + * Inject exception 'vector' into the guest vcpu. This function returns 0 on * success and non-zero on failure. * * Wrapper functions like 'vm_inject_gp()' should be preferred to calling @@ -298,7 +299,8 @@ struct vpmtmr *vm_pmtmr(struct vm *vm); * This function should only be called in the context of the thread that is * executing this vcpu. */ -int vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *vme); +int vm_inject_exception(struct vm *vm, int vcpuid, int vector, int err_valid, + uint32_t errcode, int restart_instruction); /* * This function is called after a VM-exit that occurred during exception or @@ -444,8 +446,11 @@ struct vie { rex_x:1, rex_b:1, rex_present:1, + repz_present:1, /* REP/REPE/REPZ prefix */ + repnz_present:1, /* REPNE/REPNZ prefix */ opsize_override:1, /* Operand size override */ - addrsize_override:1; /* Address size override */ + addrsize_override:1, /* Address size override */ + segment_override:1; /* Segment override */ uint8_t mod:2, /* ModRM byte */ reg:4, @@ -461,6 +466,7 @@ struct vie { uint8_t scale; int base_register; /* VM_REG_GUEST_xyz */ int index_register; /* VM_REG_GUEST_xyz */ + int segment_register; /* VM_REG_GUEST_xyz */ int64_t displacement; /* optional addr displacement */ int64_t immediate; /* optional immediate operand */ @@ -627,4 +633,6 @@ vm_inject_ss(void *vm, int vcpuid, int errcode) void vm_inject_pf(void *vm, int vcpuid, int error_code, uint64_t cr2); +int vm_restart_instruction(void *vm, int vcpuid); + #endif /* _VMM_H_ */ diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h index e4d839ef6549..9d031a9525e5 100644 --- a/sys/amd64/include/vmm_dev.h +++ b/sys/amd64/include/vmm_dev.h @@ -54,7 +54,6 @@ struct vm_seg_desc { /* data or code segment */ struct vm_run { int cpuid; - uint64_t rip; /* start running here */ struct vm_exit vm_exit; }; @@ -63,6 +62,7 @@ struct vm_exception { int vector; uint32_t error_code; int error_code_valid; + int restart_instruction; }; struct vm_lapic_msi { @@ -195,6 +195,15 @@ struct vm_intinfo { uint64_t info2; }; +struct vm_rtc_time { + time_t secs; +}; + +struct vm_rtc_data { + int offset; + uint8_t value; +}; + enum { /* general routines */ IOCNUM_ABIVERS = 0, @@ -228,6 +237,7 @@ enum { IOCNUM_LAPIC_MSI = 36, IOCNUM_LAPIC_LOCAL_IRQ = 37, IOCNUM_IOAPIC_PINCOUNT = 38, + IOCNUM_RESTART_INSTRUCTION = 39, /* PCI pass-thru */ IOCNUM_BIND_PPTDEV = 40, @@ -254,6 +264,12 @@ enum { /* vm_cpuset */ IOCNUM_ACTIVATE_CPU = 90, IOCNUM_GET_CPUSET = 91, + + /* RTC */ + IOCNUM_RTC_READ = 100, + IOCNUM_RTC_WRITE = 101, + IOCNUM_RTC_SETTIME = 102, + IOCNUM_RTC_GETTIME = 103, }; #define VM_RUN \ @@ -336,4 +352,14 @@ enum { _IOW('v', IOCNUM_SET_INTINFO, struct vm_intinfo) #define VM_GET_INTINFO \ _IOWR('v', IOCNUM_GET_INTINFO, struct vm_intinfo) +#define VM_RTC_WRITE \ + _IOW('v', IOCNUM_RTC_WRITE, struct vm_rtc_data) +#define VM_RTC_READ \ + _IOWR('v', IOCNUM_RTC_READ, struct vm_rtc_data) +#define VM_RTC_SETTIME \ + _IOW('v', IOCNUM_RTC_SETTIME, struct vm_rtc_time) +#define VM_RTC_GETTIME \ + _IOR('v', IOCNUM_RTC_GETTIME, struct vm_rtc_time) +#define VM_RESTART_INSTRUCTION \ + _IOW('v', IOCNUM_RESTART_INSTRUCTION, int) #endif diff --git a/sys/amd64/vmm/amd/svm.c b/sys/amd64/vmm/amd/svm.c index ab47041b4d88..88a846d5e36d 100644 --- a/sys/amd64/vmm/amd/svm.c +++ b/sys/amd64/vmm/amd/svm.c @@ -80,6 +80,7 @@ SYSCTL_NODE(_hw_vmm, OID_AUTO, svm, CTLFLAG_RW, NULL, NULL); #define AMD_CPUID_SVM_DECODE_ASSIST BIT(7) /* Decode assist */ #define AMD_CPUID_SVM_PAUSE_INC BIT(10) /* Pause intercept filter. */ #define AMD_CPUID_SVM_PAUSE_FTH BIT(12) /* Pause filter threshold */ +#define AMD_CPUID_SVM_AVIC BIT(13) /* AVIC present */ #define VMCB_CACHE_DEFAULT (VMCB_CACHE_ASID | \ VMCB_CACHE_IOPM | \ @@ -554,6 +555,7 @@ svm_vminit(struct vm *vm, pmap_t pmap) pml4_pa = svm_sc->nptp; for (i = 0; i < VM_MAXCPU; i++) { vcpu = svm_get_vcpu(svm_sc, i); + vcpu->nextrip = ~0; vcpu->lastcpu = NOCPU; vcpu->vmcb_pa = vtophys(&vcpu->vmcb); vmcb_init(svm_sc, i, iopm_pa, msrpm_pa, pml4_pa); @@ -1200,7 +1202,6 @@ svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit) struct vmcb_state *state; struct vmcb_ctrl *ctrl; struct svm_regctx *ctx; - struct vm_exception exception; uint64_t code, info1, info2, val; uint32_t eax, ecx, edx; int error, errcode_valid, handled, idtvec, reflect; @@ -1314,6 +1315,7 @@ svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit) /* fallthru */ default: errcode_valid = 0; + info1 = 0; break; } KASSERT(vmexit->inst_length == 0, ("invalid inst_length (%d) " @@ -1322,14 +1324,10 @@ svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit) if (reflect) { /* Reflect the exception back into the guest */ - exception.vector = idtvec; - exception.error_code_valid = errcode_valid; - exception.error_code = errcode_valid ? info1 : 0; VCPU_CTR2(svm_sc->vm, vcpu, "Reflecting exception " - "%d/%#x into the guest", exception.vector, - exception.error_code); - error = vm_inject_exception(svm_sc->vm, vcpu, - &exception); + "%d/%#x into the guest", idtvec, (int)info1); + error = vm_inject_exception(svm_sc->vm, vcpu, idtvec, + errcode_valid, info1, 0); KASSERT(error == 0, ("%s: vm_inject_exception error %d", __func__, error)); } @@ -1476,15 +1474,24 @@ svm_inj_interrupts(struct svm_softc *sc, int vcpu, struct vlapic *vlapic) { struct vmcb_ctrl *ctrl; struct vmcb_state *state; + struct svm_vcpu *vcpustate; uint8_t v_tpr; int vector, need_intr_window, pending_apic_vector; state = svm_get_vmcb_state(sc, vcpu); ctrl = svm_get_vmcb_ctrl(sc, vcpu); + vcpustate = svm_get_vcpu(sc, vcpu); need_intr_window = 0; pending_apic_vector = 0; + if (vcpustate->nextrip != state->rip) { + ctrl->intr_shadow = 0; + VCPU_CTR2(sc->vm, vcpu, "Guest interrupt blocking " + "cleared due to rip change: %#lx/%#lx", + vcpustate->nextrip, state->rip); + } + /* * Inject pending events or exceptions for this vcpu. * @@ -1634,7 +1641,7 @@ done: * VMRUN. */ v_tpr = vlapic_get_cr8(vlapic); - KASSERT(v_tpr >= 0 && v_tpr <= 15, ("invalid v_tpr %#x", v_tpr)); + KASSERT(v_tpr <= 15, ("invalid v_tpr %#x", v_tpr)); if (ctrl->v_tpr != v_tpr) { VCPU_CTR2(sc->vm, vcpu, "VMCB V_TPR changed from %#x to %#x", ctrl->v_tpr, v_tpr); @@ -1801,14 +1808,14 @@ static __inline void disable_gintr(void) { - __asm __volatile("clgi" : : :); + __asm __volatile("clgi"); } static __inline void enable_gintr(void) { - __asm __volatile("stgi" : : :); + __asm __volatile("stgi"); } /* @@ -1955,6 +1962,9 @@ svm_vmrun(void *arg, int vcpu, register_t rip, pmap_t pmap, /* #VMEXIT disables interrupts so re-enable them here. */ enable_gintr(); + /* Update 'nextrip' */ + vcpustate->nextrip = state->rip; + /* Handle #VMEXIT and if required return to user space. */ handled = svm_vmexit(svm_sc, vcpu, vmexit); } while (handled); diff --git a/sys/amd64/vmm/amd/svm_softc.h b/sys/amd64/vmm/amd/svm_softc.h index a5bb57c72883..de0c3f78903d 100644 --- a/sys/amd64/vmm/amd/svm_softc.h +++ b/sys/amd64/vmm/amd/svm_softc.h @@ -45,6 +45,7 @@ struct svm_vcpu { struct vmcb vmcb; /* hardware saved vcpu context */ struct svm_regctx swctx; /* software saved vcpu context */ uint64_t vmcb_pa; /* VMCB physical address */ + uint64_t nextrip; /* next instruction to be executed by guest */ int lastcpu; /* host cpu that the vcpu last ran on */ uint32_t dirty; /* state cache bits that must be cleared */ long eptgen; /* pmap->pm_eptgen when the vcpu last ran */ diff --git a/sys/amd64/vmm/amd/svm_support.S b/sys/amd64/vmm/amd/svm_support.S index 72327bdaf138..b363101f11f7 100644 --- a/sys/amd64/vmm/amd/svm_support.S +++ b/sys/amd64/vmm/amd/svm_support.S @@ -22,6 +22,8 @@ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ */ #include @@ -35,6 +37,10 @@ #define VENTER push %rbp ; mov %rsp,%rbp #define VLEAVE pop %rbp +#define VMLOAD .byte 0x0f, 0x01, 0xda +#define VMRUN .byte 0x0f, 0x01, 0xd8 +#define VMSAVE .byte 0x0f, 0x01, 0xdb + /* * svm_launch(uint64_t vmcb, struct svm_regctx *gctx) * %rdi: physical address of VMCB @@ -79,9 +85,9 @@ ENTRY(svm_launch) movq SCTX_RDI(%rsi), %rdi movq SCTX_RSI(%rsi), %rsi /* %rsi must be restored last */ - vmload %rax - vmrun %rax - vmsave %rax + VMLOAD + VMRUN + VMSAVE pop %rax /* pop guest context pointer from the stack */ diff --git a/sys/amd64/vmm/intel/vmcs.c b/sys/amd64/vmm/intel/vmcs.c index ae4d9db3274c..59625262314a 100644 --- a/sys/amd64/vmm/intel/vmcs.c +++ b/sys/amd64/vmm/intel/vmcs.c @@ -342,18 +342,6 @@ vmcs_init(struct vmcs *vmcs) */ VMPTRLD(vmcs); - /* Initialize guest IA32_PAT MSR with the default value */ - pat = PAT_VALUE(0, PAT_WRITE_BACK) | - PAT_VALUE(1, PAT_WRITE_THROUGH) | - PAT_VALUE(2, PAT_UNCACHED) | - PAT_VALUE(3, PAT_UNCACHEABLE) | - PAT_VALUE(4, PAT_WRITE_BACK) | - PAT_VALUE(5, PAT_WRITE_THROUGH) | - PAT_VALUE(6, PAT_UNCACHED) | - PAT_VALUE(7, PAT_UNCACHEABLE); - if ((error = vmwrite(VMCS_GUEST_IA32_PAT, pat)) != 0) - goto done; - /* Host state */ /* Initialize host IA32_PAT MSR */ diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c index c3dd04e66e1f..b81e48b05f91 100644 --- a/sys/amd64/vmm/intel/vmx.c +++ b/sys/amd64/vmm/intel/vmx.c @@ -100,13 +100,11 @@ __FBSDID("$FreeBSD$"); (VM_EXIT_HOST_LMA | \ VM_EXIT_SAVE_EFER | \ VM_EXIT_LOAD_EFER | \ - VM_EXIT_ACKNOWLEDGE_INTERRUPT | \ - VM_EXIT_SAVE_PAT | \ - VM_EXIT_LOAD_PAT) + VM_EXIT_ACKNOWLEDGE_INTERRUPT) #define VM_EXIT_CTLS_ZERO_SETTING VM_EXIT_SAVE_DEBUG_CONTROLS -#define VM_ENTRY_CTLS_ONE_SETTING (VM_ENTRY_LOAD_EFER | VM_ENTRY_LOAD_PAT) +#define VM_ENTRY_CTLS_ONE_SETTING (VM_ENTRY_LOAD_EFER) #define VM_ENTRY_CTLS_ZERO_SETTING \ (VM_ENTRY_LOAD_DEBUG_CONTROLS | \ @@ -859,10 +857,6 @@ vmx_vminit(struct vm *vm, pmap_t pmap) * VM exit and entry respectively. It is also restored from the * host VMCS area on a VM exit. * - * MSR_PAT is saved and restored in the guest VMCS are on a VM exit - * and entry respectively. It is also restored from the host VMCS - * area on a VM exit. - * * The TSC MSR is exposed read-only. Writes are disallowed as that * will impact the host TSC. * XXX Writes would be implemented with a wrmsr trap, and @@ -874,7 +868,6 @@ vmx_vminit(struct vm *vm, pmap_t pmap) guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) || guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) || guest_msr_rw(vmx, MSR_EFER) || - guest_msr_rw(vmx, MSR_PAT) || guest_msr_ro(vmx, MSR_TSC)) panic("vmx_vminit: error setting guest msr access"); @@ -941,6 +934,7 @@ vmx_vminit(struct vm *vm, pmap_t pmap) vmx->cap[i].proc_ctls = procbased_ctls; vmx->cap[i].proc_ctls2 = procbased_ctls2; + vmx->state[i].nextrip = ~0; vmx->state[i].lastcpu = NOCPU; vmx->state[i].vpid = vpid[i]; @@ -1169,12 +1163,24 @@ vmx_inject_nmi(struct vmx *vmx, int vcpu) } static void -vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic) +vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic, + uint64_t guestrip) { int vector, need_nmi_exiting, extint_pending; uint64_t rflags, entryinfo; uint32_t gi, info; + if (vmx->state[vcpu].nextrip != guestrip) { + gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); + if (gi & HWINTR_BLOCKING) { + VCPU_CTR2(vmx->vm, vcpu, "Guest interrupt blocking " + "cleared due to rip change: %#lx/%#lx", + vmx->state[vcpu].nextrip, guestrip); + gi &= ~HWINTR_BLOCKING; + vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); + } + } + if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) { KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry " "intinfo is not valid: %#lx", __func__, entryinfo)); @@ -1771,7 +1777,7 @@ vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla) { struct vm_guest_paging *paging; uint32_t csar; - + paging = &vmexit->u.inst_emul.paging; vmexit->exitcode = VM_EXITCODE_INST_EMUL; @@ -2060,12 +2066,11 @@ emulate_rdmsr(struct vmx *vmx, int vcpuid, u_int num, bool *retu) static int vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) { - int error, handled, in; + int error, errcode, errcode_valid, handled, in; struct vmxctx *vmxctx; struct vlapic *vlapic; struct vm_inout_str *vis; struct vm_task_switch *ts; - struct vm_exception vmexc; uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info; uint32_t intr_type, intr_vec, reason; uint64_t exitintinfo, qual, gpa; @@ -2250,6 +2255,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) case EXIT_REASON_MTF: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1); vmexit->exitcode = VM_EXITCODE_MTRAP; + vmexit->inst_length = 0; break; case EXIT_REASON_PAUSE: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1); @@ -2376,15 +2382,15 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length); /* Reflect all other exceptions back into the guest */ - bzero(&vmexc, sizeof(struct vm_exception)); - vmexc.vector = intr_vec; + errcode_valid = errcode = 0; if (intr_info & VMCS_INTR_DEL_ERRCODE) { - vmexc.error_code_valid = 1; - vmexc.error_code = vmcs_read(VMCS_EXIT_INTR_ERRCODE); + errcode_valid = 1; + errcode = vmcs_read(VMCS_EXIT_INTR_ERRCODE); } VCPU_CTR2(vmx->vm, vcpu, "Reflecting exception %d/%#x into " - "the guest", vmexc.vector, vmexc.error_code); - error = vm_inject_exception(vmx->vm, vcpu, &vmexc); + "the guest", intr_vec, errcode); + error = vm_inject_exception(vmx->vm, vcpu, intr_vec, + errcode_valid, errcode, 0); KASSERT(error == 0, ("%s: vm_inject_exception error %d", __func__, error)); return (1); @@ -2399,6 +2405,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) if (vm_mem_allocated(vmx->vm, gpa) || apic_access_fault(vmx, vcpu, gpa)) { vmexit->exitcode = VM_EXITCODE_PAGING; + vmexit->inst_length = 0; vmexit->u.paging.gpa = gpa; vmexit->u.paging.fault_type = ept_fault_type(qual); vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1); @@ -2540,7 +2547,7 @@ vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit) } static int -vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap, +vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap, void *rendezvous_cookie, void *suspend_cookie) { int rc, handled, launched; @@ -2550,7 +2557,6 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap, struct vmcs *vmcs; struct vm_exit *vmexit; struct vlapic *vlapic; - uint64_t rip; uint32_t exit_reason; vmx = arg; @@ -2578,11 +2584,13 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap, */ vmcs_write(VMCS_HOST_CR3, rcr3()); - vmcs_write(VMCS_GUEST_RIP, startrip); + vmcs_write(VMCS_GUEST_RIP, rip); vmx_set_pcpu_defaults(vmx, vcpu, pmap); do { - handled = UNHANDLED; + KASSERT(vmcs_guest_rip() == rip, ("%s: vmcs guest rip mismatch " + "%#lx/%#lx", __func__, vmcs_guest_rip(), rip)); + handled = UNHANDLED; /* * Interrupts are disabled from this point on until the * guest starts executing. This is done for the following @@ -2602,7 +2610,7 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap, * pmap_invalidate_ept(). */ disable_intr(); - vmx_inject_interrupts(vmx, vcpu, vlapic); + vmx_inject_interrupts(vmx, vcpu, vlapic, rip); /* * Check for vcpu suspension after injecting events because @@ -2611,20 +2619,20 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap, */ if (vcpu_suspended(suspend_cookie)) { enable_intr(); - vm_exit_suspended(vmx->vm, vcpu, vmcs_guest_rip()); + vm_exit_suspended(vmx->vm, vcpu, rip); break; } if (vcpu_rendezvous_pending(rendezvous_cookie)) { enable_intr(); - vm_exit_rendezvous(vmx->vm, vcpu, vmcs_guest_rip()); + vm_exit_rendezvous(vmx->vm, vcpu, rip); break; } if (vcpu_should_yield(vm, vcpu)) { enable_intr(); - vm_exit_astpending(vmx->vm, vcpu, vmcs_guest_rip()); - vmx_astpending_trace(vmx, vcpu, vmexit->rip); + vm_exit_astpending(vmx->vm, vcpu, rip); + vmx_astpending_trace(vmx, vcpu, rip); handled = HANDLED; break; } @@ -2638,6 +2646,9 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap, vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason(); vmexit->u.vmx.exit_qualification = vmcs_exit_qualification(); + /* Update 'nextrip' */ + vmx->state[vcpu].nextrip = rip; + if (rc == VMX_GUEST_VMEXIT) { vmx_exit_handle_nmi(vmx, vcpu, vmexit); enable_intr(); @@ -2648,6 +2659,7 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap, } launched = 1; vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled); + rip = vmexit->rip; } while (handled); /* diff --git a/sys/amd64/vmm/intel/vmx.h b/sys/amd64/vmm/intel/vmx.h index 2124554887dc..bc488614ff74 100644 --- a/sys/amd64/vmm/intel/vmx.h +++ b/sys/amd64/vmm/intel/vmx.h @@ -78,6 +78,7 @@ struct vmxcap { }; struct vmxstate { + uint64_t nextrip; /* next instruction to be executed by guest */ int lastcpu; /* host cpu that this 'vcpu' last ran on */ uint16_t vpid; }; @@ -102,6 +103,7 @@ enum { IDX_MSR_STAR, IDX_MSR_SF_MASK, IDX_MSR_KGSBASE, + IDX_MSR_PAT, GUEST_MSR_NUM /* must be the last enumeration */ }; diff --git a/sys/amd64/vmm/intel/vmx_msr.c b/sys/amd64/vmm/intel/vmx_msr.c index f6bbf2aed7d9..e5177786e4a4 100644 --- a/sys/amd64/vmm/intel/vmx_msr.c +++ b/sys/amd64/vmm/intel/vmx_msr.c @@ -230,6 +230,25 @@ westmere_cpu(void) return (false); } +static bool +pat_valid(uint64_t val) +{ + int i, pa; + + /* + * From Intel SDM: Table "Memory Types That Can Be Encoded With PAT" + * + * Extract PA0 through PA7 and validate that each one encodes a + * valid memory type. + */ + for (i = 0; i < 8; i++) { + pa = (val >> (i * 8)) & 0xff; + if (pa == 2 || pa == 3 || pa >= 8) + return (false); + } + return (true); +} + void vmx_msr_init(void) { @@ -302,6 +321,10 @@ vmx_msr_init(void) void vmx_msr_guest_init(struct vmx *vmx, int vcpuid) { + uint64_t *guest_msrs; + + guest_msrs = vmx->guest_msrs[vcpuid]; + /* * The permissions bitmap is shared between all vcpus so initialize it * once when initializing the vBSP. @@ -313,6 +336,19 @@ vmx_msr_guest_init(struct vmx *vmx, int vcpuid) guest_msr_rw(vmx, MSR_SF_MASK); guest_msr_rw(vmx, MSR_KGSBASE); } + + /* + * Initialize guest IA32_PAT MSR with default value after reset. + */ + guest_msrs[IDX_MSR_PAT] = PAT_VALUE(0, PAT_WRITE_BACK) | + PAT_VALUE(1, PAT_WRITE_THROUGH) | + PAT_VALUE(2, PAT_UNCACHED) | + PAT_VALUE(3, PAT_UNCACHEABLE) | + PAT_VALUE(4, PAT_WRITE_BACK) | + PAT_VALUE(5, PAT_WRITE_THROUGH) | + PAT_VALUE(6, PAT_UNCACHED) | + PAT_VALUE(7, PAT_UNCACHEABLE); + return; } @@ -353,7 +389,11 @@ vmx_msr_guest_exit(struct vmx *vmx, int vcpuid) int vmx_rdmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t *val, bool *retu) { - int error = 0; + const uint64_t *guest_msrs; + int error; + + guest_msrs = vmx->guest_msrs[vcpuid]; + error = 0; switch (num) { case MSR_IA32_MISC_ENABLE: @@ -366,6 +406,9 @@ vmx_rdmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t *val, bool *retu) case MSR_TURBO_RATIO_LIMIT1: *val = turbo_ratio_limit; break; + case MSR_PAT: + *val = guest_msrs[IDX_MSR_PAT]; + break; default: error = EINVAL; break; @@ -376,10 +419,13 @@ vmx_rdmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t *val, bool *retu) int vmx_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu) { + uint64_t *guest_msrs; uint64_t changed; int error; + guest_msrs = vmx->guest_msrs[vcpuid]; error = 0; + switch (num) { case MSR_IA32_MISC_ENABLE: changed = val ^ misc_enable; @@ -401,6 +447,12 @@ vmx_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu) error = EINVAL; break; + case MSR_PAT: + if (pat_valid(val)) + guest_msrs[IDX_MSR_PAT] = val; + else + vm_inject_gp(vmx->vm, vcpuid); + break; default: error = EINVAL; break; diff --git a/sys/amd64/vmm/io/vhpet.c b/sys/amd64/vmm/io/vhpet.c index 46e5ca723927..a4c96cd19b31 100644 --- a/sys/amd64/vmm/io/vhpet.c +++ b/sys/amd64/vmm/io/vhpet.c @@ -104,7 +104,6 @@ vhpet_capabilities(void) uint64_t cap = 0; cap |= 0x8086 << 16; /* vendor id */ - cap |= HPET_CAP_LEG_RT; /* legacy routing capable */ cap |= (VHPET_NUM_TIMERS - 1) << 8; /* number of timers */ cap |= 1; /* revision */ cap &= ~HPET_CAP_COUNT_SIZE; /* 32-bit timer */ @@ -127,15 +126,6 @@ vhpet_timer_msi_enabled(struct vhpet *vhpet, int n) { const uint64_t msi_enable = HPET_TCAP_FSB_INT_DEL | HPET_TCNF_FSB_EN; - /* - * LegacyReplacement Route configuration takes precedence over MSI - * for timers 0 and 1. - */ - if (n == 0 || n == 1) { - if (vhpet->config & HPET_CNF_LEG_RT) - return (false); - } - if ((vhpet->timer[n].cap_config & msi_enable) == msi_enable) return (true); else @@ -152,41 +142,9 @@ vhpet_timer_ioapic_pin(struct vhpet *vhpet, int n) if (vhpet_timer_msi_enabled(vhpet, n)) return (0); - if (vhpet->config & HPET_CNF_LEG_RT) { - /* - * In "legacy routing" timers 0 and 1 are connected to - * ioapic pins 2 and 8 respectively. - */ - switch (n) { - case 0: - return (2); - case 1: - return (8); - } - } - return ((vhpet->timer[n].cap_config & HPET_TCNF_INT_ROUTE) >> 9); } -static __inline int -vhpet_timer_atpic_pin(struct vhpet *vhpet, int n) -{ - if (vhpet->config & HPET_CNF_LEG_RT) { - /* - * In "legacy routing" timers 0 and 1 are connected to - * 8259 master pin 0 and slave pin 0 respectively. - */ - switch (n) { - case 0: - return (0); - case 1: - return (8); - } - } - - return (-1); -} - static uint32_t vhpet_counter(struct vhpet *vhpet, sbintime_t *nowptr) { @@ -216,17 +174,12 @@ vhpet_counter(struct vhpet *vhpet, sbintime_t *nowptr) static void vhpet_timer_clear_isr(struct vhpet *vhpet, int n) { - int pin, legacy_pin; + int pin; if (vhpet->isr & (1 << n)) { pin = vhpet_timer_ioapic_pin(vhpet, n); KASSERT(pin != 0, ("vhpet timer %d irq incorrectly routed", n)); vioapic_deassert_irq(vhpet->vm, pin); - - legacy_pin = vhpet_timer_atpic_pin(vhpet, n); - if (legacy_pin != -1) - vatpic_deassert_irq(vhpet->vm, legacy_pin); - vhpet->isr &= ~(1 << n); } } @@ -252,12 +205,6 @@ vhpet_timer_edge_trig(struct vhpet *vhpet, int n) KASSERT(!vhpet_timer_msi_enabled(vhpet, n), ("vhpet_timer_edge_trig: " "timer %d is using MSI", n)); - /* The legacy replacement interrupts are always edge triggered */ - if (vhpet->config & HPET_CNF_LEG_RT) { - if (n == 0 || n == 1) - return (true); - } - if ((vhpet->timer[n].cap_config & HPET_TCNF_INT_TYPE) == 0) return (true); else @@ -267,7 +214,7 @@ vhpet_timer_edge_trig(struct vhpet *vhpet, int n) static void vhpet_timer_interrupt(struct vhpet *vhpet, int n) { - int pin, legacy_pin; + int pin; /* If interrupts are not enabled for this timer then just return. */ if (!vhpet_timer_interrupt_enabled(vhpet, n)) @@ -293,17 +240,11 @@ vhpet_timer_interrupt(struct vhpet *vhpet, int n) return; } - legacy_pin = vhpet_timer_atpic_pin(vhpet, n); - if (vhpet_timer_edge_trig(vhpet, n)) { vioapic_pulse_irq(vhpet->vm, pin); - if (legacy_pin != -1) - vatpic_pulse_irq(vhpet->vm, legacy_pin); } else { vhpet->isr |= 1 << n; vioapic_assert_irq(vhpet->vm, pin); - if (legacy_pin != -1) - vatpic_assert_irq(vhpet->vm, legacy_pin); } } @@ -579,6 +520,13 @@ vhpet_mmio_write(void *vm, int vcpuid, uint64_t gpa, uint64_t val, int size, counter = vhpet_counter(vhpet, nowptr); oldval = vhpet->config; update_register(&vhpet->config, data, mask); + + /* + * LegacyReplacement Routing is not supported so clear the + * bit explicitly. + */ + vhpet->config &= ~HPET_CNF_LEG_RT; + if ((oldval ^ vhpet->config) & HPET_CNF_ENABLE) { if (vhpet_counter_enabled(vhpet)) { vhpet_start_counting(vhpet); diff --git a/sys/amd64/vmm/io/vrtc.c b/sys/amd64/vmm/io/vrtc.c new file mode 100644 index 000000000000..d5e93dc07381 --- /dev/null +++ b/sys/amd64/vmm/io/vrtc.c @@ -0,0 +1,952 @@ +/*- + * Copyright (c) 2014, Neel Natu (neel@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include "vmm_ktr.h" +#include "vatpic.h" +#include "vioapic.h" +#include "vrtc.h" + +/* Register layout of the RTC */ +struct rtcdev { + uint8_t sec; + uint8_t alarm_sec; + uint8_t min; + uint8_t alarm_min; + uint8_t hour; + uint8_t alarm_hour; + uint8_t day_of_week; + uint8_t day_of_month; + uint8_t month; + uint8_t year; + uint8_t reg_a; + uint8_t reg_b; + uint8_t reg_c; + uint8_t reg_d; + uint8_t nvram[128 - 14]; +} __packed; +CTASSERT(sizeof(struct rtcdev) == 128); + +struct vrtc { + struct vm *vm; + struct mtx mtx; + struct callout callout; + u_int addr; /* RTC register to read or write */ + sbintime_t base_uptime; + time_t base_rtctime; + struct rtcdev rtcdev; +}; + +#define VRTC_LOCK(vrtc) mtx_lock(&((vrtc)->mtx)) +#define VRTC_UNLOCK(vrtc) mtx_unlock(&((vrtc)->mtx)) +#define VRTC_LOCKED(vrtc) mtx_owned(&((vrtc)->mtx)) + +/* + * RTC time is considered "broken" if: + * - RTC updates are halted by the guest + * - RTC date/time fields have invalid values + */ +#define VRTC_BROKEN_TIME ((time_t)-1) + +#define RTC_IRQ 8 +#define RTCSB_BIN 0x04 +#define RTCSB_ALL_INTRS (RTCSB_UINTR | RTCSB_AINTR | RTCSB_PINTR) +#define rtc_halted(vrtc) ((vrtc->rtcdev.reg_b & RTCSB_HALT) != 0) +#define aintr_enabled(vrtc) (((vrtc)->rtcdev.reg_b & RTCSB_AINTR) != 0) +#define pintr_enabled(vrtc) (((vrtc)->rtcdev.reg_b & RTCSB_PINTR) != 0) +#define uintr_enabled(vrtc) (((vrtc)->rtcdev.reg_b & RTCSB_UINTR) != 0) + +static void vrtc_callout_handler(void *arg); +static void vrtc_set_reg_c(struct vrtc *vrtc, uint8_t newval); + +static MALLOC_DEFINE(M_VRTC, "vrtc", "bhyve virtual rtc"); + +SYSCTL_DECL(_hw_vmm); +SYSCTL_NODE(_hw_vmm, OID_AUTO, vrtc, CTLFLAG_RW, NULL, NULL); + +static int rtc_flag_broken_time = 1; +SYSCTL_INT(_hw_vmm_vrtc, OID_AUTO, flag_broken_time, CTLFLAG_RDTUN, + &rtc_flag_broken_time, 0, "Stop guest when invalid RTC time is detected"); + +static __inline bool +divider_enabled(int reg_a) +{ + /* + * The RTC is counting only when dividers are not held in reset. + */ + return ((reg_a & 0x70) == 0x20); +} + +static __inline bool +update_enabled(struct vrtc *vrtc) +{ + /* + * RTC date/time can be updated only if: + * - divider is not held in reset + * - guest has not disabled updates + * - the date/time fields have valid contents + */ + if (!divider_enabled(vrtc->rtcdev.reg_a)) + return (false); + + if (rtc_halted(vrtc)) + return (false); + + if (vrtc->base_rtctime == VRTC_BROKEN_TIME) + return (false); + + return (true); +} + +static time_t +vrtc_curtime(struct vrtc *vrtc) +{ + sbintime_t now, delta; + time_t t; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + t = vrtc->base_rtctime; + if (update_enabled(vrtc)) { + now = sbinuptime(); + delta = now - vrtc->base_uptime; + KASSERT(delta >= 0, ("vrtc_curtime: uptime went backwards: " + "%#lx to %#lx", vrtc->base_uptime, now)); + t += delta / SBT_1S; + } + return (t); +} + +static __inline uint8_t +rtcset(struct rtcdev *rtc, int val) +{ + + KASSERT(val >= 0 && val < 100, ("%s: invalid bin2bcd index %d", + __func__, val)); + + return ((rtc->reg_b & RTCSB_BIN) ? val : bin2bcd_data[val]); +} + +static void +secs_to_rtc(time_t rtctime, struct vrtc *vrtc, int force_update) +{ + struct clocktime ct; + struct timespec ts; + struct rtcdev *rtc; + int hour; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + if (rtctime < 0) { + KASSERT(rtctime == VRTC_BROKEN_TIME, + ("%s: invalid vrtc time %#lx", __func__, rtctime)); + return; + } + + /* + * If the RTC is halted then the guest has "ownership" of the + * date/time fields. Don't update the RTC date/time fields in + * this case (unless forced). + */ + if (rtc_halted(vrtc) && !force_update) + return; + + ts.tv_sec = rtctime; + ts.tv_nsec = 0; + clock_ts_to_ct(&ts, &ct); + + KASSERT(ct.sec >= 0 && ct.sec <= 59, ("invalid clocktime sec %d", + ct.sec)); + KASSERT(ct.min >= 0 && ct.min <= 59, ("invalid clocktime min %d", + ct.min)); + KASSERT(ct.hour >= 0 && ct.hour <= 23, ("invalid clocktime hour %d", + ct.hour)); + KASSERT(ct.dow >= 0 && ct.dow <= 6, ("invalid clocktime wday %d", + ct.dow)); + KASSERT(ct.day >= 1 && ct.day <= 31, ("invalid clocktime mday %d", + ct.day)); + KASSERT(ct.mon >= 1 && ct.mon <= 12, ("invalid clocktime month %d", + ct.mon)); + KASSERT(ct.year >= POSIX_BASE_YEAR, ("invalid clocktime year %d", + ct.year)); + + rtc = &vrtc->rtcdev; + rtc->sec = rtcset(rtc, ct.sec); + rtc->min = rtcset(rtc, ct.min); + + hour = ct.hour; + if ((rtc->reg_b & RTCSB_24HR) == 0) + hour = (hour % 12) + 1; /* convert to a 12-hour format */ + + rtc->hour = rtcset(rtc, hour); + + if ((rtc->reg_b & RTCSB_24HR) == 0 && ct.hour >= 12) + rtc->hour |= 0x80; /* set MSB to indicate PM */ + + rtc->day_of_week = rtcset(rtc, ct.dow + 1); + rtc->day_of_month = rtcset(rtc, ct.day); + rtc->month = rtcset(rtc, ct.mon); + rtc->year = rtcset(rtc, ct.year % 100); +} + +static int +rtcget(struct rtcdev *rtc, int val, int *retval) +{ + uint8_t upper, lower; + + if (rtc->reg_b & RTCSB_BIN) { + *retval = val; + return (0); + } + + lower = val & 0xf; + upper = (val >> 4) & 0xf; + + if (lower > 9 || upper > 9) + return (-1); + + *retval = upper * 10 + lower; + return (0); +} + +static time_t +rtc_to_secs(struct vrtc *vrtc) +{ + struct clocktime ct; + struct timespec ts; + struct rtcdev *rtc; + struct vm *vm; + int error, hour, pm, year; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + vm = vrtc->vm; + rtc = &vrtc->rtcdev; + + bzero(&ct, sizeof(struct clocktime)); + + error = rtcget(rtc, rtc->sec, &ct.sec); + if (error || ct.sec < 0 || ct.sec > 59) { + VM_CTR2(vm, "Invalid RTC sec %#x/%d", rtc->sec, ct.sec); + goto fail; + } + + error = rtcget(rtc, rtc->min, &ct.min); + if (error || ct.min < 0 || ct.min > 59) { + VM_CTR2(vm, "Invalid RTC min %#x/%d", rtc->min, ct.min); + goto fail; + } + + pm = 0; + hour = rtc->hour; + if ((rtc->reg_b & RTCSB_24HR) == 0) { + if (hour & 0x80) { + hour &= ~0x80; + pm = 1; + } + } + error = rtcget(rtc, hour, &ct.hour); + if ((rtc->reg_b & RTCSB_24HR) == 0) { + ct.hour -= 1; + if (pm) + ct.hour += 12; + } + + if (error || ct.hour < 0 || ct.hour > 23) { + VM_CTR2(vm, "Invalid RTC hour %#x/%d", rtc->hour, ct.hour); + goto fail; + } + + /* + * Ignore 'rtc->dow' because some guests like Linux don't bother + * setting it at all while others like OpenBSD/i386 set it incorrectly. + * + * clock_ct_to_ts() does not depend on 'ct.dow' anyways so ignore it. + */ + ct.dow = -1; + + error = rtcget(rtc, rtc->day_of_month, &ct.day); + if (error || ct.day < 1 || ct.day > 31) { + VM_CTR2(vm, "Invalid RTC mday %#x/%d", rtc->day_of_month, + ct.day); + goto fail; + } + + error = rtcget(rtc, rtc->month, &ct.mon); + if (error || ct.mon < 1 || ct.mon > 12) { + VM_CTR2(vm, "Invalid RTC month %#x/%d", rtc->month, ct.mon); + goto fail; + } + + error = rtcget(rtc, rtc->year, &year); + if (error || year < 0 || year > 99) { + VM_CTR2(vm, "Invalid RTC year %#x/%d", rtc->year, year); + goto fail; + } + if (year >= 70) + ct.year = 1900 + year; + else + ct.year = 2000 + year; + + error = clock_ct_to_ts(&ct, &ts); + if (error || ts.tv_sec < 0) { + VM_CTR3(vm, "Invalid RTC clocktime.date %04d-%02d-%02d", + ct.year, ct.mon, ct.day); + VM_CTR3(vm, "Invalid RTC clocktime.time %02d:%02d:%02d", + ct.hour, ct.min, ct.sec); + goto fail; + } + return (ts.tv_sec); /* success */ +fail: + return (VRTC_BROKEN_TIME); /* failure */ +} + +static int +vrtc_time_update(struct vrtc *vrtc, time_t newtime) +{ + struct rtcdev *rtc; + time_t oldtime; + uint8_t alarm_sec, alarm_min, alarm_hour; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + rtc = &vrtc->rtcdev; + alarm_sec = rtc->alarm_sec; + alarm_min = rtc->alarm_min; + alarm_hour = rtc->alarm_hour; + + oldtime = vrtc->base_rtctime; + VM_CTR2(vrtc->vm, "Updating RTC time from %#lx to %#lx", + oldtime, newtime); + + if (newtime == oldtime) + return (0); + + /* + * If 'newtime' indicates that RTC updates are disabled then just + * record that and return. There is no need to do alarm interrupt + * processing or update 'base_uptime' in this case. + */ + if (newtime == VRTC_BROKEN_TIME) { + vrtc->base_rtctime = VRTC_BROKEN_TIME; + return (0); + } + + /* + * Return an error if RTC updates are halted by the guest. + */ + if (rtc_halted(vrtc)) { + VM_CTR0(vrtc->vm, "RTC update halted by guest"); + return (EBUSY); + } + + do { + /* + * If the alarm interrupt is enabled and 'oldtime' is valid + * then visit all the seconds between 'oldtime' and 'newtime' + * to check for the alarm condition. + * + * Otherwise move the RTC time forward directly to 'newtime'. + */ + if (aintr_enabled(vrtc) && oldtime != VRTC_BROKEN_TIME) + vrtc->base_rtctime++; + else + vrtc->base_rtctime = newtime; + + if (aintr_enabled(vrtc)) { + /* + * Update the RTC date/time fields before checking + * if the alarm conditions are satisfied. + */ + secs_to_rtc(vrtc->base_rtctime, vrtc, 0); + + if ((alarm_sec >= 0xC0 || alarm_sec == rtc->sec) && + (alarm_min >= 0xC0 || alarm_min == rtc->min) && + (alarm_hour >= 0xC0 || alarm_hour == rtc->hour)) { + vrtc_set_reg_c(vrtc, rtc->reg_c | RTCIR_ALARM); + } + } + } while (vrtc->base_rtctime != newtime); + + if (uintr_enabled(vrtc)) + vrtc_set_reg_c(vrtc, rtc->reg_c | RTCIR_UPDATE); + + vrtc->base_uptime = sbinuptime(); + + return (0); +} + +static sbintime_t +vrtc_freq(struct vrtc *vrtc) +{ + int ratesel; + + static sbintime_t pf[16] = { + 0, + SBT_1S / 256, + SBT_1S / 128, + SBT_1S / 8192, + SBT_1S / 4096, + SBT_1S / 2048, + SBT_1S / 1024, + SBT_1S / 512, + SBT_1S / 256, + SBT_1S / 128, + SBT_1S / 64, + SBT_1S / 32, + SBT_1S / 16, + SBT_1S / 8, + SBT_1S / 4, + SBT_1S / 2, + }; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + /* + * If both periodic and alarm interrupts are enabled then use the + * periodic frequency to drive the callout. The minimum periodic + * frequency (2 Hz) is higher than the alarm frequency (1 Hz) so + * piggyback the alarm on top of it. The same argument applies to + * the update interrupt. + */ + if (pintr_enabled(vrtc) && divider_enabled(vrtc->rtcdev.reg_a)) { + ratesel = vrtc->rtcdev.reg_a & 0xf; + return (pf[ratesel]); + } else if (aintr_enabled(vrtc) && update_enabled(vrtc)) { + return (SBT_1S); + } else if (uintr_enabled(vrtc) && update_enabled(vrtc)) { + return (SBT_1S); + } else { + return (0); + } +} + +static void +vrtc_callout_reset(struct vrtc *vrtc, sbintime_t freqsbt) +{ + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + if (freqsbt == 0) { + if (callout_active(&vrtc->callout)) { + VM_CTR0(vrtc->vm, "RTC callout stopped"); + callout_stop(&vrtc->callout); + } + return; + } + VM_CTR1(vrtc->vm, "RTC callout frequency %d hz", SBT_1S / freqsbt); + callout_reset_sbt(&vrtc->callout, freqsbt, 0, vrtc_callout_handler, + vrtc, 0); +} + +static void +vrtc_callout_handler(void *arg) +{ + struct vrtc *vrtc = arg; + sbintime_t freqsbt; + time_t rtctime; + int error; + + VM_CTR0(vrtc->vm, "vrtc callout fired"); + + VRTC_LOCK(vrtc); + if (callout_pending(&vrtc->callout)) /* callout was reset */ + goto done; + + if (!callout_active(&vrtc->callout)) /* callout was stopped */ + goto done; + + callout_deactivate(&vrtc->callout); + + KASSERT((vrtc->rtcdev.reg_b & RTCSB_ALL_INTRS) != 0, + ("gratuitous vrtc callout")); + + if (pintr_enabled(vrtc)) + vrtc_set_reg_c(vrtc, vrtc->rtcdev.reg_c | RTCIR_PERIOD); + + if (aintr_enabled(vrtc) || uintr_enabled(vrtc)) { + rtctime = vrtc_curtime(vrtc); + error = vrtc_time_update(vrtc, rtctime); + KASSERT(error == 0, ("%s: vrtc_time_update error %d", + __func__, error)); + } + + freqsbt = vrtc_freq(vrtc); + KASSERT(freqsbt != 0, ("%s: vrtc frequency cannot be zero", __func__)); + vrtc_callout_reset(vrtc, freqsbt); +done: + VRTC_UNLOCK(vrtc); +} + +static __inline void +vrtc_callout_check(struct vrtc *vrtc, sbintime_t freq) +{ + int active; + + active = callout_active(&vrtc->callout) ? 1 : 0; + KASSERT((freq == 0 && !active) || (freq != 0 && active), + ("vrtc callout %s with frequency %#lx", + active ? "active" : "inactive", freq)); +} + +static void +vrtc_set_reg_c(struct vrtc *vrtc, uint8_t newval) +{ + struct rtcdev *rtc; + int oldirqf, newirqf; + uint8_t oldval, changed; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + rtc = &vrtc->rtcdev; + newval &= RTCIR_ALARM | RTCIR_PERIOD | RTCIR_UPDATE; + + oldirqf = rtc->reg_c & RTCIR_INT; + if ((aintr_enabled(vrtc) && (newval & RTCIR_ALARM) != 0) || + (pintr_enabled(vrtc) && (newval & RTCIR_PERIOD) != 0) || + (uintr_enabled(vrtc) && (newval & RTCIR_UPDATE) != 0)) { + newirqf = RTCIR_INT; + } else { + newirqf = 0; + } + + oldval = rtc->reg_c; + rtc->reg_c = newirqf | newval; + changed = oldval ^ rtc->reg_c; + if (changed) { + VM_CTR2(vrtc->vm, "RTC reg_c changed from %#x to %#x", + oldval, rtc->reg_c); + } + + if (!oldirqf && newirqf) { + VM_CTR1(vrtc->vm, "RTC irq %d asserted", RTC_IRQ); + vatpic_pulse_irq(vrtc->vm, RTC_IRQ); + vioapic_pulse_irq(vrtc->vm, RTC_IRQ); + } else if (oldirqf && !newirqf) { + VM_CTR1(vrtc->vm, "RTC irq %d deasserted", RTC_IRQ); + } +} + +static int +vrtc_set_reg_b(struct vrtc *vrtc, uint8_t newval) +{ + struct rtcdev *rtc; + sbintime_t oldfreq, newfreq; + time_t curtime, rtctime; + int error; + uint8_t oldval, changed; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + rtc = &vrtc->rtcdev; + oldval = rtc->reg_b; + oldfreq = vrtc_freq(vrtc); + + rtc->reg_b = newval; + changed = oldval ^ newval; + if (changed) { + VM_CTR2(vrtc->vm, "RTC reg_b changed from %#x to %#x", + oldval, newval); + } + + if (changed & RTCSB_HALT) { + if ((newval & RTCSB_HALT) == 0) { + rtctime = rtc_to_secs(vrtc); + if (rtctime == VRTC_BROKEN_TIME) { + /* + * Stop updating the RTC if the date/time + * programmed by the guest is not correct. + */ + VM_CTR0(vrtc->vm, "Invalid RTC date/time " + "programming detected"); + + if (rtc_flag_broken_time) + return (-1); + } + } else { + curtime = vrtc_curtime(vrtc); + KASSERT(curtime == vrtc->base_rtctime, ("%s: mismatch " + "between vrtc basetime (%#lx) and curtime (%#lx)", + __func__, vrtc->base_rtctime, curtime)); + + /* + * Force a refresh of the RTC date/time fields so + * they reflect the time right before the guest set + * the HALT bit. + */ + secs_to_rtc(curtime, vrtc, 1); + + /* + * Updates are halted so mark 'base_rtctime' to denote + * that the RTC date/time is in flux. + */ + rtctime = VRTC_BROKEN_TIME; + rtc->reg_b &= ~RTCSB_UINTR; + } + error = vrtc_time_update(vrtc, rtctime); + KASSERT(error == 0, ("vrtc_time_update error %d", error)); + } + + /* + * Side effect of changes to the interrupt enable bits. + */ + if (changed & RTCSB_ALL_INTRS) + vrtc_set_reg_c(vrtc, vrtc->rtcdev.reg_c); + + /* + * Change the callout frequency if it has changed. + */ + newfreq = vrtc_freq(vrtc); + if (newfreq != oldfreq) + vrtc_callout_reset(vrtc, newfreq); + else + vrtc_callout_check(vrtc, newfreq); + + /* + * The side effect of bits that control the RTC date/time format + * is handled lazily when those fields are actually read. + */ + return (0); +} + +static void +vrtc_set_reg_a(struct vrtc *vrtc, uint8_t newval) +{ + sbintime_t oldfreq, newfreq; + uint8_t oldval, changed; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + newval &= ~RTCSA_TUP; + oldval = vrtc->rtcdev.reg_a; + oldfreq = vrtc_freq(vrtc); + + if (divider_enabled(oldval) && !divider_enabled(newval)) { + VM_CTR2(vrtc->vm, "RTC divider held in reset at %#lx/%#lx", + vrtc->base_rtctime, vrtc->base_uptime); + } else if (!divider_enabled(oldval) && divider_enabled(newval)) { + /* + * If the dividers are coming out of reset then update + * 'base_uptime' before this happens. This is done to + * maintain the illusion that the RTC date/time was frozen + * while the dividers were disabled. + */ + vrtc->base_uptime = sbinuptime(); + VM_CTR2(vrtc->vm, "RTC divider out of reset at %#lx/%#lx", + vrtc->base_rtctime, vrtc->base_uptime); + } else { + /* NOTHING */ + } + + vrtc->rtcdev.reg_a = newval; + changed = oldval ^ newval; + if (changed) { + VM_CTR2(vrtc->vm, "RTC reg_a changed from %#x to %#x", + oldval, newval); + } + + /* + * Side effect of changes to rate select and divider enable bits. + */ + newfreq = vrtc_freq(vrtc); + if (newfreq != oldfreq) + vrtc_callout_reset(vrtc, newfreq); + else + vrtc_callout_check(vrtc, newfreq); +} + +int +vrtc_set_time(struct vm *vm, time_t secs) +{ + struct vrtc *vrtc; + int error; + + vrtc = vm_rtc(vm); + VRTC_LOCK(vrtc); + error = vrtc_time_update(vrtc, secs); + VRTC_UNLOCK(vrtc); + + if (error) { + VM_CTR2(vrtc->vm, "Error %d setting RTC time to %#lx", error, + secs); + } else { + VM_CTR1(vrtc->vm, "RTC time set to %#lx", secs); + } + + return (error); +} + +time_t +vrtc_get_time(struct vm *vm) +{ + struct vrtc *vrtc; + time_t t; + + vrtc = vm_rtc(vm); + VRTC_LOCK(vrtc); + t = vrtc_curtime(vrtc); + VRTC_UNLOCK(vrtc); + + return (t); +} + +int +vrtc_nvram_write(struct vm *vm, int offset, uint8_t value) +{ + struct vrtc *vrtc; + uint8_t *ptr; + + vrtc = vm_rtc(vm); + + /* + * Don't allow writes to RTC control registers or the date/time fields. + */ + if (offset < offsetof(struct rtcdev, nvram[0]) || + offset >= sizeof(struct rtcdev)) { + VM_CTR1(vrtc->vm, "RTC nvram write to invalid offset %d", + offset); + return (EINVAL); + } + + VRTC_LOCK(vrtc); + ptr = (uint8_t *)(&vrtc->rtcdev); + ptr[offset] = value; + VM_CTR2(vrtc->vm, "RTC nvram write %#x to offset %#x", value, offset); + VRTC_UNLOCK(vrtc); + + return (0); +} + +int +vrtc_nvram_read(struct vm *vm, int offset, uint8_t *retval) +{ + struct vrtc *vrtc; + time_t curtime; + uint8_t *ptr; + + /* + * Allow all offsets in the RTC to be read. + */ + if (offset < 0 || offset >= sizeof(struct rtcdev)) + return (EINVAL); + + vrtc = vm_rtc(vm); + VRTC_LOCK(vrtc); + + /* + * Update RTC date/time fields if necessary. + */ + if (offset < 10) { + curtime = vrtc_curtime(vrtc); + secs_to_rtc(curtime, vrtc, 0); + } + + ptr = (uint8_t *)(&vrtc->rtcdev); + *retval = ptr[offset]; + + VRTC_UNLOCK(vrtc); + return (0); +} + +int +vrtc_addr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *val) +{ + struct vrtc *vrtc; + + vrtc = vm_rtc(vm); + + if (bytes != 1) + return (-1); + + if (in) { + *val = 0xff; + return (0); + } + + VRTC_LOCK(vrtc); + vrtc->addr = *val & 0x7f; + VRTC_UNLOCK(vrtc); + + return (0); +} + +int +vrtc_data_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *val) +{ + struct vrtc *vrtc; + struct rtcdev *rtc; + time_t curtime; + int error, offset; + + vrtc = vm_rtc(vm); + rtc = &vrtc->rtcdev; + + if (bytes != 1) + return (-1); + + VRTC_LOCK(vrtc); + offset = vrtc->addr; + if (offset >= sizeof(struct rtcdev)) { + VRTC_UNLOCK(vrtc); + return (-1); + } + + error = 0; + curtime = vrtc_curtime(vrtc); + vrtc_time_update(vrtc, curtime); + + if (in) { + /* + * Update RTC date/time fields if necessary. + */ + if (offset < 10) + secs_to_rtc(curtime, vrtc, 0); + + if (offset == 12) { + /* + * XXX + * reg_c interrupt flags are updated only if the + * corresponding interrupt enable bit in reg_b is set. + */ + *val = vrtc->rtcdev.reg_c; + vrtc_set_reg_c(vrtc, 0); + } else { + *val = *((uint8_t *)rtc + offset); + } + VCPU_CTR2(vm, vcpuid, "Read value %#x from RTC offset %#x", + *val, offset); + } else { + switch (offset) { + case 10: + VCPU_CTR1(vm, vcpuid, "RTC reg_a set to %#x", *val); + vrtc_set_reg_a(vrtc, *val); + break; + case 11: + VCPU_CTR1(vm, vcpuid, "RTC reg_b set to %#x", *val); + error = vrtc_set_reg_b(vrtc, *val); + break; + case 12: + VCPU_CTR1(vm, vcpuid, "RTC reg_c set to %#x (ignored)", + *val); + break; + case 13: + VCPU_CTR1(vm, vcpuid, "RTC reg_d set to %#x (ignored)", + *val); + break; + case 0: + /* + * High order bit of 'seconds' is readonly. + */ + *val &= 0x7f; + /* FALLTHRU */ + default: + VCPU_CTR2(vm, vcpuid, "RTC offset %#x set to %#x", + offset, *val); + *((uint8_t *)rtc + offset) = *val; + break; + } + } + VRTC_UNLOCK(vrtc); + return (error); +} + +void +vrtc_reset(struct vrtc *vrtc) +{ + struct rtcdev *rtc; + + VRTC_LOCK(vrtc); + + rtc = &vrtc->rtcdev; + vrtc_set_reg_b(vrtc, rtc->reg_b & ~(RTCSB_ALL_INTRS | RTCSB_SQWE)); + vrtc_set_reg_c(vrtc, 0); + KASSERT(!callout_active(&vrtc->callout), ("rtc callout still active")); + + VRTC_UNLOCK(vrtc); +} + +struct vrtc * +vrtc_init(struct vm *vm) +{ + struct vrtc *vrtc; + struct rtcdev *rtc; + time_t curtime; + + vrtc = malloc(sizeof(struct vrtc), M_VRTC, M_WAITOK | M_ZERO); + vrtc->vm = vm; + mtx_init(&vrtc->mtx, "vrtc lock", NULL, MTX_DEF); + callout_init(&vrtc->callout, 1); + + /* Allow dividers to keep time but disable everything else */ + rtc = &vrtc->rtcdev; + rtc->reg_a = 0x20; + rtc->reg_b = RTCSB_24HR; + rtc->reg_c = 0; + rtc->reg_d = RTCSD_PWR; + + /* Reset the index register to a safe value. */ + vrtc->addr = RTC_STATUSD; + + /* + * Initialize RTC time to 00:00:00 Jan 1, 1970. + */ + curtime = 0; + + VRTC_LOCK(vrtc); + vrtc->base_rtctime = VRTC_BROKEN_TIME; + vrtc_time_update(vrtc, curtime); + secs_to_rtc(curtime, vrtc, 0); + VRTC_UNLOCK(vrtc); + + return (vrtc); +} + +void +vrtc_cleanup(struct vrtc *vrtc) +{ + + callout_drain(&vrtc->callout); + free(vrtc, M_VRTC); +} diff --git a/sys/amd64/vmm/io/vrtc.h b/sys/amd64/vmm/io/vrtc.h new file mode 100644 index 000000000000..6fbbc9c810ac --- /dev/null +++ b/sys/amd64/vmm/io/vrtc.h @@ -0,0 +1,50 @@ +/*- + * Copyright (c) 2014 Neel Natu (neel@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VRTC_H_ +#define _VRTC_H_ + +#include + +struct vrtc; + +struct vrtc *vrtc_init(struct vm *vm); +void vrtc_cleanup(struct vrtc *vrtc); +void vrtc_reset(struct vrtc *vrtc); + +time_t vrtc_get_time(struct vm *vm); +int vrtc_set_time(struct vm *vm, time_t secs); +int vrtc_nvram_write(struct vm *vm, int offset, uint8_t value); +int vrtc_nvram_read(struct vm *vm, int offset, uint8_t *retval); + +int vrtc_addr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *val); +int vrtc_data_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *val); + +#endif diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index 4739a86ca7a3..7f90c6101242 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -75,6 +75,7 @@ __FBSDID("$FreeBSD$"); #include "vioapic.h" #include "vlapic.h" #include "vpmtmr.h" +#include "vrtc.h" #include "vmm_ipi.h" #include "vmm_stat.h" #include "vmm_lapic.h" @@ -100,12 +101,15 @@ struct vcpu { uint64_t exitintinfo; /* (i) events pending at VM exit */ int nmi_pending; /* (i) NMI pending */ int extint_pending; /* (i) INTR pending */ - struct vm_exception exception; /* (x) exception collateral */ int exception_pending; /* (i) exception pending */ + int exc_vector; /* (x) exception collateral */ + int exc_errcode_valid; + uint32_t exc_errcode; struct savefpu *guestfpu; /* (a,i) guest fpu state */ uint64_t guest_xcr0; /* (i) guest %xcr0 register */ void *stats; /* (a,i) statistics */ struct vm_exit exitinfo; /* (x) exit reason and collateral */ + uint64_t nextrip; /* (x) next instruction to execute */ }; #define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx)) @@ -136,6 +140,7 @@ struct vm { struct vatpic *vatpic; /* (i) virtual atpic */ struct vatpit *vatpit; /* (i) virtual atpit */ struct vpmtmr *vpmtmr; /* (i) virtual ACPI PM timer */ + struct vrtc *vrtc; /* (o) virtual RTC */ volatile cpuset_t active_cpus; /* (i) active vcpus */ int suspend; /* (i) stop VM execution */ volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ @@ -376,6 +381,8 @@ vm_init(struct vm *vm, bool create) vm->vatpic = vatpic_init(vm); vm->vatpit = vatpit_init(vm); vm->vpmtmr = vpmtmr_init(vm); + if (create) + vm->vrtc = vrtc_init(vm); CPU_ZERO(&vm->active_cpus); @@ -438,6 +445,10 @@ vm_cleanup(struct vm *vm, bool destroy) if (vm->iommu != NULL) iommu_destroy_domain(vm->iommu); + if (destroy) + vrtc_cleanup(vm->vrtc); + else + vrtc_reset(vm->vrtc); vpmtmr_cleanup(vm->vpmtmr); vatpit_cleanup(vm->vatpit); vhpet_cleanup(vm->vhpet); @@ -841,16 +852,26 @@ vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval) } int -vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val) +vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val) { + struct vcpu *vcpu; + int error; - if (vcpu < 0 || vcpu >= VM_MAXCPU) + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) return (EINVAL); if (reg >= VM_REG_LAST) return (EINVAL); - return (VMSETREG(vm->cookie, vcpu, reg, val)); + error = VMSETREG(vm->cookie, vcpuid, reg, val); + if (error || reg != VM_REG_GUEST_RIP) + return (error); + + /* Set 'nextrip' to match the value of %rip */ + VCPU_CTR1(vm, vcpuid, "Setting nextrip to %#lx", val); + vcpu = &vm->vcpu[vcpuid]; + vcpu->nextrip = val; + return (0); } static boolean_t @@ -1102,7 +1123,7 @@ vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu) { struct vcpu *vcpu; const char *wmesg; - int error, t, vcpu_halted, vm_halted; + int t, vcpu_halted, vm_halted; KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted")); @@ -1110,22 +1131,6 @@ vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu) vcpu_halted = 0; vm_halted = 0; - /* - * The typical way to halt a cpu is to execute: "sti; hlt" - * - * STI sets RFLAGS.IF to enable interrupts. However, the processor - * remains in an "interrupt shadow" for an additional instruction - * following the STI. This guarantees that "sti; hlt" sequence is - * atomic and a pending interrupt will be recognized after the HLT. - * - * After the HLT emulation is done the vcpu is no longer in an - * interrupt shadow and a pending interrupt can be injected on - * the next entry into the guest. - */ - error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0); - KASSERT(error == 0, ("%s: error %d clearing interrupt shadow", - __func__, error)); - vcpu_lock(vcpu); while (1) { /* @@ -1206,6 +1211,9 @@ vm_handle_paging(struct vm *vm, int vcpuid, bool *retu) vcpu = &vm->vcpu[vcpuid]; vme = &vcpu->exitinfo; + KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", + __func__, vme->inst_length)); + ftype = vme->u.paging.fault_type; KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE, @@ -1231,9 +1239,6 @@ vm_handle_paging(struct vm *vm, int vcpuid, bool *retu) if (rv != KERN_SUCCESS) return (EFAULT); done: - /* restart execution at the faulting instruction */ - vme->inst_length = 0; - return (0); } @@ -1288,10 +1293,13 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu) return (EFAULT); /* - * If the instruction length is not specified the update it now. + * If the instruction length was not specified then update it now + * along with 'nextrip'. */ - if (vme->inst_length == 0) + if (vme->inst_length == 0) { vme->inst_length = vie->num_processed; + vcpu->nextrip += vie->num_processed; + } /* return to userland unless this is an in-kernel emulated device */ if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { @@ -1440,7 +1448,7 @@ vm_run(struct vm *vm, struct vm_run *vmrun) int error, vcpuid; struct vcpu *vcpu; struct pcb *pcb; - uint64_t tscval, rip; + uint64_t tscval; struct vm_exit *vme; bool retu, intr_disabled; pmap_t pmap; @@ -1462,7 +1470,6 @@ vm_run(struct vm *vm, struct vm_run *vmrun) pmap = vmspace_pmap(vm->vmspace); vcpu = &vm->vcpu[vcpuid]; vme = &vcpu->exitinfo; - rip = vmrun->rip; restart: critical_enter(); @@ -1477,7 +1484,7 @@ restart: restore_guest_fpustate(vcpu); vcpu_require_state(vm, vcpuid, VCPU_RUNNING); - error = VMRUN(vm->cookie, vcpuid, rip, pmap, rptr, sptr); + error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap, rptr, sptr); vcpu_require_state(vm, vcpuid, VCPU_FROZEN); save_guest_fpustate(vcpu); @@ -1488,6 +1495,7 @@ restart: if (error == 0) { retu = false; + vcpu->nextrip = vme->rip + vme->inst_length; switch (vme->exitcode) { case VM_EXITCODE_SUSPENDED: error = vm_handle_suspend(vm, vcpuid, &retu); @@ -1524,16 +1532,57 @@ restart: } } - if (error == 0 && retu == false) { - rip = vme->rip + vme->inst_length; + if (error == 0 && retu == false) goto restart; - } /* copy the exit information */ bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit)); return (error); } +int +vm_restart_instruction(void *arg, int vcpuid) +{ + struct vm *vm; + struct vcpu *vcpu; + enum vcpu_state state; + uint64_t rip; + int error; + + vm = arg; + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + state = vcpu_get_state(vm, vcpuid, NULL); + if (state == VCPU_RUNNING) { + /* + * When a vcpu is "running" the next instruction is determined + * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'. + * Thus setting 'inst_length' to zero will cause the current + * instruction to be restarted. + */ + vcpu->exitinfo.inst_length = 0; + VCPU_CTR1(vm, vcpuid, "restarting instruction at %#lx by " + "setting inst_length to zero", vcpu->exitinfo.rip); + } else if (state == VCPU_FROZEN) { + /* + * When a vcpu is "frozen" it is outside the critical section + * around VMRUN() and 'nextrip' points to the next instruction. + * Thus instruction restart is achieved by setting 'nextrip' + * to the vcpu's %rip. + */ + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip); + KASSERT(!error, ("%s: error %d getting rip", __func__, error)); + VCPU_CTR2(vm, vcpuid, "restarting instruction by updating " + "nextrip from %#lx to %#lx", vcpu->nextrip, rip); + vcpu->nextrip = rip; + } else { + panic("%s: invalid state %d", __func__, state); + } + return (0); +} + int vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info) { @@ -1664,11 +1713,11 @@ vcpu_exception_intinfo(struct vcpu *vcpu) uint64_t info = 0; if (vcpu->exception_pending) { - info = vcpu->exception.vector & 0xff; + info = vcpu->exc_vector & 0xff; info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; - if (vcpu->exception.error_code_valid) { + if (vcpu->exc_errcode_valid) { info |= VM_INTINFO_DEL_ERRCODE; - info |= (uint64_t)vcpu->exception.error_code << 32; + info |= (uint64_t)vcpu->exc_errcode << 32; } } return (info); @@ -1693,7 +1742,7 @@ vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo) info2 = vcpu_exception_intinfo(vcpu); vcpu->exception_pending = 0; VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx", - vcpu->exception.vector, info2); + vcpu->exc_vector, info2); } if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) { @@ -1731,14 +1780,16 @@ vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2) } int -vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception) +vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid, + uint32_t errcode, int restart_instruction) { struct vcpu *vcpu; + int error; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) return (EINVAL); - if (exception->vector < 0 || exception->vector >= 32) + if (vector < 0 || vector >= 32) return (EINVAL); /* @@ -1746,21 +1797,35 @@ vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception) * the guest. It is a derived exception that results from specific * combinations of nested faults. */ - if (exception->vector == IDT_DF) + if (vector == IDT_DF) return (EINVAL); vcpu = &vm->vcpu[vcpuid]; if (vcpu->exception_pending) { VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to " - "pending exception %d", exception->vector, - vcpu->exception.vector); + "pending exception %d", vector, vcpu->exc_vector); return (EBUSY); } + /* + * From section 26.6.1 "Interruptibility State" in Intel SDM: + * + * Event blocking by "STI" or "MOV SS" is cleared after guest executes + * one instruction or incurs an exception. + */ + error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0); + KASSERT(error == 0, ("%s: error %d clearing interrupt shadow", + __func__, error)); + + if (restart_instruction) + vm_restart_instruction(vm, vcpuid); + vcpu->exception_pending = 1; - vcpu->exception = *exception; - VCPU_CTR1(vm, vcpuid, "Exception %d pending", exception->vector); + vcpu->exc_vector = vector; + vcpu->exc_errcode = errcode; + vcpu->exc_errcode_valid = errcode_valid; + VCPU_CTR1(vm, vcpuid, "Exception %d pending", vector); return (0); } @@ -1768,28 +1833,15 @@ void vm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid, int errcode) { - struct vm_exception exception; - struct vm_exit *vmexit; struct vm *vm; - int error; + int error, restart_instruction; vm = vmarg; + restart_instruction = 1; - exception.vector = vector; - exception.error_code = errcode; - exception.error_code_valid = errcode_valid; - error = vm_inject_exception(vm, vcpuid, &exception); + error = vm_inject_exception(vm, vcpuid, vector, errcode_valid, + errcode, restart_instruction); KASSERT(error == 0, ("vm_inject_exception error %d", error)); - - /* - * A fault-like exception allows the instruction to be restarted - * after the exception handler returns. - * - * By setting the inst_length to 0 we ensure that the instruction - * pointer remains at the faulting instruction. - */ - vmexit = vm_exitinfo(vm, vcpuid); - vmexit->inst_length = 0; } void @@ -2223,6 +2275,13 @@ vm_pmtmr(struct vm *vm) return (vm->vpmtmr); } +struct vrtc * +vm_rtc(struct vm *vm) +{ + + return (vm->vrtc); +} + enum vm_reg_name vm_segment_name(int seg) { diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c index a85109edaa1d..0293d19141ff 100644 --- a/sys/amd64/vmm/vmm_dev.c +++ b/sys/amd64/vmm/vmm_dev.c @@ -58,6 +58,7 @@ __FBSDID("$FreeBSD$"); #include "io/vatpic.h" #include "io/vioapic.h" #include "io/vhpet.h" +#include "io/vrtc.h" struct vmmdev_softc { struct vm *vm; /* vm instance cookie */ @@ -174,6 +175,8 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, struct vm_activate_cpu *vac; struct vm_cpuset *vm_cpuset; struct vm_intinfo *vmii; + struct vm_rtc_time *rtctime; + struct vm_rtc_data *rtcdata; sc = vmmdev_lookup2(cdev); if (sc == NULL) @@ -202,6 +205,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, case VM_ACTIVATE_CPU: case VM_SET_INTINFO: case VM_GET_INTINFO: + case VM_RESTART_INSTRUCTION: /* * XXX fragile, handle with care * Assumes that the first field of the ioctl data is the vcpu. @@ -307,7 +311,9 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, break; case VM_INJECT_EXCEPTION: vmexc = (struct vm_exception *)data; - error = vm_inject_exception(sc->vm, vmexc->cpuid, vmexc); + error = vm_inject_exception(sc->vm, vmexc->cpuid, + vmexc->vector, vmexc->error_code_valid, vmexc->error_code, + vmexc->restart_instruction); break; case VM_INJECT_NMI: vmnmi = (struct vm_nmi *)data; @@ -482,6 +488,28 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, error = vm_get_intinfo(sc->vm, vmii->vcpuid, &vmii->info1, &vmii->info2); break; + case VM_RTC_WRITE: + rtcdata = (struct vm_rtc_data *)data; + error = vrtc_nvram_write(sc->vm, rtcdata->offset, + rtcdata->value); + break; + case VM_RTC_READ: + rtcdata = (struct vm_rtc_data *)data; + error = vrtc_nvram_read(sc->vm, rtcdata->offset, + &rtcdata->value); + break; + case VM_RTC_SETTIME: + rtctime = (struct vm_rtc_time *)data; + error = vrtc_set_time(sc->vm, rtctime->secs); + break; + case VM_RTC_GETTIME: + error = 0; + rtctime = (struct vm_rtc_time *)data; + rtctime->secs = vrtc_get_time(sc->vm); + break; + case VM_RESTART_INSTRUCTION: + error = vm_restart_instruction(sc->vm, vcpu); + break; default: error = ENOTTY; break; diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c index d1d7173447d5..3db890e9559a 100644 --- a/sys/amd64/vmm/vmm_instruction_emul.c +++ b/sys/amd64/vmm/vmm_instruction_emul.c @@ -70,6 +70,7 @@ enum { VIE_OP_TYPE_PUSH, VIE_OP_TYPE_CMP, VIE_OP_TYPE_POP, + VIE_OP_TYPE_MOVS, VIE_OP_TYPE_LAST }; @@ -78,6 +79,7 @@ enum { #define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */ #define VIE_OP_F_MOFFSET (1 << 2) /* 16/32/64-bit immediate moffset */ #define VIE_OP_F_NO_MODRM (1 << 3) +#define VIE_OP_F_NO_GLA_VERIFICATION (1 << 4) static const struct vie_op two_byte_opcodes[256] = { [0xB6] = { @@ -133,6 +135,16 @@ static const struct vie_op one_byte_opcodes[256] = { .op_type = VIE_OP_TYPE_MOV, .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, }, + [0xA4] = { + .op_byte = 0xA4, + .op_type = VIE_OP_TYPE_MOVS, + .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION + }, + [0xA5] = { + .op_byte = 0xA5, + .op_type = VIE_OP_TYPE_MOVS, + .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION + }, [0xC6] = { /* XXX Group 11 extended opcode - not just MOV */ .op_byte = 0xC6, @@ -559,6 +571,217 @@ emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, return (error); } +/* + * Helper function to calculate and validate a linear address. + * + * Returns 0 on success and 1 if an exception was injected into the guest. + */ +static int +get_gla(void *vm, int vcpuid, struct vie *vie, struct vm_guest_paging *paging, + int opsize, int addrsize, int prot, enum vm_reg_name seg, + enum vm_reg_name gpr, uint64_t *gla) +{ + struct seg_desc desc; + uint64_t cr0, val, rflags; + int error; + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0); + KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); + + error = vm_get_seg_desc(vm, vcpuid, seg, &desc); + KASSERT(error == 0, ("%s: error %d getting segment descriptor %d", + __func__, error, seg)); + + error = vie_read_register(vm, vcpuid, gpr, &val); + KASSERT(error == 0, ("%s: error %d getting register %d", __func__, + error, gpr)); + + if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize, + addrsize, prot, gla)) { + if (seg == VM_REG_GUEST_SS) + vm_inject_ss(vm, vcpuid, 0); + else + vm_inject_gp(vm, vcpuid); + return (1); + } + + if (vie_canonical_check(paging->cpu_mode, *gla)) { + if (seg == VM_REG_GUEST_SS) + vm_inject_ss(vm, vcpuid, 0); + else + vm_inject_gp(vm, vcpuid); + return (1); + } + + if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) { + vm_inject_ac(vm, vcpuid, 0); + return (1); + } + + return (0); +} + +static int +emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t memread, + mem_region_write_t memwrite, void *arg) +{ +#ifdef _KERNEL + struct vm_copyinfo copyinfo[2]; +#else + struct iovec copyinfo[2]; +#endif + uint64_t dstaddr, srcaddr, val; + uint64_t rcx, rdi, rsi, rflags; + int error, opsize, seg, repeat; + + opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize; + val = 0; + error = 0; + + /* + * XXX although the MOVS instruction is only supposed to be used with + * the "rep" prefix some guests like FreeBSD will use "repnz" instead. + * + * Empirically the "repnz" prefix has identical behavior to "rep" + * and the zero flag does not make a difference. + */ + repeat = vie->repz_present | vie->repnz_present; + + if (repeat) { + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx); + KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); + + /* + * The count register is %rcx, %ecx or %cx depending on the + * address size of the instruction. + */ + if ((rcx & vie_size2mask(vie->addrsize)) == 0) + return (0); + } + + /* + * Source Destination Comments + * -------------------------------------------- + * (1) memory memory n/a + * (2) memory mmio emulated + * (3) mmio memory emulated + * (4) mmio mmio not emulated + * + * At this point we don't have sufficient information to distinguish + * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this + * out because it will succeed only when operating on regular memory. + * + * XXX the emulation doesn't properly handle the case where 'gpa' + * is straddling the boundary between the normal memory and MMIO. + */ + + seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS; + error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize, + PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr); + if (error) + goto done; + + error = vm_copy_setup(vm, vcpuid, paging, srcaddr, opsize, PROT_READ, + copyinfo, nitems(copyinfo)); + if (error == 0) { + /* + * case (2): read from system memory and write to mmio. + */ + vm_copyin(vm, vcpuid, copyinfo, &val, opsize); + vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); + error = memwrite(vm, vcpuid, gpa, val, opsize, arg); + goto done; + } else if (error > 0) { + /* + * Resume guest execution to handle fault. + */ + goto done; + } else { + /* + * 'vm_copy_setup()' is expected to fail for cases (3) and (4) + * if 'srcaddr' is in the mmio space. + */ + } + + error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize, + PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr); + if (error) + goto done; + + error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize, + PROT_WRITE, copyinfo, nitems(copyinfo)); + if (error == 0) { + /* + * case (3): read from MMIO and write to system memory. + * + * A MMIO read can have side-effects so we commit to it + * only after vm_copy_setup() is successful. If a page-fault + * needs to be injected into the guest then it will happen + * before the MMIO read is attempted. + */ + error = memread(vm, vcpuid, gpa, &val, opsize, arg); + if (error) + goto done; + + vm_copyout(vm, vcpuid, &val, copyinfo, opsize); + vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); + } else if (error > 0) { + /* + * Resume guest execution to handle fault. + */ + goto done; + } else { + goto done; + } + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSI, &rsi); + KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error)); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi); + KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); + + if (rflags & PSL_D) { + rsi -= opsize; + rdi -= opsize; + } else { + rsi += opsize; + rdi += opsize; + } + + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSI, rsi, + vie->addrsize); + KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error)); + + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi, + vie->addrsize); + KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error)); + + if (repeat) { + rcx = rcx - 1; + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX, + rcx, vie->addrsize); + KASSERT(!error, ("%s: error %d updating rcx", __func__, error)); + + /* + * Repeat the instruction if the count register is not zero. + */ + if ((rcx & vie_size2mask(vie->addrsize)) != 0) + vm_restart_instruction(vm, vcpuid); + } +done: + if (error < 0) + return (EFAULT); + else + return (0); +} + static int emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) @@ -926,9 +1149,7 @@ emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, error = memwrite(vm, vcpuid, mmio_gpa, val, size, arg); rsp += size; } -#ifdef _KERNEL vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); -#endif if (error == 0) { error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp, @@ -1012,6 +1233,10 @@ vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, error = emulate_movx(vm, vcpuid, gpa, vie, memread, memwrite, memarg); break; + case VIE_OP_TYPE_MOVS: + error = emulate_movs(vm, vcpuid, gpa, vie, paging, memread, + memwrite, memarg); + break; case VIE_OP_TYPE_AND: error = emulate_and(vm, vcpuid, gpa, vie, memread, memwrite, memarg); @@ -1193,6 +1418,7 @@ vie_init(struct vie *vie, const char *inst_bytes, int inst_length) vie->base_register = VM_REG_LAST; vie->index_register = VM_REG_LAST; + vie->segment_register = VM_REG_LAST; if (inst_length) { bcopy(inst_bytes, vie->inst, inst_length); @@ -1458,6 +1684,35 @@ vie_advance(struct vie *vie) vie->num_processed++; } +static bool +segment_override(uint8_t x, int *seg) +{ + + switch (x) { + case 0x2E: + *seg = VM_REG_GUEST_CS; + break; + case 0x36: + *seg = VM_REG_GUEST_SS; + break; + case 0x3E: + *seg = VM_REG_GUEST_DS; + break; + case 0x26: + *seg = VM_REG_GUEST_ES; + break; + case 0x64: + *seg = VM_REG_GUEST_FS; + break; + case 0x65: + *seg = VM_REG_GUEST_GS; + break; + default: + return (false); + } + return (true); +} + static int decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d) { @@ -1471,6 +1726,12 @@ decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d) vie->opsize_override = 1; else if (x == 0x67) vie->addrsize_override = 1; + else if (x == 0xF3) + vie->repz_present = 1; + else if (x == 0xF2) + vie->repnz_present = 1; + else if (segment_override(x, &vie->segment_register)) + vie->segment_override = 1; else break; @@ -1923,8 +2184,10 @@ vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, if (verify_inst_length(vie)) return (-1); - if (verify_gla(vm, cpuid, gla, vie)) - return (-1); + if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) { + if (verify_gla(vm, cpuid, gla, vie)) + return (-1); + } vie->decoded = 1; /* success */ diff --git a/sys/amd64/vmm/vmm_ioport.c b/sys/amd64/vmm/vmm_ioport.c index e5535997c9f3..fc68a6160739 100644 --- a/sys/amd64/vmm/vmm_ioport.c +++ b/sys/amd64/vmm/vmm_ioport.c @@ -42,6 +42,7 @@ __FBSDID("$FreeBSD$"); #include "vatpic.h" #include "vatpit.h" #include "vpmtmr.h" +#include "vrtc.h" #include "vmm_ioport.h" #include "vmm_ktr.h" @@ -60,6 +61,8 @@ ioport_handler_func_t ioport_handler[MAX_IOPORTS] = { [IO_ELCR1] = vatpic_elc_handler, [IO_ELCR2] = vatpic_elc_handler, [IO_PMTMR] = vpmtmr_handler, + [IO_RTC] = vrtc_addr_handler, + [IO_RTC + 1] = vrtc_data_handler, }; #ifdef KTR @@ -71,7 +74,7 @@ inout_instruction(struct vm_exit *vmexit) static const char *iodesc[] = { "outb", "outw", "outl", "inb", "inw", "inl", - "outsb", "outsw", "outsd" + "outsb", "outsw", "outsd", "insb", "insw", "insd", }; diff --git a/sys/modules/vmm/Makefile b/sys/modules/vmm/Makefile index 6aeaf80cb1ac..6e1cf7fe2624 100644 --- a/sys/modules/vmm/Makefile +++ b/sys/modules/vmm/Makefile @@ -35,7 +35,8 @@ SRCS+= iommu.c \ vhpet.c \ vioapic.c \ vlapic.c \ - vpmtmr.c + vpmtmr.c \ + vrtc.c # intel-specific files .PATH: ${.CURDIR}/../../amd64/vmm/intel diff --git a/usr.sbin/bhyve/bhyve.8 b/usr.sbin/bhyve/bhyve.8 index 8de098902a74..e15f9ac3f7f7 100644 --- a/usr.sbin/bhyve/bhyve.8 +++ b/usr.sbin/bhyve/bhyve.8 @@ -32,7 +32,7 @@ .Nd "run a guest operating system inside a virtual machine" .Sh SYNOPSIS .Nm -.Op Fl abehwxACHPWY +.Op Fl abehuwxACHPWY .Op Fl c Ar numcpus .Op Fl g Ar gdbport .Op Fl l Ar lpcdev Ns Op , Ns Ar conf @@ -239,6 +239,8 @@ The host device must have been reserved at boot-time using the loader variable as described in .Xr vmm 4 . .El +.It Fl u +RTC keeps UTC time. .It Fl U Ar uuid Set the universally unique identifier .Pq UUID diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c index 5971993bad7c..97ed0460b3f3 100644 --- a/usr.sbin/bhyve/bhyverun.c +++ b/usr.sbin/bhyve/bhyverun.c @@ -122,7 +122,7 @@ usage(int code) { fprintf(stderr, - "Usage: %s [-abehwxACHPWY] [-c vcpus] [-g ] [-l ]\n" + "Usage: %s [-abehuwxACHPWY] [-c vcpus] [-g ] [-l ]\n" " %*s [-m mem] [-p vcpu:hostcpu] [-s ] [-U uuid] \n" " -a: local apic is in xAPIC mode (deprecated)\n" " -A: create ACPI tables\n" @@ -137,6 +137,7 @@ usage(int code) " -p: pin 'vcpu' to 'hostcpu'\n" " -P: vmexit from the guest on pause\n" " -s: PCI slot config\n" + " -u: RTC keeps UTC time\n" " -U: uuid\n" " -w: ignore unimplemented MSRs\n" " -W: force virtio to use single-vector MSI\n" @@ -185,20 +186,14 @@ vm_inject_fault(void *arg, int vcpu, int vector, int errcode_valid, int errcode) { struct vmctx *ctx; - int error; + int error, restart_instruction; ctx = arg; - if (errcode_valid) - error = vm_inject_exception2(ctx, vcpu, vector, errcode); - else - error = vm_inject_exception(ctx, vcpu, vector); - assert(error == 0); + restart_instruction = 1; - /* - * Set the instruction length to 0 to ensure that the instruction is - * restarted when the fault handler returns. - */ - vmexit[vcpu].inst_length = 0; + error = vm_inject_exception(ctx, vcpu, vector, errcode_valid, errcode, + restart_instruction); + assert(error == 0); } void * @@ -329,12 +324,6 @@ vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) } error = emulate_inout(ctx, vcpu, vme, strictio); - if (!error && in && !string) { - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, - vme->u.inout.eax); - assert(error == 0); - } - if (error) { fprintf(stderr, "Unhandled %s%c 0x%04x\n", in ? "in" : "out", bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port); @@ -358,7 +347,7 @@ vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) vme->u.msr.code, *pvcpu); if (strictmsr) { vm_inject_gp(ctx, *pvcpu); - return (VMEXIT_RESTART); + return (VMEXIT_CONTINUE); } } @@ -384,7 +373,7 @@ vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) vme->u.msr.code, vme->u.msr.wval, *pvcpu); if (strictmsr) { vm_inject_gp(ctx, *pvcpu); - return (VMEXIT_RESTART); + return (VMEXIT_CONTINUE); } } return (VMEXIT_CONTINUE); @@ -462,9 +451,11 @@ static int vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { + assert(vmexit->inst_length == 0); + stats.vmexit_bogus++; - return (VMEXIT_RESTART); + return (VMEXIT_CONTINUE); } static int @@ -494,9 +485,11 @@ static int vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { + assert(vmexit->inst_length == 0); + stats.vmexit_mtrap++; - return (VMEXIT_RESTART); + return (VMEXIT_CONTINUE); } static int @@ -581,7 +574,7 @@ static vmexit_handler_t handler[VM_EXITCODE_MAX] = { }; static void -vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip) +vm_loop(struct vmctx *ctx, int vcpu, uint64_t startrip) { int error, rc, prevcpu; enum vm_exitcode exitcode; @@ -596,8 +589,11 @@ vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip) error = vm_active_cpus(ctx, &active_cpus); assert(CPU_ISSET(vcpu, &active_cpus)); + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, startrip); + assert(error == 0); + while (1) { - error = vm_run(ctx, vcpu, rip, &vmexit[vcpu]); + error = vm_run(ctx, vcpu, &vmexit[vcpu]); if (error != 0) break; @@ -614,10 +610,6 @@ vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip) switch (rc) { case VMEXIT_CONTINUE: - rip = vmexit[vcpu].rip + vmexit[vcpu].inst_length; - break; - case VMEXIT_RESTART: - rip = vmexit[vcpu].rip; break; case VMEXIT_ABORT: abort(); @@ -694,6 +686,7 @@ main(int argc, char *argv[]) { int c, error, gdb_port, err, bvmcons; int dump_guest_memory, max_vcpus, mptgen; + int rtc_localtime; struct vmctx *ctx; uint64_t rip; size_t memsize; @@ -705,8 +698,9 @@ main(int argc, char *argv[]) guest_ncpus = 1; memsize = 256 * MB; mptgen = 1; + rtc_localtime = 1; - while ((c = getopt(argc, argv, "abehwxACHIPWYp:g:c:s:m:l:U:")) != -1) { + while ((c = getopt(argc, argv, "abehuwxACHIPWYp:g:c:s:m:l:U:")) != -1) { switch (c) { case 'a': x2apic_mode = 0; @@ -766,6 +760,9 @@ main(int argc, char *argv[]) case 'e': strictio = 1; break; + case 'u': + rtc_localtime = 0; + break; case 'U': guest_uuid_str = optarg; break; @@ -829,7 +826,7 @@ main(int argc, char *argv[]) pci_irq_init(ctx); ioapic_init(ctx); - rtc_init(ctx); + rtc_init(ctx, rtc_localtime); sci_init(ctx); /* diff --git a/usr.sbin/bhyve/bhyverun.h b/usr.sbin/bhyve/bhyverun.h index 87824ef9f4f4..c51bf48b7258 100644 --- a/usr.sbin/bhyve/bhyverun.h +++ b/usr.sbin/bhyve/bhyverun.h @@ -35,9 +35,8 @@ #define __CTASSERT(x, y) typedef char __assert ## y[(x) ? 1 : -1] #endif -#define VMEXIT_CONTINUE 1 /* continue from next instruction */ -#define VMEXIT_RESTART 2 /* restart current instruction */ -#define VMEXIT_ABORT 3 /* abort the vm run loop */ +#define VMEXIT_CONTINUE (0) +#define VMEXIT_ABORT (-1) struct vmctx; extern int guest_ncpus; diff --git a/usr.sbin/bhyve/inout.c b/usr.sbin/bhyve/inout.c index 1041a59d2194..402b953be882 100644 --- a/usr.sbin/bhyve/inout.c +++ b/usr.sbin/bhyve/inout.c @@ -104,7 +104,7 @@ int emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict) { int addrsize, bytes, flags, in, port, prot, rep; - uint32_t val; + uint32_t eax, val; inout_func_t handler; void *arg; int error, retval; @@ -214,16 +214,20 @@ emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict) } /* Restart the instruction if more iterations remain */ - if (retval == 0 && count != 0) - vmexit->inst_length = 0; - } else { - if (!in) { - val = vmexit->u.inout.eax & vie_size2mask(bytes); + if (retval == 0 && count != 0) { + error = vm_restart_instruction(ctx, vcpu); + assert(error == 0); } + } else { + eax = vmexit->u.inout.eax; + val = eax & vie_size2mask(bytes); retval = handler(ctx, vcpu, in, port, bytes, &val, arg); if (retval == 0 && in) { - vmexit->u.inout.eax &= ~vie_size2mask(bytes); - vmexit->u.inout.eax |= val & vie_size2mask(bytes); + eax &= ~vie_size2mask(bytes); + eax |= val & vie_size2mask(bytes); + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, + eax); + assert(error == 0); } } return (retval); diff --git a/usr.sbin/bhyve/pci_ahci.c b/usr.sbin/bhyve/pci_ahci.c index 9dbabcd361a4..31e02f83034e 100644 --- a/usr.sbin/bhyve/pci_ahci.c +++ b/usr.sbin/bhyve/pci_ahci.c @@ -2299,7 +2299,8 @@ pci_ahci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts, int atapi) open_fail: if (ret) { - blockif_close(sc->port[0].bctx); + if (sc->port[0].bctx != NULL) + blockif_close(sc->port[0].bctx); free(sc); } diff --git a/usr.sbin/bhyve/rtc.c b/usr.sbin/bhyve/rtc.c index 459c9000687a..5c7015400a74 100644 --- a/usr.sbin/bhyve/rtc.c +++ b/usr.sbin/bhyve/rtc.c @@ -30,10 +30,7 @@ __FBSDID("$FreeBSD$"); #include -#include -#include -#include #include #include @@ -41,47 +38,11 @@ __FBSDID("$FreeBSD$"); #include #include "acpi.h" -#include "inout.h" #include "pci_lpc.h" #include "rtc.h" -#define IO_RTC 0x70 +#define IO_RTC 0x70 -#define RTC_SEC 0x00 /* seconds */ -#define RTC_SEC_ALARM 0x01 -#define RTC_MIN 0x02 -#define RTC_MIN_ALARM 0x03 -#define RTC_HRS 0x04 -#define RTC_HRS_ALARM 0x05 -#define RTC_WDAY 0x06 -#define RTC_DAY 0x07 -#define RTC_MONTH 0x08 -#define RTC_YEAR 0x09 -#define RTC_CENTURY 0x32 /* current century */ - -#define RTC_STATUSA 0xA -#define RTCSA_TUP 0x80 /* time update, don't look now */ - -#define RTC_STATUSB 0xB -#define RTCSB_DST 0x01 -#define RTCSB_24HR 0x02 -#define RTCSB_BIN 0x04 /* 0 = BCD, 1 = Binary */ -#define RTCSB_PINTR 0x40 /* 1 = enable periodic clock interrupt */ -#define RTCSB_HALT 0x80 /* stop clock updates */ - -#define RTC_INTR 0x0c /* status register C (R) interrupt source */ - -#define RTC_STATUSD 0x0d /* status register D (R) Lost Power */ -#define RTCSD_PWR 0x80 /* clock power OK */ - -#define RTC_NVRAM_START 0x0e -#define RTC_NVRAM_END 0x7f -#define RTC_NVRAM_SZ (128 - RTC_NVRAM_START) -#define nvoff(x) ((x) - RTC_NVRAM_START) - -#define RTC_DIAG 0x0e -#define RTC_RSTCODE 0x0f -#define RTC_EQUIPMENT 0x14 #define RTC_LMEM_LSB 0x34 #define RTC_LMEM_MSB 0x35 #define RTC_HMEM_LSB 0x5b @@ -92,249 +53,30 @@ __FBSDID("$FreeBSD$"); #define m_16MB (16*1024*1024) #define m_4GB (4ULL*1024*1024*1024) -static int addr; - -static uint8_t rtc_nvram[RTC_NVRAM_SZ]; - -/* XXX initialize these to default values as they would be from BIOS */ -static uint8_t status_a, status_b; - -static struct { - uint8_t hours; - uint8_t mins; - uint8_t secs; -} rtc_alarm; - -static u_char const bin2bcd_data[] = { - 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, - 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, - 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, - 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, - 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, - 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, - 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, - 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, - 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, - 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99 -}; -#define bin2bcd(bin) (bin2bcd_data[bin]) - -#define rtcout(val) ((status_b & RTCSB_BIN) ? (val) : bin2bcd((val))) - -static void -timevalfix(struct timeval *t1) +/* + * Returns the current RTC time as number of seconds since 00:00:00 Jan 1, 1970 + */ +static time_t +rtc_time(struct vmctx *ctx, int use_localtime) { - - if (t1->tv_usec < 0) { - t1->tv_sec--; - t1->tv_usec += 1000000; - } - if (t1->tv_usec >= 1000000) { - t1->tv_sec++; - t1->tv_usec -= 1000000; - } -} - -static void -timevalsub(struct timeval *t1, const struct timeval *t2) -{ - - t1->tv_sec -= t2->tv_sec; - t1->tv_usec -= t2->tv_usec; - timevalfix(t1); -} - -static int -rtc_addr_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, - uint32_t *eax, void *arg) -{ - if (bytes != 1) - return (-1); - - if (in) { - /* straight read of this register will return 0xFF */ - *eax = 0xff; - return (0); - } - - switch (*eax & 0x7f) { - case RTC_SEC: - case RTC_SEC_ALARM: - case RTC_MIN: - case RTC_MIN_ALARM: - case RTC_HRS: - case RTC_HRS_ALARM: - case RTC_WDAY: - case RTC_DAY: - case RTC_MONTH: - case RTC_YEAR: - case RTC_STATUSA: - case RTC_STATUSB: - case RTC_INTR: - case RTC_STATUSD: - case RTC_NVRAM_START ... RTC_NVRAM_END: - break; - default: - return (-1); - } - - addr = *eax & 0x7f; - return (0); -} - -static int -rtc_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, - uint32_t *eax, void *arg) -{ - int hour; + struct tm tm; time_t t; - struct timeval cur, delta; - static struct timeval last; - static struct tm tm; - - if (bytes != 1) - return (-1); - - gettimeofday(&cur, NULL); - - /* - * Increment the cached time only once per second so we can guarantee - * that the guest has at least one second to read the hour:min:sec - * separately and still get a coherent view of the time. - */ - delta = cur; - timevalsub(&delta, &last); - if (delta.tv_sec >= 1 && (status_b & RTCSB_HALT) == 0) { - t = cur.tv_sec; + time(&t); + if (use_localtime) { localtime_r(&t, &tm); - last = cur; + t = timegm(&tm); } - - if (in) { - switch (addr) { - case RTC_SEC_ALARM: - *eax = rtc_alarm.secs; - break; - case RTC_MIN_ALARM: - *eax = rtc_alarm.mins; - break; - case RTC_HRS_ALARM: - *eax = rtc_alarm.hours; - break; - case RTC_SEC: - *eax = rtcout(tm.tm_sec); - return (0); - case RTC_MIN: - *eax = rtcout(tm.tm_min); - return (0); - case RTC_HRS: - if (status_b & RTCSB_24HR) - hour = tm.tm_hour; - else - hour = (tm.tm_hour % 12) + 1; - - *eax = rtcout(hour); - - /* - * If we are representing time in the 12-hour format - * then set the MSB to indicate PM. - */ - if ((status_b & RTCSB_24HR) == 0 && tm.tm_hour >= 12) - *eax |= 0x80; - - return (0); - case RTC_WDAY: - *eax = rtcout(tm.tm_wday + 1); - return (0); - case RTC_DAY: - *eax = rtcout(tm.tm_mday); - return (0); - case RTC_MONTH: - *eax = rtcout(tm.tm_mon + 1); - return (0); - case RTC_YEAR: - *eax = rtcout(tm.tm_year % 100); - return (0); - case RTC_STATUSA: - *eax = status_a; - return (0); - case RTC_STATUSB: - *eax = status_b; - return (0); - case RTC_INTR: - *eax = 0; - return (0); - case RTC_STATUSD: - *eax = RTCSD_PWR; - return (0); - case RTC_NVRAM_START ... RTC_NVRAM_END: - *eax = rtc_nvram[addr - RTC_NVRAM_START]; - return (0); - default: - return (-1); - } - } - - switch (addr) { - case RTC_STATUSA: - status_a = *eax & ~RTCSA_TUP; - break; - case RTC_STATUSB: - /* XXX not implemented yet XXX */ - if (*eax & RTCSB_PINTR) - return (-1); - status_b = *eax; - break; - case RTC_STATUSD: - /* ignore write */ - break; - case RTC_SEC_ALARM: - rtc_alarm.secs = *eax; - break; - case RTC_MIN_ALARM: - rtc_alarm.mins = *eax; - break; - case RTC_HRS_ALARM: - rtc_alarm.hours = *eax; - break; - case RTC_SEC: - case RTC_MIN: - case RTC_HRS: - case RTC_WDAY: - case RTC_DAY: - case RTC_MONTH: - case RTC_YEAR: - /* - * Ignore writes to the time of day registers - */ - break; - case RTC_NVRAM_START ... RTC_NVRAM_END: - rtc_nvram[addr - RTC_NVRAM_START] = *eax; - break; - default: - return (-1); - } - return (0); + return (t); } void -rtc_init(struct vmctx *ctx) +rtc_init(struct vmctx *ctx, int use_localtime) { - struct timeval cur; - struct tm tm; size_t himem; size_t lomem; int err; - err = gettimeofday(&cur, NULL); - assert(err == 0); - (void) localtime_r(&cur.tv_sec, &tm); - - memset(rtc_nvram, 0, sizeof(rtc_nvram)); - - rtc_nvram[nvoff(RTC_CENTURY)] = bin2bcd((tm.tm_year + 1900) / 100); - /* XXX init diag/reset code/equipment/checksum ? */ /* @@ -344,17 +86,22 @@ rtc_init(struct vmctx *ctx) * 0x5b/0x5c/0x5d - 64KB chunks above 4GB */ lomem = (vm_get_lowmem_size(ctx) - m_16MB) / m_64KB; - rtc_nvram[nvoff(RTC_LMEM_LSB)] = lomem; - rtc_nvram[nvoff(RTC_LMEM_MSB)] = lomem >> 8; + err = vm_rtc_write(ctx, RTC_LMEM_LSB, lomem); + assert(err == 0); + err = vm_rtc_write(ctx, RTC_LMEM_MSB, lomem >> 8); + assert(err == 0); himem = vm_get_highmem_size(ctx) / m_64KB; - rtc_nvram[nvoff(RTC_HMEM_LSB)] = himem; - rtc_nvram[nvoff(RTC_HMEM_SB)] = himem >> 8; - rtc_nvram[nvoff(RTC_HMEM_MSB)] = himem >> 16; -} + err = vm_rtc_write(ctx, RTC_HMEM_LSB, himem); + assert(err == 0); + err = vm_rtc_write(ctx, RTC_HMEM_SB, himem >> 8); + assert(err == 0); + err = vm_rtc_write(ctx, RTC_HMEM_MSB, himem >> 16); + assert(err == 0); -INOUT_PORT(rtc, IO_RTC, IOPORT_F_INOUT, rtc_addr_handler); -INOUT_PORT(rtc, IO_RTC + 1, IOPORT_F_INOUT, rtc_data_handler); + err = vm_rtc_settime(ctx, rtc_time(ctx, use_localtime)); + assert(err == 0); +} static void rtc_dsdt(void) diff --git a/usr.sbin/bhyve/rtc.h b/usr.sbin/bhyve/rtc.h index 72cffb3b09b7..5b08ca3463d4 100644 --- a/usr.sbin/bhyve/rtc.h +++ b/usr.sbin/bhyve/rtc.h @@ -29,6 +29,6 @@ #ifndef _RTC_H_ #define _RTC_H_ -void rtc_init(struct vmctx *ctx); +void rtc_init(struct vmctx *ctx, int use_localtime); #endif /* _RTC_H_ */ diff --git a/usr.sbin/bhyve/task_switch.c b/usr.sbin/bhyve/task_switch.c index b939c1a98614..ba6a9d26b0df 100644 --- a/usr.sbin/bhyve/task_switch.c +++ b/usr.sbin/bhyve/task_switch.c @@ -725,20 +725,10 @@ vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) assert(paging->cpu_mode == CPU_MODE_PROTECTED); /* - * Calculate the %eip to store in the old TSS before modifying the - * 'inst_length'. + * Calculate the instruction pointer to store in the old TSS. */ eip = vmexit->rip + vmexit->inst_length; - /* - * Set the 'inst_length' to '0'. - * - * If an exception is triggered during emulation of the task switch - * then the exception handler should return to the instruction that - * caused the task switch as opposed to the subsequent instruction. - */ - vmexit->inst_length = 0; - /* * Section 4.6, "Access Rights" in Intel SDM Vol 3. * The following page table accesses are implicitly supervisor mode: @@ -883,8 +873,8 @@ vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) * after this point will be handled in the context of the new task and * the saved instruction pointer will belong to the new task. */ - vmexit->rip = newtss.tss_eip; - assert(vmexit->inst_length == 0); + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, newtss.tss_eip); + assert(error == 0); /* Load processor state from new TSS */ error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov); diff --git a/usr.sbin/bhyve/xmsr.c b/usr.sbin/bhyve/xmsr.c index d50a93945449..5b7bfbbc71e3 100644 --- a/usr.sbin/bhyve/xmsr.c +++ b/usr.sbin/bhyve/xmsr.c @@ -185,6 +185,15 @@ emulate_rdmsr(struct vmctx *ctx, int vcpu, uint32_t num, uint64_t *val) *val = 0; break; + /* + * OpenBSD guests test bit 0 of this MSR to detect if the + * workaround for erratum 721 is already applied. + * http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf + */ + case 0xC0011029: + *val = 1; + break; + default: error = -1; break; diff --git a/usr.sbin/bhyvectl/bhyvectl.c b/usr.sbin/bhyvectl/bhyvectl.c index 0c4457e6ee87..e2b514d8ec7b 100644 --- a/usr.sbin/bhyvectl/bhyvectl.c +++ b/usr.sbin/bhyvectl/bhyvectl.c @@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include @@ -157,6 +158,11 @@ usage(bool cpu_intel) " [--inject-nmi]\n" " [--force-reset]\n" " [--force-poweroff]\n" + " [--get-rtc-time]\n" + " [--set-rtc-time=]\n" + " [--get-rtc-nvram]\n" + " [--set-rtc-nvram=]\n" + " [--rtc-nvram-offset=]\n" " [--get-active-cpus]\n" " [--get-suspended-cpus]\n" " [--get-intinfo]\n" @@ -220,6 +226,12 @@ usage(bool cpu_intel) exit(1); } +static int get_rtc_time, set_rtc_time; +static int get_rtc_nvram, set_rtc_nvram; +static int rtc_nvram_offset; +static uint8_t rtc_nvram_value; +static time_t rtc_secs; + static int get_stats, getcap, setcap, capval, get_gpa_pmap; static int inject_nmi, assert_lapic_lvt; static int force_reset, force_poweroff; @@ -545,6 +557,9 @@ enum { UNASSIGN_PPTDEV, GET_GPA_PMAP, ASSERT_LAPIC_LVT, + SET_RTC_TIME, + SET_RTC_NVRAM, + RTC_NVRAM_OFFSET, }; static void @@ -1269,6 +1284,11 @@ setup_options(bool cpu_intel) { "setcap", REQ_ARG, 0, SET_CAP }, { "get-gpa-pmap", REQ_ARG, 0, GET_GPA_PMAP }, { "assert-lapic-lvt", REQ_ARG, 0, ASSERT_LAPIC_LVT }, + { "get-rtc-time", NO_ARG, &get_rtc_time, 1 }, + { "set-rtc-time", REQ_ARG, 0, SET_RTC_TIME }, + { "rtc-nvram-offset", REQ_ARG, 0, RTC_NVRAM_OFFSET }, + { "get-rtc-nvram", NO_ARG, &get_rtc_nvram, 1 }, + { "set-rtc-nvram", REQ_ARG, 0, SET_RTC_NVRAM }, { "getcap", NO_ARG, &getcap, 1 }, { "get-stats", NO_ARG, &get_stats, 1 }, { "get-desc-ds",NO_ARG, &get_desc_ds, 1 }, @@ -1462,6 +1482,33 @@ setup_options(bool cpu_intel) return (all_opts); } +static const char * +wday_str(int idx) +{ + static const char *weekdays[] = { + "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat" + }; + + if (idx >= 0 && idx < 7) + return (weekdays[idx]); + else + return ("UNK"); +} + +static const char * +mon_str(int idx) +{ + static const char *months[] = { + "Jan", "Feb", "Mar", "Apr", "May", "Jun", + "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" + }; + + if (idx >= 0 && idx < 12) + return (months[idx]); + else + return ("UNK"); +} + int main(int argc, char *argv[]) { @@ -1477,6 +1524,7 @@ main(int argc, char *argv[]) cpuset_t cpus; bool cpu_intel; uint64_t cs, ds, es, fs, gs, ss, tr, ldtr; + struct tm tm; struct option *opts; cpu_intel = cpu_vendor_intel(); @@ -1594,6 +1642,17 @@ main(int argc, char *argv[]) capval = strtoul(optarg, NULL, 0); setcap = 1; break; + case SET_RTC_TIME: + rtc_secs = strtoul(optarg, NULL, 0); + set_rtc_time = 1; + break; + case SET_RTC_NVRAM: + rtc_nvram_value = (uint8_t)strtoul(optarg, NULL, 0); + set_rtc_nvram = 1; + break; + case RTC_NVRAM_OFFSET: + rtc_nvram_offset = strtoul(optarg, NULL, 0); + break; case GET_GPA_PMAP: gpa_pmap = strtoul(optarg, NULL, 0); get_gpa_pmap = 1; @@ -1971,6 +2030,31 @@ main(int argc, char *argv[]) } } + if (!error && set_rtc_nvram) + error = vm_rtc_write(ctx, rtc_nvram_offset, rtc_nvram_value); + + if (!error && (get_rtc_nvram || get_all)) { + error = vm_rtc_read(ctx, rtc_nvram_offset, &rtc_nvram_value); + if (error == 0) { + printf("rtc nvram[%03d]: 0x%02x\n", rtc_nvram_offset, + rtc_nvram_value); + } + } + + if (!error && set_rtc_time) + error = vm_rtc_settime(ctx, rtc_secs); + + if (!error && (get_rtc_time || get_all)) { + error = vm_rtc_gettime(ctx, &rtc_secs); + if (error == 0) { + gmtime_r(&rtc_secs, &tm); + printf("rtc time %#lx: %s %s %02d %02d:%02d:%02d %d\n", + rtc_secs, wday_str(tm.tm_wday), mon_str(tm.tm_mon), + tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec, + 1900 + tm.tm_year); + } + } + if (!error && (getcap || get_all)) { int captype, val, getcaptype; @@ -2034,10 +2118,7 @@ main(int argc, char *argv[]) } if (!error && run) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip); - assert(error == 0); - - error = vm_run(ctx, vcpu, rip, &vmexit); + error = vm_run(ctx, vcpu, &vmexit); if (error == 0) dump_vm_run_exitcode(&vmexit, vcpu); else