From 3e49998fdfce1f7d28cbd813dab17114caeb6392 Mon Sep 17 00:00:00 2001 From: neel Date: Wed, 26 Mar 2014 23:34:27 +0000 Subject: [PATCH] Add an ioctl to suspend a virtual machine (VM_SUSPEND). The ioctl can be called from any context i.e., it is not required to be called from a vcpu thread. The ioctl simply sets a state variable 'vm->suspend' to '1' and returns. The vcpus inspect 'vm->suspend' in the run loop and if it is set to '1' the vcpu breaks out of the loop with a reason of 'VM_EXITCODE_SUSPENDED'. The suspend handler waits until all 'vm->active_cpus' have transitioned to 'vm->suspended_cpus' before returning to userspace. Discussed with: grehan --- lib/libvmmapi/vmmapi.c | 7 ++++ lib/libvmmapi/vmmapi.h | 1 + sys/amd64/include/vmm.h | 12 +++++- sys/amd64/include/vmm_dev.h | 3 ++ sys/amd64/vmm/amd/amdv.c | 3 +- sys/amd64/vmm/intel/vmx.c | 22 ++++++++-- sys/amd64/vmm/vmm.c | 83 +++++++++++++++++++++++++++++++++++-- sys/amd64/vmm/vmm_dev.c | 3 ++ usr.sbin/bhyve/bhyverun.c | 35 +++++++++++++++- 9 files changed, 159 insertions(+), 10 deletions(-) diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c index 22b536a104ee..60d3105db206 100644 --- a/lib/libvmmapi/vmmapi.c +++ b/lib/libvmmapi/vmmapi.c @@ -342,6 +342,13 @@ vm_run(struct vmctx *ctx, int vcpu, uint64_t rip, struct vm_exit *vmexit) return (error); } +int +vm_suspend(struct vmctx *ctx) +{ + + return (ioctl(ctx->fd, VM_SUSPEND, 0)); +} + static int vm_inject_exception_real(struct vmctx *ctx, int vcpu, int vector, int error_code, int error_code_valid) diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h index 085af94a00d9..ce150d8c3d5e 100644 --- a/lib/libvmmapi/vmmapi.h +++ b/lib/libvmmapi/vmmapi.h @@ -61,6 +61,7 @@ int vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val); int vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *retval); int vm_run(struct vmctx *ctx, int vcpu, uint64_t rip, struct vm_exit *ret_vmexit); +int vm_suspend(struct vmctx *ctx); int vm_apicid2vcpu(struct vmctx *ctx, int apicid); int vm_inject_exception(struct vmctx *ctx, int vcpu, int vec); int vm_inject_exception2(struct vmctx *ctx, int vcpu, int vec, int errcode); diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h index d50e7bcf6dd2..364bbb855e7f 100644 --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -53,7 +53,8 @@ typedef int (*vmm_cleanup_func_t)(void); typedef void (*vmm_resume_func_t)(void); typedef void * (*vmi_init_func_t)(struct vm *vm, struct pmap *pmap); typedef int (*vmi_run_func_t)(void *vmi, int vcpu, register_t rip, - struct pmap *pmap, void *rendezvous_cookie); + struct pmap *pmap, void *rendezvous_cookie, + void *suspend_cookie); typedef void (*vmi_cleanup_func_t)(void *vmi); typedef int (*vmi_get_register_t)(void *vmi, int vcpu, int num, uint64_t *retval); @@ -114,6 +115,7 @@ int vm_get_seg_desc(struct vm *vm, int vcpu, int reg, int vm_set_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc); int vm_run(struct vm *vm, struct vm_run *vmrun); +int vm_suspend(struct vm *vm); int vm_inject_nmi(struct vm *vm, int vcpu); int vm_nmi_pending(struct vm *vm, int vcpuid); void vm_nmi_clear(struct vm *vm, int vcpuid); @@ -158,6 +160,13 @@ vcpu_rendezvous_pending(void *rendezvous_cookie) return (*(uintptr_t *)rendezvous_cookie != 0); } +static __inline int +vcpu_suspended(void *suspend_cookie) +{ + + return (*(int *)suspend_cookie); +} + /* * Return 1 if device indicated by bus/slot/func is supposed to be a * pci passthrough device. @@ -311,6 +320,7 @@ enum vm_exitcode { VM_EXITCODE_SPINDOWN_CPU, VM_EXITCODE_RENDEZVOUS, VM_EXITCODE_IOAPIC_EOI, + VM_EXITCODE_SUSPENDED, VM_EXITCODE_MAX }; diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h index eda9b94f23c3..475a07fb7e8b 100644 --- a/sys/amd64/include/vmm_dev.h +++ b/sys/amd64/include/vmm_dev.h @@ -165,6 +165,7 @@ enum { IOCNUM_RUN = 1, IOCNUM_SET_CAPABILITY = 2, IOCNUM_GET_CAPABILITY = 3, + IOCNUM_SUSPEND = 4, /* memory apis */ IOCNUM_MAP_MEMORY = 10, @@ -212,6 +213,8 @@ enum { #define VM_RUN \ _IOWR('v', IOCNUM_RUN, struct vm_run) +#define VM_SUSPEND \ + _IO('v', IOCNUM_SUSPEND) #define VM_MAP_MEMORY \ _IOWR('v', IOCNUM_MAP_MEMORY, struct vm_memory_segment) #define VM_GET_MEMORY_SEG \ diff --git a/sys/amd64/vmm/amd/amdv.c b/sys/amd64/vmm/amd/amdv.c index 39f0ef737d57..4c88d12bee54 100644 --- a/sys/amd64/vmm/amd/amdv.c +++ b/sys/amd64/vmm/amd/amdv.c @@ -67,7 +67,8 @@ amdv_vminit(struct vm *vm, struct pmap *pmap) } static int -amdv_vmrun(void *arg, int vcpu, register_t rip, struct pmap *pmap, void *cookie) +amdv_vmrun(void *arg, int vcpu, register_t rip, struct pmap *pmap, + void *rptr, void *sptr) { printf("amdv_vmrun: not implemented\n"); diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c index a4b799d08885..54322968530a 100644 --- a/sys/amd64/vmm/intel/vmx.c +++ b/sys/amd64/vmm/intel/vmx.c @@ -2037,6 +2037,16 @@ vmx_exit_rendezvous(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) return (UNHANDLED); } +static __inline int +vmx_exit_suspended(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) +{ + + vmexit->rip = vmcs_guest_rip(); + vmexit->inst_length = 0; + vmexit->exitcode = VM_EXITCODE_SUSPENDED; + return (UNHANDLED); +} + static __inline int vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit) { @@ -2097,7 +2107,7 @@ vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit) static int vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap, - void *rendezvous_cookie) + void *rendezvous_cookie, void *suspend_cookie) { int rc, handled, launched; struct vmx *vmx; @@ -2154,9 +2164,9 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap, * pmap_invalidate_ept(). */ disable_intr(); - if (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED)) { + if (vcpu_suspended(suspend_cookie)) { enable_intr(); - handled = vmx_exit_astpending(vmx, vcpu, vmexit); + handled = vmx_exit_suspended(vmx, vcpu, vmexit); break; } @@ -2166,6 +2176,12 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap, break; } + if (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED)) { + enable_intr(); + handled = vmx_exit_astpending(vmx, vcpu, vmexit); + break; + } + vmx_inject_interrupts(vmx, vcpu, vlapic); vmx_run_trace(vmx, vcpu); rc = vmx_enter_guest(vmxctx, vmx, launched); diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index 9d740d10f532..a835c351cb82 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -139,6 +139,9 @@ struct vm { cpuset_t rendezvous_done_cpus; void *rendezvous_arg; vm_rendezvous_func_t rendezvous_func; + + int suspend; + volatile cpuset_t suspended_cpus; }; static int vmm_initialized; @@ -149,8 +152,8 @@ static struct vmm_ops *ops; #define VMM_RESUME() (ops != NULL ? (*ops->resume)() : 0) #define VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL) -#define VMRUN(vmi, vcpu, rip, pmap, rptr) \ - (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, rptr) : ENXIO) +#define VMRUN(vmi, vcpu, rip, pmap, rptr, sptr) \ + (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, rptr, sptr) : ENXIO) #define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL) #define VMSPACE_ALLOC(min, max) \ (ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL) @@ -1019,7 +1022,8 @@ vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu) * These interrupts could have happened any time after we * returned from VMRUN() and before we grabbed the vcpu lock. */ - if (!vm_nmi_pending(vm, vcpuid) && + if (vm->rendezvous_func == NULL && + !vm_nmi_pending(vm, vcpuid) && (intr_disabled || !vlapic_pending_intr(vcpu->vlapic, NULL))) { t = ticks; vcpu_require_state_locked(vcpu, VCPU_SLEEPING); @@ -1152,6 +1156,71 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu) return (error); } +static int +vm_handle_suspend(struct vm *vm, int vcpuid, bool *retu) +{ + int i, done; + struct vcpu *vcpu; + + done = 0; + vcpu = &vm->vcpu[vcpuid]; + + CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus); + + /* + * Wait until all 'active_cpus' have suspended themselves. + * + * Since a VM may be suspended at any time including when one or + * more vcpus are doing a rendezvous we need to call the rendezvous + * handler while we are waiting to prevent a deadlock. + */ + vcpu_lock(vcpu); + while (1) { + if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { + VCPU_CTR0(vm, vcpuid, "All vcpus suspended"); + break; + } + + if (vm->rendezvous_func == NULL) { + VCPU_CTR0(vm, vcpuid, "Sleeping during suspend"); + vcpu_require_state_locked(vcpu, VCPU_SLEEPING); + msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz); + vcpu_require_state_locked(vcpu, VCPU_FROZEN); + } else { + VCPU_CTR0(vm, vcpuid, "Rendezvous during suspend"); + vcpu_unlock(vcpu); + vm_handle_rendezvous(vm, vcpuid); + vcpu_lock(vcpu); + } + } + vcpu_unlock(vcpu); + + /* + * Wakeup the other sleeping vcpus and return to userspace. + */ + for (i = 0; i < VM_MAXCPU; i++) { + if (CPU_ISSET(i, &vm->suspended_cpus)) { + vcpu_notify_event(vm, i, false); + } + } + + *retu = true; + return (0); +} + +int +vm_suspend(struct vm *vm) +{ + + if (atomic_cmpset_int(&vm->suspend, 0, 1)) { + VM_CTR0(vm, "virtual machine suspended"); + return (0); + } else { + VM_CTR0(vm, "virtual machine already suspended"); + return (EALREADY); + } +} + int vm_run(struct vm *vm, struct vm_run *vmrun) { @@ -1162,12 +1231,15 @@ vm_run(struct vm *vm, struct vm_run *vmrun) struct vm_exit *vme; bool retu, intr_disabled; pmap_t pmap; + void *rptr, *sptr; vcpuid = vmrun->cpuid; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) return (EINVAL); + rptr = &vm->rendezvous_func; + sptr = &vm->suspend; pmap = vmspace_pmap(vm->vmspace); vcpu = &vm->vcpu[vcpuid]; vme = &vcpu->exitinfo; @@ -1187,7 +1259,7 @@ restart: restore_guest_fpustate(vcpu); vcpu_require_state(vm, vcpuid, VCPU_RUNNING); - error = VMRUN(vm->cookie, vcpuid, rip, pmap, &vm->rendezvous_func); + error = VMRUN(vm->cookie, vcpuid, rip, pmap, rptr, sptr); vcpu_require_state(vm, vcpuid, VCPU_FROZEN); save_guest_fpustate(vcpu); @@ -1200,6 +1272,9 @@ restart: if (error == 0) { retu = false; switch (vme->exitcode) { + case VM_EXITCODE_SUSPENDED: + error = vm_handle_suspend(vm, vcpuid, &retu); + break; case VM_EXITCODE_IOAPIC_EOI: vioapic_process_eoi(vm, vcpuid, vme->u.ioapic_eoi.vector); diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c index 5733685ee9ac..f8f854be3a3f 100644 --- a/sys/amd64/vmm/vmm_dev.c +++ b/sys/amd64/vmm/vmm_dev.c @@ -239,6 +239,9 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, vmrun = (struct vm_run *)data; error = vm_run(sc->vm, vmrun); break; + case VM_SUSPEND: + error = vm_suspend(sc->vm); + break; case VM_STAT_DESC: { statdesc = (struct vm_stat_desc *)data; error = vmm_stat_desc_copy(statdesc->index, diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c index 99194f24e18e..6aac50e8703b 100644 --- a/usr.sbin/bhyve/bhyverun.c +++ b/usr.sbin/bhyve/bhyverun.c @@ -463,6 +463,33 @@ vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) return (VMEXIT_CONTINUE); } +static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER; +static int resetcpu = -1; + +static int +vmexit_suspend(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + + assert(resetcpu != -1); + + fbsdrun_deletecpu(ctx, *pvcpu); + + if (*pvcpu != resetcpu) { + pthread_mutex_lock(&resetcpu_mtx); + pthread_cond_signal(&resetcpu_cond); + pthread_mutex_unlock(&resetcpu_mtx); + pthread_exit(NULL); + } + + pthread_mutex_lock(&resetcpu_mtx); + while (!CPU_EMPTY(&cpumask)) { + pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx); + } + pthread_mutex_unlock(&resetcpu_mtx); + exit(0); +} + static vmexit_handler_t handler[VM_EXITCODE_MAX] = { [VM_EXITCODE_INOUT] = vmexit_inout, [VM_EXITCODE_VMX] = vmexit_vmx, @@ -473,6 +500,7 @@ static vmexit_handler_t handler[VM_EXITCODE_MAX] = { [VM_EXITCODE_INST_EMUL] = vmexit_inst_emul, [VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap, [VM_EXITCODE_SPINDOWN_CPU] = vmexit_spindown_cpu, + [VM_EXITCODE_SUSPENDED] = vmexit_suspend }; static void @@ -514,7 +542,12 @@ vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip) rip = vmexit[vcpu].rip; break; case VMEXIT_RESET: - exit(0); + if (vm_suspend(ctx) == 0) { + assert(resetcpu == -1); + resetcpu = vcpu; + } + rip = vmexit[vcpu].rip + vmexit[vcpu].inst_length; + break; default: exit(1); }