Simplify instruction restart logic in bhyve.

Keep track of the next instruction to be executed by the vcpu as 'nextrip'.
As a result the VM_RUN ioctl no longer takes the %rip where a vcpu should
start execution.

Also, instruction restart happens implicitly via 'vm_inject_exception()' or
explicitly via 'vm_restart_instruction()'. The APIs behave identically in
both kernel and userspace contexts. The main beneficiary is the instruction
emulation code that executes in both contexts.

bhyve(8) VM exit handlers now treat 'vmexit->rip' and 'vmexit->inst_length'
as readonly:
- Restarting an instruction is now done by calling 'vm_restart_instruction()'
  as opposed to setting 'vmexit->inst_length' to 0 (e.g. emulate_inout())
- Resuming vcpu at an arbitrary %rip is now done by setting VM_REG_GUEST_RIP
  as opposed to changing 'vmexit->rip' (e.g. vmexit_task_switch())

Differential Revision:	https://reviews.freebsd.org/D1526
Reviewed by:		grehan
MFC after:		2 weeks
This commit is contained in:
Neel Natu 2015-01-18 03:08:30 +00:00
parent 1bd468a4e9
commit d087a39935
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=277310
11 changed files with 125 additions and 98 deletions

View File

@ -368,14 +368,13 @@ vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *ret_val)
}
int
vm_run(struct vmctx *ctx, int vcpu, uint64_t rip, struct vm_exit *vmexit)
vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit)
{
int error;
struct vm_run vmrun;
bzero(&vmrun, sizeof(vmrun));
vmrun.cpuid = vcpu;
vmrun.rip = rip;
error = ioctl(ctx->fd, VM_RUN, &vmrun);
bcopy(&vmrun.vm_exit, vmexit, sizeof(struct vm_exit));
@ -399,35 +398,21 @@ vm_reinit(struct vmctx *ctx)
return (ioctl(ctx->fd, VM_REINIT, 0));
}
static int
vm_inject_exception_real(struct vmctx *ctx, int vcpu, int vector,
int error_code, int error_code_valid)
int
vm_inject_exception(struct vmctx *ctx, int vcpu, int vector, int errcode_valid,
uint32_t errcode, int restart_instruction)
{
struct vm_exception exc;
bzero(&exc, sizeof(exc));
exc.cpuid = vcpu;
exc.vector = vector;
exc.error_code = error_code;
exc.error_code_valid = error_code_valid;
exc.error_code = errcode;
exc.error_code_valid = errcode_valid;
exc.restart_instruction = restart_instruction;
return (ioctl(ctx->fd, VM_INJECT_EXCEPTION, &exc));
}
int
vm_inject_exception(struct vmctx *ctx, int vcpu, int vector)
{
return (vm_inject_exception_real(ctx, vcpu, vector, 0, 0));
}
int
vm_inject_exception2(struct vmctx *ctx, int vcpu, int vector, int errcode)
{
return (vm_inject_exception_real(ctx, vcpu, vector, errcode, 1));
}
int
vm_apicid2vcpu(struct vmctx *ctx, int apicid)
{
@ -1198,3 +1183,11 @@ vm_rtc_gettime(struct vmctx *ctx, time_t *secs)
*secs = rtctime.secs;
return (error);
}
int
vm_restart_instruction(void *arg, int vcpu)
{
struct vmctx *ctx = arg;
return (ioctl(ctx->fd, VM_RESTART_INSTRUCTION, &vcpu));
}

View File

@ -32,6 +32,12 @@
#include <sys/param.h>
#include <sys/cpuset.h>
/*
* API version for out-of-tree consumers like grub-bhyve for making compile
* time decisions.
*/
#define VMMAPI_VERSION 0101 /* 2 digit major followed by 2 digit minor */
struct iovec;
struct vmctx;
enum x2apic_state;
@ -70,13 +76,12 @@ int vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg,
struct seg_desc *seg_desc);
int vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val);
int vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *retval);
int vm_run(struct vmctx *ctx, int vcpu, uint64_t rip,
struct vm_exit *ret_vmexit);
int vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *ret_vmexit);
int vm_suspend(struct vmctx *ctx, enum vm_suspend_how how);
int vm_reinit(struct vmctx *ctx);
int vm_apicid2vcpu(struct vmctx *ctx, int apicid);
int vm_inject_exception(struct vmctx *ctx, int vcpu, int vec);
int vm_inject_exception2(struct vmctx *ctx, int vcpu, int vec, int errcode);
int vm_inject_exception(struct vmctx *ctx, int vcpu, int vector,
int errcode_valid, uint32_t errcode, int restart_instruction);
int vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector);
int vm_lapic_local_irq(struct vmctx *ctx, int vcpu, int vector);
int vm_lapic_msi(struct vmctx *ctx, uint64_t addr, uint64_t msg);

View File

@ -54,7 +54,6 @@ struct vm_seg_desc { /* data or code segment */
struct vm_run {
int cpuid;
uint64_t rip; /* start running here */
struct vm_exit vm_exit;
};
@ -238,6 +237,7 @@ enum {
IOCNUM_LAPIC_MSI = 36,
IOCNUM_LAPIC_LOCAL_IRQ = 37,
IOCNUM_IOAPIC_PINCOUNT = 38,
IOCNUM_RESTART_INSTRUCTION = 39,
/* PCI pass-thru */
IOCNUM_BIND_PPTDEV = 40,
@ -360,4 +360,6 @@ enum {
_IOW('v', IOCNUM_RTC_SETTIME, struct vm_rtc_time)
#define VM_RTC_GETTIME \
_IOR('v', IOCNUM_RTC_GETTIME, struct vm_rtc_time)
#define VM_RESTART_INSTRUCTION \
_IOW('v', IOCNUM_RESTART_INSTRUCTION, int)
#endif

View File

@ -2412,6 +2412,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
if (vm_mem_allocated(vmx->vm, gpa) ||
apic_access_fault(vmx, vcpu, gpa)) {
vmexit->exitcode = VM_EXITCODE_PAGING;
vmexit->inst_length = 0;
vmexit->u.paging.gpa = gpa;
vmexit->u.paging.fault_type = ept_fault_type(qual);
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1);

View File

@ -109,6 +109,7 @@ struct vcpu {
uint64_t guest_xcr0; /* (i) guest %xcr0 register */
void *stats; /* (a,i) statistics */
struct vm_exit exitinfo; /* (x) exit reason and collateral */
uint64_t nextrip; /* (x) next instruction to execute */
};
#define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
@ -850,16 +851,26 @@ vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
}
int
vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val)
vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val)
{
struct vcpu *vcpu;
int error;
if (vcpu < 0 || vcpu >= VM_MAXCPU)
if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
return (EINVAL);
if (reg >= VM_REG_LAST)
return (EINVAL);
return (VMSETREG(vm->cookie, vcpu, reg, val));
error = VMSETREG(vm->cookie, vcpuid, reg, val);
if (error || reg != VM_REG_GUEST_RIP)
return (error);
/* Set 'nextrip' to match the value of %rip */
VCPU_CTR1(vm, vcpuid, "Setting nextrip to %#lx", val);
vcpu = &vm->vcpu[vcpuid];
vcpu->nextrip = val;
return (0);
}
static boolean_t
@ -1199,6 +1210,9 @@ vm_handle_paging(struct vm *vm, int vcpuid, bool *retu)
vcpu = &vm->vcpu[vcpuid];
vme = &vcpu->exitinfo;
KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
__func__, vme->inst_length));
ftype = vme->u.paging.fault_type;
KASSERT(ftype == VM_PROT_READ ||
ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
@ -1224,9 +1238,6 @@ vm_handle_paging(struct vm *vm, int vcpuid, bool *retu)
if (rv != KERN_SUCCESS)
return (EFAULT);
done:
/* restart execution at the faulting instruction */
vm_restart_instruction(vm, vcpuid);
return (0);
}
@ -1281,10 +1292,13 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
return (EFAULT);
/*
* If the instruction length is not specified the update it now.
* If the instruction length was not specified then update it now
* along with 'nextrip'.
*/
if (vme->inst_length == 0)
if (vme->inst_length == 0) {
vme->inst_length = vie->num_processed;
vcpu->nextrip += vie->num_processed;
}
/* return to userland unless this is an in-kernel emulated device */
if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
@ -1433,7 +1447,7 @@ vm_run(struct vm *vm, struct vm_run *vmrun)
int error, vcpuid;
struct vcpu *vcpu;
struct pcb *pcb;
uint64_t tscval, rip;
uint64_t tscval;
struct vm_exit *vme;
bool retu, intr_disabled;
pmap_t pmap;
@ -1455,7 +1469,6 @@ vm_run(struct vm *vm, struct vm_run *vmrun)
pmap = vmspace_pmap(vm->vmspace);
vcpu = &vm->vcpu[vcpuid];
vme = &vcpu->exitinfo;
rip = vmrun->rip;
restart:
critical_enter();
@ -1470,7 +1483,7 @@ vm_run(struct vm *vm, struct vm_run *vmrun)
restore_guest_fpustate(vcpu);
vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
error = VMRUN(vm->cookie, vcpuid, rip, pmap, rptr, sptr);
error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap, rptr, sptr);
vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
save_guest_fpustate(vcpu);
@ -1481,6 +1494,7 @@ vm_run(struct vm *vm, struct vm_run *vmrun)
if (error == 0) {
retu = false;
vcpu->nextrip = vme->rip + vme->inst_length;
switch (vme->exitcode) {
case VM_EXITCODE_SUSPENDED:
error = vm_handle_suspend(vm, vcpuid, &retu);
@ -1517,10 +1531,8 @@ vm_run(struct vm *vm, struct vm_run *vmrun)
}
}
if (error == 0 && retu == false) {
rip = vme->rip + vme->inst_length;
if (error == 0 && retu == false)
goto restart;
}
/* copy the exit information */
bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
@ -1530,14 +1542,43 @@ vm_run(struct vm *vm, struct vm_run *vmrun)
int
vm_restart_instruction(void *arg, int vcpuid)
{
struct vm *vm;
struct vcpu *vcpu;
struct vm *vm = arg;
enum vcpu_state state;
uint64_t rip;
int error;
vm = arg;
if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
return (EINVAL);
vcpu = &vm->vcpu[vcpuid];
vcpu->exitinfo.inst_length = 0;
state = vcpu_get_state(vm, vcpuid, NULL);
if (state == VCPU_RUNNING) {
/*
* When a vcpu is "running" the next instruction is determined
* by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'.
* Thus setting 'inst_length' to zero will cause the current
* instruction to be restarted.
*/
vcpu->exitinfo.inst_length = 0;
VCPU_CTR1(vm, vcpuid, "restarting instruction at %#lx by "
"setting inst_length to zero", vcpu->exitinfo.rip);
} else if (state == VCPU_FROZEN) {
/*
* When a vcpu is "frozen" it is outside the critical section
* around VMRUN() and 'nextrip' points to the next instruction.
* Thus instruction restart is achieved by setting 'nextrip'
* to the vcpu's %rip.
*/
error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip);
KASSERT(!error, ("%s: error %d getting rip", __func__, error));
VCPU_CTR2(vm, vcpuid, "restarting instruction by updating "
"nextrip from %#lx to %#lx", vcpu->nextrip, rip);
vcpu->nextrip = rip;
} else {
panic("%s: invalid state %d", __func__, state);
}
return (0);
}

View File

@ -205,6 +205,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
case VM_ACTIVATE_CPU:
case VM_SET_INTINFO:
case VM_GET_INTINFO:
case VM_RESTART_INSTRUCTION:
/*
* XXX fragile, handle with care
* Assumes that the first field of the ioctl data is the vcpu.
@ -506,6 +507,9 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
rtctime = (struct vm_rtc_time *)data;
rtctime->secs = vrtc_get_time(sc->vm);
break;
case VM_RESTART_INSTRUCTION:
error = vm_restart_instruction(sc->vm, vcpu);
break;
default:
error = ENOTTY;
break;

View File

@ -185,20 +185,14 @@ vm_inject_fault(void *arg, int vcpu, int vector, int errcode_valid,
int errcode)
{
struct vmctx *ctx;
int error;
int error, restart_instruction;
ctx = arg;
if (errcode_valid)
error = vm_inject_exception2(ctx, vcpu, vector, errcode);
else
error = vm_inject_exception(ctx, vcpu, vector);
assert(error == 0);
restart_instruction = 1;
/*
* Set the instruction length to 0 to ensure that the instruction is
* restarted when the fault handler returns.
*/
vmexit[vcpu].inst_length = 0;
error = vm_inject_exception(ctx, vcpu, vector, errcode_valid, errcode,
restart_instruction);
assert(error == 0);
}
void *
@ -329,12 +323,6 @@ vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
}
error = emulate_inout(ctx, vcpu, vme, strictio);
if (!error && in && !string) {
error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX,
vme->u.inout.eax);
assert(error == 0);
}
if (error) {
fprintf(stderr, "Unhandled %s%c 0x%04x\n", in ? "in" : "out",
bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port);
@ -358,7 +346,7 @@ vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
vme->u.msr.code, *pvcpu);
if (strictmsr) {
vm_inject_gp(ctx, *pvcpu);
return (VMEXIT_RESTART);
return (VMEXIT_CONTINUE);
}
}
@ -384,7 +372,7 @@ vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
vme->u.msr.code, vme->u.msr.wval, *pvcpu);
if (strictmsr) {
vm_inject_gp(ctx, *pvcpu);
return (VMEXIT_RESTART);
return (VMEXIT_CONTINUE);
}
}
return (VMEXIT_CONTINUE);
@ -462,9 +450,11 @@ static int
vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
assert(vmexit->inst_length == 0);
stats.vmexit_bogus++;
return (VMEXIT_RESTART);
return (VMEXIT_CONTINUE);
}
static int
@ -494,9 +484,11 @@ static int
vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
assert(vmexit->inst_length == 0);
stats.vmexit_mtrap++;
return (VMEXIT_RESTART);
return (VMEXIT_CONTINUE);
}
static int
@ -581,7 +573,7 @@ static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
};
static void
vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip)
vm_loop(struct vmctx *ctx, int vcpu, uint64_t startrip)
{
int error, rc, prevcpu;
enum vm_exitcode exitcode;
@ -596,8 +588,11 @@ vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip)
error = vm_active_cpus(ctx, &active_cpus);
assert(CPU_ISSET(vcpu, &active_cpus));
error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, startrip);
assert(error == 0);
while (1) {
error = vm_run(ctx, vcpu, rip, &vmexit[vcpu]);
error = vm_run(ctx, vcpu, &vmexit[vcpu]);
if (error != 0)
break;
@ -614,10 +609,6 @@ vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip)
switch (rc) {
case VMEXIT_CONTINUE:
rip = vmexit[vcpu].rip + vmexit[vcpu].inst_length;
break;
case VMEXIT_RESTART:
rip = vmexit[vcpu].rip;
break;
case VMEXIT_ABORT:
abort();

View File

@ -35,9 +35,8 @@
#define __CTASSERT(x, y) typedef char __assert ## y[(x) ? 1 : -1]
#endif
#define VMEXIT_CONTINUE 1 /* continue from next instruction */
#define VMEXIT_RESTART 2 /* restart current instruction */
#define VMEXIT_ABORT 3 /* abort the vm run loop */
#define VMEXIT_CONTINUE (0)
#define VMEXIT_ABORT (-1)
struct vmctx;
extern int guest_ncpus;

View File

@ -104,7 +104,7 @@ int
emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict)
{
int addrsize, bytes, flags, in, port, prot, rep;
uint32_t val;
uint32_t eax, val;
inout_func_t handler;
void *arg;
int error, retval;
@ -214,16 +214,20 @@ emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict)
}
/* Restart the instruction if more iterations remain */
if (retval == 0 && count != 0)
vmexit->inst_length = 0;
} else {
if (!in) {
val = vmexit->u.inout.eax & vie_size2mask(bytes);
if (retval == 0 && count != 0) {
error = vm_restart_instruction(ctx, vcpu);
assert(error == 0);
}
} else {
eax = vmexit->u.inout.eax;
val = eax & vie_size2mask(bytes);
retval = handler(ctx, vcpu, in, port, bytes, &val, arg);
if (retval == 0 && in) {
vmexit->u.inout.eax &= ~vie_size2mask(bytes);
vmexit->u.inout.eax |= val & vie_size2mask(bytes);
eax &= ~vie_size2mask(bytes);
eax |= val & vie_size2mask(bytes);
error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX,
eax);
assert(error == 0);
}
}
return (retval);

View File

@ -725,20 +725,10 @@ vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
assert(paging->cpu_mode == CPU_MODE_PROTECTED);
/*
* Calculate the %eip to store in the old TSS before modifying the
* 'inst_length'.
* Calculate the instruction pointer to store in the old TSS.
*/
eip = vmexit->rip + vmexit->inst_length;
/*
* Set the 'inst_length' to '0'.
*
* If an exception is triggered during emulation of the task switch
* then the exception handler should return to the instruction that
* caused the task switch as opposed to the subsequent instruction.
*/
vmexit->inst_length = 0;
/*
* Section 4.6, "Access Rights" in Intel SDM Vol 3.
* The following page table accesses are implicitly supervisor mode:
@ -883,8 +873,8 @@ vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
* after this point will be handled in the context of the new task and
* the saved instruction pointer will belong to the new task.
*/
vmexit->rip = newtss.tss_eip;
assert(vmexit->inst_length == 0);
error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, newtss.tss_eip);
assert(error == 0);
/* Load processor state from new TSS */
error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov);

View File

@ -2118,10 +2118,7 @@ main(int argc, char *argv[])
}
if (!error && run) {
error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip);
assert(error == 0);
error = vm_run(ctx, vcpu, rip, &vmexit);
error = vm_run(ctx, vcpu, &vmexit);
if (error == 0)
dump_vm_run_exitcode(&vmexit, vcpu);
else