Add an API to rendezvous all active vcpus in a virtual machine. The rendezvous

can be initiated in the context of a vcpu thread or from the bhyve(8) control
process.

The first use of this functionality is to update the vlapic trigger-mode
register when the IOAPIC pin configuration is changed.

Prior to this change we would update the TMR in the virtual-APIC page at
the time of interrupt delivery. But this doesn't work with Posted Interrupts
because there is no way to program the EOI_exit_bitmap[] in the VMCS of
the target at the time of interrupt delivery.

Discussed with:	grehan@
This commit is contained in:
Neel Natu 2014-01-14 01:55:58 +00:00
parent df48f4170e
commit 5b8a8cd1fe
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=260619
9 changed files with 318 additions and 27 deletions

View File

@ -52,7 +52,7 @@ typedef int (*vmm_cleanup_func_t)(void);
typedef void (*vmm_resume_func_t)(void);
typedef void * (*vmi_init_func_t)(struct vm *vm, struct pmap *pmap);
typedef int (*vmi_run_func_t)(void *vmi, int vcpu, register_t rip,
struct pmap *pmap);
struct pmap *pmap, void *rendezvous_cookie);
typedef void (*vmi_cleanup_func_t)(void *vmi);
typedef int (*vmi_get_register_t)(void *vmi, int vcpu, int num,
uint64_t *retval);
@ -135,6 +135,31 @@ void vm_activate_cpu(struct vm *vm, int vcpu);
cpuset_t vm_active_cpus(struct vm *vm);
struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid);
/*
* Rendezvous all vcpus specified in 'dest' and execute 'func(arg)'.
* The rendezvous 'func(arg)' is not allowed to do anything that will
* cause the thread to be put to sleep.
*
* If the rendezvous is being initiated from a vcpu context then the
* 'vcpuid' must refer to that vcpu, otherwise it should be set to -1.
*
* The caller cannot hold any locks when initiating the rendezvous.
*
* The implementation of this API may cause vcpus other than those specified
* by 'dest' to be stalled. The caller should not rely on any vcpus making
* forward progress when the rendezvous is in progress.
*/
typedef void (*vm_rendezvous_func_t)(struct vm *vm, int vcpuid, void *arg);
void vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
vm_rendezvous_func_t func, void *arg);
static __inline int
vcpu_rendezvous_pending(void *rendezvous_cookie)
{
return (*(uintptr_t *)rendezvous_cookie != 0);
}
/*
* Return 1 if device indicated by bus/slot/func is supposed to be a
* pci passthrough device.
@ -272,6 +297,7 @@ enum vm_exitcode {
VM_EXITCODE_INST_EMUL,
VM_EXITCODE_SPINUP_AP,
VM_EXITCODE_SPINDOWN_CPU,
VM_EXITCODE_RENDEZVOUS,
VM_EXITCODE_MAX
};

View File

@ -67,7 +67,7 @@ amdv_vminit(struct vm *vm, struct pmap *pmap)
}
static int
amdv_vmrun(void *arg, int vcpu, register_t rip, struct pmap *pmap)
amdv_vmrun(void *arg, int vcpu, register_t rip, struct pmap *pmap, void *cookie)
{
printf("amdv_vmrun: not implemented\n");

View File

@ -1668,6 +1668,18 @@ vmx_exit_astpending(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
return (HANDLED);
}
static __inline int
vmx_exit_rendezvous(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
{
vmexit->rip = vmcs_guest_rip();
vmexit->inst_length = 0;
vmexit->exitcode = VM_EXITCODE_RENDEZVOUS;
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RENDEZVOUS, 1);
return (UNHANDLED);
}
static __inline int
vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit)
{
@ -1697,10 +1709,12 @@ vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit)
}
static int
vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap)
vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,
void *rendezvous_cookie)
{
int rc, handled, launched;
struct vmx *vmx;
struct vm *vm;
struct vmxctx *vmxctx;
struct vmcs *vmcs;
struct vm_exit *vmexit;
@ -1709,10 +1723,11 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap)
uint32_t exit_reason;
vmx = arg;
vm = vmx->vm;
vmcs = &vmx->vmcs[vcpu];
vmxctx = &vmx->ctx[vcpu];
vlapic = vm_lapic(vmx->vm, vcpu);
vmexit = vm_exitinfo(vmx->vm, vcpu);
vlapic = vm_lapic(vm, vcpu);
vmexit = vm_exitinfo(vm, vcpu);
launched = 0;
KASSERT(vmxctx->pmap == pmap,
@ -1760,6 +1775,12 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap)
break;
}
if (vcpu_rendezvous_pending(rendezvous_cookie)) {
enable_intr();
handled = vmx_exit_rendezvous(vmx, vcpu, vmexit);
break;
}
vmx_inject_interrupts(vmx, vcpu, vlapic);
vmx_run_trace(vmx, vcpu);
rc = vmx_enter_guest(vmxctx, launched);
@ -1793,9 +1814,9 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap)
}
if (!handled)
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_USERSPACE, 1);
vmm_stat_incr(vm, vcpu, VMEXIT_USERSPACE, 1);
VCPU_CTR1(vmx->vm, vcpu, "returning from vmx_run: exitcode %d",
VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d",
vmexit->exitcode);
VMCLEAR(vmcs);

View File

@ -222,8 +222,52 @@ vioapic_pulse_irq(struct vm *vm, int irq)
return (vioapic_set_irqstate(vm, irq, IRQSTATE_PULSE));
}
/*
* Reset the vlapic's trigger-mode register to reflect the ioapic pin
* configuration.
*/
static void
vioapic_update_tmr(struct vm *vm, int vcpuid, void *arg)
{
struct vioapic *vioapic;
struct vlapic *vlapic;
uint32_t low, high, dest;
int delmode, pin, vector;
bool level, phys;
vlapic = vm_lapic(vm, vcpuid);
vioapic = vm_ioapic(vm);
VIOAPIC_LOCK(vioapic);
/*
* Reset all vectors to be edge-triggered.
*/
vlapic_reset_tmr(vlapic);
for (pin = 0; pin < REDIR_ENTRIES; pin++) {
low = vioapic->rtbl[pin].reg;
high = vioapic->rtbl[pin].reg >> 32;
level = low & IOART_TRGRLVL ? true : false;
if (!level)
continue;
/*
* For a level-triggered 'pin' let the vlapic figure out if
* an assertion on this 'pin' would result in an interrupt
* being delivered to it. If yes, then it will modify the
* TMR bit associated with this vector to level-triggered.
*/
phys = ((low & IOART_DESTMOD) == IOART_DESTPHY);
delmode = low & IOART_DELMOD;
vector = low & IOART_INTVEC;
dest = high >> APIC_ID_SHIFT;
vlapic_set_tmr_level(vlapic, dest, phys, delmode, vector);
}
VIOAPIC_UNLOCK(vioapic);
}
static uint32_t
vioapic_read(struct vioapic *vioapic, uint32_t addr)
vioapic_read(struct vioapic *vioapic, int vcpuid, uint32_t addr)
{
int regnum, pin, rshift;
@ -258,10 +302,12 @@ vioapic_read(struct vioapic *vioapic, uint32_t addr)
}
static void
vioapic_write(struct vioapic *vioapic, uint32_t addr, uint32_t data)
vioapic_write(struct vioapic *vioapic, int vcpuid, uint32_t addr, uint32_t data)
{
uint64_t data64, mask64;
uint64_t last, changed;
int regnum, pin, lshift;
cpuset_t allvcpus;
regnum = addr & 0xff;
switch (regnum) {
@ -285,6 +331,8 @@ vioapic_write(struct vioapic *vioapic, uint32_t addr, uint32_t data)
else
lshift = 0;
last = vioapic->rtbl[pin].reg;
data64 = (uint64_t)data << lshift;
mask64 = (uint64_t)0xffffffff << lshift;
vioapic->rtbl[pin].reg &= ~mask64 | RTBL_RO_BITS;
@ -293,6 +341,22 @@ vioapic_write(struct vioapic *vioapic, uint32_t addr, uint32_t data)
VIOAPIC_CTR2(vioapic, "ioapic pin%d: redir table entry %#lx",
pin, vioapic->rtbl[pin].reg);
/*
* If any fields in the redirection table entry (except mask
* or polarity) have changed then rendezvous all the vcpus
* to update their vlapic trigger-mode registers.
*/
changed = last ^ vioapic->rtbl[pin].reg;
if (changed & ~(IOART_INTMASK | IOART_INTPOL)) {
VIOAPIC_CTR1(vioapic, "ioapic pin%d: recalculate "
"vlapic trigger-mode register", pin);
VIOAPIC_UNLOCK(vioapic);
allvcpus = vm_active_cpus(vioapic->vm);
vm_smp_rendezvous(vioapic->vm, vcpuid, allvcpus,
vioapic_update_tmr, NULL);
VIOAPIC_LOCK(vioapic);
}
/*
* Generate an interrupt if the following conditions are met:
* - pin is not masked
@ -310,8 +374,8 @@ vioapic_write(struct vioapic *vioapic, uint32_t addr, uint32_t data)
}
static int
vioapic_mmio_rw(struct vioapic *vioapic, uint64_t gpa, uint64_t *data,
int size, bool doread)
vioapic_mmio_rw(struct vioapic *vioapic, int vcpuid, uint64_t gpa,
uint64_t *data, int size, bool doread)
{
uint64_t offset;
@ -334,10 +398,13 @@ vioapic_mmio_rw(struct vioapic *vioapic, uint64_t gpa, uint64_t *data,
else
vioapic->ioregsel = *data;
} else {
if (doread)
*data = vioapic_read(vioapic, vioapic->ioregsel);
else
vioapic_write(vioapic, vioapic->ioregsel, *data);
if (doread) {
*data = vioapic_read(vioapic, vcpuid,
vioapic->ioregsel);
} else {
vioapic_write(vioapic, vcpuid, vioapic->ioregsel,
*data);
}
}
VIOAPIC_UNLOCK(vioapic);
@ -352,7 +419,7 @@ vioapic_mmio_read(void *vm, int vcpuid, uint64_t gpa, uint64_t *rval,
struct vioapic *vioapic;
vioapic = vm_ioapic(vm);
error = vioapic_mmio_rw(vioapic, gpa, rval, size, true);
error = vioapic_mmio_rw(vioapic, vcpuid, gpa, rval, size, true);
return (error);
}
@ -364,7 +431,7 @@ vioapic_mmio_write(void *vm, int vcpuid, uint64_t gpa, uint64_t wval,
struct vioapic *vioapic;
vioapic = vm_ioapic(vm);
error = vioapic_mmio_rw(vioapic, gpa, &wval, size, false);
error = vioapic_mmio_rw(vioapic, vcpuid, gpa, &wval, size, false);
return (error);
}

View File

@ -285,15 +285,13 @@ vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
atomic_set_int(&irrptr[idx], mask);
/*
* Upon acceptance of an interrupt into the IRR the corresponding
* TMR bit is cleared for edge-triggered interrupts and set for
* level-triggered interrupts.
* Verify that the trigger-mode of the interrupt matches with
* the vlapic TMR registers.
*/
tmrptr = &lapic->tmr0;
if (level)
atomic_set_int(&tmrptr[idx], mask);
else
atomic_clear_int(&tmrptr[idx], mask);
KASSERT((tmrptr[idx] & mask) == (level ? mask : 0),
("vlapic TMR[%d] is 0x%08x but interrupt is %s-triggered",
idx / 4, tmrptr[idx], level ? "level" : "edge"));
VLAPIC_CTR_IRR(vlapic, "vlapic_set_intr_ready");
return (1);
@ -1458,3 +1456,57 @@ vlapic_enabled(struct vlapic *vlapic)
else
return (false);
}
void
vlapic_reset_tmr(struct vlapic *vlapic)
{
struct LAPIC *lapic;
VLAPIC_CTR0(vlapic, "vlapic resetting all vectors to edge-triggered");
lapic = vlapic->apic_page;
lapic->tmr0 = 0;
lapic->tmr1 = 0;
lapic->tmr2 = 0;
lapic->tmr3 = 0;
lapic->tmr4 = 0;
lapic->tmr5 = 0;
lapic->tmr6 = 0;
lapic->tmr7 = 0;
}
void
vlapic_set_tmr_level(struct vlapic *vlapic, uint32_t dest, bool phys,
int delmode, int vector)
{
struct LAPIC *lapic;
uint32_t *tmrptr, mask;
cpuset_t dmask;
int idx;
bool lowprio;
KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector));
/*
* A level trigger is valid only for fixed and lowprio delivery modes.
*/
if (delmode != APIC_DELMODE_FIXED && delmode != APIC_DELMODE_LOWPRIO) {
VLAPIC_CTR1(vlapic, "Ignoring level trigger-mode for "
"delivery-mode %d", delmode);
return;
}
lowprio = (delmode == APIC_DELMODE_LOWPRIO);
vlapic_calcdest(vlapic->vm, &dmask, dest, phys, lowprio, false);
if (!CPU_ISSET(vlapic->vcpuid, &dmask))
return;
lapic = vlapic->apic_page;
tmrptr = &lapic->tmr0;
idx = (vector / 32) * 4;
mask = 1 << (vector % 32);
tmrptr[idx] |= mask;
VLAPIC_CTR1(vlapic, "vector %d set to level-triggered", vector);
}

View File

@ -81,6 +81,17 @@ bool vlapic_enabled(struct vlapic *vlapic);
void vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys,
int delmode, int vec);
/* Reset the trigger-mode bits for all vectors to be edge-triggered */
void vlapic_reset_tmr(struct vlapic *vlapic);
/*
* Set the trigger-mode bit associated with 'vector' to level-triggered if
* the (dest,phys,delmode) tuple resolves to an interrupt being delivered to
* this 'vlapic'.
*/
void vlapic_set_tmr_level(struct vlapic *vlapic, uint32_t dest, bool phys,
int delmode, int vector);
/* APIC write handlers */
void vlapic_id_write_handler(struct vlapic *vlapic);
void vlapic_ldr_write_handler(struct vlapic *vlapic);

View File

@ -125,6 +125,12 @@ struct vm {
* explicitly (AP) by sending it a startup ipi.
*/
cpuset_t active_cpus;
struct mtx rendezvous_mtx;
cpuset_t rendezvous_req_cpus;
cpuset_t rendezvous_done_cpus;
void *rendezvous_arg;
vm_rendezvous_func_t rendezvous_func;
};
static int vmm_initialized;
@ -135,8 +141,8 @@ static struct vmm_ops *ops;
#define VMM_RESUME() (ops != NULL ? (*ops->resume)() : 0)
#define VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
#define VMRUN(vmi, vcpu, rip, pmap) \
(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap) : ENXIO)
#define VMRUN(vmi, vcpu, rip, pmap, rptr) \
(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, rptr) : ENXIO)
#define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
#define VMSPACE_ALLOC(min, max) \
(ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
@ -176,6 +182,8 @@ static int vmm_ipinum;
SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
"IPI vector used for vcpu notifications");
static void vm_deactivate_cpu(struct vm *vm, int vcpuid);
static void
vcpu_cleanup(struct vm *vm, int i)
{
@ -330,6 +338,7 @@ vm_create(const char *name, struct vm **retvm)
vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
strcpy(vm->name, name);
vm->vmspace = vmspace;
mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF);
vm->cookie = VMINIT(vm, vmspace_pmap(vmspace));
vm->vioapic = vioapic_init(vm);
vm->vhpet = vhpet_init(vm);
@ -896,6 +905,59 @@ vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
panic("Error %d setting state to %d", error, newstate);
}
static void
vm_set_rendezvous_func(struct vm *vm, vm_rendezvous_func_t func)
{
KASSERT(mtx_owned(&vm->rendezvous_mtx), ("rendezvous_mtx not locked"));
/*
* Update 'rendezvous_func' and execute a write memory barrier to
* ensure that it is visible across all host cpus. This is not needed
* for correctness but it does ensure that all the vcpus will notice
* that the rendezvous is requested immediately.
*/
vm->rendezvous_func = func;
wmb();
}
#define RENDEZVOUS_CTR0(vm, vcpuid, fmt) \
do { \
if (vcpuid >= 0) \
VCPU_CTR0(vm, vcpuid, fmt); \
else \
VM_CTR0(vm, fmt); \
} while (0)
static void
vm_handle_rendezvous(struct vm *vm, int vcpuid)
{
KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
("vm_handle_rendezvous: invalid vcpuid %d", vcpuid));
mtx_lock(&vm->rendezvous_mtx);
while (vm->rendezvous_func != NULL) {
if (vcpuid != -1 &&
CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus)) {
VCPU_CTR0(vm, vcpuid, "Calling rendezvous func");
(*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg);
CPU_SET(vcpuid, &vm->rendezvous_done_cpus);
}
if (CPU_CMP(&vm->rendezvous_req_cpus,
&vm->rendezvous_done_cpus) == 0) {
VCPU_CTR0(vm, vcpuid, "Rendezvous completed");
vm_set_rendezvous_func(vm, NULL);
wakeup(&vm->rendezvous_func);
break;
}
RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion");
mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0,
"vmrndv", 0);
}
mtx_unlock(&vm->rendezvous_mtx);
}
/*
* Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
*/
@ -936,6 +998,7 @@ vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
*retu = true;
vmexit = vm_exitinfo(vm, vcpuid);
vmexit->exitcode = VM_EXITCODE_SPINDOWN_CPU;
vm_deactivate_cpu(vm, vcpuid);
VCPU_CTR0(vm, vcpuid, "spinning down cpu");
}
vcpu_require_state_locked(vcpu, VCPU_FROZEN);
@ -1072,7 +1135,7 @@ vm_run(struct vm *vm, struct vm_run *vmrun)
vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
vcpu->hostcpu = curcpu;
error = VMRUN(vm->cookie, vcpuid, rip, pmap);
error = VMRUN(vm->cookie, vcpuid, rip, pmap, &vm->rendezvous_func);
vcpu->hostcpu = NOCPU;
vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
@ -1086,6 +1149,10 @@ vm_run(struct vm *vm, struct vm_run *vmrun)
if (error == 0) {
retu = false;
switch (vme->exitcode) {
case VM_EXITCODE_RENDEZVOUS:
vm_handle_rendezvous(vm, vcpuid);
error = 0;
break;
case VM_EXITCODE_HLT:
intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu);
@ -1321,6 +1388,14 @@ vm_activate_cpu(struct vm *vm, int vcpuid)
CPU_SET(vcpuid, &vm->active_cpus);
}
static void
vm_deactivate_cpu(struct vm *vm, int vcpuid)
{
if (vcpuid >= 0 && vcpuid < VM_MAXCPU)
CPU_CLR(vcpuid, &vm->active_cpus);
}
cpuset_t
vm_active_cpus(struct vm *vm)
{
@ -1411,3 +1486,40 @@ vm_apicid2vcpuid(struct vm *vm, int apicid)
*/
return (apicid);
}
void
vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
vm_rendezvous_func_t func, void *arg)
{
/*
* Enforce that this function is called without any locks
*/
WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous");
KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
("vm_smp_rendezvous: invalid vcpuid %d", vcpuid));
restart:
mtx_lock(&vm->rendezvous_mtx);
if (vm->rendezvous_func != NULL) {
/*
* If a rendezvous is already in progress then we need to
* call the rendezvous handler in case this 'vcpuid' is one
* of the targets of the rendezvous.
*/
RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress");
mtx_unlock(&vm->rendezvous_mtx);
vm_handle_rendezvous(vm, vcpuid);
goto restart;
}
KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous "
"rendezvous is still in progress"));
RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous");
vm->rendezvous_req_cpus = dest;
CPU_ZERO(&vm->rendezvous_done_cpus);
vm->rendezvous_arg = arg;
vm_set_rendezvous_func(vm, func);
mtx_unlock(&vm->rendezvous_mtx);
vm_handle_rendezvous(vm, vcpuid);
}

View File

@ -150,3 +150,4 @@ VMM_STAT(VMEXIT_EPT_FAULT, "vm exits due to nested page fault");
VMM_STAT(VMEXIT_UNKNOWN, "number of vm exits for unknown reason");
VMM_STAT(VMEXIT_ASTPENDING, "number of times astpending at exit");
VMM_STAT(VMEXIT_USERSPACE, "number of vm exits handled in userspace");
VMM_STAT(VMEXIT_RENDEZVOUS, "number of times rendezvous pending at exit");

View File

@ -120,4 +120,5 @@ VMM_STAT_DECLARE(VMEXIT_EPT_FAULT);
VMM_STAT_DECLARE(VMEXIT_UNKNOWN);
VMM_STAT_DECLARE(VMEXIT_ASTPENDING);
VMM_STAT_DECLARE(VMEXIT_USERSPACE);
VMM_STAT_DECLARE(VMEXIT_RENDEZVOUS);
#endif