Use callout(9) to drive the vlapic timer instead of clocking it on each VM exit.

This decouples the guest's 'hz' from the host's 'hz' setting. For e.g. it is
now possible to have a guest run at 'hz=1000' while the host is at 'hz=100'.

Discussed with:	grehan@
Tested by:	Tycho Nightingale (tycho.nightingale@pluribusnetworks.com)
This commit is contained in:
neel 2013-12-07 23:11:12 +00:00
parent f24ecdf104
commit e7ebb9541a
6 changed files with 233 additions and 144 deletions

View File

@ -1563,7 +1563,6 @@ vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap)
panic("vmx_run: error %d setting up pcpu defaults", error);
do {
lapic_timer_tick(vmx->vm, vcpu);
vmx_inject_interrupts(vmx, vcpu);
vmx_run_trace(vmx, vcpu);
rc = vmx_setjmp(vmxctx);

View File

@ -30,8 +30,10 @@
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/lock.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/systm.h>
#include <sys/smp.h>
@ -103,12 +105,15 @@ struct vlapic {
struct vm *vm;
int vcpuid;
struct LAPIC apic;
struct LAPIC apic;
int esr_update;
int divisor;
int ccr_ticks;
struct callout callout; /* vlapic timer */
struct bintime timer_fire_bt; /* callout expiry time */
struct bintime timer_freq_bt; /* timer frequency */
struct bintime timer_period_bt; /* timer period */
struct mtx timer_mtx;
/*
* The 'isrvec_stk' is a stack of vectors injected by the local apic.
@ -123,6 +128,21 @@ struct vlapic {
enum boot_state boot_state;
};
/*
* The 'vlapic->timer_mtx' is used to provide mutual exclusion between the
* vlapic_callout_handler() and vcpu accesses to the following registers:
* - initial count register aka icr_timer
* - current count register aka ccr_timer
* - divide config register aka dcr_timer
* - timer LVT register
*
* Note that the vlapic_callout_handler() does not write to any of these
* registers so they can be safely read from the vcpu context without locking.
*/
#define VLAPIC_TIMER_LOCK(vlapic) mtx_lock(&((vlapic)->timer_mtx))
#define VLAPIC_TIMER_UNLOCK(vlapic) mtx_unlock(&((vlapic)->timer_mtx))
#define VLAPIC_TIMER_LOCKED(vlapic) mtx_owned(&((vlapic)->timer_mtx))
#define VLAPIC_BUS_FREQ tsc_freq
static int
@ -170,11 +190,62 @@ vlapic_dump_lvt(uint32_t offset, uint32_t *lvt)
}
#endif
static uint64_t
static uint32_t
vlapic_get_ccr(struct vlapic *vlapic)
{
struct LAPIC *lapic = &vlapic->apic;
return lapic->ccr_timer;
struct bintime bt_now, bt_rem;
struct LAPIC *lapic;
uint32_t ccr;
ccr = 0;
lapic = &vlapic->apic;
VLAPIC_TIMER_LOCK(vlapic);
if (callout_active(&vlapic->callout)) {
/*
* If the timer is scheduled to expire in the future then
* compute the value of 'ccr' based on the remaining time.
*/
binuptime(&bt_now);
if (bintime_cmp(&vlapic->timer_fire_bt, &bt_now, >)) {
bt_rem = vlapic->timer_fire_bt;
bintime_sub(&bt_rem, &bt_now);
ccr += bt_rem.sec * BT2FREQ(&vlapic->timer_freq_bt);
ccr += bt_rem.frac / vlapic->timer_freq_bt.frac;
}
}
KASSERT(ccr <= lapic->icr_timer, ("vlapic_get_ccr: invalid ccr %#x, "
"icr_timer is %#x", ccr, lapic->icr_timer));
VLAPIC_CTR2(vlapic, "vlapic ccr_timer = %#x, icr_timer = %#x",
ccr, lapic->icr_timer);
VLAPIC_TIMER_UNLOCK(vlapic);
return (ccr);
}
static void
vlapic_set_dcr(struct vlapic *vlapic, uint32_t dcr)
{
struct LAPIC *lapic;
int divisor;
lapic = &vlapic->apic;
VLAPIC_TIMER_LOCK(vlapic);
lapic->dcr_timer = dcr;
divisor = vlapic_timer_divisor(dcr);
VLAPIC_CTR2(vlapic, "vlapic dcr_timer=%#x, divisor=%d", dcr, divisor);
/*
* Update the timer frequency and the timer period.
*
* XXX changes to the frequency divider will not take effect until
* the timer is reloaded.
*/
FREQ2BT(VLAPIC_BUS_FREQ / divisor, &vlapic->timer_freq_bt);
vlapic->timer_period_bt = vlapic->timer_freq_bt;
bintime_mul(&vlapic->timer_period_bt, lapic->icr_timer);
VLAPIC_TIMER_UNLOCK(vlapic);
}
static void
@ -203,7 +274,7 @@ vlapic_reset(struct vlapic *vlapic)
memset(lapic, 0, sizeof(*lapic));
lapic->apr = vlapic->vcpuid;
vlapic_init_ipi(vlapic);
vlapic->divisor = vlapic_timer_divisor(lapic->dcr_timer);
vlapic_set_dcr(vlapic, 0);
if (vlapic->vcpuid == 0)
vlapic->boot_state = BS_RUNNING; /* BSP */
@ -250,30 +321,8 @@ vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
VLAPIC_CTR_IRR(vlapic, "vlapic_set_intr_ready");
}
static void
vlapic_start_timer(struct vlapic *vlapic, uint32_t elapsed)
{
uint32_t icr_timer;
icr_timer = vlapic->apic.icr_timer;
vlapic->ccr_ticks = ticks;
if (elapsed < icr_timer)
vlapic->apic.ccr_timer = icr_timer - elapsed;
else {
/*
* This can happen when the guest is trying to run its local
* apic timer higher that the setting of 'hz' in the host.
*
* We deal with this by running the guest local apic timer
* at the rate of the host's 'hz' setting.
*/
vlapic->apic.ccr_timer = 0;
}
}
static __inline uint32_t *
vlapic_get_lvt(struct vlapic *vlapic, uint32_t offset)
vlapic_get_lvtptr(struct vlapic *vlapic, uint32_t offset)
{
struct LAPIC *lapic = &vlapic->apic;
int i;
@ -285,6 +334,33 @@ vlapic_get_lvt(struct vlapic *vlapic, uint32_t offset)
return ((&lapic->lvt_timer) + i);;
}
static __inline uint32_t
vlapic_get_lvt(struct vlapic *vlapic, uint32_t offset)
{
return (*vlapic_get_lvtptr(vlapic, offset));
}
static void
vlapic_set_lvt(struct vlapic *vlapic, uint32_t offset, uint32_t val)
{
uint32_t *lvtptr;
struct LAPIC *lapic;
lapic = &vlapic->apic;
lvtptr = vlapic_get_lvtptr(vlapic, offset);
if (offset == APIC_OFFSET_TIMER_LVT)
VLAPIC_TIMER_LOCK(vlapic);
if (!(lapic->svr & APIC_SVR_ENABLE))
val |= APIC_LVT_M;
*lvtptr = val;
if (offset == APIC_OFFSET_TIMER_LVT)
VLAPIC_TIMER_UNLOCK(vlapic);
}
#if 1
static void
dump_isrvec_stk(struct vlapic *vlapic)
@ -407,15 +483,16 @@ vlapic_process_eoi(struct vlapic *vlapic)
}
static __inline int
vlapic_get_lvt_field(uint32_t *lvt, uint32_t mask)
vlapic_get_lvt_field(uint32_t lvt, uint32_t mask)
{
return (*lvt & mask);
return (lvt & mask);
}
static __inline int
vlapic_periodic_timer(struct vlapic *vlapic)
{
uint32_t *lvt;
uint32_t lvt;
lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
@ -428,17 +505,111 @@ static void
vlapic_fire_timer(struct vlapic *vlapic)
{
int vector;
uint32_t *lvt;
uint32_t lvt;
KASSERT(VLAPIC_TIMER_LOCKED(vlapic), ("vlapic_fire_timer not locked"));
lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
if (!vlapic_get_lvt_field(lvt, APIC_LVTT_M)) {
vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_TIMER, 1);
vector = vlapic_get_lvt_field(lvt,APIC_LVTT_VECTOR);
vector = vlapic_get_lvt_field(lvt, APIC_LVTT_VECTOR);
vlapic_set_intr_ready(vlapic, vector, false);
vcpu_notify_event(vlapic->vm, vlapic->vcpuid);
}
}
static void
vlapic_callout_handler(void *arg)
{
struct vlapic *vlapic;
struct bintime bt, btnow;
sbintime_t rem_sbt;
vlapic = arg;
VLAPIC_TIMER_LOCK(vlapic);
if (callout_pending(&vlapic->callout)) /* callout was reset */
goto done;
if (!callout_active(&vlapic->callout)) /* callout was stopped */
goto done;
callout_deactivate(&vlapic->callout);
KASSERT(vlapic->apic.icr_timer != 0, ("vlapic timer is disabled"));
vlapic_fire_timer(vlapic);
if (vlapic_periodic_timer(vlapic)) {
binuptime(&btnow);
KASSERT(bintime_cmp(&btnow, &vlapic->timer_fire_bt, >=),
("vlapic callout at %#lx.%#lx, expected at %#lx.#%lx",
btnow.sec, btnow.frac, vlapic->timer_fire_bt.sec,
vlapic->timer_fire_bt.frac));
/*
* Compute the delta between when the timer was supposed to
* fire and the present time.
*/
bt = btnow;
bintime_sub(&bt, &vlapic->timer_fire_bt);
rem_sbt = bttosbt(vlapic->timer_period_bt);
if (bintime_cmp(&bt, &vlapic->timer_period_bt, <)) {
/*
* Adjust the time until the next countdown downward
* to account for the lost time.
*/
rem_sbt -= bttosbt(bt);
} else {
/*
* If the delta is greater than the timer period then
* just reset our time base instead of trying to catch
* up.
*/
vlapic->timer_fire_bt = btnow;
VLAPIC_CTR2(vlapic, "vlapic timer lagging by %lu "
"usecs, period is %lu usecs - resetting time base",
bttosbt(bt) / SBT_1US,
bttosbt(vlapic->timer_period_bt) / SBT_1US);
}
bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt);
callout_reset_sbt(&vlapic->callout, rem_sbt, 0,
vlapic_callout_handler, vlapic, 0);
}
done:
VLAPIC_TIMER_UNLOCK(vlapic);
}
static void
vlapic_set_icr_timer(struct vlapic *vlapic, uint32_t icr_timer)
{
struct LAPIC *lapic;
sbintime_t sbt;
VLAPIC_TIMER_LOCK(vlapic);
lapic = &vlapic->apic;
lapic->icr_timer = icr_timer;
vlapic->timer_period_bt = vlapic->timer_freq_bt;
bintime_mul(&vlapic->timer_period_bt, icr_timer);
if (icr_timer != 0) {
binuptime(&vlapic->timer_fire_bt);
bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt);
sbt = bttosbt(vlapic->timer_period_bt);
callout_reset_sbt(&vlapic->callout, sbt, 0,
vlapic_callout_handler, vlapic, 0);
} else
callout_stop(&vlapic->callout);
VLAPIC_TIMER_UNLOCK(vlapic);
}
static VMM_STAT_ARRAY(IPIS_SENT, VM_MAXCPU, "ipis sent to vcpu");
static int
@ -564,7 +735,6 @@ vlapic_pending_intr(struct vlapic *vlapic)
break;
}
}
VLAPIC_CTR0(vlapic, "no pending intr");
return (-1);
}
@ -613,9 +783,21 @@ lapic_set_svr(struct vlapic *vlapic, uint32_t new)
changed = old ^ new;
if ((changed & APIC_SVR_ENABLE) != 0) {
if ((new & APIC_SVR_ENABLE) == 0) {
/*
* The apic is now disabled so stop the apic timer.
*/
VLAPIC_CTR0(vlapic, "vlapic is software-disabled");
VLAPIC_TIMER_LOCK(vlapic);
callout_stop(&vlapic->callout);
VLAPIC_TIMER_UNLOCK(vlapic);
} else {
/*
* The apic is now enabled so restart the apic timer
* if it is configured in periodic mode.
*/
VLAPIC_CTR0(vlapic, "vlapic is software-enabled");
if (vlapic_periodic_timer(vlapic))
vlapic_set_icr_timer(vlapic, lapic->icr_timer);
}
}
lapic->svr = new;
@ -691,8 +873,7 @@ vlapic_read(struct vlapic *vlapic, uint64_t offset, uint64_t *data)
*data = lapic->icr_hi;
break;
case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
reg = vlapic_get_lvt(vlapic, offset);
*data = *(reg);
*data = vlapic_get_lvt(vlapic, offset);
break;
case APIC_OFFSET_ICR:
*data = lapic->icr_timer;
@ -717,7 +898,6 @@ int
vlapic_write(struct vlapic *vlapic, uint64_t offset, uint64_t data)
{
struct LAPIC *lapic = &vlapic->apic;
uint32_t *reg;
int retval;
VLAPIC_CTR2(vlapic, "vlapic write offset %#x, data %#lx", offset, data);
@ -760,21 +940,14 @@ vlapic_write(struct vlapic *vlapic, uint64_t offset, uint64_t data)
}
break;
case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
reg = vlapic_get_lvt(vlapic, offset);
if (!(lapic->svr & APIC_SVR_ENABLE)) {
data |= APIC_LVT_M;
}
*reg = data;
// vlapic_dump_lvt(offset, reg);
vlapic_set_lvt(vlapic, offset, data);
break;
case APIC_OFFSET_ICR:
lapic->icr_timer = data;
vlapic_start_timer(vlapic, 0);
vlapic_set_icr_timer(vlapic, data);
break;
case APIC_OFFSET_DCR:
lapic->dcr_timer = data;
vlapic->divisor = vlapic_timer_divisor(data);
vlapic_set_dcr(vlapic, data);
break;
case APIC_OFFSET_ESR:
@ -796,70 +969,6 @@ vlapic_write(struct vlapic *vlapic, uint64_t offset, uint64_t data)
return (retval);
}
int
vlapic_timer_tick(struct vlapic *vlapic)
{
int curticks, delta, periodic, fired;
uint32_t ccr;
uint32_t decrement, leftover;
restart:
curticks = ticks;
delta = curticks - vlapic->ccr_ticks;
/* Local APIC timer is disabled */
if (vlapic->apic.icr_timer == 0)
return (-1);
/* One-shot mode and timer has already counted down to zero */
periodic = vlapic_periodic_timer(vlapic);
if (!periodic && vlapic->apic.ccr_timer == 0)
return (-1);
/*
* The 'curticks' and 'ccr_ticks' are out of sync by more than
* 2^31 ticks. We deal with this by restarting the timer.
*/
if (delta < 0) {
vlapic_start_timer(vlapic, 0);
goto restart;
}
fired = 0;
decrement = (VLAPIC_BUS_FREQ / vlapic->divisor) / hz;
vlapic->ccr_ticks = curticks;
ccr = vlapic->apic.ccr_timer;
while (delta-- > 0) {
if (ccr > decrement) {
ccr -= decrement;
continue;
}
/* Trigger the local apic timer interrupt */
vlapic_fire_timer(vlapic);
if (periodic) {
leftover = decrement - ccr;
vlapic_start_timer(vlapic, leftover);
ccr = vlapic->apic.ccr_timer;
} else {
/*
* One-shot timer has counted down to zero.
*/
ccr = 0;
}
fired = 1;
break;
}
vlapic->apic.ccr_timer = ccr;
if (!fired)
return ((ccr / decrement) + 1);
else
return (0);
}
struct vlapic *
vlapic_init(struct vm *vm, int vcpuid)
{
@ -869,6 +978,9 @@ vlapic_init(struct vm *vm, int vcpuid)
vlapic->vm = vm;
vlapic->vcpuid = vcpuid;
mtx_init(&vlapic->timer_mtx, "vlapic timer mtx", NULL, MTX_DEF);
callout_init(&vlapic->callout, 1);
vlapic->msr_apicbase = DEFAULT_APIC_BASE | APICBASE_ENABLED;
if (vcpuid == 0)
@ -883,6 +995,7 @@ void
vlapic_cleanup(struct vlapic *vlapic)
{
callout_drain(&vlapic->callout);
free(vlapic, M_VLAPIC);
}

View File

@ -95,7 +95,6 @@ int vlapic_read(struct vlapic *vlapic, uint64_t offset, uint64_t *data);
int vlapic_pending_intr(struct vlapic *vlapic);
void vlapic_intr_accepted(struct vlapic *vlapic, int vector);
void vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level);
int vlapic_timer_tick(struct vlapic *vlapic);
uint64_t vlapic_get_apicbase(struct vlapic *vlapic);
void vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val);

View File

@ -865,25 +865,12 @@ vm_handle_hlt(struct vm *vm, int vcpuid, boolean_t intr_disabled,
{
struct vm_exit *vmexit;
struct vcpu *vcpu;
int sleepticks, t;
int t, timo;
vcpu = &vm->vcpu[vcpuid];
vcpu_lock(vcpu);
/*
* Figure out the number of host ticks until the next apic
* timer interrupt in the guest.
*/
sleepticks = lapic_timer_tick(vm, vcpuid);
/*
* If the guest local apic timer is disabled then sleep for
* a long time but not forever.
*/
if (sleepticks < 0)
sleepticks = hz;
/*
* Do a final check for pending NMI or interrupts before
* really putting this thread to sleep.
@ -893,12 +880,15 @@ vm_handle_hlt(struct vm *vm, int vcpuid, boolean_t intr_disabled,
*/
if (!vm_nmi_pending(vm, vcpuid) &&
(intr_disabled || vlapic_pending_intr(vcpu->vlapic) < 0)) {
if (sleepticks <= 0)
panic("invalid sleepticks %d", sleepticks);
t = ticks;
vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
if (vlapic_enabled(vcpu->vlapic)) {
msleep_spin(vcpu, &vcpu->mtx, "vmidle", sleepticks);
/*
* XXX msleep_spin() is not interruptible so use the
* 'timo' to put an upper bound on the sleep time.
*/
timo = hz;
msleep_spin(vcpu, &vcpu->mtx, "vmidle", timo);
} else {
/*
* Spindown the vcpu if the apic is disabled and it

View File

@ -80,16 +80,6 @@ lapic_set_intr(struct vm *vm, int cpu, int vector, bool level)
return (0);
}
int
lapic_timer_tick(struct vm *vm, int cpu)
{
struct vlapic *vlapic;
vlapic = vm_lapic(vm, cpu);
return (vlapic_timer_tick(vlapic));
}
static boolean_t
x2apic_msr(u_int msr)
{

View File

@ -40,8 +40,6 @@ int lapic_mmio_read(void *vm, int cpu, uint64_t gpa,
int lapic_mmio_write(void *vm, int cpu, uint64_t gpa,
uint64_t wval, int size, void *arg);
int lapic_timer_tick(struct vm *vm, int cpu);
/*
* Returns a vector between 32 and 255 if an interrupt is pending in the
* IRR that can be delivered based on the current state of ISR and TPR.