Improve robustness of NMI handling, for NMIs recognized in kernel

mode.

- Make the NMI handler run on its own stack (TSS_IST2).
- Store the GSBASE value for each CPU just before the start of
  each NMI stack, permitting efficient retrieval using %rsp-relative
  addressing.
- For NMIs taken from kernel mode, program MSR_GSBASE explicitly
  since one or both of MSR_GSBASE and MSR_KGSBASE can be potentially
  invalid.  The current contents of MSR_GSBASE are saved and restored
  at exit.
- For NMIs handled from user mode, continue to use 'swapgs' to
  load the per-CPU GSBASE.

Reviewed by:	jeff
Debugging help:	jeff
Tested by:	gnn, Artem Belevich <artemb at gmail dot com>
This commit is contained in:
Joseph Koshy 2009-02-03 09:01:45 +00:00
parent 2c204a1631
commit bb471e3315
4 changed files with 84 additions and 26 deletions

View File

@ -383,22 +383,24 @@ IDTVEC(fast_syscall32)
* NMI handling is special.
*
* First, NMIs do not respect the state of the processor's RFLAGS.IF
* bit and the NMI handler may be invoked at any time, including when
* the processor is in a critical section with RFLAGS.IF == 0. In
* particular, this means that the processor's GS.base values could be
* inconsistent on entry to the handler, and so we need to read
* MSR_GSBASE to determine if a 'swapgs' is needed. We use '%ebx', a
* C-preserved register, to remember whether to swap GS back on the
* exit path.
* bit. The NMI handler may be entered at any time, including when
* the processor is in a critical section with RFLAGS.IF == 0.
* The processor's GS.base value could be invalid on entry to the
* handler.
*
* Second, the processor treats NMIs specially, blocking further NMIs
* until an 'iretq' instruction is executed. We therefore need to
* execute the NMI handler with interrupts disabled to prevent a
* nested interrupt from executing an 'iretq' instruction and
* inadvertently taking the processor out of NMI mode.
* until an 'iretq' instruction is executed. We thus need to execute
* the NMI handler with interrupts disabled, to prevent a nested interrupt
* from executing an 'iretq' instruction and inadvertently taking the
* processor out of NMI mode.
*
* Third, the NMI handler runs on its own stack (tss_ist1), shared
* with the double fault handler.
* Third, the NMI handler runs on its own stack (tss_ist2). The canonical
* GS.base value for the processor is stored just above the bottom of its
* NMI stack. For NMIs taken from kernel mode, the current value in
* the processor's GS.base is saved at entry to C-preserved register %r12,
* the canonical value for GS.base is then loaded into the processor, and
* the saved value is restored at exit time. For NMIs taken from user mode,
* the cheaper 'SWAPGS' instructions are used for swapping GS.base.
*/
IDTVEC(nmi)
@ -423,12 +425,22 @@ IDTVEC(nmi)
movq %r15,TF_R15(%rsp)
xorl %ebx,%ebx
testb $SEL_RPL_MASK,TF_CS(%rsp)
jnz nmi_needswapgs /* we came from userland */
jnz nmi_fromuserspace
/*
* We've interrupted the kernel. Preserve GS.base in %r12.
*/
movl $MSR_GSBASE,%ecx
rdmsr
cmpl $VM_MAXUSER_ADDRESS >> 32,%edx
jae nmi_calltrap /* GS.base holds a kernel VA */
nmi_needswapgs:
movq %rax,%r12
shlq $32,%rdx
orq %rdx,%r12
/* Retrieve and load the canonical value for GS.base. */
movq TF_SIZE(%rsp),%rdx
movl %edx,%eax
shrq $32,%rdx
wrmsr
jmp nmi_calltrap
nmi_fromuserspace:
incl %ebx
swapgs
/* Note: this label is also used by ddb and gdb: */
@ -439,14 +451,19 @@ nmi_calltrap:
MEXITCOUNT
#ifdef HWPMC_HOOKS
/*
* Check if the current trap was from user mode and if so
* whether the current thread needs a user call chain to be
* captured. We are still in NMI mode at this point.
* Capture a userspace callchain if needed.
*
* - Check if the current trap was from user mode.
* - Check if the current thread is valid.
* - Check if the thread requires a user call chain to be
* captured.
*
* We are still in NMI mode at this point.
*/
testb $SEL_RPL_MASK,TF_CS(%rsp)
jz nocallchain
movq PCPU(CURTHREAD),%rax /* curthread present? */
orq %rax,%rax
testl %ebx,%ebx
jz nocallchain /* not from userspace */
movq PCPU(CURTHREAD),%rax
orq %rax,%rax /* curthread present? */
jz nocallchain
testl $TDP_CALLCHAIN,TD_PFLAGS(%rax) /* flagged for capture? */
jz nocallchain
@ -498,8 +515,18 @@ outofnmi:
nocallchain:
#endif
testl %ebx,%ebx
jz nmi_restoreregs
jz nmi_kernelexit
swapgs
jmp nmi_restoreregs
nmi_kernelexit:
/*
* Put back the preserved MSR_GSBASE value.
*/
movl $MSR_GSBASE,%ecx
movq %r12,%rdx
movl %edx,%eax
shrq $32,%rdx
wrmsr
nmi_restoreregs:
movq TF_RDI(%rsp),%rdi
movq TF_RSI(%rsp),%rsi

View File

@ -809,6 +809,9 @@ struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */
static char dblfault_stack[PAGE_SIZE] __aligned(16);
static char nmi0_stack[PAGE_SIZE] __aligned(16);
CTASSERT(sizeof(struct nmi_pcpu) == 16);
struct amd64tss common_tss[MAXCPU];
/* software prototypes -- in more palatable form */
@ -1291,6 +1294,7 @@ hammer_time(u_int64_t modulep, u_int64_t physfree)
caddr_t kmdp;
int gsel_tss, x;
struct pcpu *pc;
struct nmi_pcpu *np;
u_int64_t msr;
char *env;
@ -1365,7 +1369,7 @@ hammer_time(u_int64_t modulep, u_int64_t physfree)
setidt(x, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
setidt(IDT_DE, &IDTVEC(div), SDT_SYSIGT, SEL_KPL, 0);
setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 0);
setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 1);
setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2);
setidt(IDT_BP, &IDTVEC(bpt), SDT_SYSIGT, SEL_UPL, 0);
setidt(IDT_OF, &IDTVEC(ofl), SDT_SYSIGT, SEL_KPL, 0);
setidt(IDT_BR, &IDTVEC(bnd), SDT_SYSIGT, SEL_KPL, 0);
@ -1438,6 +1442,14 @@ hammer_time(u_int64_t modulep, u_int64_t physfree)
/* doublefault stack space, runs on ist1 */
common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)];
/*
* NMI stack, runs on ist2. The pcpu pointer is stored just
* above the start of the ist2 stack.
*/
np = ((struct nmi_pcpu *) &nmi0_stack[sizeof(nmi0_stack)]) - 1;
np->np_pcpu = (register_t) pc;
common_tss[0].tss_ist2 = (long) np;
/* Set the IO permission bitmap (empty due to tss seg limit) */
common_tss[0].tss_iobase = sizeof(struct amd64tss);

View File

@ -92,6 +92,7 @@ void *bootstacks[MAXCPU];
/* Temporary holder for double fault stack */
char *doublefault_stack;
char *nmi_stack;
/* Hotwire a 0->4MB V==P mapping */
extern pt_entry_t *KPTphys;
@ -437,6 +438,7 @@ void
init_secondary(void)
{
struct pcpu *pc;
struct nmi_pcpu *np;
u_int64_t msr, cr0;
int cpu, gsel_tss, x;
struct region_descriptor ap_gdt;
@ -450,6 +452,10 @@ init_secondary(void)
common_tss[cpu].tss_iobase = sizeof(struct amd64tss);
common_tss[cpu].tss_ist1 = (long)&doublefault_stack[PAGE_SIZE];
/* The NMI stack runs on IST2. */
np = ((struct nmi_pcpu *) &nmi_stack[PAGE_SIZE]) - 1;
common_tss[cpu].tss_ist2 = (long) np;
/* Prepare private GDT */
gdt_segs[GPROC0_SEL].ssd_base = (long) &common_tss[cpu];
ssdtosyssd(&gdt_segs[GPROC0_SEL],
@ -474,6 +480,9 @@ init_secondary(void)
pc->pc_rsp0 = 0;
pc->pc_gs32p = &gdt[NGDT * cpu + GUGS32_SEL];
/* Save the per-cpu pointer for use by the NMI handler. */
np->np_pcpu = (register_t) pc;
wrmsr(MSR_FSBASE, 0); /* User value */
wrmsr(MSR_GSBASE, (u_int64_t)pc);
wrmsr(MSR_KGSBASE, (u_int64_t)pc); /* XXX User value while we're in the kernel */
@ -725,6 +734,7 @@ start_all_aps(void)
/* allocate and set up an idle stack data page */
bootstacks[cpu] = (void *)kmem_alloc(kernel_map, KSTACK_PAGES * PAGE_SIZE);
doublefault_stack = (char *)kmem_alloc(kernel_map, PAGE_SIZE);
nmi_stack = (char *)kmem_alloc(kernel_map, PAGE_SIZE);
bootSTK = (char *)bootstacks[cpu] + KSTACK_PAGES * PAGE_SIZE - 8;
bootAP = cpu;

View File

@ -120,6 +120,15 @@ struct intsrc {
struct trapframe;
/*
* The following data structure holds per-cpu data, and is placed just
* above the top of the space used for the NMI stack.
*/
struct nmi_pcpu {
register_t np_pcpu;
register_t __padding; /* pad to 16 bytes */
};
extern struct mtx icu_lock;
extern int elcr_found;