Improve robustness of NMI handling, for NMIs recognized in kernel
mode. - Make the NMI handler run on its own stack (TSS_IST2). - Store the GSBASE value for each CPU just before the start of each NMI stack, permitting efficient retrieval using %rsp-relative addressing. - For NMIs taken from kernel mode, program MSR_GSBASE explicitly since one or both of MSR_GSBASE and MSR_KGSBASE can be potentially invalid. The current contents of MSR_GSBASE are saved and restored at exit. - For NMIs handled from user mode, continue to use 'swapgs' to load the per-CPU GSBASE. Reviewed by: jeff Debugging help: jeff Tested by: gnn, Artem Belevich <artemb at gmail dot com>
This commit is contained in:
parent
2c204a1631
commit
bb471e3315
@ -383,22 +383,24 @@ IDTVEC(fast_syscall32)
|
||||
* NMI handling is special.
|
||||
*
|
||||
* First, NMIs do not respect the state of the processor's RFLAGS.IF
|
||||
* bit and the NMI handler may be invoked at any time, including when
|
||||
* the processor is in a critical section with RFLAGS.IF == 0. In
|
||||
* particular, this means that the processor's GS.base values could be
|
||||
* inconsistent on entry to the handler, and so we need to read
|
||||
* MSR_GSBASE to determine if a 'swapgs' is needed. We use '%ebx', a
|
||||
* C-preserved register, to remember whether to swap GS back on the
|
||||
* exit path.
|
||||
* bit. The NMI handler may be entered at any time, including when
|
||||
* the processor is in a critical section with RFLAGS.IF == 0.
|
||||
* The processor's GS.base value could be invalid on entry to the
|
||||
* handler.
|
||||
*
|
||||
* Second, the processor treats NMIs specially, blocking further NMIs
|
||||
* until an 'iretq' instruction is executed. We therefore need to
|
||||
* execute the NMI handler with interrupts disabled to prevent a
|
||||
* nested interrupt from executing an 'iretq' instruction and
|
||||
* inadvertently taking the processor out of NMI mode.
|
||||
* until an 'iretq' instruction is executed. We thus need to execute
|
||||
* the NMI handler with interrupts disabled, to prevent a nested interrupt
|
||||
* from executing an 'iretq' instruction and inadvertently taking the
|
||||
* processor out of NMI mode.
|
||||
*
|
||||
* Third, the NMI handler runs on its own stack (tss_ist1), shared
|
||||
* with the double fault handler.
|
||||
* Third, the NMI handler runs on its own stack (tss_ist2). The canonical
|
||||
* GS.base value for the processor is stored just above the bottom of its
|
||||
* NMI stack. For NMIs taken from kernel mode, the current value in
|
||||
* the processor's GS.base is saved at entry to C-preserved register %r12,
|
||||
* the canonical value for GS.base is then loaded into the processor, and
|
||||
* the saved value is restored at exit time. For NMIs taken from user mode,
|
||||
* the cheaper 'SWAPGS' instructions are used for swapping GS.base.
|
||||
*/
|
||||
|
||||
IDTVEC(nmi)
|
||||
@ -423,12 +425,22 @@ IDTVEC(nmi)
|
||||
movq %r15,TF_R15(%rsp)
|
||||
xorl %ebx,%ebx
|
||||
testb $SEL_RPL_MASK,TF_CS(%rsp)
|
||||
jnz nmi_needswapgs /* we came from userland */
|
||||
jnz nmi_fromuserspace
|
||||
/*
|
||||
* We've interrupted the kernel. Preserve GS.base in %r12.
|
||||
*/
|
||||
movl $MSR_GSBASE,%ecx
|
||||
rdmsr
|
||||
cmpl $VM_MAXUSER_ADDRESS >> 32,%edx
|
||||
jae nmi_calltrap /* GS.base holds a kernel VA */
|
||||
nmi_needswapgs:
|
||||
movq %rax,%r12
|
||||
shlq $32,%rdx
|
||||
orq %rdx,%r12
|
||||
/* Retrieve and load the canonical value for GS.base. */
|
||||
movq TF_SIZE(%rsp),%rdx
|
||||
movl %edx,%eax
|
||||
shrq $32,%rdx
|
||||
wrmsr
|
||||
jmp nmi_calltrap
|
||||
nmi_fromuserspace:
|
||||
incl %ebx
|
||||
swapgs
|
||||
/* Note: this label is also used by ddb and gdb: */
|
||||
@ -439,14 +451,19 @@ nmi_calltrap:
|
||||
MEXITCOUNT
|
||||
#ifdef HWPMC_HOOKS
|
||||
/*
|
||||
* Check if the current trap was from user mode and if so
|
||||
* whether the current thread needs a user call chain to be
|
||||
* captured. We are still in NMI mode at this point.
|
||||
* Capture a userspace callchain if needed.
|
||||
*
|
||||
* - Check if the current trap was from user mode.
|
||||
* - Check if the current thread is valid.
|
||||
* - Check if the thread requires a user call chain to be
|
||||
* captured.
|
||||
*
|
||||
* We are still in NMI mode at this point.
|
||||
*/
|
||||
testb $SEL_RPL_MASK,TF_CS(%rsp)
|
||||
jz nocallchain
|
||||
movq PCPU(CURTHREAD),%rax /* curthread present? */
|
||||
orq %rax,%rax
|
||||
testl %ebx,%ebx
|
||||
jz nocallchain /* not from userspace */
|
||||
movq PCPU(CURTHREAD),%rax
|
||||
orq %rax,%rax /* curthread present? */
|
||||
jz nocallchain
|
||||
testl $TDP_CALLCHAIN,TD_PFLAGS(%rax) /* flagged for capture? */
|
||||
jz nocallchain
|
||||
@ -498,8 +515,18 @@ outofnmi:
|
||||
nocallchain:
|
||||
#endif
|
||||
testl %ebx,%ebx
|
||||
jz nmi_restoreregs
|
||||
jz nmi_kernelexit
|
||||
swapgs
|
||||
jmp nmi_restoreregs
|
||||
nmi_kernelexit:
|
||||
/*
|
||||
* Put back the preserved MSR_GSBASE value.
|
||||
*/
|
||||
movl $MSR_GSBASE,%ecx
|
||||
movq %r12,%rdx
|
||||
movl %edx,%eax
|
||||
shrq $32,%rdx
|
||||
wrmsr
|
||||
nmi_restoreregs:
|
||||
movq TF_RDI(%rsp),%rdi
|
||||
movq TF_RSI(%rsp),%rsi
|
||||
|
@ -809,6 +809,9 @@ struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */
|
||||
|
||||
static char dblfault_stack[PAGE_SIZE] __aligned(16);
|
||||
|
||||
static char nmi0_stack[PAGE_SIZE] __aligned(16);
|
||||
CTASSERT(sizeof(struct nmi_pcpu) == 16);
|
||||
|
||||
struct amd64tss common_tss[MAXCPU];
|
||||
|
||||
/* software prototypes -- in more palatable form */
|
||||
@ -1291,6 +1294,7 @@ hammer_time(u_int64_t modulep, u_int64_t physfree)
|
||||
caddr_t kmdp;
|
||||
int gsel_tss, x;
|
||||
struct pcpu *pc;
|
||||
struct nmi_pcpu *np;
|
||||
u_int64_t msr;
|
||||
char *env;
|
||||
|
||||
@ -1365,7 +1369,7 @@ hammer_time(u_int64_t modulep, u_int64_t physfree)
|
||||
setidt(x, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
|
||||
setidt(IDT_DE, &IDTVEC(div), SDT_SYSIGT, SEL_KPL, 0);
|
||||
setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 0);
|
||||
setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 1);
|
||||
setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2);
|
||||
setidt(IDT_BP, &IDTVEC(bpt), SDT_SYSIGT, SEL_UPL, 0);
|
||||
setidt(IDT_OF, &IDTVEC(ofl), SDT_SYSIGT, SEL_KPL, 0);
|
||||
setidt(IDT_BR, &IDTVEC(bnd), SDT_SYSIGT, SEL_KPL, 0);
|
||||
@ -1438,6 +1442,14 @@ hammer_time(u_int64_t modulep, u_int64_t physfree)
|
||||
/* doublefault stack space, runs on ist1 */
|
||||
common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)];
|
||||
|
||||
/*
|
||||
* NMI stack, runs on ist2. The pcpu pointer is stored just
|
||||
* above the start of the ist2 stack.
|
||||
*/
|
||||
np = ((struct nmi_pcpu *) &nmi0_stack[sizeof(nmi0_stack)]) - 1;
|
||||
np->np_pcpu = (register_t) pc;
|
||||
common_tss[0].tss_ist2 = (long) np;
|
||||
|
||||
/* Set the IO permission bitmap (empty due to tss seg limit) */
|
||||
common_tss[0].tss_iobase = sizeof(struct amd64tss);
|
||||
|
||||
|
@ -92,6 +92,7 @@ void *bootstacks[MAXCPU];
|
||||
|
||||
/* Temporary holder for double fault stack */
|
||||
char *doublefault_stack;
|
||||
char *nmi_stack;
|
||||
|
||||
/* Hotwire a 0->4MB V==P mapping */
|
||||
extern pt_entry_t *KPTphys;
|
||||
@ -437,6 +438,7 @@ void
|
||||
init_secondary(void)
|
||||
{
|
||||
struct pcpu *pc;
|
||||
struct nmi_pcpu *np;
|
||||
u_int64_t msr, cr0;
|
||||
int cpu, gsel_tss, x;
|
||||
struct region_descriptor ap_gdt;
|
||||
@ -450,6 +452,10 @@ init_secondary(void)
|
||||
common_tss[cpu].tss_iobase = sizeof(struct amd64tss);
|
||||
common_tss[cpu].tss_ist1 = (long)&doublefault_stack[PAGE_SIZE];
|
||||
|
||||
/* The NMI stack runs on IST2. */
|
||||
np = ((struct nmi_pcpu *) &nmi_stack[PAGE_SIZE]) - 1;
|
||||
common_tss[cpu].tss_ist2 = (long) np;
|
||||
|
||||
/* Prepare private GDT */
|
||||
gdt_segs[GPROC0_SEL].ssd_base = (long) &common_tss[cpu];
|
||||
ssdtosyssd(&gdt_segs[GPROC0_SEL],
|
||||
@ -474,6 +480,9 @@ init_secondary(void)
|
||||
pc->pc_rsp0 = 0;
|
||||
pc->pc_gs32p = &gdt[NGDT * cpu + GUGS32_SEL];
|
||||
|
||||
/* Save the per-cpu pointer for use by the NMI handler. */
|
||||
np->np_pcpu = (register_t) pc;
|
||||
|
||||
wrmsr(MSR_FSBASE, 0); /* User value */
|
||||
wrmsr(MSR_GSBASE, (u_int64_t)pc);
|
||||
wrmsr(MSR_KGSBASE, (u_int64_t)pc); /* XXX User value while we're in the kernel */
|
||||
@ -725,6 +734,7 @@ start_all_aps(void)
|
||||
/* allocate and set up an idle stack data page */
|
||||
bootstacks[cpu] = (void *)kmem_alloc(kernel_map, KSTACK_PAGES * PAGE_SIZE);
|
||||
doublefault_stack = (char *)kmem_alloc(kernel_map, PAGE_SIZE);
|
||||
nmi_stack = (char *)kmem_alloc(kernel_map, PAGE_SIZE);
|
||||
|
||||
bootSTK = (char *)bootstacks[cpu] + KSTACK_PAGES * PAGE_SIZE - 8;
|
||||
bootAP = cpu;
|
||||
|
@ -120,6 +120,15 @@ struct intsrc {
|
||||
|
||||
struct trapframe;
|
||||
|
||||
/*
|
||||
* The following data structure holds per-cpu data, and is placed just
|
||||
* above the top of the space used for the NMI stack.
|
||||
*/
|
||||
struct nmi_pcpu {
|
||||
register_t np_pcpu;
|
||||
register_t __padding; /* pad to 16 bytes */
|
||||
};
|
||||
|
||||
extern struct mtx icu_lock;
|
||||
extern int elcr_found;
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user