Use a dedicated per-CPU stack for machine check exceptions.

Similar to NMIs, machine check exceptions can fire at any time and are
not masked by IF.  This means that machine checks can fire when the
kstack is too deep to hold a trap frame, or at critical sections in
trap handlers when a user %gs is used with a kernel %cs.  Use the same
strategy used for NMIs of using a dedicated per-CPU stack configured
in IST 3.  Store the CPU's pcpu pointer at the stop of the stack so
that the machine check handler can reliably find the proper value for
%gs (also borrowed from NMIs).

This should also fix a similar issue with PTI with a MC# occurring
while the CPU is executing on the trampoline stack.

While here, bypass trap() entirely and just call mca_intr().  This
avoids a bogus call to kdb_reenter() (there's no reason to try to
reenter kdb if a MC# is raised).

Reviewed by:	kib
Tested by:	avg (on AMD without PTI)
Differential Revision:	https://reviews.freebsd.org/D13962
This commit is contained in:
jhb 2018-01-18 23:50:21 +00:00
parent 2832bf2f2a
commit e2ed91ad09
7 changed files with 125 additions and 11 deletions

View File

@ -200,6 +200,7 @@ db_nextframe(struct amd64_frame **fp, db_addr_t *ip, struct thread *td)
if (name != NULL) {
if (strcmp(name, "calltrap") == 0 ||
strcmp(name, "fork_trampoline") == 0 ||
strcmp(name, "mchk_calltrap") == 0 ||
strcmp(name, "nmi_calltrap") == 0 ||
strcmp(name, "Xdblfault") == 0)
frame_type = TRAP;

View File

@ -141,7 +141,6 @@ X\l:
TRAP ill, T_PRIVINFLT
TRAP dna, T_DNA
TRAP fpusegm, T_FPOPFLT
TRAP mchk, T_MCHK
TRAP rsvd, T_RESERVED
TRAP fpu, T_ARITHTRAP
TRAP xmm, T_XMMFLT
@ -678,6 +677,103 @@ nocallchain:
addq $TF_RIP,%rsp
jmp doreti_iret
/*
* MC# handling is similar to NMI.
*
* As with NMIs, machine check exceptions do not respect RFLAGS.IF and
* can occur at any time with a GS.base value that does not correspond
* to the privilege level in CS.
*
* Machine checks are not unblocked by iretq, but it is best to run
* the handler with interrupts disabled since the exception may have
* interrupted a critical section.
*
* The MC# handler runs on its own stack (tss_ist3). The canonical
* GS.base value for the processor is stored just above the bottom of
* its MC# stack. For exceptions taken from kernel mode, the current
* value in the processor's GS.base is saved at entry to C-preserved
* register %r12, the canonical value for GS.base is then loaded into
* the processor, and the saved value is restored at exit time. For
* exceptions taken from user mode, the cheaper 'SWAPGS' instructions
* are used for swapping GS.base.
*/
IDTVEC(mchk)
subq $TF_RIP,%rsp
movl $(T_MCHK),TF_TRAPNO(%rsp)
movq $0,TF_ADDR(%rsp)
movq $0,TF_ERR(%rsp)
movq %rdi,TF_RDI(%rsp)
movq %rsi,TF_RSI(%rsp)
movq %rdx,TF_RDX(%rsp)
movq %rcx,TF_RCX(%rsp)
movq %r8,TF_R8(%rsp)
movq %r9,TF_R9(%rsp)
movq %rax,TF_RAX(%rsp)
movq %rbx,TF_RBX(%rsp)
movq %rbp,TF_RBP(%rsp)
movq %r10,TF_R10(%rsp)
movq %r11,TF_R11(%rsp)
movq %r12,TF_R12(%rsp)
movq %r13,TF_R13(%rsp)
movq %r14,TF_R14(%rsp)
movq %r15,TF_R15(%rsp)
SAVE_SEGS
movl $TF_HASSEGS,TF_FLAGS(%rsp)
cld
xorl %ebx,%ebx
testb $SEL_RPL_MASK,TF_CS(%rsp)
jnz mchk_fromuserspace
/*
* We've interrupted the kernel. Preserve GS.base in %r12
* and %cr3 in %r13.
*/
movl $MSR_GSBASE,%ecx
rdmsr
movq %rax,%r12
shlq $32,%rdx
orq %rdx,%r12
/* Retrieve and load the canonical value for GS.base. */
movq TF_SIZE(%rsp),%rdx
movl %edx,%eax
shrq $32,%rdx
wrmsr
movq %cr3,%r13
movq PCPU(KCR3),%rax
cmpq $~0,%rax
je mchk_calltrap
movq %rax,%cr3
jmp mchk_calltrap
mchk_fromuserspace:
incl %ebx
swapgs
movq %cr3,%r13
movq PCPU(KCR3),%rax
cmpq $~0,%rax
je 1f
movq %rax,%cr3
1:
/* Note: this label is also used by ddb and gdb: */
mchk_calltrap:
FAKE_MCOUNT(TF_RIP(%rsp))
movq %rsp,%rdi
call mca_intr
MEXITCOUNT
testl %ebx,%ebx /* %ebx == 0 => return to userland */
jnz doreti_exit
/*
* Put back the preserved MSR_GSBASE value.
*/
movl $MSR_GSBASE,%ecx
movq %r12,%rdx
movl %edx,%eax
shrq $32,%rdx
wrmsr
movq %r13,%cr3
RESTORE_REGS
addq $TF_RIP,%rsp
jmp doreti_iret
ENTRY(fork_trampoline)
movq %r12,%rdi /* function */
movq %rbx,%rsi /* arg1 */

View File

@ -662,7 +662,7 @@ static struct gate_descriptor idt0[NIDT];
struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */
static char dblfault_stack[PAGE_SIZE] __aligned(16);
static char mce0_stack[PAGE_SIZE] __aligned(16);
static char nmi0_stack[PAGE_SIZE] __aligned(16);
CTASSERT(sizeof(struct nmi_pcpu) == 16);
@ -819,7 +819,7 @@ extern inthand_t
IDTVEC(div_pti), IDTVEC(dbg_pti), IDTVEC(bpt_pti),
IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti), IDTVEC(mchk_pti),
IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
IDTVEC(xmm_pti),
#ifdef KDTRACE_HOOKS
@ -1658,8 +1658,7 @@ hammer_time(u_int64_t modulep, u_int64_t physfree)
SEL_KPL, 0);
setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
SEL_KPL, 0);
setidt(IDT_MC, pti ? &IDTVEC(mchk_pti) : &IDTVEC(mchk), SDT_SYSIGT,
SEL_KPL, 0);
setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
SEL_KPL, 0);
#ifdef KDTRACE_HOOKS
@ -1704,6 +1703,14 @@ hammer_time(u_int64_t modulep, u_int64_t physfree)
np->np_pcpu = (register_t) pc;
common_tss[0].tss_ist2 = (long) np;
/*
* MC# stack, runs on ist3. The pcpu pointer is stored just
* above the start of the ist3 stack.
*/
np = ((struct nmi_pcpu *) &mce0_stack[sizeof(mce0_stack)]) - 1;
np->np_pcpu = (register_t) pc;
common_tss[0].tss_ist3 = (long) np;
/* Set the IO permission bitmap (empty due to tss seg limit) */
common_tss[0].tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE;

View File

@ -87,6 +87,7 @@ extern struct pcpu __pcpu[];
/* Temporary variables for init_secondary() */
char *doublefault_stack;
char *mce_stack;
char *nmi_stack;
/*
@ -212,6 +213,10 @@ init_secondary(void)
np = ((struct nmi_pcpu *) &nmi_stack[PAGE_SIZE]) - 1;
common_tss[cpu].tss_ist2 = (long) np;
/* The MC# stack runs on IST3. */
np = ((struct nmi_pcpu *) &mce_stack[PAGE_SIZE]) - 1;
common_tss[cpu].tss_ist3 = (long) np;
/* Prepare private GDT */
gdt_segs[GPROC0_SEL].ssd_base = (long) &common_tss[cpu];
for (x = 0; x < NGDT; x++) {
@ -250,6 +255,11 @@ init_secondary(void)
PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful : 0;
/* Save the per-cpu pointer for use by the NMI handler. */
np = ((struct nmi_pcpu *) &nmi_stack[PAGE_SIZE]) - 1;
np->np_pcpu = (register_t) pc;
/* Save the per-cpu pointer for use by the MC# handler. */
np = ((struct nmi_pcpu *) &mce_stack[PAGE_SIZE]) - 1;
np->np_pcpu = (register_t) pc;
wrmsr(MSR_FSBASE, 0); /* User value */
@ -346,6 +356,8 @@ native_start_all_aps(void)
kstack_pages * PAGE_SIZE, M_WAITOK | M_ZERO);
doublefault_stack = (char *)kmem_malloc(kernel_arena,
PAGE_SIZE, M_WAITOK | M_ZERO);
mce_stack = (char *)kmem_malloc(kernel_arena, PAGE_SIZE,
M_WAITOK | M_ZERO);
nmi_stack = (char *)kmem_malloc(kernel_arena, PAGE_SIZE,
M_WAITOK | M_ZERO);
dpcpu = (void *)kmem_malloc(kernel_arena, DPCPU_SIZE,

View File

@ -7596,6 +7596,9 @@ pmap_pti_init(void)
/* NMI stack IST 2 */
va = common_tss[i].tss_ist2 + sizeof(struct nmi_pcpu);
pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
/* MC# stack IST 3 */
va = common_tss[i].tss_ist3 + sizeof(struct nmi_pcpu);
pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
}
pmap_pti_add_kva_locked((vm_offset_t)kernphys + KERNBASE,
(vm_offset_t)etext, true);

View File

@ -220,11 +220,6 @@ trap(struct trapframe *frame)
#endif
}
if (type == T_MCHK) {
mca_intr();
return;
}
if ((frame->tf_rflags & PSL_I) == 0) {
/*
* Buggy application or kernel code has disabled

View File

@ -139,7 +139,7 @@ struct trapframe;
/*
* The following data structure holds per-cpu data, and is placed just
* above the top of the space used for the NMI stack.
* above the top of the space used for the NMI and MC# stacks.
*/
struct nmi_pcpu {
register_t np_pcpu;