Use a dedicated per-CPU stack for machine check exceptions.
Similar to NMIs, machine check exceptions can fire at any time and are not masked by IF. This means that machine checks can fire when the kstack is too deep to hold a trap frame, or at critical sections in trap handlers when a user %gs is used with a kernel %cs. Use the same strategy used for NMIs of using a dedicated per-CPU stack configured in IST 3. Store the CPU's pcpu pointer at the stop of the stack so that the machine check handler can reliably find the proper value for %gs (also borrowed from NMIs). This should also fix a similar issue with PTI with a MC# occurring while the CPU is executing on the trampoline stack. While here, bypass trap() entirely and just call mca_intr(). This avoids a bogus call to kdb_reenter() (there's no reason to try to reenter kdb if a MC# is raised). Reviewed by: kib Tested by: avg (on AMD without PTI) Differential Revision: https://reviews.freebsd.org/D13962
This commit is contained in:
parent
2832bf2f2a
commit
e2ed91ad09
@ -200,6 +200,7 @@ db_nextframe(struct amd64_frame **fp, db_addr_t *ip, struct thread *td)
|
||||
if (name != NULL) {
|
||||
if (strcmp(name, "calltrap") == 0 ||
|
||||
strcmp(name, "fork_trampoline") == 0 ||
|
||||
strcmp(name, "mchk_calltrap") == 0 ||
|
||||
strcmp(name, "nmi_calltrap") == 0 ||
|
||||
strcmp(name, "Xdblfault") == 0)
|
||||
frame_type = TRAP;
|
||||
|
@ -141,7 +141,6 @@ X\l:
|
||||
TRAP ill, T_PRIVINFLT
|
||||
TRAP dna, T_DNA
|
||||
TRAP fpusegm, T_FPOPFLT
|
||||
TRAP mchk, T_MCHK
|
||||
TRAP rsvd, T_RESERVED
|
||||
TRAP fpu, T_ARITHTRAP
|
||||
TRAP xmm, T_XMMFLT
|
||||
@ -678,6 +677,103 @@ nocallchain:
|
||||
addq $TF_RIP,%rsp
|
||||
jmp doreti_iret
|
||||
|
||||
/*
|
||||
* MC# handling is similar to NMI.
|
||||
*
|
||||
* As with NMIs, machine check exceptions do not respect RFLAGS.IF and
|
||||
* can occur at any time with a GS.base value that does not correspond
|
||||
* to the privilege level in CS.
|
||||
*
|
||||
* Machine checks are not unblocked by iretq, but it is best to run
|
||||
* the handler with interrupts disabled since the exception may have
|
||||
* interrupted a critical section.
|
||||
*
|
||||
* The MC# handler runs on its own stack (tss_ist3). The canonical
|
||||
* GS.base value for the processor is stored just above the bottom of
|
||||
* its MC# stack. For exceptions taken from kernel mode, the current
|
||||
* value in the processor's GS.base is saved at entry to C-preserved
|
||||
* register %r12, the canonical value for GS.base is then loaded into
|
||||
* the processor, and the saved value is restored at exit time. For
|
||||
* exceptions taken from user mode, the cheaper 'SWAPGS' instructions
|
||||
* are used for swapping GS.base.
|
||||
*/
|
||||
|
||||
IDTVEC(mchk)
|
||||
subq $TF_RIP,%rsp
|
||||
movl $(T_MCHK),TF_TRAPNO(%rsp)
|
||||
movq $0,TF_ADDR(%rsp)
|
||||
movq $0,TF_ERR(%rsp)
|
||||
movq %rdi,TF_RDI(%rsp)
|
||||
movq %rsi,TF_RSI(%rsp)
|
||||
movq %rdx,TF_RDX(%rsp)
|
||||
movq %rcx,TF_RCX(%rsp)
|
||||
movq %r8,TF_R8(%rsp)
|
||||
movq %r9,TF_R9(%rsp)
|
||||
movq %rax,TF_RAX(%rsp)
|
||||
movq %rbx,TF_RBX(%rsp)
|
||||
movq %rbp,TF_RBP(%rsp)
|
||||
movq %r10,TF_R10(%rsp)
|
||||
movq %r11,TF_R11(%rsp)
|
||||
movq %r12,TF_R12(%rsp)
|
||||
movq %r13,TF_R13(%rsp)
|
||||
movq %r14,TF_R14(%rsp)
|
||||
movq %r15,TF_R15(%rsp)
|
||||
SAVE_SEGS
|
||||
movl $TF_HASSEGS,TF_FLAGS(%rsp)
|
||||
cld
|
||||
xorl %ebx,%ebx
|
||||
testb $SEL_RPL_MASK,TF_CS(%rsp)
|
||||
jnz mchk_fromuserspace
|
||||
/*
|
||||
* We've interrupted the kernel. Preserve GS.base in %r12
|
||||
* and %cr3 in %r13.
|
||||
*/
|
||||
movl $MSR_GSBASE,%ecx
|
||||
rdmsr
|
||||
movq %rax,%r12
|
||||
shlq $32,%rdx
|
||||
orq %rdx,%r12
|
||||
/* Retrieve and load the canonical value for GS.base. */
|
||||
movq TF_SIZE(%rsp),%rdx
|
||||
movl %edx,%eax
|
||||
shrq $32,%rdx
|
||||
wrmsr
|
||||
movq %cr3,%r13
|
||||
movq PCPU(KCR3),%rax
|
||||
cmpq $~0,%rax
|
||||
je mchk_calltrap
|
||||
movq %rax,%cr3
|
||||
jmp mchk_calltrap
|
||||
mchk_fromuserspace:
|
||||
incl %ebx
|
||||
swapgs
|
||||
movq %cr3,%r13
|
||||
movq PCPU(KCR3),%rax
|
||||
cmpq $~0,%rax
|
||||
je 1f
|
||||
movq %rax,%cr3
|
||||
1:
|
||||
/* Note: this label is also used by ddb and gdb: */
|
||||
mchk_calltrap:
|
||||
FAKE_MCOUNT(TF_RIP(%rsp))
|
||||
movq %rsp,%rdi
|
||||
call mca_intr
|
||||
MEXITCOUNT
|
||||
testl %ebx,%ebx /* %ebx == 0 => return to userland */
|
||||
jnz doreti_exit
|
||||
/*
|
||||
* Put back the preserved MSR_GSBASE value.
|
||||
*/
|
||||
movl $MSR_GSBASE,%ecx
|
||||
movq %r12,%rdx
|
||||
movl %edx,%eax
|
||||
shrq $32,%rdx
|
||||
wrmsr
|
||||
movq %r13,%cr3
|
||||
RESTORE_REGS
|
||||
addq $TF_RIP,%rsp
|
||||
jmp doreti_iret
|
||||
|
||||
ENTRY(fork_trampoline)
|
||||
movq %r12,%rdi /* function */
|
||||
movq %rbx,%rsi /* arg1 */
|
||||
|
@ -662,7 +662,7 @@ static struct gate_descriptor idt0[NIDT];
|
||||
struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */
|
||||
|
||||
static char dblfault_stack[PAGE_SIZE] __aligned(16);
|
||||
|
||||
static char mce0_stack[PAGE_SIZE] __aligned(16);
|
||||
static char nmi0_stack[PAGE_SIZE] __aligned(16);
|
||||
CTASSERT(sizeof(struct nmi_pcpu) == 16);
|
||||
|
||||
@ -819,7 +819,7 @@ extern inthand_t
|
||||
IDTVEC(div_pti), IDTVEC(dbg_pti), IDTVEC(bpt_pti),
|
||||
IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
|
||||
IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
|
||||
IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti), IDTVEC(mchk_pti),
|
||||
IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
|
||||
IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
|
||||
IDTVEC(xmm_pti),
|
||||
#ifdef KDTRACE_HOOKS
|
||||
@ -1658,8 +1658,7 @@ hammer_time(u_int64_t modulep, u_int64_t physfree)
|
||||
SEL_KPL, 0);
|
||||
setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
|
||||
SEL_KPL, 0);
|
||||
setidt(IDT_MC, pti ? &IDTVEC(mchk_pti) : &IDTVEC(mchk), SDT_SYSIGT,
|
||||
SEL_KPL, 0);
|
||||
setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
|
||||
setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
|
||||
SEL_KPL, 0);
|
||||
#ifdef KDTRACE_HOOKS
|
||||
@ -1704,6 +1703,14 @@ hammer_time(u_int64_t modulep, u_int64_t physfree)
|
||||
np->np_pcpu = (register_t) pc;
|
||||
common_tss[0].tss_ist2 = (long) np;
|
||||
|
||||
/*
|
||||
* MC# stack, runs on ist3. The pcpu pointer is stored just
|
||||
* above the start of the ist3 stack.
|
||||
*/
|
||||
np = ((struct nmi_pcpu *) &mce0_stack[sizeof(mce0_stack)]) - 1;
|
||||
np->np_pcpu = (register_t) pc;
|
||||
common_tss[0].tss_ist3 = (long) np;
|
||||
|
||||
/* Set the IO permission bitmap (empty due to tss seg limit) */
|
||||
common_tss[0].tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE;
|
||||
|
||||
|
@ -87,6 +87,7 @@ extern struct pcpu __pcpu[];
|
||||
|
||||
/* Temporary variables for init_secondary() */
|
||||
char *doublefault_stack;
|
||||
char *mce_stack;
|
||||
char *nmi_stack;
|
||||
|
||||
/*
|
||||
@ -212,6 +213,10 @@ init_secondary(void)
|
||||
np = ((struct nmi_pcpu *) &nmi_stack[PAGE_SIZE]) - 1;
|
||||
common_tss[cpu].tss_ist2 = (long) np;
|
||||
|
||||
/* The MC# stack runs on IST3. */
|
||||
np = ((struct nmi_pcpu *) &mce_stack[PAGE_SIZE]) - 1;
|
||||
common_tss[cpu].tss_ist3 = (long) np;
|
||||
|
||||
/* Prepare private GDT */
|
||||
gdt_segs[GPROC0_SEL].ssd_base = (long) &common_tss[cpu];
|
||||
for (x = 0; x < NGDT; x++) {
|
||||
@ -250,6 +255,11 @@ init_secondary(void)
|
||||
PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful : 0;
|
||||
|
||||
/* Save the per-cpu pointer for use by the NMI handler. */
|
||||
np = ((struct nmi_pcpu *) &nmi_stack[PAGE_SIZE]) - 1;
|
||||
np->np_pcpu = (register_t) pc;
|
||||
|
||||
/* Save the per-cpu pointer for use by the MC# handler. */
|
||||
np = ((struct nmi_pcpu *) &mce_stack[PAGE_SIZE]) - 1;
|
||||
np->np_pcpu = (register_t) pc;
|
||||
|
||||
wrmsr(MSR_FSBASE, 0); /* User value */
|
||||
@ -346,6 +356,8 @@ native_start_all_aps(void)
|
||||
kstack_pages * PAGE_SIZE, M_WAITOK | M_ZERO);
|
||||
doublefault_stack = (char *)kmem_malloc(kernel_arena,
|
||||
PAGE_SIZE, M_WAITOK | M_ZERO);
|
||||
mce_stack = (char *)kmem_malloc(kernel_arena, PAGE_SIZE,
|
||||
M_WAITOK | M_ZERO);
|
||||
nmi_stack = (char *)kmem_malloc(kernel_arena, PAGE_SIZE,
|
||||
M_WAITOK | M_ZERO);
|
||||
dpcpu = (void *)kmem_malloc(kernel_arena, DPCPU_SIZE,
|
||||
|
@ -7596,6 +7596,9 @@ pmap_pti_init(void)
|
||||
/* NMI stack IST 2 */
|
||||
va = common_tss[i].tss_ist2 + sizeof(struct nmi_pcpu);
|
||||
pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
|
||||
/* MC# stack IST 3 */
|
||||
va = common_tss[i].tss_ist3 + sizeof(struct nmi_pcpu);
|
||||
pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
|
||||
}
|
||||
pmap_pti_add_kva_locked((vm_offset_t)kernphys + KERNBASE,
|
||||
(vm_offset_t)etext, true);
|
||||
|
@ -220,11 +220,6 @@ trap(struct trapframe *frame)
|
||||
#endif
|
||||
}
|
||||
|
||||
if (type == T_MCHK) {
|
||||
mca_intr();
|
||||
return;
|
||||
}
|
||||
|
||||
if ((frame->tf_rflags & PSL_I) == 0) {
|
||||
/*
|
||||
* Buggy application or kernel code has disabled
|
||||
|
@ -139,7 +139,7 @@ struct trapframe;
|
||||
|
||||
/*
|
||||
* The following data structure holds per-cpu data, and is placed just
|
||||
* above the top of the space used for the NMI stack.
|
||||
* above the top of the space used for the NMI and MC# stacks.
|
||||
*/
|
||||
struct nmi_pcpu {
|
||||
register_t np_pcpu;
|
||||
|
Loading…
Reference in New Issue
Block a user