Prepare DB# handler for deferred trigger of watchpoints.

Since pop %ss/mov %ss instructions defer all interrupts and exceptions
for the next instruction, it is possible that the userspace watchpoint
trap executes on the first instruction of the kernel entry for
syscall/bpt.

In this case, DB# should be treated similarly to NMI: on amd64 we must
always load GSBASE even if the trap comes from kernel mode, and load
the kernel page table root into %cr3.  Moreover, the trap must
use the dedicated stack, because we are still on the user stack when
trapped on syscall entry.

For i386, we must reload %cr3.  The syscall instruction is not configured,
so there is no issue with executing on user stack when trapping.

Due to some CPU erratas it is not always possible to detect that the
userspace watchpoint triggered by inspecting %dr6.  In trap(), compare the
trap %rip with the known unsafe entry points and if matched pretend that
the watchpoint did not fire at all.

Thank you to the MSRC Incident Response Team, and in particular Greg
Lenti and Nate Warfield, for coordinating the response to this issue
across multiple vendors.

Thanks to Computer Recycling at The Working Center of Kitchener for
making hardware available to allow us to test the patch on additional
CPU families.

Reviewed by:	jhb
Discussed with:	Matthew Dillon
Tested by:	emaste
Sponsored by:	The FreeBSD Foundation
Security:	CVE-2018-8897
Security:	FreeBSD-SA-18:06.debugreg
This commit is contained in:
kib 2018-05-08 17:00:34 +00:00
parent 3d0eb99e32
commit e393d204ee
7 changed files with 257 additions and 6 deletions

View File

@ -115,7 +115,6 @@ X\l: subq $TF_RIP,%rsp
jmp alltraps_noen
.endm
TRAP_NOEN dbg, T_TRCTRAP
TRAP_NOEN bpt, T_BPTFLT
#ifdef KDTRACE_HOOKS
TRAP_NOEN dtrace_ret, T_DTRACE_RET
@ -537,6 +536,121 @@ fast_syscall_common:
IDTVEC(fast_syscall32)
sysret
/*
* DB# handler is very similar to NM#, because 'mov/pop %ss' delay
* generation of exception until the next instruction is executed,
* which might be a kernel entry. So we must execute the handler
* on IST stack and be ready for non-kernel GSBASE.
*/
IDTVEC(dbg)
subq $TF_RIP,%rsp
movl $(T_TRCTRAP),TF_TRAPNO(%rsp)
movq $0,TF_ADDR(%rsp)
movq $0,TF_ERR(%rsp)
movq %rdi,TF_RDI(%rsp)
movq %rsi,TF_RSI(%rsp)
movq %rdx,TF_RDX(%rsp)
movq %rcx,TF_RCX(%rsp)
movq %r8,TF_R8(%rsp)
movq %r9,TF_R9(%rsp)
movq %rax,TF_RAX(%rsp)
movq %rbx,TF_RBX(%rsp)
movq %rbp,TF_RBP(%rsp)
movq %r10,TF_R10(%rsp)
movq %r11,TF_R11(%rsp)
movq %r12,TF_R12(%rsp)
movq %r13,TF_R13(%rsp)
movq %r14,TF_R14(%rsp)
movq %r15,TF_R15(%rsp)
SAVE_SEGS
movl $TF_HASSEGS,TF_FLAGS(%rsp)
cld
testb $SEL_RPL_MASK,TF_CS(%rsp)
jnz dbg_fromuserspace
/*
* We've interrupted the kernel. Preserve GS.base in %r12,
* %cr3 in %r13, and possibly lower half of MSR_IA32_SPEC_CTL in %r14d.
*/
movl $MSR_GSBASE,%ecx
rdmsr
movq %rax,%r12
shlq $32,%rdx
orq %rdx,%r12
/* Retrieve and load the canonical value for GS.base. */
movq TF_SIZE(%rsp),%rdx
movl %edx,%eax
shrq $32,%rdx
wrmsr
movq %cr3,%r13
movq PCPU(KCR3),%rax
cmpq $~0,%rax
je 1f
movq %rax,%cr3
1: testl $CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
je 2f
movl $MSR_IA32_SPEC_CTRL,%ecx
rdmsr
movl %eax,%r14d
call handle_ibrs_entry
2: FAKE_MCOUNT(TF_RIP(%rsp))
movq %rsp,%rdi
call trap
MEXITCOUNT
testl $CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
je 3f
movl %r14d,%eax
xorl %edx,%edx
movl $MSR_IA32_SPEC_CTRL,%ecx
wrmsr
/*
* Put back the preserved MSR_GSBASE value.
*/
3: movl $MSR_GSBASE,%ecx
movq %r12,%rdx
movl %edx,%eax
shrq $32,%rdx
wrmsr
movq %r13,%cr3
RESTORE_REGS
addq $TF_RIP,%rsp
jmp doreti_iret
dbg_fromuserspace:
/*
* Switch to kernel GSBASE and kernel page table, and copy frame
* from the IST stack to the normal kernel stack, since trap()
* re-enables interrupts, and since we might trap on DB# while
* in trap().
*/
swapgs
movq PCPU(KCR3),%rax
cmpq $~0,%rax
je 1f
movq %rax,%cr3
1: movq PCPU(RSP0),%rax
movl $TF_SIZE,%ecx
subq %rcx,%rax
movq %rax,%rdi
movq %rsp,%rsi
rep;movsb
movq %rax,%rsp
call handle_ibrs_entry
movq PCPU(CURPCB),%rdi
orl $PCB_FULL_IRET,PCB_FLAGS(%rdi)
testb $CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip)
jz 3f
cmpw $KUF32SEL,TF_FS(%rsp)
jne 2f
rdfsbase %rax
movq %rax,PCB_FSBASE(%rdi)
2: cmpw $KUG32SEL,TF_GS(%rsp)
jne 3f
movl $MSR_KGSBASE,%ecx
rdmsr
shlq $32,%rdx
orq %rdx,%rax
movq %rax,PCB_GSBASE(%rdi)
3: jmp calltrap
/*
* NMI handling is special.
*

View File

@ -669,6 +669,7 @@ struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */
static char dblfault_stack[PAGE_SIZE] __aligned(16);
static char mce0_stack[PAGE_SIZE] __aligned(16);
static char nmi0_stack[PAGE_SIZE] __aligned(16);
static char dbg0_stack[PAGE_SIZE] __aligned(16);
CTASSERT(sizeof(struct nmi_pcpu) == 16);
struct amd64tss common_tss[MAXCPU];
@ -821,7 +822,7 @@ extern inthand_t
IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
IDTVEC(xmm), IDTVEC(dblfault),
IDTVEC(div_pti), IDTVEC(dbg_pti), IDTVEC(bpt_pti),
IDTVEC(div_pti), IDTVEC(bpt_pti),
IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
@ -1632,8 +1633,7 @@ hammer_time(u_int64_t modulep, u_int64_t physfree)
SEL_KPL, 0);
setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
SEL_KPL, 0);
setidt(IDT_DB, pti ? &IDTVEC(dbg_pti) : &IDTVEC(dbg), SDT_SYSIGT,
SEL_KPL, 0);
setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2);
setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
SEL_UPL, 0);
@ -1715,6 +1715,13 @@ hammer_time(u_int64_t modulep, u_int64_t physfree)
np = ((struct nmi_pcpu *) &mce0_stack[sizeof(mce0_stack)]) - 1;
np->np_pcpu = (register_t) pc;
common_tss[0].tss_ist3 = (long) np;
/*
* DB# stack, runs on ist4.
*/
np = ((struct nmi_pcpu *) &dbg0_stack[sizeof(dbg0_stack)]) - 1;
np->np_pcpu = (register_t) pc;
common_tss[0].tss_ist4 = (long) np;
/* Set the IO permission bitmap (empty due to tss seg limit) */
common_tss[0].tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE;

View File

@ -91,6 +91,7 @@ extern struct pcpu __pcpu[];
char *doublefault_stack;
char *mce_stack;
char *nmi_stack;
char *dbg_stack;
/*
* Local data and functions.
@ -251,6 +252,10 @@ init_secondary(void)
np = ((struct nmi_pcpu *) &mce_stack[PAGE_SIZE]) - 1;
common_tss[cpu].tss_ist3 = (long) np;
/* The DB# stack runs on IST4. */
np = ((struct nmi_pcpu *) &dbg_stack[PAGE_SIZE]) - 1;
common_tss[cpu].tss_ist4 = (long) np;
/* Prepare private GDT */
gdt_segs[GPROC0_SEL].ssd_base = (long) &common_tss[cpu];
for (x = 0; x < NGDT; x++) {
@ -297,6 +302,10 @@ init_secondary(void)
np = ((struct nmi_pcpu *) &mce_stack[PAGE_SIZE]) - 1;
np->np_pcpu = (register_t) pc;
/* Save the per-cpu pointer for use by the DB# handler. */
np = ((struct nmi_pcpu *) &dbg_stack[PAGE_SIZE]) - 1;
np->np_pcpu = (register_t) pc;
wrmsr(MSR_FSBASE, 0); /* User value */
wrmsr(MSR_GSBASE, (u_int64_t)pc);
wrmsr(MSR_KGSBASE, (u_int64_t)pc); /* XXX User value while we're in the kernel */
@ -392,6 +401,8 @@ native_start_all_aps(void)
M_WAITOK | M_ZERO);
nmi_stack = (char *)kmem_malloc(kernel_arena, PAGE_SIZE,
M_WAITOK | M_ZERO);
dbg_stack = (char *)kmem_malloc(kernel_arena, PAGE_SIZE,
M_WAITOK | M_ZERO);
dpcpu = (void *)kmem_malloc(kernel_arena, DPCPU_SIZE,
M_WAITOK | M_ZERO);

View File

@ -7843,6 +7843,9 @@ pmap_pti_init(void)
/* MC# stack IST 3 */
va = common_tss[i].tss_ist3 + sizeof(struct nmi_pcpu);
pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
/* DB# stack IST 4 */
va = common_tss[i].tss_ist4 + sizeof(struct nmi_pcpu);
pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
}
pmap_pti_add_kva_locked((vm_offset_t)kernphys + KERNBASE,
(vm_offset_t)etext, true);

View File

@ -47,6 +47,7 @@ __FBSDID("$FreeBSD$");
*/
#include "opt_clock.h"
#include "opt_compat.h"
#include "opt_cpu.h"
#include "opt_hwpmc_hooks.h"
#include "opt_isa.h"
@ -102,6 +103,10 @@ PMC_SOFT_DEFINE( , , page_fault, write);
#include <sys/dtrace_bsd.h>
#endif
extern inthand_t IDTVEC(bpt), IDTVEC(bpt_pti), IDTVEC(dbg),
IDTVEC(fast_syscall), IDTVEC(fast_syscall_pti), IDTVEC(fast_syscall32),
IDTVEC(int0x80_syscall_pti), IDTVEC(int0x80_syscall);
void __noinline trap(struct trapframe *frame);
void trap_check(struct trapframe *frame);
void dblfault_handler(struct trapframe *frame);
@ -535,6 +540,52 @@ trap(struct trapframe *frame)
load_dr6(rdr6() & ~0xf);
return;
}
/*
* Malicious user code can configure a debug
* register watchpoint to trap on data access
* to the top of stack and then execute 'pop
* %ss; int 3'. Due to exception deferral for
* 'pop %ss', the CPU will not interrupt 'int
* 3' to raise the DB# exception for the debug
* register but will postpone the DB# until
* execution of the first instruction of the
* BP# handler (in kernel mode). Normally the
* previous check would ignore DB# exceptions
* for watchpoints on user addresses raised in
* kernel mode. However, some CPU errata
* include cases where DB# exceptions do not
* properly set bits in %dr6, e.g. Haswell
* HSD23 and Skylake-X SKZ24.
*
* A deferred DB# can also be raised on the
* first instructions of system call entry
* points or single-step traps via similar use
* of 'pop %ss' or 'mov xxx, %ss'.
*/
if (pti) {
if (frame->tf_rip ==
(uintptr_t)IDTVEC(fast_syscall_pti) ||
#ifdef COMPAT_FREEBSD32
frame->tf_rip ==
(uintptr_t)IDTVEC(int0x80_syscall_pti) ||
#endif
frame->tf_rip == (uintptr_t)IDTVEC(bpt_pti))
return;
} else {
if (frame->tf_rip ==
(uintptr_t)IDTVEC(fast_syscall) ||
#ifdef COMPAT_FREEBSD32
frame->tf_rip ==
(uintptr_t)IDTVEC(int0x80_syscall) ||
#endif
frame->tf_rip == (uintptr_t)IDTVEC(bpt))
return;
}
if (frame->tf_rip == (uintptr_t)IDTVEC(dbg) ||
/* Needed for AMD. */
frame->tf_rip == (uintptr_t)IDTVEC(fast_syscall32))
return;
/*
* FALLTHROUGH (TRCTRAP kernel mode, kernel address)
*/

View File

@ -103,8 +103,6 @@ MCOUNT_LABEL(btrap)
IDTVEC(div)
pushl $0; TRAP(T_DIVIDE)
IDTVEC(dbg)
pushl $0; TRAP(T_TRCTRAP)
IDTVEC(bpt)
pushl $0; TRAP(T_BPTFLT)
IDTVEC(dtrace_ret)
@ -287,6 +285,39 @@ norm_ill:
jmp alltraps
#endif
/*
* See comment in the handler for the kernel case T_TRCTRAP in trap.c.
* The exception handler must be ready to execute with wrong %cr3.
* We save original %cr3 in frame->tf_err, similarly to NMI and MCE
* handlers.
*/
IDTVEC(dbg)
pushl $0
pushl $T_TRCTRAP
PUSH_FRAME2
SET_KERNEL_SREGS
cld
movl %cr3, %eax
movl %eax, TF_ERR(%esp)
call 1f
1: popl %eax
movl (tramp_idleptd - 1b)(%eax), %eax
movl %eax, %cr3
FAKE_MCOUNT(TF_EIP(%esp))
testl $PSL_VM, TF_EFLAGS(%esp)
jnz dbg_user
testb $SEL_RPL_MASK,TF_CS(%esp)
jz calltrap
dbg_user:
NMOVE_STACKS
pushl %esp
movl $trap,%eax
call *%eax
add $4, %esp
movl $T_RESERVED, TF_TRAPNO(%esp)
MEXITCOUNT
jmp doreti
IDTVEC(mchk)
pushl $0
pushl $T_MCHK
@ -469,6 +500,8 @@ doreti_exit:
je doreti_iret_nmi
cmpl $T_MCHK, TF_TRAPNO(%esp)
je doreti_iret_nmi
cmpl $T_TRCTRAP, TF_TRAPNO(%esp)
je doreti_iret_nmi
testl $SEL_RPL_MASK, TF_CS(%esp)
jz doreti_popl_fs
movl %esp, %esi

View File

@ -118,6 +118,8 @@ static int trap_pfault(struct trapframe *, int, vm_offset_t);
static void trap_fatal(struct trapframe *, vm_offset_t);
void dblfault_handler(void);
extern inthand_t IDTVEC(bpt), IDTVEC(dbg), IDTVEC(int0x80_syscall);
#define MAX_TRAP_MSG 32
struct trap_data {
@ -662,6 +664,36 @@ kernel_trctrap:
load_dr6(rdr6() & ~0xf);
return;
}
/*
* Malicious user code can configure a debug
* register watchpoint to trap on data access
* to the top of stack and then execute 'pop
* %ss; int 3'. Due to exception deferral for
* 'pop %ss', the CPU will not interrupt 'int
* 3' to raise the DB# exception for the debug
* register but will postpone the DB# until
* execution of the first instruction of the
* BP# handler (in kernel mode). Normally the
* previous check would ignore DB# exceptions
* for watchpoints on user addresses raised in
* kernel mode. However, some CPU errata
* include cases where DB# exceptions do not
* properly set bits in %dr6, e.g. Haswell
* HSD23 and Skylake-X SKZ24.
*
* A deferred DB# can also be raised on the
* first instructions of system call entry
* points or single-step traps via similar use
* of 'pop %ss' or 'mov xxx, %ss'.
*/
if (frame->tf_eip ==
(uintptr_t)IDTVEC(int0x80_syscall) + setidt_disp ||
frame->tf_eip == (uintptr_t)IDTVEC(bpt) +
setidt_disp ||
frame->tf_eip == (uintptr_t)IDTVEC(dbg) +
setidt_disp)
return;
/*
* FALLTHROUGH (TRCTRAP kernel mode, kernel address)
*/