Revert r323722. A better fix will be committed shortly, as well as

some still useful bits of the reverted revision.

The problem with the committed fix is that there are still issues with
returning from NMI, when NMI interrupted kernel in a moment where the
kernel segments selectors were still not loaded into registers.  If
this happens, the NMI return would loose the userspace selectors
because r323722 does not reload segment registers on return to kernel
mode.

Fixing the problem is complicated.  Since an alternative approach to
handle the original bug exists, it makes sence to stop adding more
complexity.

Discussed with:	bde
Sponsored by:	The FreeBSD Foundation
MFC after:	1 week
This commit is contained in:
Konstantin Belousov 2017-09-28 08:38:24 +00:00
parent 352035746f
commit d3c968bf84
9 changed files with 105 additions and 134 deletions

View File

@ -189,7 +189,8 @@ IDTVEC(xen_intr_upcall)
SUPERALIGN_TEXT
invltlb_ret:
call as_lapic_eoi
jmp doreti
POP_FRAME
iret
SUPERALIGN_TEXT
IDTVEC(invltlb)
@ -273,7 +274,9 @@ IDTVEC(cpustop)
call as_lapic_eoi
call cpustop_handler
jmp doreti
POP_FRAME
iret
/*
* Executed by a CPU when it receives an IPI_SUSPEND from another CPU.
@ -287,7 +290,9 @@ IDTVEC(cpususpend)
call as_lapic_eoi
call cpususpend_handler
jmp doreti
POP_FRAME
jmp doreti_iret
/*
* Executed by a CPU when it receives a RENDEZVOUS IPI from another CPU.
@ -309,6 +314,7 @@ IDTVEC(rendezvous)
call smp_rendezvous_action
call as_lapic_eoi
jmp doreti
POP_FRAME
iret
#endif /* SMP */

View File

@ -326,7 +326,8 @@ db_nextframe(struct i386_frame **fp, db_addr_t *ip, struct thread *td)
else if (strncmp(name, "Xatpic_intr", 11) == 0 ||
strncmp(name, "Xapic_isr", 9) == 0)
frame_type = INTERRUPT;
else if (strcmp(name, "Xint0x80_syscall") == 0)
else if (strcmp(name, "Xlcall_syscall") == 0 ||
strcmp(name, "Xint0x80_syscall") == 0)
frame_type = SYSCALL;
else if (strcmp(name, "dblfault_handler") == 0)
frame_type = DOUBLE_FAULT;

View File

@ -98,16 +98,15 @@ MCOUNT_LABEL(user)
MCOUNT_LABEL(btrap)
#define TRAP(a) pushl $(a) ; jmp alltraps
#define TRAP_NOEN(a) pushl $(a) ; jmp alltraps_noen
IDTVEC(div)
pushl $0; TRAP(T_DIVIDE)
IDTVEC(dbg)
pushl $0; TRAP_NOEN(T_TRCTRAP)
pushl $0; TRAP(T_TRCTRAP)
IDTVEC(nmi)
pushl $0; TRAP_NOEN(T_NMI)
pushl $0; TRAP(T_NMI)
IDTVEC(bpt)
pushl $0; TRAP_NOEN(T_BPTFLT)
pushl $0; TRAP(T_BPTFLT)
IDTVEC(dtrace_ret)
pushl $0; TRAP(T_DTRACE_RET)
IDTVEC(ofl)
@ -131,7 +130,7 @@ IDTVEC(stk)
IDTVEC(prot)
TRAP(T_PROTFLT)
IDTVEC(page)
TRAP_NOEN(T_PAGEFLT)
TRAP(T_PAGEFLT)
IDTVEC(mchk)
pushl $0; TRAP(T_MCHK)
IDTVEC(rsvd)
@ -143,21 +142,6 @@ IDTVEC(align)
IDTVEC(xmm)
pushl $0; TRAP(T_XMMFLT)
SUPERALIGN_TEXT
.globl alltraps_noen
alltraps_noen:
pushal
pushl $0
movw %ds,(%esp)
pushl $0
movw %es,(%esp)
pushl $0
movw %fs,(%esp)
SET_KERNEL_SREGS
cld
FAKE_MCOUNT(TF_EIP(%esp))
jmp calltrap
/*
* All traps except ones for syscalls jump to alltraps. If
* interrupts were enabled when the trap occurred, then interrupts
@ -180,7 +164,6 @@ alltraps:
movw %fs,(%esp)
alltraps_with_regs_pushed:
SET_KERNEL_SREGS
sti
cld
FAKE_MCOUNT(TF_EIP(%esp))
calltrap:
@ -241,6 +224,40 @@ norm_ill:
TRAP(T_PRIVINFLT)
#endif
/*
* Call gate entry for syscalls (lcall 7,0).
* This is used by FreeBSD 1.x a.out executables and "old" NetBSD executables.
*
* The intersegment call has been set up to specify one dummy parameter.
* This leaves a place to put eflags so that the call frame can be
* converted to a trap frame. Note that the eflags is (semi-)bogusly
* pushed into (what will be) tf_err and then copied later into the
* final spot. It has to be done this way because esp can't be just
* temporarily altered for the pushfl - an interrupt might come in
* and clobber the saved cs/eip.
*/
SUPERALIGN_TEXT
IDTVEC(lcall_syscall)
pushfl /* save eflags */
popl 8(%esp) /* shuffle into tf_eflags */
pushl $7 /* sizeof "lcall 7,0" */
pushl $0 /* tf_trapno */
pushal
pushl $0
movw %ds,(%esp)
pushl $0
movw %es,(%esp)
pushl $0
movw %fs,(%esp)
SET_KERNEL_SREGS
cld
FAKE_MCOUNT(TF_EIP(%esp))
pushl %esp
call syscall
add $4, %esp
MEXITCOUNT
jmp doreti
/*
* Trap gate entry for syscalls (int 0x80).
* This is used by FreeBSD ELF executables, "new" NetBSD executables, and all
@ -262,7 +279,6 @@ IDTVEC(int0x80_syscall)
pushl $0
movw %fs,(%esp)
SET_KERNEL_SREGS
sti
cld
FAKE_MCOUNT(TF_EIP(%esp))
pushl %esp
@ -346,7 +362,7 @@ doreti_next:
#ifdef HWPMC_HOOKS
je doreti_nmi
#else
je doreti_notvm86
je doreti_exit
#endif
/*
* PSL_VM must be checked first since segment registers only
@ -362,7 +378,7 @@ doreti_next:
doreti_notvm86:
testb $SEL_RPL_MASK,TF_CS(%esp) /* are we returning to user mode? */
jz doreti_nosegs /* can't handle ASTs now if not */
jz doreti_exit /* can't handle ASTs now if not */
doreti_ast:
/*
@ -399,12 +415,6 @@ doreti_popl_es:
.globl doreti_popl_ds
doreti_popl_ds:
popl %ds
jmp doreti_iret_popal
doreti_nosegs:
MEXITCOUNT
addl $12,%esp
doreti_iret_popal:
popal
addl $8,%esp
.globl doreti_iret
@ -447,7 +457,7 @@ doreti_nmi:
* needs a user call chain capture.
*/
testb $SEL_RPL_MASK,TF_CS(%esp)
jz doreti_nosegs
jz doreti_exit
movl PCPU(CURTHREAD),%eax /* curthread present? */
orl %eax,%eax
jz doreti_exit

View File

@ -335,44 +335,6 @@ osigcode:
pushl %eax /* junk to fake return addr. */
int $0x80 /* enter kernel with args */
0: jmp 0b
/*
* The lcall $7,$0 handler cannot use the call gate that does an
* inter-privilege transition. The reason is that the call gate
* does not disable interrupts, and, before the kernel segment registers
* are loaded, we would have a window where the ring 0 code is
* executed with the wrong segments.
*
* Instead, set LDT descriptor 0 as code segment, which reflects
* the lcall $7,$0 back to ring 3 trampoline. The trampoline sets up
* the frame for int $0x80.
*/
ALIGN_TEXT
lcall_tramp:
cmpl $SYS_vfork,%eax
je 1f
pushl %ebp
movl %esp,%ebp
pushl 0x24(%ebp) /* arg 6 */
pushl 0x20(%ebp)
pushl 0x1c(%ebp)
pushl 0x18(%ebp)
pushl 0x14(%ebp)
pushl 0x10(%ebp) /* arg 1 */
subl $4,%esp /* gap */
int $0x80
leavel
lretl
1:
/*
* vfork handling is special and relies on the libc stub saving
* the return ip in %ecx. Also, we assume that the call was done
* with ucode32 selector in %cs.
*/
int $0x80
movl $0x33,4(%esp) /* GUCODE32_SEL | SEL_UPL */
movl %ecx,(%esp)
lretl
#endif /* COMPAT_43 */
ALIGN_TEXT
@ -391,9 +353,6 @@ szfreebsd4_sigcode:
.globl szosigcode
szosigcode:
.long esigcode-osigcode
.globl szlcallcode
szlcallcode:
.long esigcode-lcall_tramp
#endif
.text

View File

@ -1513,7 +1513,7 @@ extern inthand_t
#ifdef XENHVM
IDTVEC(xen_intr_upcall),
#endif
IDTVEC(int0x80_syscall);
IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall);
#ifdef DDB
/*
@ -2157,9 +2157,7 @@ i386_kdb_init(void)
register_t
init386(int first)
{
#ifdef COMPAT_43
struct segment_descriptor *gdp;
#endif
struct gate_descriptor *gdp;
int gsel_tss, metadata_missing, x, pa;
struct pcpu *pc;
struct xstate_hdr *xhdr;
@ -2248,9 +2246,9 @@ init386(int first)
/* exceptions */
for (x = 0; x < NIDT; x++)
setidt(x, &IDTVEC(rsvd), SDT_SYS386IGT, SEL_KPL,
setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_DE, &IDTVEC(div), SDT_SYS386IGT, SEL_KPL,
setidt(IDT_DE, &IDTVEC(div), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_DB, &IDTVEC(dbg), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
@ -2258,39 +2256,39 @@ init386(int first)
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_BP, &IDTVEC(bpt), SDT_SYS386IGT, SEL_UPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_OF, &IDTVEC(ofl), SDT_SYS386IGT, SEL_UPL,
setidt(IDT_OF, &IDTVEC(ofl), SDT_SYS386TGT, SEL_UPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_BR, &IDTVEC(bnd), SDT_SYS386IGT, SEL_KPL,
setidt(IDT_BR, &IDTVEC(bnd), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386IGT, SEL_KPL,
setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_NM, &IDTVEC(dna), SDT_SYS386IGT, SEL_KPL
setidt(IDT_NM, &IDTVEC(dna), SDT_SYS386TGT, SEL_KPL
, GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_DF, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL));
setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYS386IGT, SEL_KPL,
setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_TS, &IDTVEC(tss), SDT_SYS386IGT, SEL_KPL,
setidt(IDT_TS, &IDTVEC(tss), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_NP, &IDTVEC(missing), SDT_SYS386IGT, SEL_KPL,
setidt(IDT_NP, &IDTVEC(missing), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_SS, &IDTVEC(stk), SDT_SYS386IGT, SEL_KPL,
setidt(IDT_SS, &IDTVEC(stk), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386IGT, SEL_KPL,
setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_PF, &IDTVEC(page), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_MF, &IDTVEC(fpu), SDT_SYS386IGT, SEL_KPL,
setidt(IDT_MF, &IDTVEC(fpu), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_AC, &IDTVEC(align), SDT_SYS386IGT, SEL_KPL,
setidt(IDT_AC, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_MC, &IDTVEC(mchk), SDT_SYS386IGT, SEL_KPL,
setidt(IDT_MC, &IDTVEC(mchk), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386IGT, SEL_KPL,
setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYS386IGT, SEL_UPL,
setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL,
GSEL(GCODE_SEL, SEL_KPL));
#ifdef KDTRACE_HOOKS
setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYS386IGT, SEL_UPL,
setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYS386TGT, SEL_UPL,
GSEL(GCODE_SEL, SEL_KPL));
#endif
#ifdef XENHVM
@ -2331,9 +2329,9 @@ init386(int first)
clock_init();
finishidentcpu(); /* Final stage of CPU initialization */
setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386IGT, SEL_KPL,
setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386IGT, SEL_KPL,
setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
initializecpu(); /* Initialize CPU registers */
initializecpucache();
@ -2438,21 +2436,17 @@ init386(int first)
gdt[GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; /* clear busy bit */
ltr(gsel_tss);
#ifdef COMPAT_43
/*
* Make a code descriptor to emulate lcall $7,$0 with int
* $0x80. sd_hibase and sd_lobase are set after the sigtramp
* base in the shared table is known.
*/
gdp = &ldt[LSYS5CALLS_SEL].sd;
gdp->sd_type = SDT_MEMERA;
gdp->sd_dpl = SEL_UPL;
gdp->sd_p = 1;
gdp->sd_def32 = 1;
gdp->sd_gran = 1;
gdp->sd_lolimit = 0xffff;
gdp->sd_hilimit = 0xf;
#endif
/* make a call gate to reenter kernel with */
gdp = &ldt[LSYS5CALLS_SEL].gd;
x = (int) &IDTVEC(lcall_syscall);
gdp->gd_looffset = x;
gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL);
gdp->gd_stkcpy = 1;
gdp->gd_type = SDT_SYS386CGT;
gdp->gd_dpl = SEL_UPL;
gdp->gd_p = 1;
gdp->gd_hioffset = x >> 16;
/* transfer to user mode */

View File

@ -114,6 +114,8 @@ static int trap_pfault(struct trapframe *, int, vm_offset_t);
static void trap_fatal(struct trapframe *, vm_offset_t);
void dblfault_handler(void);
extern inthand_t IDTVEC(lcall_syscall);
#define MAX_TRAP_MSG 32
static char *trap_msg[] = {
"", /* 0 unused */
@ -627,6 +629,23 @@ user_trctrap_out:
case T_TRCTRAP: /* trace trap */
kernel_trctrap:
if (frame->tf_eip == (int)IDTVEC(lcall_syscall)) {
/*
* We've just entered system mode via the
* syscall lcall. Continue single stepping
* silently until the syscall handler has
* saved the flags.
*/
return;
}
if (frame->tf_eip == (int)IDTVEC(lcall_syscall) + 1) {
/*
* The syscall handler has now saved the
* flags. Stop single stepping it.
*/
frame->tf_eflags &= ~PSL_T;
return;
}
/*
* Ignore debug register trace traps due to
* accesses in the user's address space, which

View File

@ -43,7 +43,6 @@ extern int szfreebsd4_sigcode;
#endif
#ifdef COMPAT_43
extern int szosigcode;
extern int szlcallcode;
#endif
extern uint32_t *vm_page_dump;

View File

@ -237,7 +237,7 @@ npx_probe(void)
}
save_idt_npxtrap = idt[IDT_MF];
setidt(IDT_MF, probetrap, SDT_SYS386IGT, SEL_KPL,
setidt(IDT_MF, probetrap, SDT_SYS386TGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
/*

View File

@ -27,8 +27,6 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_compat.h"
#include <sys/param.h>
#include <sys/exec.h>
#include <sys/imgact.h>
@ -339,18 +337,3 @@ exec_aout_imgact(struct image_params *imgp)
*/
static struct execsw aout_execsw = { exec_aout_imgact, "a.out" };
EXEC_SET(aout, aout_execsw);
#if defined(__i386__) && defined(COMPAT_43)
static void
exec_init_lcall(void *arg __unused)
{
struct segment_descriptor *gdp;
u_int lcall_addr;
gdp = &ldt[LSYS5CALLS_SEL].sd;
lcall_addr = aout_sysvec.sv_psstrings - szlcallcode;
gdp->sd_hibase = lcall_addr >> 24;
gdp->sd_lobase = lcall_addr;
}
SYSINIT(aout, SI_SUB_EXEC + 1, SI_ORDER_ANY, exec_init_lcall, NULL);
#endif