Improve the lcall $7,$0 syscall emulation on amd64.

Current code, which copies the potential syscall arguments into the
current frame, puts an arbitrary limit on the number of syscall
arguments.  Apparently, mmap(2) and lseek(2) (?) require larger
number.  But there is an issue that stack is only need to be mapped to
contain the number of arguments required by the syscall, so copying
arbitrary large number of words from the stack is not completely safe.

Use different approach to convert lcall frame into int $0x80 frame in
place, by doing the retl in kernel.  This also allows to stop proceed
vfork case specially, and stop making assumptions about %cs at the
syscall time.

Also, improve comments with the formulations provided by bde.

Reviewed and tested by:	bde
Sponsored by:	The FreeBSD Foundation
MFC after:	1 week
This commit is contained in:
Konstantin Belousov 2018-03-24 12:57:58 +00:00
parent e24e568336
commit a37d4032ed
2 changed files with 39 additions and 32 deletions

View File

@ -78,44 +78,23 @@ ia32_osigcode:
1:
jmp 1b
/*
* The lcall $7,$0 emulator cannot use the call gate that does an
* inter-privilege transition. The reason is that the call gate
* does not disable interrupts, and, before the swapgs is
* executed, we would have a window where the ring 0 code is
* executed with the wrong gsbase.
* Our lcall $7,$0 handler remains in user mode (ring 3), since lcalls
* don't change the interrupt mask, so if this one went directly to the
* kernel then there would be a window with interrupts enabled in kernel
* mode, and all interrupt handlers would have to be almost as complicated
* as the NMI handler to support this.
*
* Instead, set LDT descriptor 0 as code segment, which reflects
* the lcall $7,$0 back to ring 3 trampoline. The trampoline sets up
* the frame for int $0x80.
* Instead, convert the lcall to an int0x80 call. The kernel does most
* of the conversion by popping the lcall return values off the user
* stack and returning to them instead of to here, except when the
* conversion itself fails. Adjusting the stack here is impossible for
* vfork() and harder for other syscalls.
*/
ALIGN_TEXT
lcall_tramp:
cmpl $SYS_vfork,%eax
je 1f
pushl %ebp
movl %esp,%ebp
pushl 0x24(%ebp) /* arg 6 */
pushl 0x20(%ebp)
pushl 0x1c(%ebp)
pushl 0x18(%ebp)
pushl 0x14(%ebp)
pushl 0x10(%ebp) /* arg 1 */
subl $4,%esp /* gap */
int $0x80
leavel
lretl
1:
/*
* vfork handling is special and relies on the libc stub saving
* the return ip in %ecx. Also, we assume that the call was done
* with ucode32 selector in %cs.
*/
int $0x80
movl $0x33,4(%esp) /* GUCODE32_SEL | SEL_UPL */
movl %ecx,(%esp)
lretl
1: jmp 1b
#endif
ALIGN_TEXT

View File

@ -116,11 +116,39 @@ ia32_fetch_syscall_args(struct thread *td)
caddr_t params;
u_int32_t args[8], tmp;
int error, i;
#ifdef COMPAT_43
u_int32_t eip;
int cs;
#endif
p = td->td_proc;
frame = td->td_frame;
sa = &td->td_sa;
#ifdef COMPAT_43
if (__predict_false(frame->tf_cs == 7 && frame->tf_rip == 2)) {
/*
* In lcall $7,$0 after int $0x80. Convert the user
* frame to what it would be for a direct int 0x80 instead
* of lcall $7,$0, by popping the lcall return address.
*/
error = fueword32((void *)frame->tf_rsp, &eip);
if (error == -1)
return (EFAULT);
cs = fuword16((void *)(frame->tf_rsp + sizeof(u_int32_t)));
if (cs == -1)
return (EFAULT);
/*
* Unwind in-kernel frame after all stack frame pieces
* were successfully read.
*/
frame->tf_rip = eip;
frame->tf_cs = cs;
frame->tf_rsp += 2 * sizeof(u_int32_t);
}
#endif
params = (caddr_t)frame->tf_rsp + sizeof(u_int32_t);
sa->code = frame->tf_rax;