2003-05-14 04:10:49 +00:00
|
|
|
/*-
|
2017-11-18 14:26:50 +00:00
|
|
|
* SPDX-License-Identifier: BSD-4-Clause
|
|
|
|
*
|
2003-05-14 04:10:49 +00:00
|
|
|
* Copyright (C) 1994, David Greenman
|
|
|
|
* Copyright (c) 1990, 1993
|
|
|
|
* The Regents of the University of California. All rights reserved.
|
|
|
|
*
|
|
|
|
* This code is derived from software contributed to Berkeley by
|
|
|
|
* the University of Utah, and William Jolitz.
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
* are met:
|
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
|
* 3. All advertising materials mentioning features or use of this software
|
|
|
|
* must display the following acknowledgement:
|
|
|
|
* This product includes software developed by the University of
|
|
|
|
* California, Berkeley and its contributors.
|
|
|
|
* 4. Neither the name of the University nor the names of its contributors
|
|
|
|
* may be used to endorse or promote products derived from this software
|
|
|
|
* without specific prior written permission.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
|
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
|
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
|
|
* SUCH DAMAGE.
|
|
|
|
*/
|
|
|
|
|
2003-07-25 21:19:19 +00:00
|
|
|
#include <sys/cdefs.h>
|
|
|
|
__FBSDID("$FreeBSD$");
|
|
|
|
|
2003-05-14 04:10:49 +00:00
|
|
|
/*
|
|
|
|
* 386 Trap and System call handling
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "opt_clock.h"
|
|
|
|
#include "opt_cpu.h"
|
|
|
|
#include "opt_isa.h"
|
|
|
|
|
|
|
|
#include <sys/param.h>
|
|
|
|
#include <sys/bus.h>
|
|
|
|
#include <sys/systm.h>
|
|
|
|
#include <sys/proc.h>
|
|
|
|
#include <sys/pioctl.h>
|
|
|
|
#include <sys/kernel.h>
|
|
|
|
#include <sys/ktr.h>
|
|
|
|
#include <sys/lock.h>
|
|
|
|
#include <sys/mutex.h>
|
Reorganize syscall entry and leave handling.
Extend struct sysvec with three new elements:
sv_fetch_syscall_args - the method to fetch syscall arguments from
usermode into struct syscall_args. The structure is machine-depended
(this might be reconsidered after all architectures are converted).
sv_set_syscall_retval - the method to set a return value for usermode
from the syscall. It is a generalization of
cpu_set_syscall_retval(9) to allow ABIs to override the way to set a
return value.
sv_syscallnames - the table of syscall names.
Use sv_set_syscall_retval in kern_sigsuspend() instead of hardcoding
the call to cpu_set_syscall_retval().
The new functions syscallenter(9) and syscallret(9) are provided that
use sv_*syscall* pointers and contain the common repeated code from
the syscall() implementations for the architecture-specific syscall
trap handlers.
Syscallenter() fetches arguments, calls syscall implementation from
ABI sysent table, and set up return frame. The end of syscall
bookkeeping is done by syscallret().
Take advantage of single place for MI syscall handling code and
implement ptrace_lwpinfo pl_flags PL_FLAG_SCE, PL_FLAG_SCX and
PL_FLAG_EXEC. The SCE and SCX flags notify the debugger that the
thread is stopped at syscall entry or return point respectively. The
EXEC flag augments SCX and notifies debugger that the process address
space was changed by one of exec(2)-family syscalls.
The i386, amd64, sparc64, sun4v, powerpc and ia64 syscall()s are
changed to use syscallenter()/syscallret(). MIPS and arm are not
converted and use the mostly unchanged syscall() implementation.
Reviewed by: jhb, marcel, marius, nwhitehorn, stas
Tested by: marcel (ia64), marius (sparc64), nwhitehorn (powerpc),
stas (mips)
MFC after: 1 month
2010-05-23 18:32:02 +00:00
|
|
|
#include <sys/proc.h>
|
2006-07-27 19:50:16 +00:00
|
|
|
#include <sys/ptrace.h>
|
2003-05-14 04:10:49 +00:00
|
|
|
#include <sys/resourcevar.h>
|
|
|
|
#include <sys/signalvar.h>
|
|
|
|
#include <sys/syscall.h>
|
|
|
|
#include <sys/sysctl.h>
|
|
|
|
#include <sys/sysent.h>
|
|
|
|
#include <sys/uio.h>
|
|
|
|
#include <sys/vmmeter.h>
|
2006-02-04 20:37:20 +00:00
|
|
|
#include <security/audit/audit.h>
|
2003-05-14 04:10:49 +00:00
|
|
|
|
|
|
|
#include <vm/vm.h>
|
|
|
|
#include <vm/vm_param.h>
|
|
|
|
#include <vm/pmap.h>
|
|
|
|
#include <vm/vm_kern.h>
|
|
|
|
#include <vm/vm_map.h>
|
|
|
|
#include <vm/vm_page.h>
|
|
|
|
#include <vm/vm_extern.h>
|
|
|
|
|
|
|
|
#include <machine/cpu.h>
|
2003-11-17 08:58:16 +00:00
|
|
|
#include <machine/intr_machdep.h>
|
2003-05-14 04:10:49 +00:00
|
|
|
#include <machine/md_var.h>
|
|
|
|
|
2011-04-01 11:16:29 +00:00
|
|
|
#include <compat/freebsd32/freebsd32_signal.h>
|
Reorganize syscall entry and leave handling.
Extend struct sysvec with three new elements:
sv_fetch_syscall_args - the method to fetch syscall arguments from
usermode into struct syscall_args. The structure is machine-depended
(this might be reconsidered after all architectures are converted).
sv_set_syscall_retval - the method to set a return value for usermode
from the syscall. It is a generalization of
cpu_set_syscall_retval(9) to allow ABIs to override the way to set a
return value.
sv_syscallnames - the table of syscall names.
Use sv_set_syscall_retval in kern_sigsuspend() instead of hardcoding
the call to cpu_set_syscall_retval().
The new functions syscallenter(9) and syscallret(9) are provided that
use sv_*syscall* pointers and contain the common repeated code from
the syscall() implementations for the architecture-specific syscall
trap handlers.
Syscallenter() fetches arguments, calls syscall implementation from
ABI sysent table, and set up return frame. The end of syscall
bookkeeping is done by syscallret().
Take advantage of single place for MI syscall handling code and
implement ptrace_lwpinfo pl_flags PL_FLAG_SCE, PL_FLAG_SCX and
PL_FLAG_EXEC. The SCE and SCX flags notify the debugger that the
thread is stopped at syscall entry or return point respectively. The
EXEC flag augments SCX and notifies debugger that the process address
space was changed by one of exec(2)-family syscalls.
The i386, amd64, sparc64, sun4v, powerpc and ia64 syscall()s are
changed to use syscallenter()/syscallret(). MIPS and arm are not
converted and use the mostly unchanged syscall() implementation.
Reviewed by: jhb, marcel, marius, nwhitehorn, stas
Tested by: marcel (ia64), marius (sparc64), nwhitehorn (powerpc),
stas (mips)
MFC after: 1 month
2010-05-23 18:32:02 +00:00
|
|
|
#include <compat/freebsd32/freebsd32_util.h>
|
2011-04-01 11:16:29 +00:00
|
|
|
#include <compat/ia32/ia32_signal.h>
|
|
|
|
#include <machine/psl.h>
|
|
|
|
#include <machine/segments.h>
|
|
|
|
#include <machine/specialreg.h>
|
|
|
|
#include <machine/sysarch.h>
|
|
|
|
#include <machine/frame.h>
|
|
|
|
#include <machine/md_var.h>
|
|
|
|
#include <machine/pcb.h>
|
|
|
|
#include <machine/cpufunc.h>
|
Reorganize syscall entry and leave handling.
Extend struct sysvec with three new elements:
sv_fetch_syscall_args - the method to fetch syscall arguments from
usermode into struct syscall_args. The structure is machine-depended
(this might be reconsidered after all architectures are converted).
sv_set_syscall_retval - the method to set a return value for usermode
from the syscall. It is a generalization of
cpu_set_syscall_retval(9) to allow ABIs to override the way to set a
return value.
sv_syscallnames - the table of syscall names.
Use sv_set_syscall_retval in kern_sigsuspend() instead of hardcoding
the call to cpu_set_syscall_retval().
The new functions syscallenter(9) and syscallret(9) are provided that
use sv_*syscall* pointers and contain the common repeated code from
the syscall() implementations for the architecture-specific syscall
trap handlers.
Syscallenter() fetches arguments, calls syscall implementation from
ABI sysent table, and set up return frame. The end of syscall
bookkeeping is done by syscallret().
Take advantage of single place for MI syscall handling code and
implement ptrace_lwpinfo pl_flags PL_FLAG_SCE, PL_FLAG_SCX and
PL_FLAG_EXEC. The SCE and SCX flags notify the debugger that the
thread is stopped at syscall entry or return point respectively. The
EXEC flag augments SCX and notifies debugger that the process address
space was changed by one of exec(2)-family syscalls.
The i386, amd64, sparc64, sun4v, powerpc and ia64 syscall()s are
changed to use syscallenter()/syscallret(). MIPS and arm are not
converted and use the mostly unchanged syscall() implementation.
Reviewed by: jhb, marcel, marius, nwhitehorn, stas
Tested by: marcel (ia64), marius (sparc64), nwhitehorn (powerpc),
stas (mips)
MFC after: 1 month
2010-05-23 18:32:02 +00:00
|
|
|
|
2003-05-14 04:10:49 +00:00
|
|
|
#define IDTVEC(name) __CONCAT(X,name)
|
|
|
|
|
PTI for amd64.
The implementation of the Kernel Page Table Isolation (KPTI) for
amd64, first version. It provides a workaround for the 'meltdown'
vulnerability. PTI is turned off by default for now, enable with the
loader tunable vm.pmap.pti=1.
The pmap page table is split into kernel-mode table and user-mode
table. Kernel-mode table is identical to the non-PTI table, while
usermode table is obtained from kernel table by leaving userspace
mappings intact, but only leaving the following parts of the kernel
mapped:
kernel text (but not modules text)
PCPU
GDT/IDT/user LDT/task structures
IST stacks for NMI and doublefault handlers.
Kernel switches to user page table before returning to usermode, and
restores full kernel page table on the entry. Initial kernel-mode
stack for PTI trampoline is allocated in PCPU, it is only 16
qwords. Kernel entry trampoline switches page tables. then the
hardware trap frame is copied to the normal kstack, and execution
continues.
IST stacks are kept mapped and no trampoline is needed for
NMI/doublefault, but of course page table switch is performed.
On return to usermode, the trampoline is used again, iret frame is
copied to the trampoline stack, page tables are switched and iretq is
executed. The case of iretq faulting due to the invalid usermode
context is tricky, since the frame for fault is appended to the
trampoline frame. Besides copying the fault frame and original
(corrupted) frame to kstack, the fault frame must be patched to make
it look as if the fault occured on the kstack, see the comment in
doret_iret detection code in trap().
Currently kernel pages which are mapped during trampoline operation
are identical for all pmaps. They are registered using
pmap_pti_add_kva(). Besides initial registrations done during boot,
LDT and non-common TSS segments are registered if user requested their
use. In principle, they can be installed into kernel page table per
pmap with some work. Similarly, PCPU can be hidden from userspace
mapping using trampoline PCPU page, but again I do not see much
benefits besides complexity.
PDPE pages for the kernel half of the user page tables are
pre-allocated during boot because we need to know pml4 entries which
are copied to the top-level paging structure page, in advance on a new
pmap creation. I enforce this to avoid iterating over the all
existing pmaps if a new PDPE page is needed for PTI kernel mappings.
The iteration is a known problematic operation on i386.
The need to flush hidden kernel translations on the switch to user
mode make global tables (PG_G) meaningless and even harming, so PG_G
use is disabled for PTI case. Our existing use of PCID is
incompatible with PTI and is automatically disabled if PTI is
enabled. PCID can be forced on only for developer's benefit.
MCE is known to be broken, it requires IST stack to operate completely
correctly even for non-PTI case, and absolutely needs dedicated IST
stack because MCE delivery while trampoline did not switched from PTI
stack is fatal. The fix is pending.
Reviewed by: markj (partially)
Tested by: pho (previous version)
Discussed with: jeff, jhb
Sponsored by: The FreeBSD Foundation
MFC after: 2 weeks
2018-01-17 11:44:21 +00:00
|
|
|
extern inthand_t IDTVEC(int0x80_syscall), IDTVEC(int0x80_syscall_pti),
|
|
|
|
IDTVEC(rsvd), IDTVEC(rsvd_pti);
|
2003-05-14 04:10:49 +00:00
|
|
|
|
2006-12-17 06:48:40 +00:00
|
|
|
void ia32_syscall(struct trapframe *frame); /* Called from asm code */
|
2003-05-14 04:10:49 +00:00
|
|
|
|
Reorganize syscall entry and leave handling.
Extend struct sysvec with three new elements:
sv_fetch_syscall_args - the method to fetch syscall arguments from
usermode into struct syscall_args. The structure is machine-depended
(this might be reconsidered after all architectures are converted).
sv_set_syscall_retval - the method to set a return value for usermode
from the syscall. It is a generalization of
cpu_set_syscall_retval(9) to allow ABIs to override the way to set a
return value.
sv_syscallnames - the table of syscall names.
Use sv_set_syscall_retval in kern_sigsuspend() instead of hardcoding
the call to cpu_set_syscall_retval().
The new functions syscallenter(9) and syscallret(9) are provided that
use sv_*syscall* pointers and contain the common repeated code from
the syscall() implementations for the architecture-specific syscall
trap handlers.
Syscallenter() fetches arguments, calls syscall implementation from
ABI sysent table, and set up return frame. The end of syscall
bookkeeping is done by syscallret().
Take advantage of single place for MI syscall handling code and
implement ptrace_lwpinfo pl_flags PL_FLAG_SCE, PL_FLAG_SCX and
PL_FLAG_EXEC. The SCE and SCX flags notify the debugger that the
thread is stopped at syscall entry or return point respectively. The
EXEC flag augments SCX and notifies debugger that the process address
space was changed by one of exec(2)-family syscalls.
The i386, amd64, sparc64, sun4v, powerpc and ia64 syscall()s are
changed to use syscallenter()/syscallret(). MIPS and arm are not
converted and use the mostly unchanged syscall() implementation.
Reviewed by: jhb, marcel, marius, nwhitehorn, stas
Tested by: marcel (ia64), marius (sparc64), nwhitehorn (powerpc),
stas (mips)
MFC after: 1 month
2010-05-23 18:32:02 +00:00
|
|
|
void
|
|
|
|
ia32_set_syscall_retval(struct thread *td, int error)
|
|
|
|
{
|
2010-01-23 11:45:35 +00:00
|
|
|
|
Reorganize syscall entry and leave handling.
Extend struct sysvec with three new elements:
sv_fetch_syscall_args - the method to fetch syscall arguments from
usermode into struct syscall_args. The structure is machine-depended
(this might be reconsidered after all architectures are converted).
sv_set_syscall_retval - the method to set a return value for usermode
from the syscall. It is a generalization of
cpu_set_syscall_retval(9) to allow ABIs to override the way to set a
return value.
sv_syscallnames - the table of syscall names.
Use sv_set_syscall_retval in kern_sigsuspend() instead of hardcoding
the call to cpu_set_syscall_retval().
The new functions syscallenter(9) and syscallret(9) are provided that
use sv_*syscall* pointers and contain the common repeated code from
the syscall() implementations for the architecture-specific syscall
trap handlers.
Syscallenter() fetches arguments, calls syscall implementation from
ABI sysent table, and set up return frame. The end of syscall
bookkeeping is done by syscallret().
Take advantage of single place for MI syscall handling code and
implement ptrace_lwpinfo pl_flags PL_FLAG_SCE, PL_FLAG_SCX and
PL_FLAG_EXEC. The SCE and SCX flags notify the debugger that the
thread is stopped at syscall entry or return point respectively. The
EXEC flag augments SCX and notifies debugger that the process address
space was changed by one of exec(2)-family syscalls.
The i386, amd64, sparc64, sun4v, powerpc and ia64 syscall()s are
changed to use syscallenter()/syscallret(). MIPS and arm are not
converted and use the mostly unchanged syscall() implementation.
Reviewed by: jhb, marcel, marius, nwhitehorn, stas
Tested by: marcel (ia64), marius (sparc64), nwhitehorn (powerpc),
stas (mips)
MFC after: 1 month
2010-05-23 18:32:02 +00:00
|
|
|
cpu_set_syscall_retval(td, error);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
2017-06-12 21:03:23 +00:00
|
|
|
ia32_fetch_syscall_args(struct thread *td)
|
2010-01-23 11:45:35 +00:00
|
|
|
{
|
|
|
|
struct proc *p;
|
|
|
|
struct trapframe *frame;
|
2017-06-12 21:03:23 +00:00
|
|
|
struct syscall_args *sa;
|
Reorganize syscall entry and leave handling.
Extend struct sysvec with three new elements:
sv_fetch_syscall_args - the method to fetch syscall arguments from
usermode into struct syscall_args. The structure is machine-depended
(this might be reconsidered after all architectures are converted).
sv_set_syscall_retval - the method to set a return value for usermode
from the syscall. It is a generalization of
cpu_set_syscall_retval(9) to allow ABIs to override the way to set a
return value.
sv_syscallnames - the table of syscall names.
Use sv_set_syscall_retval in kern_sigsuspend() instead of hardcoding
the call to cpu_set_syscall_retval().
The new functions syscallenter(9) and syscallret(9) are provided that
use sv_*syscall* pointers and contain the common repeated code from
the syscall() implementations for the architecture-specific syscall
trap handlers.
Syscallenter() fetches arguments, calls syscall implementation from
ABI sysent table, and set up return frame. The end of syscall
bookkeeping is done by syscallret().
Take advantage of single place for MI syscall handling code and
implement ptrace_lwpinfo pl_flags PL_FLAG_SCE, PL_FLAG_SCX and
PL_FLAG_EXEC. The SCE and SCX flags notify the debugger that the
thread is stopped at syscall entry or return point respectively. The
EXEC flag augments SCX and notifies debugger that the process address
space was changed by one of exec(2)-family syscalls.
The i386, amd64, sparc64, sun4v, powerpc and ia64 syscall()s are
changed to use syscallenter()/syscallret(). MIPS and arm are not
converted and use the mostly unchanged syscall() implementation.
Reviewed by: jhb, marcel, marius, nwhitehorn, stas
Tested by: marcel (ia64), marius (sparc64), nwhitehorn (powerpc),
stas (mips)
MFC after: 1 month
2010-05-23 18:32:02 +00:00
|
|
|
caddr_t params;
|
2014-10-28 15:28:20 +00:00
|
|
|
u_int32_t args[8], tmp;
|
2010-01-23 11:45:35 +00:00
|
|
|
int error, i;
|
2018-03-24 12:57:58 +00:00
|
|
|
#ifdef COMPAT_43
|
|
|
|
u_int32_t eip;
|
|
|
|
int cs;
|
|
|
|
#endif
|
2003-05-14 04:10:49 +00:00
|
|
|
|
2010-01-23 11:45:35 +00:00
|
|
|
p = td->td_proc;
|
|
|
|
frame = td->td_frame;
|
2017-06-12 21:03:23 +00:00
|
|
|
sa = &td->td_sa;
|
2010-01-23 11:45:35 +00:00
|
|
|
|
2018-03-24 12:57:58 +00:00
|
|
|
#ifdef COMPAT_43
|
|
|
|
if (__predict_false(frame->tf_cs == 7 && frame->tf_rip == 2)) {
|
|
|
|
/*
|
|
|
|
* In lcall $7,$0 after int $0x80. Convert the user
|
|
|
|
* frame to what it would be for a direct int 0x80 instead
|
|
|
|
* of lcall $7,$0, by popping the lcall return address.
|
|
|
|
*/
|
|
|
|
error = fueword32((void *)frame->tf_rsp, &eip);
|
|
|
|
if (error == -1)
|
|
|
|
return (EFAULT);
|
|
|
|
cs = fuword16((void *)(frame->tf_rsp + sizeof(u_int32_t)));
|
|
|
|
if (cs == -1)
|
|
|
|
return (EFAULT);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Unwind in-kernel frame after all stack frame pieces
|
|
|
|
* were successfully read.
|
|
|
|
*/
|
|
|
|
frame->tf_rip = eip;
|
|
|
|
frame->tf_cs = cs;
|
|
|
|
frame->tf_rsp += 2 * sizeof(u_int32_t);
|
2018-04-05 11:03:21 +00:00
|
|
|
frame->tf_err = 7; /* size of lcall $7,$0 */
|
2018-03-24 12:57:58 +00:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
Reorganize syscall entry and leave handling.
Extend struct sysvec with three new elements:
sv_fetch_syscall_args - the method to fetch syscall arguments from
usermode into struct syscall_args. The structure is machine-depended
(this might be reconsidered after all architectures are converted).
sv_set_syscall_retval - the method to set a return value for usermode
from the syscall. It is a generalization of
cpu_set_syscall_retval(9) to allow ABIs to override the way to set a
return value.
sv_syscallnames - the table of syscall names.
Use sv_set_syscall_retval in kern_sigsuspend() instead of hardcoding
the call to cpu_set_syscall_retval().
The new functions syscallenter(9) and syscallret(9) are provided that
use sv_*syscall* pointers and contain the common repeated code from
the syscall() implementations for the architecture-specific syscall
trap handlers.
Syscallenter() fetches arguments, calls syscall implementation from
ABI sysent table, and set up return frame. The end of syscall
bookkeeping is done by syscallret().
Take advantage of single place for MI syscall handling code and
implement ptrace_lwpinfo pl_flags PL_FLAG_SCE, PL_FLAG_SCX and
PL_FLAG_EXEC. The SCE and SCX flags notify the debugger that the
thread is stopped at syscall entry or return point respectively. The
EXEC flag augments SCX and notifies debugger that the process address
space was changed by one of exec(2)-family syscalls.
The i386, amd64, sparc64, sun4v, powerpc and ia64 syscall()s are
changed to use syscallenter()/syscallret(). MIPS and arm are not
converted and use the mostly unchanged syscall() implementation.
Reviewed by: jhb, marcel, marius, nwhitehorn, stas
Tested by: marcel (ia64), marius (sparc64), nwhitehorn (powerpc),
stas (mips)
MFC after: 1 month
2010-05-23 18:32:02 +00:00
|
|
|
params = (caddr_t)frame->tf_rsp + sizeof(u_int32_t);
|
2010-01-23 11:45:35 +00:00
|
|
|
sa->code = frame->tf_rax;
|
2003-05-14 04:10:49 +00:00
|
|
|
|
Reorganize syscall entry and leave handling.
Extend struct sysvec with three new elements:
sv_fetch_syscall_args - the method to fetch syscall arguments from
usermode into struct syscall_args. The structure is machine-depended
(this might be reconsidered after all architectures are converted).
sv_set_syscall_retval - the method to set a return value for usermode
from the syscall. It is a generalization of
cpu_set_syscall_retval(9) to allow ABIs to override the way to set a
return value.
sv_syscallnames - the table of syscall names.
Use sv_set_syscall_retval in kern_sigsuspend() instead of hardcoding
the call to cpu_set_syscall_retval().
The new functions syscallenter(9) and syscallret(9) are provided that
use sv_*syscall* pointers and contain the common repeated code from
the syscall() implementations for the architecture-specific syscall
trap handlers.
Syscallenter() fetches arguments, calls syscall implementation from
ABI sysent table, and set up return frame. The end of syscall
bookkeeping is done by syscallret().
Take advantage of single place for MI syscall handling code and
implement ptrace_lwpinfo pl_flags PL_FLAG_SCE, PL_FLAG_SCX and
PL_FLAG_EXEC. The SCE and SCX flags notify the debugger that the
thread is stopped at syscall entry or return point respectively. The
EXEC flag augments SCX and notifies debugger that the process address
space was changed by one of exec(2)-family syscalls.
The i386, amd64, sparc64, sun4v, powerpc and ia64 syscall()s are
changed to use syscallenter()/syscallret(). MIPS and arm are not
converted and use the mostly unchanged syscall() implementation.
Reviewed by: jhb, marcel, marius, nwhitehorn, stas
Tested by: marcel (ia64), marius (sparc64), nwhitehorn (powerpc),
stas (mips)
MFC after: 1 month
2010-05-23 18:32:02 +00:00
|
|
|
/*
|
|
|
|
* Need to check if this is a 32 bit or 64 bit syscall.
|
|
|
|
*/
|
|
|
|
if (sa->code == SYS_syscall) {
|
2003-05-14 04:10:49 +00:00
|
|
|
/*
|
Reorganize syscall entry and leave handling.
Extend struct sysvec with three new elements:
sv_fetch_syscall_args - the method to fetch syscall arguments from
usermode into struct syscall_args. The structure is machine-depended
(this might be reconsidered after all architectures are converted).
sv_set_syscall_retval - the method to set a return value for usermode
from the syscall. It is a generalization of
cpu_set_syscall_retval(9) to allow ABIs to override the way to set a
return value.
sv_syscallnames - the table of syscall names.
Use sv_set_syscall_retval in kern_sigsuspend() instead of hardcoding
the call to cpu_set_syscall_retval().
The new functions syscallenter(9) and syscallret(9) are provided that
use sv_*syscall* pointers and contain the common repeated code from
the syscall() implementations for the architecture-specific syscall
trap handlers.
Syscallenter() fetches arguments, calls syscall implementation from
ABI sysent table, and set up return frame. The end of syscall
bookkeeping is done by syscallret().
Take advantage of single place for MI syscall handling code and
implement ptrace_lwpinfo pl_flags PL_FLAG_SCE, PL_FLAG_SCX and
PL_FLAG_EXEC. The SCE and SCX flags notify the debugger that the
thread is stopped at syscall entry or return point respectively. The
EXEC flag augments SCX and notifies debugger that the process address
space was changed by one of exec(2)-family syscalls.
The i386, amd64, sparc64, sun4v, powerpc and ia64 syscall()s are
changed to use syscallenter()/syscallret(). MIPS and arm are not
converted and use the mostly unchanged syscall() implementation.
Reviewed by: jhb, marcel, marius, nwhitehorn, stas
Tested by: marcel (ia64), marius (sparc64), nwhitehorn (powerpc),
stas (mips)
MFC after: 1 month
2010-05-23 18:32:02 +00:00
|
|
|
* Code is first argument, followed by actual args.
|
2003-05-14 04:10:49 +00:00
|
|
|
*/
|
2014-10-28 15:28:20 +00:00
|
|
|
error = fueword32(params, &tmp);
|
|
|
|
if (error == -1)
|
|
|
|
return (EFAULT);
|
|
|
|
sa->code = tmp;
|
Reorganize syscall entry and leave handling.
Extend struct sysvec with three new elements:
sv_fetch_syscall_args - the method to fetch syscall arguments from
usermode into struct syscall_args. The structure is machine-depended
(this might be reconsidered after all architectures are converted).
sv_set_syscall_retval - the method to set a return value for usermode
from the syscall. It is a generalization of
cpu_set_syscall_retval(9) to allow ABIs to override the way to set a
return value.
sv_syscallnames - the table of syscall names.
Use sv_set_syscall_retval in kern_sigsuspend() instead of hardcoding
the call to cpu_set_syscall_retval().
The new functions syscallenter(9) and syscallret(9) are provided that
use sv_*syscall* pointers and contain the common repeated code from
the syscall() implementations for the architecture-specific syscall
trap handlers.
Syscallenter() fetches arguments, calls syscall implementation from
ABI sysent table, and set up return frame. The end of syscall
bookkeeping is done by syscallret().
Take advantage of single place for MI syscall handling code and
implement ptrace_lwpinfo pl_flags PL_FLAG_SCE, PL_FLAG_SCX and
PL_FLAG_EXEC. The SCE and SCX flags notify the debugger that the
thread is stopped at syscall entry or return point respectively. The
EXEC flag augments SCX and notifies debugger that the process address
space was changed by one of exec(2)-family syscalls.
The i386, amd64, sparc64, sun4v, powerpc and ia64 syscall()s are
changed to use syscallenter()/syscallret(). MIPS and arm are not
converted and use the mostly unchanged syscall() implementation.
Reviewed by: jhb, marcel, marius, nwhitehorn, stas
Tested by: marcel (ia64), marius (sparc64), nwhitehorn (powerpc),
stas (mips)
MFC after: 1 month
2010-05-23 18:32:02 +00:00
|
|
|
params += sizeof(int);
|
|
|
|
} else if (sa->code == SYS___syscall) {
|
2003-05-14 04:10:49 +00:00
|
|
|
/*
|
Reorganize syscall entry and leave handling.
Extend struct sysvec with three new elements:
sv_fetch_syscall_args - the method to fetch syscall arguments from
usermode into struct syscall_args. The structure is machine-depended
(this might be reconsidered after all architectures are converted).
sv_set_syscall_retval - the method to set a return value for usermode
from the syscall. It is a generalization of
cpu_set_syscall_retval(9) to allow ABIs to override the way to set a
return value.
sv_syscallnames - the table of syscall names.
Use sv_set_syscall_retval in kern_sigsuspend() instead of hardcoding
the call to cpu_set_syscall_retval().
The new functions syscallenter(9) and syscallret(9) are provided that
use sv_*syscall* pointers and contain the common repeated code from
the syscall() implementations for the architecture-specific syscall
trap handlers.
Syscallenter() fetches arguments, calls syscall implementation from
ABI sysent table, and set up return frame. The end of syscall
bookkeeping is done by syscallret().
Take advantage of single place for MI syscall handling code and
implement ptrace_lwpinfo pl_flags PL_FLAG_SCE, PL_FLAG_SCX and
PL_FLAG_EXEC. The SCE and SCX flags notify the debugger that the
thread is stopped at syscall entry or return point respectively. The
EXEC flag augments SCX and notifies debugger that the process address
space was changed by one of exec(2)-family syscalls.
The i386, amd64, sparc64, sun4v, powerpc and ia64 syscall()s are
changed to use syscallenter()/syscallret(). MIPS and arm are not
converted and use the mostly unchanged syscall() implementation.
Reviewed by: jhb, marcel, marius, nwhitehorn, stas
Tested by: marcel (ia64), marius (sparc64), nwhitehorn (powerpc),
stas (mips)
MFC after: 1 month
2010-05-23 18:32:02 +00:00
|
|
|
* Like syscall, but code is a quad, so as to maintain
|
|
|
|
* quad alignment for the rest of the arguments.
|
|
|
|
* We use a 32-bit fetch in case params is not
|
|
|
|
* aligned.
|
2003-05-14 04:10:49 +00:00
|
|
|
*/
|
2014-10-28 15:28:20 +00:00
|
|
|
error = fueword32(params, &tmp);
|
|
|
|
if (error == -1)
|
|
|
|
return (EFAULT);
|
|
|
|
sa->code = tmp;
|
Reorganize syscall entry and leave handling.
Extend struct sysvec with three new elements:
sv_fetch_syscall_args - the method to fetch syscall arguments from
usermode into struct syscall_args. The structure is machine-depended
(this might be reconsidered after all architectures are converted).
sv_set_syscall_retval - the method to set a return value for usermode
from the syscall. It is a generalization of
cpu_set_syscall_retval(9) to allow ABIs to override the way to set a
return value.
sv_syscallnames - the table of syscall names.
Use sv_set_syscall_retval in kern_sigsuspend() instead of hardcoding
the call to cpu_set_syscall_retval().
The new functions syscallenter(9) and syscallret(9) are provided that
use sv_*syscall* pointers and contain the common repeated code from
the syscall() implementations for the architecture-specific syscall
trap handlers.
Syscallenter() fetches arguments, calls syscall implementation from
ABI sysent table, and set up return frame. The end of syscall
bookkeeping is done by syscallret().
Take advantage of single place for MI syscall handling code and
implement ptrace_lwpinfo pl_flags PL_FLAG_SCE, PL_FLAG_SCX and
PL_FLAG_EXEC. The SCE and SCX flags notify the debugger that the
thread is stopped at syscall entry or return point respectively. The
EXEC flag augments SCX and notifies debugger that the process address
space was changed by one of exec(2)-family syscalls.
The i386, amd64, sparc64, sun4v, powerpc and ia64 syscall()s are
changed to use syscallenter()/syscallret(). MIPS and arm are not
converted and use the mostly unchanged syscall() implementation.
Reviewed by: jhb, marcel, marius, nwhitehorn, stas
Tested by: marcel (ia64), marius (sparc64), nwhitehorn (powerpc),
stas (mips)
MFC after: 1 month
2010-05-23 18:32:02 +00:00
|
|
|
params += sizeof(quad_t);
|
2003-05-14 04:10:49 +00:00
|
|
|
}
|
|
|
|
if (p->p_sysent->sv_mask)
|
2010-01-23 11:45:35 +00:00
|
|
|
sa->code &= p->p_sysent->sv_mask;
|
|
|
|
if (sa->code >= p->p_sysent->sv_size)
|
|
|
|
sa->callp = &p->p_sysent->sv_table[0];
|
2003-05-14 04:10:49 +00:00
|
|
|
else
|
2010-01-23 11:45:35 +00:00
|
|
|
sa->callp = &p->p_sysent->sv_table[sa->code];
|
|
|
|
sa->narg = sa->callp->sy_narg;
|
2003-05-14 04:10:49 +00:00
|
|
|
|
Reorganize syscall entry and leave handling.
Extend struct sysvec with three new elements:
sv_fetch_syscall_args - the method to fetch syscall arguments from
usermode into struct syscall_args. The structure is machine-depended
(this might be reconsidered after all architectures are converted).
sv_set_syscall_retval - the method to set a return value for usermode
from the syscall. It is a generalization of
cpu_set_syscall_retval(9) to allow ABIs to override the way to set a
return value.
sv_syscallnames - the table of syscall names.
Use sv_set_syscall_retval in kern_sigsuspend() instead of hardcoding
the call to cpu_set_syscall_retval().
The new functions syscallenter(9) and syscallret(9) are provided that
use sv_*syscall* pointers and contain the common repeated code from
the syscall() implementations for the architecture-specific syscall
trap handlers.
Syscallenter() fetches arguments, calls syscall implementation from
ABI sysent table, and set up return frame. The end of syscall
bookkeeping is done by syscallret().
Take advantage of single place for MI syscall handling code and
implement ptrace_lwpinfo pl_flags PL_FLAG_SCE, PL_FLAG_SCX and
PL_FLAG_EXEC. The SCE and SCX flags notify the debugger that the
thread is stopped at syscall entry or return point respectively. The
EXEC flag augments SCX and notifies debugger that the process address
space was changed by one of exec(2)-family syscalls.
The i386, amd64, sparc64, sun4v, powerpc and ia64 syscall()s are
changed to use syscallenter()/syscallret(). MIPS and arm are not
converted and use the mostly unchanged syscall() implementation.
Reviewed by: jhb, marcel, marius, nwhitehorn, stas
Tested by: marcel (ia64), marius (sparc64), nwhitehorn (powerpc),
stas (mips)
MFC after: 1 month
2010-05-23 18:32:02 +00:00
|
|
|
if (params != NULL && sa->narg != 0)
|
|
|
|
error = copyin(params, (caddr_t)args,
|
2010-01-23 11:45:35 +00:00
|
|
|
(u_int)(sa->narg * sizeof(int)));
|
2003-05-14 04:10:49 +00:00
|
|
|
else
|
|
|
|
error = 0;
|
|
|
|
|
2010-01-23 11:45:35 +00:00
|
|
|
for (i = 0; i < sa->narg; i++)
|
Reorganize syscall entry and leave handling.
Extend struct sysvec with three new elements:
sv_fetch_syscall_args - the method to fetch syscall arguments from
usermode into struct syscall_args. The structure is machine-depended
(this might be reconsidered after all architectures are converted).
sv_set_syscall_retval - the method to set a return value for usermode
from the syscall. It is a generalization of
cpu_set_syscall_retval(9) to allow ABIs to override the way to set a
return value.
sv_syscallnames - the table of syscall names.
Use sv_set_syscall_retval in kern_sigsuspend() instead of hardcoding
the call to cpu_set_syscall_retval().
The new functions syscallenter(9) and syscallret(9) are provided that
use sv_*syscall* pointers and contain the common repeated code from
the syscall() implementations for the architecture-specific syscall
trap handlers.
Syscallenter() fetches arguments, calls syscall implementation from
ABI sysent table, and set up return frame. The end of syscall
bookkeeping is done by syscallret().
Take advantage of single place for MI syscall handling code and
implement ptrace_lwpinfo pl_flags PL_FLAG_SCE, PL_FLAG_SCX and
PL_FLAG_EXEC. The SCE and SCX flags notify the debugger that the
thread is stopped at syscall entry or return point respectively. The
EXEC flag augments SCX and notifies debugger that the process address
space was changed by one of exec(2)-family syscalls.
The i386, amd64, sparc64, sun4v, powerpc and ia64 syscall()s are
changed to use syscallenter()/syscallret(). MIPS and arm are not
converted and use the mostly unchanged syscall() implementation.
Reviewed by: jhb, marcel, marius, nwhitehorn, stas
Tested by: marcel (ia64), marius (sparc64), nwhitehorn (powerpc),
stas (mips)
MFC after: 1 month
2010-05-23 18:32:02 +00:00
|
|
|
sa->args[i] = args[i];
|
2003-05-14 04:10:49 +00:00
|
|
|
|
Reorganize syscall entry and leave handling.
Extend struct sysvec with three new elements:
sv_fetch_syscall_args - the method to fetch syscall arguments from
usermode into struct syscall_args. The structure is machine-depended
(this might be reconsidered after all architectures are converted).
sv_set_syscall_retval - the method to set a return value for usermode
from the syscall. It is a generalization of
cpu_set_syscall_retval(9) to allow ABIs to override the way to set a
return value.
sv_syscallnames - the table of syscall names.
Use sv_set_syscall_retval in kern_sigsuspend() instead of hardcoding
the call to cpu_set_syscall_retval().
The new functions syscallenter(9) and syscallret(9) are provided that
use sv_*syscall* pointers and contain the common repeated code from
the syscall() implementations for the architecture-specific syscall
trap handlers.
Syscallenter() fetches arguments, calls syscall implementation from
ABI sysent table, and set up return frame. The end of syscall
bookkeeping is done by syscallret().
Take advantage of single place for MI syscall handling code and
implement ptrace_lwpinfo pl_flags PL_FLAG_SCE, PL_FLAG_SCX and
PL_FLAG_EXEC. The SCE and SCX flags notify the debugger that the
thread is stopped at syscall entry or return point respectively. The
EXEC flag augments SCX and notifies debugger that the process address
space was changed by one of exec(2)-family syscalls.
The i386, amd64, sparc64, sun4v, powerpc and ia64 syscall()s are
changed to use syscallenter()/syscallret(). MIPS and arm are not
converted and use the mostly unchanged syscall() implementation.
Reviewed by: jhb, marcel, marius, nwhitehorn, stas
Tested by: marcel (ia64), marius (sparc64), nwhitehorn (powerpc),
stas (mips)
MFC after: 1 month
2010-05-23 18:32:02 +00:00
|
|
|
if (error == 0) {
|
|
|
|
td->td_retval[0] = 0;
|
|
|
|
td->td_retval[1] = frame->tf_rdx;
|
|
|
|
}
|
2010-01-23 11:45:35 +00:00
|
|
|
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2011-09-11 16:05:09 +00:00
|
|
|
#include "../../kern/subr_syscall.c"
|
|
|
|
|
2010-01-23 11:45:35 +00:00
|
|
|
void
|
|
|
|
ia32_syscall(struct trapframe *frame)
|
|
|
|
{
|
|
|
|
struct thread *td;
|
|
|
|
register_t orig_tf_rflags;
|
|
|
|
int error;
|
|
|
|
ksiginfo_t ksi;
|
|
|
|
|
Reorganize syscall entry and leave handling.
Extend struct sysvec with three new elements:
sv_fetch_syscall_args - the method to fetch syscall arguments from
usermode into struct syscall_args. The structure is machine-depended
(this might be reconsidered after all architectures are converted).
sv_set_syscall_retval - the method to set a return value for usermode
from the syscall. It is a generalization of
cpu_set_syscall_retval(9) to allow ABIs to override the way to set a
return value.
sv_syscallnames - the table of syscall names.
Use sv_set_syscall_retval in kern_sigsuspend() instead of hardcoding
the call to cpu_set_syscall_retval().
The new functions syscallenter(9) and syscallret(9) are provided that
use sv_*syscall* pointers and contain the common repeated code from
the syscall() implementations for the architecture-specific syscall
trap handlers.
Syscallenter() fetches arguments, calls syscall implementation from
ABI sysent table, and set up return frame. The end of syscall
bookkeeping is done by syscallret().
Take advantage of single place for MI syscall handling code and
implement ptrace_lwpinfo pl_flags PL_FLAG_SCE, PL_FLAG_SCX and
PL_FLAG_EXEC. The SCE and SCX flags notify the debugger that the
thread is stopped at syscall entry or return point respectively. The
EXEC flag augments SCX and notifies debugger that the process address
space was changed by one of exec(2)-family syscalls.
The i386, amd64, sparc64, sun4v, powerpc and ia64 syscall()s are
changed to use syscallenter()/syscallret(). MIPS and arm are not
converted and use the mostly unchanged syscall() implementation.
Reviewed by: jhb, marcel, marius, nwhitehorn, stas
Tested by: marcel (ia64), marius (sparc64), nwhitehorn (powerpc),
stas (mips)
MFC after: 1 month
2010-05-23 18:32:02 +00:00
|
|
|
orig_tf_rflags = frame->tf_rflags;
|
2010-01-23 11:45:35 +00:00
|
|
|
td = curthread;
|
|
|
|
td->td_frame = frame;
|
2003-05-14 04:10:49 +00:00
|
|
|
|
2017-06-12 21:03:23 +00:00
|
|
|
error = syscallenter(td);
|
2003-05-14 04:10:49 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Traced syscall.
|
|
|
|
*/
|
|
|
|
if (orig_tf_rflags & PSL_T) {
|
2006-12-17 06:48:40 +00:00
|
|
|
frame->tf_rflags &= ~PSL_T;
|
1. Change prototype of trapsignal and sendsig to use ksiginfo_t *, most
changes in MD code are trivial, before this change, trapsignal and
sendsig use discrete parameters, now they uses member fields of
ksiginfo_t structure. For sendsig, this change allows us to pass
POSIX realtime signal value to user code.
2. Remove cpu_thread_siginfo, it is no longer needed because we now always
generate ksiginfo_t data and feed it to libpthread.
3. Add p_sigqueue to proc structure to hold shared signals which were
blocked by all threads in the proc.
4. Add td_sigqueue to thread structure to hold all signals delivered to
thread.
5. i386 and amd64 now return POSIX standard si_code, other arches will
be fixed.
6. In this sigqueue implementation, pending signal set is kept as before,
an extra siginfo list holds additional siginfo_t data for signals.
kernel code uses psignal() still behavior as before, it won't be failed
even under memory pressure, only exception is when deleting a signal,
we should call sigqueue_delete to remove signal from sigqueue but
not SIGDELSET. Current there is no kernel code will deliver a signal
with additional data, so kernel should be as stable as before,
a ksiginfo can carry more information, for example, allow signal to
be delivered but throw away siginfo data if memory is not enough.
SIGKILL and SIGSTOP have fast path in sigqueue_add, because they can
not be caught or masked.
The sigqueue() syscall allows user code to queue a signal to target
process, if resource is unavailable, EAGAIN will be returned as
specification said.
Just before thread exits, signal queue memory will be freed by
sigqueue_flush.
Current, all signals are allowed to be queued, not only realtime signals.
Earlier patch reviewed by: jhb, deischen
Tested on: i386, amd64
2005-10-14 12:43:47 +00:00
|
|
|
ksiginfo_init_trap(&ksi);
|
|
|
|
ksi.ksi_signo = SIGTRAP;
|
|
|
|
ksi.ksi_code = TRAP_TRACE;
|
2006-12-17 06:48:40 +00:00
|
|
|
ksi.ksi_addr = (void *)frame->tf_rip;
|
1. Change prototype of trapsignal and sendsig to use ksiginfo_t *, most
changes in MD code are trivial, before this change, trapsignal and
sendsig use discrete parameters, now they uses member fields of
ksiginfo_t structure. For sendsig, this change allows us to pass
POSIX realtime signal value to user code.
2. Remove cpu_thread_siginfo, it is no longer needed because we now always
generate ksiginfo_t data and feed it to libpthread.
3. Add p_sigqueue to proc structure to hold shared signals which were
blocked by all threads in the proc.
4. Add td_sigqueue to thread structure to hold all signals delivered to
thread.
5. i386 and amd64 now return POSIX standard si_code, other arches will
be fixed.
6. In this sigqueue implementation, pending signal set is kept as before,
an extra siginfo list holds additional siginfo_t data for signals.
kernel code uses psignal() still behavior as before, it won't be failed
even under memory pressure, only exception is when deleting a signal,
we should call sigqueue_delete to remove signal from sigqueue but
not SIGDELSET. Current there is no kernel code will deliver a signal
with additional data, so kernel should be as stable as before,
a ksiginfo can carry more information, for example, allow signal to
be delivered but throw away siginfo data if memory is not enough.
SIGKILL and SIGSTOP have fast path in sigqueue_add, because they can
not be caught or masked.
The sigqueue() syscall allows user code to queue a signal to target
process, if resource is unavailable, EAGAIN will be returned as
specification said.
Just before thread exits, signal queue memory will be freed by
sigqueue_flush.
Current, all signals are allowed to be queued, not only realtime signals.
Earlier patch reviewed by: jhb, deischen
Tested on: i386, amd64
2005-10-14 12:43:47 +00:00
|
|
|
trapsignal(td, &ksi);
|
2003-05-14 04:10:49 +00:00
|
|
|
}
|
|
|
|
|
2017-06-12 21:03:23 +00:00
|
|
|
syscallret(td, error);
|
2003-05-14 04:10:49 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
ia32_syscall_enable(void *dummy)
|
|
|
|
{
|
|
|
|
|
PTI for amd64.
The implementation of the Kernel Page Table Isolation (KPTI) for
amd64, first version. It provides a workaround for the 'meltdown'
vulnerability. PTI is turned off by default for now, enable with the
loader tunable vm.pmap.pti=1.
The pmap page table is split into kernel-mode table and user-mode
table. Kernel-mode table is identical to the non-PTI table, while
usermode table is obtained from kernel table by leaving userspace
mappings intact, but only leaving the following parts of the kernel
mapped:
kernel text (but not modules text)
PCPU
GDT/IDT/user LDT/task structures
IST stacks for NMI and doublefault handlers.
Kernel switches to user page table before returning to usermode, and
restores full kernel page table on the entry. Initial kernel-mode
stack for PTI trampoline is allocated in PCPU, it is only 16
qwords. Kernel entry trampoline switches page tables. then the
hardware trap frame is copied to the normal kstack, and execution
continues.
IST stacks are kept mapped and no trampoline is needed for
NMI/doublefault, but of course page table switch is performed.
On return to usermode, the trampoline is used again, iret frame is
copied to the trampoline stack, page tables are switched and iretq is
executed. The case of iretq faulting due to the invalid usermode
context is tricky, since the frame for fault is appended to the
trampoline frame. Besides copying the fault frame and original
(corrupted) frame to kstack, the fault frame must be patched to make
it look as if the fault occured on the kstack, see the comment in
doret_iret detection code in trap().
Currently kernel pages which are mapped during trampoline operation
are identical for all pmaps. They are registered using
pmap_pti_add_kva(). Besides initial registrations done during boot,
LDT and non-common TSS segments are registered if user requested their
use. In principle, they can be installed into kernel page table per
pmap with some work. Similarly, PCPU can be hidden from userspace
mapping using trampoline PCPU page, but again I do not see much
benefits besides complexity.
PDPE pages for the kernel half of the user page tables are
pre-allocated during boot because we need to know pml4 entries which
are copied to the top-level paging structure page, in advance on a new
pmap creation. I enforce this to avoid iterating over the all
existing pmaps if a new PDPE page is needed for PTI kernel mappings.
The iteration is a known problematic operation on i386.
The need to flush hidden kernel translations on the switch to user
mode make global tables (PG_G) meaningless and even harming, so PG_G
use is disabled for PTI case. Our existing use of PCID is
incompatible with PTI and is automatically disabled if PTI is
enabled. PCID can be forced on only for developer's benefit.
MCE is known to be broken, it requires IST stack to operate completely
correctly even for non-PTI case, and absolutely needs dedicated IST
stack because MCE delivery while trampoline did not switched from PTI
stack is fatal. The fix is pending.
Reviewed by: markj (partially)
Tested by: pho (previous version)
Discussed with: jeff, jhb
Sponsored by: The FreeBSD Foundation
MFC after: 2 weeks
2018-01-17 11:44:21 +00:00
|
|
|
setidt(IDT_SYSCALL, pti ? &IDTVEC(int0x80_syscall_pti) :
|
|
|
|
&IDTVEC(int0x80_syscall), SDT_SYSIGT, SEL_UPL, 0);
|
2003-05-14 04:10:49 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
ia32_syscall_disable(void *dummy)
|
|
|
|
{
|
|
|
|
|
PTI for amd64.
The implementation of the Kernel Page Table Isolation (KPTI) for
amd64, first version. It provides a workaround for the 'meltdown'
vulnerability. PTI is turned off by default for now, enable with the
loader tunable vm.pmap.pti=1.
The pmap page table is split into kernel-mode table and user-mode
table. Kernel-mode table is identical to the non-PTI table, while
usermode table is obtained from kernel table by leaving userspace
mappings intact, but only leaving the following parts of the kernel
mapped:
kernel text (but not modules text)
PCPU
GDT/IDT/user LDT/task structures
IST stacks for NMI and doublefault handlers.
Kernel switches to user page table before returning to usermode, and
restores full kernel page table on the entry. Initial kernel-mode
stack for PTI trampoline is allocated in PCPU, it is only 16
qwords. Kernel entry trampoline switches page tables. then the
hardware trap frame is copied to the normal kstack, and execution
continues.
IST stacks are kept mapped and no trampoline is needed for
NMI/doublefault, but of course page table switch is performed.
On return to usermode, the trampoline is used again, iret frame is
copied to the trampoline stack, page tables are switched and iretq is
executed. The case of iretq faulting due to the invalid usermode
context is tricky, since the frame for fault is appended to the
trampoline frame. Besides copying the fault frame and original
(corrupted) frame to kstack, the fault frame must be patched to make
it look as if the fault occured on the kstack, see the comment in
doret_iret detection code in trap().
Currently kernel pages which are mapped during trampoline operation
are identical for all pmaps. They are registered using
pmap_pti_add_kva(). Besides initial registrations done during boot,
LDT and non-common TSS segments are registered if user requested their
use. In principle, they can be installed into kernel page table per
pmap with some work. Similarly, PCPU can be hidden from userspace
mapping using trampoline PCPU page, but again I do not see much
benefits besides complexity.
PDPE pages for the kernel half of the user page tables are
pre-allocated during boot because we need to know pml4 entries which
are copied to the top-level paging structure page, in advance on a new
pmap creation. I enforce this to avoid iterating over the all
existing pmaps if a new PDPE page is needed for PTI kernel mappings.
The iteration is a known problematic operation on i386.
The need to flush hidden kernel translations on the switch to user
mode make global tables (PG_G) meaningless and even harming, so PG_G
use is disabled for PTI case. Our existing use of PCID is
incompatible with PTI and is automatically disabled if PTI is
enabled. PCID can be forced on only for developer's benefit.
MCE is known to be broken, it requires IST stack to operate completely
correctly even for non-PTI case, and absolutely needs dedicated IST
stack because MCE delivery while trampoline did not switched from PTI
stack is fatal. The fix is pending.
Reviewed by: markj (partially)
Tested by: pho (previous version)
Discussed with: jeff, jhb
Sponsored by: The FreeBSD Foundation
MFC after: 2 weeks
2018-01-17 11:44:21 +00:00
|
|
|
setidt(IDT_SYSCALL, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd),
|
|
|
|
SDT_SYSIGT, SEL_KPL, 0);
|
2003-05-14 04:10:49 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
SYSINIT(ia32_syscall, SI_SUB_EXEC, SI_ORDER_ANY, ia32_syscall_enable, NULL);
|
|
|
|
SYSUNINIT(ia32_syscall, SI_SUB_EXEC, SI_ORDER_ANY, ia32_syscall_disable, NULL);
|
2011-04-01 11:16:29 +00:00
|
|
|
|
|
|
|
#ifdef COMPAT_43
|
|
|
|
int
|
|
|
|
setup_lcall_gate(void)
|
|
|
|
{
|
|
|
|
struct i386_ldt_args uap;
|
2014-12-27 23:19:08 +00:00
|
|
|
struct user_segment_descriptor desc;
|
2011-04-01 11:16:29 +00:00
|
|
|
uint32_t lcall_addr;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
bzero(&uap, sizeof(uap));
|
|
|
|
uap.start = 0;
|
2014-12-27 23:19:08 +00:00
|
|
|
uap.num = 1;
|
|
|
|
lcall_addr = curproc->p_sysent->sv_psstrings - sz_lcall_tramp;
|
|
|
|
bzero(&desc, sizeof(desc));
|
|
|
|
desc.sd_type = SDT_MEMERA;
|
|
|
|
desc.sd_dpl = SEL_UPL;
|
|
|
|
desc.sd_p = 1;
|
|
|
|
desc.sd_def32 = 1;
|
|
|
|
desc.sd_gran = 1;
|
|
|
|
desc.sd_lolimit = 0xffff;
|
|
|
|
desc.sd_hilimit = 0xf;
|
|
|
|
desc.sd_lobase = lcall_addr;
|
|
|
|
desc.sd_hibase = lcall_addr >> 24;
|
|
|
|
error = amd64_set_ldt(curthread, &uap, &desc);
|
2011-04-01 11:16:29 +00:00
|
|
|
if (error != 0)
|
|
|
|
return (error);
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
#endif
|