a113b17f10
On machines with SMAP, fueword executes two serializing instructions which can be seen in microbenchmarks. As a measure to restore microbenchmark numbers, only read the word on the attempt to deliver signal in ast(). If the word is set, signal is not delivered and word is kept, preventing interruption of interruptible sleeps by signals until userspace calls sigfastblock(UNBLOCK) which clears the word. This way, the spurious EINTR that userspace can see while in critical section is on first interruptible sleep, if a signal is pending, and on signal posting. It is believed that it is not important for rtld and lbithr critical sections. It might be visible for the application code e.g. for the callback of dl_iterate_phdr(3), but again the belief is that the non-compliance is acceptable. Most important is that the retry of the sleeping syscall does not interrupt unless additional signal is posted. For now I added the knob kern.sigfastblock_fetch_always to enable the word read on syscall entry to be able to diagnose possible issues due to spurious EINTR. While there, do some code restructuting to have all sigfastblock() handling located in kern_sig.c. Reviewed by: jeff Discussed with: mjg Tested by: pho Sponsored by: The FreeBSD Foundation Differential revision: https://reviews.freebsd.org/D23622
251 lines
7.4 KiB
C
251 lines
7.4 KiB
C
/*-
|
|
* SPDX-License-Identifier: BSD-4-Clause
|
|
*
|
|
* Copyright (C) 1994, David Greenman
|
|
* Copyright (c) 1990, 1993
|
|
* The Regents of the University of California. All rights reserved.
|
|
* Copyright (C) 2010 Konstantin Belousov <kib@freebsd.org>
|
|
*
|
|
* This code is derived from software contributed to Berkeley by
|
|
* the University of Utah, and William Jolitz.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* 3. All advertising materials mentioning features or use of this software
|
|
* must display the following acknowledgement:
|
|
* This product includes software developed by the University of
|
|
* California, Berkeley and its contributors.
|
|
* 4. Neither the name of the University nor the names of its contributors
|
|
* may be used to endorse or promote products derived from this software
|
|
* without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
* SUCH DAMAGE.
|
|
*
|
|
* from: @(#)trap.c 7.4 (Berkeley) 5/13/91
|
|
*/
|
|
|
|
#include "opt_capsicum.h"
|
|
#include "opt_ktrace.h"
|
|
|
|
__FBSDID("$FreeBSD$");
|
|
|
|
#include <sys/capsicum.h>
|
|
#include <sys/ktr.h>
|
|
#include <sys/vmmeter.h>
|
|
#ifdef KTRACE
|
|
#include <sys/uio.h>
|
|
#include <sys/ktrace.h>
|
|
#endif
|
|
#include <security/audit/audit.h>
|
|
|
|
static inline void
|
|
syscallenter(struct thread *td)
|
|
{
|
|
struct proc *p;
|
|
struct syscall_args *sa;
|
|
int error, traced;
|
|
|
|
VM_CNT_INC(v_syscall);
|
|
p = td->td_proc;
|
|
sa = &td->td_sa;
|
|
|
|
td->td_pticks = 0;
|
|
if (__predict_false(td->td_cowgen != p->p_cowgen))
|
|
thread_cow_update(td);
|
|
traced = (p->p_flag & P_TRACED) != 0;
|
|
if (__predict_false(traced || td->td_dbgflags & TDB_USERWR)) {
|
|
PROC_LOCK(p);
|
|
td->td_dbgflags &= ~TDB_USERWR;
|
|
if (traced)
|
|
td->td_dbgflags |= TDB_SCE;
|
|
PROC_UNLOCK(p);
|
|
}
|
|
error = (p->p_sysent->sv_fetch_syscall_args)(td);
|
|
#ifdef KTRACE
|
|
if (KTRPOINT(td, KTR_SYSCALL))
|
|
ktrsyscall(sa->code, sa->narg, sa->args);
|
|
#endif
|
|
KTR_START4(KTR_SYSC, "syscall", syscallname(p, sa->code),
|
|
(uintptr_t)td, "pid:%d", td->td_proc->p_pid, "arg0:%p", sa->args[0],
|
|
"arg1:%p", sa->args[1], "arg2:%p", sa->args[2]);
|
|
|
|
if (__predict_false(error != 0)) {
|
|
td->td_errno = error;
|
|
goto retval;
|
|
}
|
|
|
|
STOPEVENT(p, S_SCE, sa->narg);
|
|
if (__predict_false((p->p_flag & P_TRACED) != 0)) {
|
|
PROC_LOCK(p);
|
|
if (p->p_ptevents & PTRACE_SCE)
|
|
ptracestop((td), SIGTRAP, NULL);
|
|
PROC_UNLOCK(p);
|
|
}
|
|
if (__predict_false((td->td_dbgflags & TDB_USERWR) != 0)) {
|
|
/*
|
|
* Reread syscall number and arguments if debugger
|
|
* modified registers or memory.
|
|
*/
|
|
error = (p->p_sysent->sv_fetch_syscall_args)(td);
|
|
#ifdef KTRACE
|
|
if (KTRPOINT(td, KTR_SYSCALL))
|
|
ktrsyscall(sa->code, sa->narg, sa->args);
|
|
#endif
|
|
if (error != 0) {
|
|
td->td_errno = error;
|
|
goto retval;
|
|
}
|
|
}
|
|
|
|
#ifdef CAPABILITY_MODE
|
|
/*
|
|
* In capability mode, we only allow access to system calls
|
|
* flagged with SYF_CAPENABLED.
|
|
*/
|
|
if (__predict_false(IN_CAPABILITY_MODE(td) &&
|
|
!(sa->callp->sy_flags & SYF_CAPENABLED))) {
|
|
td->td_errno = error = ECAPMODE;
|
|
goto retval;
|
|
}
|
|
#endif
|
|
|
|
error = syscall_thread_enter(td, sa->callp);
|
|
if (error != 0) {
|
|
td->td_errno = error;
|
|
goto retval;
|
|
}
|
|
|
|
/*
|
|
* Fetch fast sigblock value at the time of syscall entry to
|
|
* handle sleepqueue primitives which might call cursig().
|
|
*/
|
|
if (__predict_false(sigfastblock_fetch_always))
|
|
(void)sigfastblock_fetch(td);
|
|
|
|
/* Let system calls set td_errno directly. */
|
|
td->td_pflags &= ~TDP_NERRNO;
|
|
|
|
if (__predict_false(systrace_enabled || AUDIT_SYSCALL_ENTER(sa->code, td))) {
|
|
#ifdef KDTRACE_HOOKS
|
|
/* Give the syscall:::entry DTrace probe a chance to fire. */
|
|
if (__predict_false(sa->callp->sy_entry != 0))
|
|
(*systrace_probe_func)(sa, SYSTRACE_ENTRY, 0);
|
|
#endif
|
|
error = (sa->callp->sy_call)(td, sa->args);
|
|
/* Save the latest error return value. */
|
|
if (__predict_false((td->td_pflags & TDP_NERRNO) == 0))
|
|
td->td_errno = error;
|
|
AUDIT_SYSCALL_EXIT(error, td);
|
|
#ifdef KDTRACE_HOOKS
|
|
/* Give the syscall:::return DTrace probe a chance to fire. */
|
|
if (__predict_false(sa->callp->sy_return != 0))
|
|
(*systrace_probe_func)(sa, SYSTRACE_RETURN,
|
|
error ? -1 : td->td_retval[0]);
|
|
#endif
|
|
} else {
|
|
error = (sa->callp->sy_call)(td, sa->args);
|
|
/* Save the latest error return value. */
|
|
if (__predict_false((td->td_pflags & TDP_NERRNO) == 0))
|
|
td->td_errno = error;
|
|
}
|
|
syscall_thread_exit(td, sa->callp);
|
|
|
|
retval:
|
|
KTR_STOP4(KTR_SYSC, "syscall", syscallname(p, sa->code),
|
|
(uintptr_t)td, "pid:%d", td->td_proc->p_pid, "error:%d", error,
|
|
"retval0:%#lx", td->td_retval[0], "retval1:%#lx",
|
|
td->td_retval[1]);
|
|
if (__predict_false(traced)) {
|
|
PROC_LOCK(p);
|
|
td->td_dbgflags &= ~TDB_SCE;
|
|
PROC_UNLOCK(p);
|
|
}
|
|
(p->p_sysent->sv_set_syscall_retval)(td, error);
|
|
}
|
|
|
|
static inline void
|
|
syscallret(struct thread *td)
|
|
{
|
|
struct proc *p;
|
|
struct syscall_args *sa;
|
|
ksiginfo_t ksi;
|
|
int traced;
|
|
|
|
KASSERT((td->td_pflags & TDP_FORKING) == 0,
|
|
("fork() did not clear TDP_FORKING upon completion"));
|
|
|
|
p = td->td_proc;
|
|
sa = &td->td_sa;
|
|
if (__predict_false(td->td_errno == ENOTCAPABLE ||
|
|
td->td_errno == ECAPMODE)) {
|
|
if ((trap_enotcap ||
|
|
(p->p_flag2 & P2_TRAPCAP) != 0) && IN_CAPABILITY_MODE(td)) {
|
|
ksiginfo_init_trap(&ksi);
|
|
ksi.ksi_signo = SIGTRAP;
|
|
ksi.ksi_errno = td->td_errno;
|
|
ksi.ksi_code = TRAP_CAP;
|
|
trapsignal(td, &ksi);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Handle reschedule and other end-of-syscall issues
|
|
*/
|
|
userret(td, td->td_frame);
|
|
|
|
#ifdef KTRACE
|
|
if (KTRPOINT(td, KTR_SYSRET)) {
|
|
ktrsysret(sa->code, td->td_errno, td->td_retval[0]);
|
|
}
|
|
#endif
|
|
|
|
traced = 0;
|
|
if (__predict_false(p->p_flag & P_TRACED)) {
|
|
traced = 1;
|
|
PROC_LOCK(p);
|
|
td->td_dbgflags |= TDB_SCX;
|
|
PROC_UNLOCK(p);
|
|
}
|
|
/*
|
|
* This works because errno is findable through the
|
|
* register set. If we ever support an emulation where this
|
|
* is not the case, this code will need to be revisited.
|
|
*/
|
|
STOPEVENT(p, S_SCX, sa->code);
|
|
if (__predict_false(traced ||
|
|
(td->td_dbgflags & (TDB_EXEC | TDB_FORK)) != 0)) {
|
|
PROC_LOCK(p);
|
|
/*
|
|
* If tracing the execed process, trap to the debugger
|
|
* so that breakpoints can be set before the program
|
|
* executes. If debugger requested tracing of syscall
|
|
* returns, do it now too.
|
|
*/
|
|
if (traced &&
|
|
((td->td_dbgflags & (TDB_FORK | TDB_EXEC)) != 0 ||
|
|
(p->p_ptevents & PTRACE_SCX) != 0))
|
|
ptracestop(td, SIGTRAP, NULL);
|
|
td->td_dbgflags &= ~(TDB_SCX | TDB_EXEC | TDB_FORK);
|
|
PROC_UNLOCK(p);
|
|
}
|
|
|
|
if (__predict_false(td->td_pflags & TDP_RFPPWAIT))
|
|
fork_rfppwait(td);
|
|
}
|