2005-01-06 23:35:40 +00:00
|
|
|
/*-
|
2017-11-27 15:20:12 +00:00
|
|
|
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
|
|
|
|
*
|
1993-12-20 16:16:46 +00:00
|
|
|
* Copyright (c) 1993, David Greenman
|
|
|
|
* All rights reserved.
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
* are met:
|
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
|
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
1995-09-08 13:24:33 +00:00
|
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
1993-12-20 16:16:46 +00:00
|
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
|
|
* SUCH DAMAGE.
|
|
|
|
*/
|
|
|
|
|
2003-06-11 00:56:59 +00:00
|
|
|
#include <sys/cdefs.h>
|
|
|
|
__FBSDID("$FreeBSD$");
|
|
|
|
|
1994-05-25 09:21:21 +00:00
|
|
|
#include <sys/param.h>
|
|
|
|
#include <sys/exec.h>
|
|
|
|
#include <sys/imgact.h>
|
1994-09-24 21:37:01 +00:00
|
|
|
#include <sys/imgact_aout.h>
|
1994-05-25 09:21:21 +00:00
|
|
|
#include <sys/kernel.h>
|
2011-04-01 11:16:29 +00:00
|
|
|
#include <sys/limits.h>
|
2001-05-01 08:13:21 +00:00
|
|
|
#include <sys/lock.h>
|
2002-09-10 11:57:02 +00:00
|
|
|
#include <sys/malloc.h>
|
2001-05-01 08:13:21 +00:00
|
|
|
#include <sys/mutex.h>
|
1996-05-01 02:43:13 +00:00
|
|
|
#include <sys/proc.h>
|
2011-04-05 20:23:59 +00:00
|
|
|
#include <sys/racct.h>
|
2001-05-01 08:13:21 +00:00
|
|
|
#include <sys/resourcevar.h>
|
1998-09-14 05:36:51 +00:00
|
|
|
#include <sys/signalvar.h>
|
|
|
|
#include <sys/syscall.h>
|
2002-09-10 11:57:02 +00:00
|
|
|
#include <sys/sysent.h>
|
|
|
|
#include <sys/systm.h>
|
1996-05-01 02:43:13 +00:00
|
|
|
#include <sys/vnode.h>
|
2001-05-01 08:13:21 +00:00
|
|
|
|
2002-09-07 01:23:51 +00:00
|
|
|
#include <machine/frame.h>
|
2002-09-10 11:57:02 +00:00
|
|
|
#include <machine/md_var.h>
|
1993-12-20 16:16:46 +00:00
|
|
|
|
1994-05-25 09:21:21 +00:00
|
|
|
#include <vm/vm.h>
|
1995-12-07 12:48:31 +00:00
|
|
|
#include <vm/pmap.h>
|
|
|
|
#include <vm/vm_map.h>
|
1998-01-11 21:35:38 +00:00
|
|
|
#include <vm/vm_object.h>
|
2002-09-10 11:57:02 +00:00
|
|
|
#include <vm/vm_param.h>
|
1993-12-20 16:16:46 +00:00
|
|
|
|
2011-04-01 11:16:29 +00:00
|
|
|
#ifdef __amd64__
|
|
|
|
#include <compat/freebsd32/freebsd32_signal.h>
|
|
|
|
#include <compat/freebsd32/freebsd32_util.h>
|
|
|
|
#include <compat/freebsd32/freebsd32_proto.h>
|
|
|
|
#include <compat/freebsd32/freebsd32_syscall.h>
|
|
|
|
#include <compat/ia32/ia32_signal.h>
|
|
|
|
#endif
|
|
|
|
|
2002-03-19 21:25:46 +00:00
|
|
|
static int exec_aout_imgact(struct image_params *imgp);
|
2019-12-03 23:17:54 +00:00
|
|
|
static int aout_fixup(uintptr_t *stack_base, struct image_params *imgp);
|
1995-12-02 16:32:03 +00:00
|
|
|
|
i386 4/4G split.
The change makes the user and kernel address spaces on i386
independent, giving each almost the full 4G of usable virtual addresses
except for one PDE at top used for trampoline and per-CPU trampoline
stacks, and system structures that must be always mapped, namely IDT,
GDT, common TSS and LDT, and process-private TSS and LDT if allocated.
By using 1:1 mapping for the kernel text and data, it appeared
possible to eliminate assembler part of the locore.S which bootstraps
initial page table and KPTmap. The code is rewritten in C and moved
into the pmap_cold(). The comment in vmparam.h explains the KVA
layout.
There is no PCID mechanism available in protected mode, so each
kernel/user switch forth and back completely flushes the TLB, except
for the trampoline PTD region. The TLB invalidations for userspace
becomes trivial, because IPI handlers switch page tables. On the other
hand, context switches no longer need to reload %cr3.
copyout(9) was rewritten to use vm_fault_quick_hold(). An issue for
new copyout(9) is compatibility with wiring user buffers around sysctl
handlers. This explains two kind of locks for copyout ptes and
accounting of the vslock() calls. The vm_fault_quick_hold() AKA slow
path, is only tried after the 'fast path' failed, which temporary
changes mapping to the userspace and copies the data to/from small
per-cpu buffer in the trampoline. If a page fault occurs during the
copy, it is short-circuit by exception.s to not even reach C code.
The change was motivated by the need to implement the Meltdown
mitigation, but instead of KPTI the full split is done. The i386
architecture already shows the sizing problems, in particular, it is
impossible to link clang and lld with debugging. I expect that the
issues due to the virtual address space limits would only exaggerate
and the split gives more liveness to the platform.
Tested by: pho
Discussed with: bde
Sponsored by: The FreeBSD Foundation
MFC after: 1 month
Differential revision: https://reviews.freebsd.org/D14633
2018-04-13 20:30:49 +00:00
|
|
|
#define AOUT32_USRSTACK 0xbfc00000
|
|
|
|
|
2011-04-01 11:16:29 +00:00
|
|
|
#if defined(__i386__)
|
i386 4/4G split.
The change makes the user and kernel address spaces on i386
independent, giving each almost the full 4G of usable virtual addresses
except for one PDE at top used for trampoline and per-CPU trampoline
stacks, and system structures that must be always mapped, namely IDT,
GDT, common TSS and LDT, and process-private TSS and LDT if allocated.
By using 1:1 mapping for the kernel text and data, it appeared
possible to eliminate assembler part of the locore.S which bootstraps
initial page table and KPTmap. The code is rewritten in C and moved
into the pmap_cold(). The comment in vmparam.h explains the KVA
layout.
There is no PCID mechanism available in protected mode, so each
kernel/user switch forth and back completely flushes the TLB, except
for the trampoline PTD region. The TLB invalidations for userspace
becomes trivial, because IPI handlers switch page tables. On the other
hand, context switches no longer need to reload %cr3.
copyout(9) was rewritten to use vm_fault_quick_hold(). An issue for
new copyout(9) is compatibility with wiring user buffers around sysctl
handlers. This explains two kind of locks for copyout ptes and
accounting of the vslock() calls. The vm_fault_quick_hold() AKA slow
path, is only tried after the 'fast path' failed, which temporary
changes mapping to the userspace and copies the data to/from small
per-cpu buffer in the trampoline. If a page fault occurs during the
copy, it is short-circuit by exception.s to not even reach C code.
The change was motivated by the need to implement the Meltdown
mitigation, but instead of KPTI the full split is done. The i386
architecture already shows the sizing problems, in particular, it is
impossible to link clang and lld with debugging. I expect that the
issues due to the virtual address space limits would only exaggerate
and the split gives more liveness to the platform.
Tested by: pho
Discussed with: bde
Sponsored by: The FreeBSD Foundation
MFC after: 1 month
Differential revision: https://reviews.freebsd.org/D14633
2018-04-13 20:30:49 +00:00
|
|
|
|
|
|
|
#define AOUT32_PS_STRINGS (AOUT32_USRSTACK - sizeof(struct ps_strings))
|
|
|
|
|
1998-09-14 05:36:51 +00:00
|
|
|
struct sysentvec aout_sysvec = {
|
2008-09-24 10:14:37 +00:00
|
|
|
.sv_size = SYS_MAXSYSCALL,
|
|
|
|
.sv_table = sysent,
|
|
|
|
.sv_transtrap = NULL,
|
|
|
|
.sv_fixup = aout_fixup,
|
|
|
|
.sv_sendsig = sendsig,
|
|
|
|
.sv_sigcode = sigcode,
|
|
|
|
.sv_szsigcode = &szsigcode,
|
|
|
|
.sv_name = "FreeBSD a.out",
|
|
|
|
.sv_coredump = NULL,
|
|
|
|
.sv_imgact_try = NULL,
|
|
|
|
.sv_minsigstksz = MINSIGSTKSZ,
|
|
|
|
.sv_minuser = VM_MIN_ADDRESS,
|
i386 4/4G split.
The change makes the user and kernel address spaces on i386
independent, giving each almost the full 4G of usable virtual addresses
except for one PDE at top used for trampoline and per-CPU trampoline
stacks, and system structures that must be always mapped, namely IDT,
GDT, common TSS and LDT, and process-private TSS and LDT if allocated.
By using 1:1 mapping for the kernel text and data, it appeared
possible to eliminate assembler part of the locore.S which bootstraps
initial page table and KPTmap. The code is rewritten in C and moved
into the pmap_cold(). The comment in vmparam.h explains the KVA
layout.
There is no PCID mechanism available in protected mode, so each
kernel/user switch forth and back completely flushes the TLB, except
for the trampoline PTD region. The TLB invalidations for userspace
becomes trivial, because IPI handlers switch page tables. On the other
hand, context switches no longer need to reload %cr3.
copyout(9) was rewritten to use vm_fault_quick_hold(). An issue for
new copyout(9) is compatibility with wiring user buffers around sysctl
handlers. This explains two kind of locks for copyout ptes and
accounting of the vslock() calls. The vm_fault_quick_hold() AKA slow
path, is only tried after the 'fast path' failed, which temporary
changes mapping to the userspace and copies the data to/from small
per-cpu buffer in the trampoline. If a page fault occurs during the
copy, it is short-circuit by exception.s to not even reach C code.
The change was motivated by the need to implement the Meltdown
mitigation, but instead of KPTI the full split is done. The i386
architecture already shows the sizing problems, in particular, it is
impossible to link clang and lld with debugging. I expect that the
issues due to the virtual address space limits would only exaggerate
and the split gives more liveness to the platform.
Tested by: pho
Discussed with: bde
Sponsored by: The FreeBSD Foundation
MFC after: 1 month
Differential revision: https://reviews.freebsd.org/D14633
2018-04-13 20:30:49 +00:00
|
|
|
.sv_maxuser = AOUT32_USRSTACK,
|
|
|
|
.sv_usrstack = AOUT32_USRSTACK,
|
|
|
|
.sv_psstrings = AOUT32_PS_STRINGS,
|
2022-01-17 16:42:07 +00:00
|
|
|
.sv_psstringssz = sizeof(struct ps_strings),
|
2008-09-24 10:14:37 +00:00
|
|
|
.sv_stackprot = VM_PROT_ALL,
|
|
|
|
.sv_copyout_strings = exec_copyout_strings,
|
|
|
|
.sv_setregs = exec_setregs,
|
|
|
|
.sv_fixlimit = NULL,
|
2008-11-22 12:36:15 +00:00
|
|
|
.sv_maxssiz = NULL,
|
2011-04-01 11:16:29 +00:00
|
|
|
.sv_flags = SV_ABI_FREEBSD | SV_AOUT | SV_IA32 | SV_ILP32,
|
Reorganize syscall entry and leave handling.
Extend struct sysvec with three new elements:
sv_fetch_syscall_args - the method to fetch syscall arguments from
usermode into struct syscall_args. The structure is machine-depended
(this might be reconsidered after all architectures are converted).
sv_set_syscall_retval - the method to set a return value for usermode
from the syscall. It is a generalization of
cpu_set_syscall_retval(9) to allow ABIs to override the way to set a
return value.
sv_syscallnames - the table of syscall names.
Use sv_set_syscall_retval in kern_sigsuspend() instead of hardcoding
the call to cpu_set_syscall_retval().
The new functions syscallenter(9) and syscallret(9) are provided that
use sv_*syscall* pointers and contain the common repeated code from
the syscall() implementations for the architecture-specific syscall
trap handlers.
Syscallenter() fetches arguments, calls syscall implementation from
ABI sysent table, and set up return frame. The end of syscall
bookkeeping is done by syscallret().
Take advantage of single place for MI syscall handling code and
implement ptrace_lwpinfo pl_flags PL_FLAG_SCE, PL_FLAG_SCX and
PL_FLAG_EXEC. The SCE and SCX flags notify the debugger that the
thread is stopped at syscall entry or return point respectively. The
EXEC flag augments SCX and notifies debugger that the process address
space was changed by one of exec(2)-family syscalls.
The i386, amd64, sparc64, sun4v, powerpc and ia64 syscall()s are
changed to use syscallenter()/syscallret(). MIPS and arm are not
converted and use the mostly unchanged syscall() implementation.
Reviewed by: jhb, marcel, marius, nwhitehorn, stas
Tested by: marcel (ia64), marius (sparc64), nwhitehorn (powerpc),
stas (mips)
MFC after: 1 month
2010-05-23 18:32:02 +00:00
|
|
|
.sv_set_syscall_retval = cpu_set_syscall_retval,
|
|
|
|
.sv_fetch_syscall_args = cpu_fetch_syscall_args,
|
|
|
|
.sv_syscallnames = syscallnames,
|
2011-03-08 19:01:45 +00:00
|
|
|
.sv_schedtail = NULL,
|
2015-05-24 14:51:29 +00:00
|
|
|
.sv_thread_detach = NULL,
|
2016-01-09 20:18:53 +00:00
|
|
|
.sv_trap = NULL,
|
2021-07-01 18:00:29 +00:00
|
|
|
.sv_onexec_old = exec_onexec_old,
|
|
|
|
.sv_onexit = exit_onexit,
|
2021-08-12 08:45:25 +00:00
|
|
|
.sv_set_fork_retval = x86_set_fork_retval,
|
1998-09-14 05:36:51 +00:00
|
|
|
};
|
|
|
|
|
2011-04-01 11:16:29 +00:00
|
|
|
#elif defined(__amd64__)
|
|
|
|
|
2021-11-14 00:26:55 +00:00
|
|
|
#include "vdso_ia32_offsets.h"
|
|
|
|
|
|
|
|
extern const char _binary_elf_vdso32_so_1_start[];
|
|
|
|
extern const char _binary_elf_vdso32_so_1_end[];
|
|
|
|
extern char _binary_elf_vdso32_so_1_size;
|
|
|
|
|
2011-04-01 11:16:29 +00:00
|
|
|
#define AOUT32_PS_STRINGS \
|
|
|
|
(AOUT32_USRSTACK - sizeof(struct freebsd32_ps_strings))
|
i386 4/4G split.
The change makes the user and kernel address spaces on i386
independent, giving each almost the full 4G of usable virtual addresses
except for one PDE at top used for trampoline and per-CPU trampoline
stacks, and system structures that must be always mapped, namely IDT,
GDT, common TSS and LDT, and process-private TSS and LDT if allocated.
By using 1:1 mapping for the kernel text and data, it appeared
possible to eliminate assembler part of the locore.S which bootstraps
initial page table and KPTmap. The code is rewritten in C and moved
into the pmap_cold(). The comment in vmparam.h explains the KVA
layout.
There is no PCID mechanism available in protected mode, so each
kernel/user switch forth and back completely flushes the TLB, except
for the trampoline PTD region. The TLB invalidations for userspace
becomes trivial, because IPI handlers switch page tables. On the other
hand, context switches no longer need to reload %cr3.
copyout(9) was rewritten to use vm_fault_quick_hold(). An issue for
new copyout(9) is compatibility with wiring user buffers around sysctl
handlers. This explains two kind of locks for copyout ptes and
accounting of the vslock() calls. The vm_fault_quick_hold() AKA slow
path, is only tried after the 'fast path' failed, which temporary
changes mapping to the userspace and copies the data to/from small
per-cpu buffer in the trampoline. If a page fault occurs during the
copy, it is short-circuit by exception.s to not even reach C code.
The change was motivated by the need to implement the Meltdown
mitigation, but instead of KPTI the full split is done. The i386
architecture already shows the sizing problems, in particular, it is
impossible to link clang and lld with debugging. I expect that the
issues due to the virtual address space limits would only exaggerate
and the split gives more liveness to the platform.
Tested by: pho
Discussed with: bde
Sponsored by: The FreeBSD Foundation
MFC after: 1 month
Differential revision: https://reviews.freebsd.org/D14633
2018-04-13 20:30:49 +00:00
|
|
|
#define AOUT32_MINUSER FREEBSD32_MINUSER
|
2011-04-01 11:16:29 +00:00
|
|
|
|
|
|
|
extern const char *freebsd32_syscallnames[];
|
|
|
|
extern u_long ia32_maxssiz;
|
|
|
|
|
2021-11-14 00:26:55 +00:00
|
|
|
static int aout_szsigcode;
|
|
|
|
|
2011-04-01 11:16:29 +00:00
|
|
|
struct sysentvec aout_sysvec = {
|
|
|
|
.sv_size = FREEBSD32_SYS_MAXSYSCALL,
|
|
|
|
.sv_table = freebsd32_sysent,
|
|
|
|
.sv_transtrap = NULL,
|
|
|
|
.sv_fixup = aout_fixup,
|
|
|
|
.sv_sendsig = ia32_sendsig,
|
2021-11-14 00:26:55 +00:00
|
|
|
.sv_sigcode = _binary_elf_vdso32_so_1_start,
|
|
|
|
.sv_szsigcode = &aout_szsigcode,
|
2011-04-01 11:16:29 +00:00
|
|
|
.sv_name = "FreeBSD a.out",
|
|
|
|
.sv_coredump = NULL,
|
|
|
|
.sv_imgact_try = NULL,
|
|
|
|
.sv_minsigstksz = MINSIGSTKSZ,
|
2012-07-22 13:41:45 +00:00
|
|
|
.sv_minuser = AOUT32_MINUSER,
|
2011-04-01 11:16:29 +00:00
|
|
|
.sv_maxuser = AOUT32_USRSTACK,
|
|
|
|
.sv_usrstack = AOUT32_USRSTACK,
|
|
|
|
.sv_psstrings = AOUT32_PS_STRINGS,
|
2022-01-17 16:42:07 +00:00
|
|
|
.sv_psstringssz = sizeof(struct freebsd32_ps_strings),
|
2011-04-01 11:16:29 +00:00
|
|
|
.sv_stackprot = VM_PROT_ALL,
|
|
|
|
.sv_copyout_strings = freebsd32_copyout_strings,
|
|
|
|
.sv_setregs = ia32_setregs,
|
|
|
|
.sv_fixlimit = ia32_fixlimit,
|
|
|
|
.sv_maxssiz = &ia32_maxssiz,
|
|
|
|
.sv_flags = SV_ABI_FREEBSD | SV_AOUT | SV_IA32 | SV_ILP32,
|
|
|
|
.sv_set_syscall_retval = ia32_set_syscall_retval,
|
|
|
|
.sv_fetch_syscall_args = ia32_fetch_syscall_args,
|
|
|
|
.sv_syscallnames = freebsd32_syscallnames,
|
2021-07-01 18:00:29 +00:00
|
|
|
.sv_onexec_old = exec_onexec_old,
|
|
|
|
.sv_onexit = exit_onexit,
|
2021-08-12 08:45:25 +00:00
|
|
|
.sv_set_fork_retval = x86_set_fork_retval,
|
2011-04-01 11:16:29 +00:00
|
|
|
};
|
2021-11-14 00:26:55 +00:00
|
|
|
|
|
|
|
static void
|
|
|
|
aout_sysent(void *arg __unused)
|
|
|
|
{
|
|
|
|
aout_szsigcode = (int)(uintptr_t)&_binary_elf_vdso32_so_1_size;
|
|
|
|
}
|
|
|
|
SYSINIT(aout_sysent, SI_SUB_EXEC, SI_ORDER_ANY, aout_sysent, NULL);
|
2011-04-01 11:16:29 +00:00
|
|
|
#else
|
2021-12-04 04:14:38 +00:00
|
|
|
#error "Only ia32 arch is supported"
|
2011-04-01 11:16:29 +00:00
|
|
|
#endif
|
|
|
|
|
2002-09-01 21:41:24 +00:00
|
|
|
static int
|
2019-12-03 23:17:54 +00:00
|
|
|
aout_fixup(uintptr_t *stack_base, struct image_params *imgp)
|
2002-09-01 21:41:24 +00:00
|
|
|
{
|
|
|
|
|
2019-12-03 23:17:54 +00:00
|
|
|
*stack_base -= sizeof(uint32_t);
|
|
|
|
if (suword32((void *)*stack_base, imgp->args->argc) != 0)
|
|
|
|
return (EFAULT);
|
|
|
|
return (0);
|
2002-09-01 21:41:24 +00:00
|
|
|
}
|
|
|
|
|
1999-09-04 13:30:18 +00:00
|
|
|
static int
|
2011-04-01 11:16:29 +00:00
|
|
|
exec_aout_imgact(struct image_params *imgp)
|
1993-12-20 16:16:46 +00:00
|
|
|
{
|
2021-12-03 10:32:35 +00:00
|
|
|
const struct exec *a_out;
|
1997-04-13 01:48:35 +00:00
|
|
|
struct vmspace *vmspace;
|
1999-03-04 18:04:40 +00:00
|
|
|
vm_map_t map;
|
1998-01-11 21:35:38 +00:00
|
|
|
vm_object_t object;
|
|
|
|
vm_offset_t text_end, data_end;
|
1996-03-19 15:03:00 +00:00
|
|
|
unsigned long virtual_offset;
|
1995-12-11 04:58:34 +00:00
|
|
|
unsigned long file_offset;
|
1993-12-20 16:16:46 +00:00
|
|
|
unsigned long bss_size;
|
1994-09-25 19:34:02 +00:00
|
|
|
int error;
|
1993-12-20 16:16:46 +00:00
|
|
|
|
2021-12-03 10:32:35 +00:00
|
|
|
a_out = (const struct exec *)imgp->image_header;
|
|
|
|
|
1995-02-14 19:23:22 +00:00
|
|
|
/*
|
|
|
|
* Linux and *BSD binaries look very much alike,
|
1995-05-30 08:16:23 +00:00
|
|
|
* only the machine id is different:
|
1995-06-11 19:33:05 +00:00
|
|
|
* 0x64 for Linux, 0x86 for *BSD, 0x00 for BSDI.
|
1996-03-03 20:06:53 +00:00
|
|
|
* NetBSD is in network byte order.. ugh.
|
1995-02-14 19:23:22 +00:00
|
|
|
*/
|
2012-06-28 07:33:43 +00:00
|
|
|
if (((a_out->a_midmag >> 16) & 0xff) != 0x86 &&
|
|
|
|
((a_out->a_midmag >> 16) & 0xff) != 0 &&
|
|
|
|
((((int)ntohl(a_out->a_midmag)) >> 16) & 0xff) != 0x86)
|
2021-12-03 10:32:35 +00:00
|
|
|
return (-1);
|
1995-02-14 19:23:22 +00:00
|
|
|
|
1993-12-20 16:16:46 +00:00
|
|
|
/*
|
|
|
|
* Set file/virtual offset based on a.out variant.
|
|
|
|
* We do two cases: host byte order and network byte order
|
|
|
|
* (for NetBSD compatibility)
|
|
|
|
*/
|
2012-06-28 07:33:43 +00:00
|
|
|
switch ((int)(a_out->a_midmag & 0xffff)) {
|
1993-12-20 16:16:46 +00:00
|
|
|
case ZMAGIC:
|
|
|
|
virtual_offset = 0;
|
|
|
|
if (a_out->a_text) {
|
1996-05-02 10:43:17 +00:00
|
|
|
file_offset = PAGE_SIZE;
|
1993-12-20 16:16:46 +00:00
|
|
|
} else {
|
|
|
|
/* Bill's "screwball mode" */
|
|
|
|
file_offset = 0;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case QMAGIC:
|
1996-05-02 10:43:17 +00:00
|
|
|
virtual_offset = PAGE_SIZE;
|
1993-12-20 16:16:46 +00:00
|
|
|
file_offset = 0;
|
1999-04-03 22:20:03 +00:00
|
|
|
/* Pass PS_STRINGS for BSD/OS binaries only. */
|
|
|
|
if (N_GETMID(*a_out) == MID_ZERO)
|
2020-04-15 20:21:30 +00:00
|
|
|
imgp->ps_strings = (void *)aout_sysvec.sv_psstrings;
|
1993-12-20 16:16:46 +00:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
/* NetBSD compatibility */
|
2012-06-28 07:33:43 +00:00
|
|
|
switch ((int)(ntohl(a_out->a_midmag) & 0xffff)) {
|
1993-12-20 16:16:46 +00:00
|
|
|
case ZMAGIC:
|
|
|
|
case QMAGIC:
|
1996-05-02 10:43:17 +00:00
|
|
|
virtual_offset = PAGE_SIZE;
|
1993-12-20 16:16:46 +00:00
|
|
|
file_offset = 0;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
return (-1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
1996-05-02 10:43:17 +00:00
|
|
|
bss_size = roundup(a_out->a_bss, PAGE_SIZE);
|
1993-12-20 16:16:46 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Check various fields in header for validity/bounds.
|
|
|
|
*/
|
|
|
|
if (/* entry point must lay with text region */
|
|
|
|
a_out->a_entry < virtual_offset ||
|
|
|
|
a_out->a_entry >= virtual_offset + a_out->a_text ||
|
|
|
|
|
|
|
|
/* text and data size must each be page rounded */
|
2011-04-01 11:16:29 +00:00
|
|
|
a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK
|
|
|
|
|
|
|
|
#ifdef __amd64__
|
|
|
|
||
|
|
|
|
/* overflows */
|
|
|
|
virtual_offset + a_out->a_text + a_out->a_data + bss_size > UINT_MAX
|
|
|
|
#endif
|
|
|
|
)
|
1993-12-20 16:16:46 +00:00
|
|
|
return (-1);
|
|
|
|
|
|
|
|
/* text + data can't exceed file size */
|
1995-11-06 12:52:37 +00:00
|
|
|
if (a_out->a_data + a_out->a_text > imgp->attr->va_size)
|
1993-12-20 16:16:46 +00:00
|
|
|
return (EFAULT);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* text/data/bss must not exceed limits
|
|
|
|
*/
|
2004-02-04 21:52:57 +00:00
|
|
|
PROC_LOCK(imgp->proc);
|
1993-12-20 16:16:46 +00:00
|
|
|
if (/* text can't exceed maximum text size */
|
2001-10-10 23:06:54 +00:00
|
|
|
a_out->a_text > maxtsiz ||
|
1993-12-20 16:16:46 +00:00
|
|
|
|
|
|
|
/* data + bss can't exceed rlimit */
|
2015-06-10 10:48:12 +00:00
|
|
|
a_out->a_data + bss_size > lim_cur_proc(imgp->proc, RLIMIT_DATA) ||
|
2011-04-05 20:23:59 +00:00
|
|
|
racct_set(imgp->proc, RACCT_DATA, a_out->a_data + bss_size) != 0) {
|
Switch to use shared vnode locks for text files during image activation.
kern_execve() locks text vnode exclusive to be able to set and clear
VV_TEXT flag. VV_TEXT is mutually exclusive with the v_writecount > 0
condition.
The change removes VV_TEXT, replacing it with the condition
v_writecount <= -1, and puts v_writecount under the vnode interlock.
Each text reference decrements v_writecount. To clear the text
reference when the segment is unmapped, it is recorded in the
vm_map_entry backed by the text file as MAP_ENTRY_VN_TEXT flag, and
v_writecount is incremented on the map entry removal
The operations like VOP_ADD_WRITECOUNT() and VOP_SET_TEXT() check that
v_writecount does not contradict the desired change. vn_writecheck()
is now racy and its use was eliminated everywhere except access.
Atomic check for writeability and increment of v_writecount is
performed by the VOP. vn_truncate() now increments v_writecount
around VOP_SETATTR() call, lack of which is arguably a bug on its own.
nullfs bypasses v_writecount to the lower vnode always, so nullfs
vnode has its own v_writecount correct, and lower vnode gets all
references, since object->handle is always lower vnode.
On the text vnode' vm object dealloc, the v_writecount value is reset
to zero, and deadfs vop_unset_text short-circuit the operation.
Reclamation of lowervp always reclaims all nullfs vnodes referencing
lowervp first, so no stray references are left.
Reviewed by: markj, trasz
Tested by: mjg, pho
Sponsored by: The FreeBSD Foundation
MFC after: 1 month
Differential revision: https://reviews.freebsd.org/D19923
2019-05-05 11:20:43 +00:00
|
|
|
PROC_UNLOCK(imgp->proc);
|
|
|
|
return (ENOMEM);
|
2004-02-04 21:52:57 +00:00
|
|
|
}
|
|
|
|
PROC_UNLOCK(imgp->proc);
|
1993-12-20 16:16:46 +00:00
|
|
|
|
2005-12-24 04:57:50 +00:00
|
|
|
/*
|
|
|
|
* Avoid a possible deadlock if the current address space is destroyed
|
|
|
|
* and that address space maps the locked vnode. In the common case,
|
|
|
|
* the locked vnode's v_usecount is decremented but remains greater
|
|
|
|
* than zero. Consequently, the vnode lock is not needed by vrele().
|
|
|
|
* However, in cases where the vnode lock is external, such as nullfs,
|
|
|
|
* v_usecount may become zero.
|
|
|
|
*/
|
2020-01-03 22:29:58 +00:00
|
|
|
VOP_UNLOCK(imgp->vp);
|
2005-12-24 04:57:50 +00:00
|
|
|
|
1993-12-20 16:16:46 +00:00
|
|
|
/*
|
|
|
|
* Destroy old process VM and create a new one (with a new stack)
|
|
|
|
*/
|
2007-11-05 11:36:16 +00:00
|
|
|
error = exec_new_vmspace(imgp, &aout_sysvec);
|
1993-12-20 16:16:46 +00:00
|
|
|
|
Switch to use shared vnode locks for text files during image activation.
kern_execve() locks text vnode exclusive to be able to set and clear
VV_TEXT flag. VV_TEXT is mutually exclusive with the v_writecount > 0
condition.
The change removes VV_TEXT, replacing it with the condition
v_writecount <= -1, and puts v_writecount under the vnode interlock.
Each text reference decrements v_writecount. To clear the text
reference when the segment is unmapped, it is recorded in the
vm_map_entry backed by the text file as MAP_ENTRY_VN_TEXT flag, and
v_writecount is incremented on the map entry removal
The operations like VOP_ADD_WRITECOUNT() and VOP_SET_TEXT() check that
v_writecount does not contradict the desired change. vn_writecheck()
is now racy and its use was eliminated everywhere except access.
Atomic check for writeability and increment of v_writecount is
performed by the VOP. vn_truncate() now increments v_writecount
around VOP_SETATTR() call, lack of which is arguably a bug on its own.
nullfs bypasses v_writecount to the lower vnode always, so nullfs
vnode has its own v_writecount correct, and lower vnode gets all
references, since object->handle is always lower vnode.
On the text vnode' vm object dealloc, the v_writecount value is reset
to zero, and deadfs vop_unset_text short-circuit the operation.
Reclamation of lowervp always reclaims all nullfs vnodes referencing
lowervp first, so no stray references are left.
Reviewed by: markj, trasz
Tested by: mjg, pho
Sponsored by: The FreeBSD Foundation
MFC after: 1 month
Differential revision: https://reviews.freebsd.org/D19923
2019-05-05 11:20:43 +00:00
|
|
|
vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
|
2007-11-05 11:36:16 +00:00
|
|
|
if (error)
|
|
|
|
return (error);
|
2005-12-24 04:57:50 +00:00
|
|
|
|
1997-04-13 01:48:35 +00:00
|
|
|
/*
|
|
|
|
* The vm space can be changed by exec_new_vmspace
|
|
|
|
*/
|
|
|
|
vmspace = imgp->proc->p_vmspace;
|
|
|
|
|
2002-07-06 07:00:01 +00:00
|
|
|
object = imgp->object;
|
1999-03-04 18:04:40 +00:00
|
|
|
map = &vmspace->vm_map;
|
|
|
|
vm_map_lock(map);
|
1998-01-11 21:35:38 +00:00
|
|
|
vm_object_reference(object);
|
|
|
|
|
|
|
|
text_end = virtual_offset + a_out->a_text;
|
1999-03-10 07:07:42 +00:00
|
|
|
error = vm_map_insert(map, object,
|
1998-01-11 21:35:38 +00:00
|
|
|
file_offset,
|
|
|
|
virtual_offset, text_end,
|
|
|
|
VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_ALL,
|
Switch to use shared vnode locks for text files during image activation.
kern_execve() locks text vnode exclusive to be able to set and clear
VV_TEXT flag. VV_TEXT is mutually exclusive with the v_writecount > 0
condition.
The change removes VV_TEXT, replacing it with the condition
v_writecount <= -1, and puts v_writecount under the vnode interlock.
Each text reference decrements v_writecount. To clear the text
reference when the segment is unmapped, it is recorded in the
vm_map_entry backed by the text file as MAP_ENTRY_VN_TEXT flag, and
v_writecount is incremented on the map entry removal
The operations like VOP_ADD_WRITECOUNT() and VOP_SET_TEXT() check that
v_writecount does not contradict the desired change. vn_writecheck()
is now racy and its use was eliminated everywhere except access.
Atomic check for writeability and increment of v_writecount is
performed by the VOP. vn_truncate() now increments v_writecount
around VOP_SETATTR() call, lack of which is arguably a bug on its own.
nullfs bypasses v_writecount to the lower vnode always, so nullfs
vnode has its own v_writecount correct, and lower vnode gets all
references, since object->handle is always lower vnode.
On the text vnode' vm object dealloc, the v_writecount value is reset
to zero, and deadfs vop_unset_text short-circuit the operation.
Reclamation of lowervp always reclaims all nullfs vnodes referencing
lowervp first, so no stray references are left.
Reviewed by: markj, trasz
Tested by: mjg, pho
Sponsored by: The FreeBSD Foundation
MFC after: 1 month
Differential revision: https://reviews.freebsd.org/D19923
2019-05-05 11:20:43 +00:00
|
|
|
MAP_COPY_ON_WRITE | MAP_PREFAULT | MAP_VN_EXEC);
|
1999-03-04 18:04:40 +00:00
|
|
|
if (error) {
|
|
|
|
vm_map_unlock(map);
|
2006-03-16 08:51:59 +00:00
|
|
|
vm_object_deallocate(object);
|
1993-12-20 16:16:46 +00:00
|
|
|
return (error);
|
1999-03-04 18:04:40 +00:00
|
|
|
}
|
Switch to use shared vnode locks for text files during image activation.
kern_execve() locks text vnode exclusive to be able to set and clear
VV_TEXT flag. VV_TEXT is mutually exclusive with the v_writecount > 0
condition.
The change removes VV_TEXT, replacing it with the condition
v_writecount <= -1, and puts v_writecount under the vnode interlock.
Each text reference decrements v_writecount. To clear the text
reference when the segment is unmapped, it is recorded in the
vm_map_entry backed by the text file as MAP_ENTRY_VN_TEXT flag, and
v_writecount is incremented on the map entry removal
The operations like VOP_ADD_WRITECOUNT() and VOP_SET_TEXT() check that
v_writecount does not contradict the desired change. vn_writecheck()
is now racy and its use was eliminated everywhere except access.
Atomic check for writeability and increment of v_writecount is
performed by the VOP. vn_truncate() now increments v_writecount
around VOP_SETATTR() call, lack of which is arguably a bug on its own.
nullfs bypasses v_writecount to the lower vnode always, so nullfs
vnode has its own v_writecount correct, and lower vnode gets all
references, since object->handle is always lower vnode.
On the text vnode' vm object dealloc, the v_writecount value is reset
to zero, and deadfs vop_unset_text short-circuit the operation.
Reclamation of lowervp always reclaims all nullfs vnodes referencing
lowervp first, so no stray references are left.
Reviewed by: markj, trasz
Tested by: mjg, pho
Sponsored by: The FreeBSD Foundation
MFC after: 1 month
Differential revision: https://reviews.freebsd.org/D19923
2019-05-05 11:20:43 +00:00
|
|
|
VOP_SET_TEXT_CHECKED(imgp->vp);
|
1998-01-11 21:35:38 +00:00
|
|
|
data_end = text_end + a_out->a_data;
|
|
|
|
if (a_out->a_data) {
|
|
|
|
vm_object_reference(object);
|
1999-03-10 07:07:42 +00:00
|
|
|
error = vm_map_insert(map, object,
|
1998-01-11 21:35:38 +00:00
|
|
|
file_offset + a_out->a_text,
|
|
|
|
text_end, data_end,
|
|
|
|
VM_PROT_ALL, VM_PROT_ALL,
|
Switch to use shared vnode locks for text files during image activation.
kern_execve() locks text vnode exclusive to be able to set and clear
VV_TEXT flag. VV_TEXT is mutually exclusive with the v_writecount > 0
condition.
The change removes VV_TEXT, replacing it with the condition
v_writecount <= -1, and puts v_writecount under the vnode interlock.
Each text reference decrements v_writecount. To clear the text
reference when the segment is unmapped, it is recorded in the
vm_map_entry backed by the text file as MAP_ENTRY_VN_TEXT flag, and
v_writecount is incremented on the map entry removal
The operations like VOP_ADD_WRITECOUNT() and VOP_SET_TEXT() check that
v_writecount does not contradict the desired change. vn_writecheck()
is now racy and its use was eliminated everywhere except access.
Atomic check for writeability and increment of v_writecount is
performed by the VOP. vn_truncate() now increments v_writecount
around VOP_SETATTR() call, lack of which is arguably a bug on its own.
nullfs bypasses v_writecount to the lower vnode always, so nullfs
vnode has its own v_writecount correct, and lower vnode gets all
references, since object->handle is always lower vnode.
On the text vnode' vm object dealloc, the v_writecount value is reset
to zero, and deadfs vop_unset_text short-circuit the operation.
Reclamation of lowervp always reclaims all nullfs vnodes referencing
lowervp first, so no stray references are left.
Reviewed by: markj, trasz
Tested by: mjg, pho
Sponsored by: The FreeBSD Foundation
MFC after: 1 month
Differential revision: https://reviews.freebsd.org/D19923
2019-05-05 11:20:43 +00:00
|
|
|
MAP_COPY_ON_WRITE | MAP_PREFAULT | MAP_VN_EXEC);
|
1999-03-04 18:04:40 +00:00
|
|
|
if (error) {
|
|
|
|
vm_map_unlock(map);
|
2006-03-16 08:51:59 +00:00
|
|
|
vm_object_deallocate(object);
|
1995-02-20 22:23:31 +00:00
|
|
|
return (error);
|
1999-03-04 18:04:40 +00:00
|
|
|
}
|
Switch to use shared vnode locks for text files during image activation.
kern_execve() locks text vnode exclusive to be able to set and clear
VV_TEXT flag. VV_TEXT is mutually exclusive with the v_writecount > 0
condition.
The change removes VV_TEXT, replacing it with the condition
v_writecount <= -1, and puts v_writecount under the vnode interlock.
Each text reference decrements v_writecount. To clear the text
reference when the segment is unmapped, it is recorded in the
vm_map_entry backed by the text file as MAP_ENTRY_VN_TEXT flag, and
v_writecount is incremented on the map entry removal
The operations like VOP_ADD_WRITECOUNT() and VOP_SET_TEXT() check that
v_writecount does not contradict the desired change. vn_writecheck()
is now racy and its use was eliminated everywhere except access.
Atomic check for writeability and increment of v_writecount is
performed by the VOP. vn_truncate() now increments v_writecount
around VOP_SETATTR() call, lack of which is arguably a bug on its own.
nullfs bypasses v_writecount to the lower vnode always, so nullfs
vnode has its own v_writecount correct, and lower vnode gets all
references, since object->handle is always lower vnode.
On the text vnode' vm object dealloc, the v_writecount value is reset
to zero, and deadfs vop_unset_text short-circuit the operation.
Reclamation of lowervp always reclaims all nullfs vnodes referencing
lowervp first, so no stray references are left.
Reviewed by: markj, trasz
Tested by: mjg, pho
Sponsored by: The FreeBSD Foundation
MFC after: 1 month
Differential revision: https://reviews.freebsd.org/D19923
2019-05-05 11:20:43 +00:00
|
|
|
VOP_SET_TEXT_CHECKED(imgp->vp);
|
1995-02-20 22:23:31 +00:00
|
|
|
}
|
1993-12-20 16:16:46 +00:00
|
|
|
|
1998-01-11 21:35:38 +00:00
|
|
|
if (bss_size) {
|
1999-03-10 07:07:42 +00:00
|
|
|
error = vm_map_insert(map, NULL, 0,
|
1998-01-11 21:35:38 +00:00
|
|
|
data_end, data_end + bss_size,
|
|
|
|
VM_PROT_ALL, VM_PROT_ALL, 0);
|
1999-03-04 18:04:40 +00:00
|
|
|
if (error) {
|
|
|
|
vm_map_unlock(map);
|
1998-01-11 21:35:38 +00:00
|
|
|
return (error);
|
1999-03-04 18:04:40 +00:00
|
|
|
}
|
1998-01-11 21:35:38 +00:00
|
|
|
}
|
1999-03-04 18:04:40 +00:00
|
|
|
vm_map_unlock(map);
|
|
|
|
|
1993-12-20 16:16:46 +00:00
|
|
|
/* Fill in process VM information */
|
|
|
|
vmspace->vm_tsize = a_out->a_text >> PAGE_SHIFT;
|
|
|
|
vmspace->vm_dsize = (a_out->a_data + bss_size) >> PAGE_SHIFT;
|
1998-07-15 05:00:26 +00:00
|
|
|
vmspace->vm_taddr = (caddr_t) (uintptr_t) virtual_offset;
|
|
|
|
vmspace->vm_daddr = (caddr_t) (uintptr_t)
|
|
|
|
(virtual_offset + a_out->a_text);
|
1993-12-20 16:16:46 +00:00
|
|
|
|
exec: Reimplement stack address randomization
The approach taken by the stack gap implementation was to insert a
random gap between the top of the fixed stack mapping and the true top
of the main process stack. This approach was chosen so as to avoid
randomizing the previously fixed address of certain process metadata
stored at the top of the stack, but had some shortcomings. In
particular, mlockall(2) calls would wire the gap, bloating the process'
memory usage, and RLIMIT_STACK included the size of the gap so small
(< several MB) limits could not be used.
There is little value in storing each process' ps_strings at a fixed
location, as only very old programs hard-code this address; consumers
were converted decades ago to use a sysctl-based interface for this
purpose. Thus, this change re-implements stack address randomization by
simply breaking the convention of storing ps_strings at a fixed
location, and randomizing the location of the entire stack mapping.
This implementation is simpler and avoids the problems mentioned above,
while being unlikely to break compatibility anywhere the default ASLR
settings are used.
The kern.elfN.aslr.stack_gap sysctl is renamed to kern.elfN.aslr.stack,
and is re-enabled by default.
PR: 260303
Reviewed by: kib
Discussed with: emaste, mw
MFC after: 1 month
Sponsored by: The FreeBSD Foundation
Differential Revision: https://reviews.freebsd.org/D33704
2022-01-17 16:42:56 +00:00
|
|
|
error = exec_map_stack(imgp);
|
|
|
|
if (error != 0)
|
|
|
|
return (error);
|
|
|
|
|
1993-12-20 16:16:46 +00:00
|
|
|
/* Fill in image_params */
|
1995-11-06 12:52:37 +00:00
|
|
|
imgp->interpreted = 0;
|
|
|
|
imgp->entry_addr = a_out->a_entry;
|
1995-05-30 08:16:23 +00:00
|
|
|
|
1995-11-06 12:52:37 +00:00
|
|
|
imgp->proc->p_sysent = &aout_sysvec;
|
1995-08-24 10:32:37 +00:00
|
|
|
|
1993-12-20 16:16:46 +00:00
|
|
|
return (0);
|
|
|
|
}
|
1993-12-20 19:31:41 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Tell kern_execve.c about it, with a little help from the linker.
|
|
|
|
*/
|
2018-03-13 13:09:10 +00:00
|
|
|
static struct execsw aout_execsw = {
|
|
|
|
.ex_imgact = exec_aout_imgact,
|
|
|
|
.ex_name = "a.out"
|
|
|
|
};
|
1998-10-16 03:55:01 +00:00
|
|
|
EXEC_SET(aout, aout_execsw);
|