freebsd-skq/sys/kern/imgact_aout.c
Konstantin Belousov 78022527bb Switch to use shared vnode locks for text files during image activation.
kern_execve() locks text vnode exclusive to be able to set and clear
VV_TEXT flag. VV_TEXT is mutually exclusive with the v_writecount > 0
condition.

The change removes VV_TEXT, replacing it with the condition
v_writecount <= -1, and puts v_writecount under the vnode interlock.
Each text reference decrements v_writecount.  To clear the text
reference when the segment is unmapped, it is recorded in the
vm_map_entry backed by the text file as MAP_ENTRY_VN_TEXT flag, and
v_writecount is incremented on the map entry removal

The operations like VOP_ADD_WRITECOUNT() and VOP_SET_TEXT() check that
v_writecount does not contradict the desired change.  vn_writecheck()
is now racy and its use was eliminated everywhere except access.
Atomic check for writeability and increment of v_writecount is
performed by the VOP.  vn_truncate() now increments v_writecount
around VOP_SETATTR() call, lack of which is arguably a bug on its own.

nullfs bypasses v_writecount to the lower vnode always, so nullfs
vnode has its own v_writecount correct, and lower vnode gets all
references, since object->handle is always lower vnode.

On the text vnode' vm object dealloc, the v_writecount value is reset
to zero, and deadfs vop_unset_text short-circuit the operation.
Reclamation of lowervp always reclaims all nullfs vnodes referencing
lowervp first, so no stray references are left.

Reviewed by:	markj, trasz
Tested by:	mjg, pho
Sponsored by:	The FreeBSD Foundation
MFC after:	1 month
Differential revision:	https://reviews.freebsd.org/D19923
2019-05-05 11:20:43 +00:00

347 lines
9.5 KiB
C

/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 1993, David Greenman
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/exec.h>
#include <sys/imgact.h>
#include <sys/imgact_aout.h>
#include <sys/kernel.h>
#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/racct.h>
#include <sys/resourcevar.h>
#include <sys/signalvar.h>
#include <sys/syscall.h>
#include <sys/sysent.h>
#include <sys/systm.h>
#include <sys/vnode.h>
#include <machine/frame.h>
#include <machine/md_var.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_object.h>
#include <vm/vm_param.h>
#ifdef __amd64__
#include <compat/freebsd32/freebsd32_signal.h>
#include <compat/freebsd32/freebsd32_util.h>
#include <compat/freebsd32/freebsd32_proto.h>
#include <compat/freebsd32/freebsd32_syscall.h>
#include <compat/ia32/ia32_signal.h>
#endif
static int exec_aout_imgact(struct image_params *imgp);
static int aout_fixup(register_t **stack_base, struct image_params *imgp);
#define AOUT32_USRSTACK 0xbfc00000
#if defined(__i386__)
#define AOUT32_PS_STRINGS (AOUT32_USRSTACK - sizeof(struct ps_strings))
struct sysentvec aout_sysvec = {
.sv_size = SYS_MAXSYSCALL,
.sv_table = sysent,
.sv_errsize = 0,
.sv_errtbl = NULL,
.sv_transtrap = NULL,
.sv_fixup = aout_fixup,
.sv_sendsig = sendsig,
.sv_sigcode = sigcode,
.sv_szsigcode = &szsigcode,
.sv_name = "FreeBSD a.out",
.sv_coredump = NULL,
.sv_imgact_try = NULL,
.sv_minsigstksz = MINSIGSTKSZ,
.sv_minuser = VM_MIN_ADDRESS,
.sv_maxuser = AOUT32_USRSTACK,
.sv_usrstack = AOUT32_USRSTACK,
.sv_psstrings = AOUT32_PS_STRINGS,
.sv_stackprot = VM_PROT_ALL,
.sv_copyout_strings = exec_copyout_strings,
.sv_setregs = exec_setregs,
.sv_fixlimit = NULL,
.sv_maxssiz = NULL,
.sv_flags = SV_ABI_FREEBSD | SV_AOUT | SV_IA32 | SV_ILP32,
.sv_set_syscall_retval = cpu_set_syscall_retval,
.sv_fetch_syscall_args = cpu_fetch_syscall_args,
.sv_syscallnames = syscallnames,
.sv_schedtail = NULL,
.sv_thread_detach = NULL,
.sv_trap = NULL,
};
#elif defined(__amd64__)
#define AOUT32_PS_STRINGS \
(AOUT32_USRSTACK - sizeof(struct freebsd32_ps_strings))
#define AOUT32_MINUSER FREEBSD32_MINUSER
extern const char *freebsd32_syscallnames[];
extern u_long ia32_maxssiz;
struct sysentvec aout_sysvec = {
.sv_size = FREEBSD32_SYS_MAXSYSCALL,
.sv_table = freebsd32_sysent,
.sv_errsize = 0,
.sv_errtbl = NULL,
.sv_transtrap = NULL,
.sv_fixup = aout_fixup,
.sv_sendsig = ia32_sendsig,
.sv_sigcode = ia32_sigcode,
.sv_szsigcode = &sz_ia32_sigcode,
.sv_name = "FreeBSD a.out",
.sv_coredump = NULL,
.sv_imgact_try = NULL,
.sv_minsigstksz = MINSIGSTKSZ,
.sv_minuser = AOUT32_MINUSER,
.sv_maxuser = AOUT32_USRSTACK,
.sv_usrstack = AOUT32_USRSTACK,
.sv_psstrings = AOUT32_PS_STRINGS,
.sv_stackprot = VM_PROT_ALL,
.sv_copyout_strings = freebsd32_copyout_strings,
.sv_setregs = ia32_setregs,
.sv_fixlimit = ia32_fixlimit,
.sv_maxssiz = &ia32_maxssiz,
.sv_flags = SV_ABI_FREEBSD | SV_AOUT | SV_IA32 | SV_ILP32,
.sv_set_syscall_retval = ia32_set_syscall_retval,
.sv_fetch_syscall_args = ia32_fetch_syscall_args,
.sv_syscallnames = freebsd32_syscallnames,
};
#else
#error "Port me"
#endif
static int
aout_fixup(register_t **stack_base, struct image_params *imgp)
{
*(char **)stack_base -= sizeof(uint32_t);
return (suword32(*stack_base, imgp->args->argc));
}
static int
exec_aout_imgact(struct image_params *imgp)
{
const struct exec *a_out = (const struct exec *) imgp->image_header;
struct vmspace *vmspace;
vm_map_t map;
vm_object_t object;
vm_offset_t text_end, data_end;
unsigned long virtual_offset;
unsigned long file_offset;
unsigned long bss_size;
int error;
/*
* Linux and *BSD binaries look very much alike,
* only the machine id is different:
* 0x64 for Linux, 0x86 for *BSD, 0x00 for BSDI.
* NetBSD is in network byte order.. ugh.
*/
if (((a_out->a_midmag >> 16) & 0xff) != 0x86 &&
((a_out->a_midmag >> 16) & 0xff) != 0 &&
((((int)ntohl(a_out->a_midmag)) >> 16) & 0xff) != 0x86)
return -1;
/*
* Set file/virtual offset based on a.out variant.
* We do two cases: host byte order and network byte order
* (for NetBSD compatibility)
*/
switch ((int)(a_out->a_midmag & 0xffff)) {
case ZMAGIC:
virtual_offset = 0;
if (a_out->a_text) {
file_offset = PAGE_SIZE;
} else {
/* Bill's "screwball mode" */
file_offset = 0;
}
break;
case QMAGIC:
virtual_offset = PAGE_SIZE;
file_offset = 0;
/* Pass PS_STRINGS for BSD/OS binaries only. */
if (N_GETMID(*a_out) == MID_ZERO)
imgp->ps_strings = aout_sysvec.sv_psstrings;
break;
default:
/* NetBSD compatibility */
switch ((int)(ntohl(a_out->a_midmag) & 0xffff)) {
case ZMAGIC:
case QMAGIC:
virtual_offset = PAGE_SIZE;
file_offset = 0;
break;
default:
return (-1);
}
}
bss_size = roundup(a_out->a_bss, PAGE_SIZE);
/*
* Check various fields in header for validity/bounds.
*/
if (/* entry point must lay with text region */
a_out->a_entry < virtual_offset ||
a_out->a_entry >= virtual_offset + a_out->a_text ||
/* text and data size must each be page rounded */
a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK
#ifdef __amd64__
||
/* overflows */
virtual_offset + a_out->a_text + a_out->a_data + bss_size > UINT_MAX
#endif
)
return (-1);
/* text + data can't exceed file size */
if (a_out->a_data + a_out->a_text > imgp->attr->va_size)
return (EFAULT);
/*
* text/data/bss must not exceed limits
*/
PROC_LOCK(imgp->proc);
if (/* text can't exceed maximum text size */
a_out->a_text > maxtsiz ||
/* data + bss can't exceed rlimit */
a_out->a_data + bss_size > lim_cur_proc(imgp->proc, RLIMIT_DATA) ||
racct_set(imgp->proc, RACCT_DATA, a_out->a_data + bss_size) != 0) {
PROC_UNLOCK(imgp->proc);
return (ENOMEM);
}
PROC_UNLOCK(imgp->proc);
/*
* Avoid a possible deadlock if the current address space is destroyed
* and that address space maps the locked vnode. In the common case,
* the locked vnode's v_usecount is decremented but remains greater
* than zero. Consequently, the vnode lock is not needed by vrele().
* However, in cases where the vnode lock is external, such as nullfs,
* v_usecount may become zero.
*/
VOP_UNLOCK(imgp->vp, 0);
/*
* Destroy old process VM and create a new one (with a new stack)
*/
error = exec_new_vmspace(imgp, &aout_sysvec);
vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
if (error)
return (error);
/*
* The vm space can be changed by exec_new_vmspace
*/
vmspace = imgp->proc->p_vmspace;
object = imgp->object;
map = &vmspace->vm_map;
vm_map_lock(map);
vm_object_reference(object);
text_end = virtual_offset + a_out->a_text;
error = vm_map_insert(map, object,
file_offset,
virtual_offset, text_end,
VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_ALL,
MAP_COPY_ON_WRITE | MAP_PREFAULT | MAP_VN_EXEC);
if (error) {
vm_map_unlock(map);
vm_object_deallocate(object);
return (error);
}
VOP_SET_TEXT_CHECKED(imgp->vp);
data_end = text_end + a_out->a_data;
if (a_out->a_data) {
vm_object_reference(object);
error = vm_map_insert(map, object,
file_offset + a_out->a_text,
text_end, data_end,
VM_PROT_ALL, VM_PROT_ALL,
MAP_COPY_ON_WRITE | MAP_PREFAULT | MAP_VN_EXEC);
if (error) {
vm_map_unlock(map);
vm_object_deallocate(object);
return (error);
}
VOP_SET_TEXT_CHECKED(imgp->vp);
}
if (bss_size) {
error = vm_map_insert(map, NULL, 0,
data_end, data_end + bss_size,
VM_PROT_ALL, VM_PROT_ALL, 0);
if (error) {
vm_map_unlock(map);
return (error);
}
}
vm_map_unlock(map);
/* Fill in process VM information */
vmspace->vm_tsize = a_out->a_text >> PAGE_SHIFT;
vmspace->vm_dsize = (a_out->a_data + bss_size) >> PAGE_SHIFT;
vmspace->vm_taddr = (caddr_t) (uintptr_t) virtual_offset;
vmspace->vm_daddr = (caddr_t) (uintptr_t)
(virtual_offset + a_out->a_text);
/* Fill in image_params */
imgp->interpreted = 0;
imgp->entry_addr = a_out->a_entry;
imgp->proc->p_sysent = &aout_sysvec;
return (0);
}
/*
* Tell kern_execve.c about it, with a little help from the linker.
*/
static struct execsw aout_execsw = {
.ex_imgact = exec_aout_imgact,
.ex_name = "a.out"
};
EXEC_SET(aout, aout_execsw);