1996-03-10 08:42:54 +00:00
|
|
|
/*-
|
2017-11-27 15:20:12 +00:00
|
|
|
* SPDX-License-Identifier: BSD-3-Clause
|
|
|
|
*
|
2017-03-30 18:21:36 +00:00
|
|
|
* Copyright (c) 2017 Dell EMC
|
2018-07-30 07:01:00 +00:00
|
|
|
* Copyright (c) 2000-2001, 2003 David O'Brien
|
2012-01-15 13:23:18 +00:00
|
|
|
* Copyright (c) 1995-1996 Søren Schmidt
|
1996-03-10 22:37:34 +00:00
|
|
|
* Copyright (c) 1996 Peter Wemm
|
1996-03-10 08:42:54 +00:00
|
|
|
* All rights reserved.
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
* are met:
|
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer
|
|
|
|
* in this position and unchanged.
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
|
* 3. The name of the author may not be used to endorse or promote products
|
2002-06-02 20:05:59 +00:00
|
|
|
* derived from this software without specific prior written permission
|
1996-03-10 08:42:54 +00:00
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
|
|
|
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
|
|
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
|
|
|
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
|
|
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
|
|
|
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
|
|
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
|
|
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
|
|
|
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
*/
|
|
|
|
|
2003-06-11 00:56:59 +00:00
|
|
|
#include <sys/cdefs.h>
|
|
|
|
__FBSDID("$FreeBSD$");
|
|
|
|
|
2011-06-30 10:56:02 +00:00
|
|
|
#include "opt_capsicum.h"
|
2005-06-30 07:49:22 +00:00
|
|
|
|
1996-03-10 08:42:54 +00:00
|
|
|
#include <sys/param.h>
|
2014-03-16 10:55:57 +00:00
|
|
|
#include <sys/capsicum.h>
|
2018-01-08 21:27:41 +00:00
|
|
|
#include <sys/compressor.h>
|
1996-03-10 08:42:54 +00:00
|
|
|
#include <sys/exec.h>
|
1998-09-14 22:46:08 +00:00
|
|
|
#include <sys/fcntl.h>
|
1996-03-10 08:42:54 +00:00
|
|
|
#include <sys/imgact.h>
|
|
|
|
#include <sys/imgact_elf.h>
|
2015-02-27 16:28:55 +00:00
|
|
|
#include <sys/jail.h>
|
1996-03-10 08:42:54 +00:00
|
|
|
#include <sys/kernel.h>
|
2001-03-28 09:17:56 +00:00
|
|
|
#include <sys/lock.h>
|
1996-03-10 08:42:54 +00:00
|
|
|
#include <sys/malloc.h>
|
2005-09-15 15:03:48 +00:00
|
|
|
#include <sys/mount.h>
|
1998-09-14 22:46:08 +00:00
|
|
|
#include <sys/mman.h>
|
1996-05-01 02:43:13 +00:00
|
|
|
#include <sys/namei.h>
|
1998-09-14 22:46:08 +00:00
|
|
|
#include <sys/pioctl.h>
|
1996-05-01 02:43:13 +00:00
|
|
|
#include <sys/proc.h>
|
1998-09-14 22:46:08 +00:00
|
|
|
#include <sys/procfs.h>
|
2017-03-30 18:21:36 +00:00
|
|
|
#include <sys/ptrace.h>
|
2011-04-05 20:23:59 +00:00
|
|
|
#include <sys/racct.h>
|
1998-09-14 22:46:08 +00:00
|
|
|
#include <sys/resourcevar.h>
|
2013-03-09 02:32:23 +00:00
|
|
|
#include <sys/rwlock.h>
|
2013-04-14 19:59:38 +00:00
|
|
|
#include <sys/sbuf.h>
|
2005-12-16 18:34:14 +00:00
|
|
|
#include <sys/sf_buf.h>
|
2010-08-17 08:55:45 +00:00
|
|
|
#include <sys/smp.h>
|
2000-09-10 13:54:52 +00:00
|
|
|
#include <sys/systm.h>
|
1996-03-10 08:42:54 +00:00
|
|
|
#include <sys/signalvar.h>
|
1998-09-14 22:46:08 +00:00
|
|
|
#include <sys/stat.h>
|
2001-03-28 11:52:56 +00:00
|
|
|
#include <sys/sx.h>
|
1998-09-14 22:46:08 +00:00
|
|
|
#include <sys/syscall.h>
|
1996-03-10 08:42:54 +00:00
|
|
|
#include <sys/sysctl.h>
|
1998-09-14 22:46:08 +00:00
|
|
|
#include <sys/sysent.h>
|
1996-05-01 02:43:13 +00:00
|
|
|
#include <sys/vnode.h>
|
2010-03-02 06:58:58 +00:00
|
|
|
#include <sys/syslog.h>
|
|
|
|
#include <sys/eventhandler.h>
|
2013-04-16 19:19:14 +00:00
|
|
|
#include <sys/user.h>
|
2010-03-02 06:58:58 +00:00
|
|
|
|
1996-03-10 08:42:54 +00:00
|
|
|
#include <vm/vm.h>
|
|
|
|
#include <vm/vm_kern.h>
|
|
|
|
#include <vm/vm_param.h>
|
|
|
|
#include <vm/pmap.h>
|
|
|
|
#include <vm/vm_map.h>
|
1998-09-16 02:04:05 +00:00
|
|
|
#include <vm/vm_object.h>
|
1996-03-10 08:42:54 +00:00
|
|
|
#include <vm/vm_extern.h>
|
|
|
|
|
1998-10-18 15:55:12 +00:00
|
|
|
#include <machine/elf.h>
|
1996-03-10 08:42:54 +00:00
|
|
|
#include <machine/md_var.h>
|
|
|
|
|
2013-05-01 14:59:16 +00:00
|
|
|
#define ELF_NOTE_ROUNDSIZE 4
|
2000-04-18 02:39:26 +00:00
|
|
|
#define OLD_EI_BRAND 8
|
|
|
|
|
2002-07-20 02:56:12 +00:00
|
|
|
static int __elfN(check_header)(const Elf_Ehdr *hdr);
|
2009-03-13 16:40:51 +00:00
|
|
|
static Elf_Brandinfo *__elfN(get_brandinfo)(struct image_params *imgp,
|
2019-03-28 21:43:01 +00:00
|
|
|
const char *interp, int32_t *osrel, uint32_t *fctl0);
|
2002-07-20 02:56:12 +00:00
|
|
|
static int __elfN(load_file)(struct proc *p, const char *file, u_long *addr,
|
2019-03-01 16:16:38 +00:00
|
|
|
u_long *entry);
|
2017-03-07 13:36:43 +00:00
|
|
|
static int __elfN(load_section)(struct image_params *imgp, vm_ooffset_t offset,
|
2019-03-01 16:16:38 +00:00
|
|
|
caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot);
|
2002-07-20 02:56:12 +00:00
|
|
|
static int __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp);
|
2018-03-13 16:40:29 +00:00
|
|
|
static bool __elfN(freebsd_trans_osrel)(const Elf_Note *note,
|
Fix handling of .note.ABI-tag section for GNU systems [1].
Handle GNU/Linux according to LSB Core Specification 4.0,
Chapter 11. Object Format, 11.8. ABI note tag.
Also check the first word of desc, not only name, according to
glibc abi-tags specification to distinguish between Linux and
kFreeBSD.
Add explicit handling for Debian GNU/kFreeBSD, which runs
on our kernels as well [2].
In {amd64,i386}/trap.c, when checking osrel of the current process,
also check the ABI to not change the signal behaviour for Linux
binary processes, now that we save an osrel version for all three
from the lists above in struct proc [2].
These changes make it possible to run FreeBSD, Debian GNU/kFreeBSD
and Linux binaries on the same machine again for at least i386 and
amd64, and no longer break kFreeBSD which was detected as GNU(/Linux).
PR: kern/135468
Submitted by: dchagin [1] (initial patch)
Suggested by: kib [2]
Tested by: Petr Salinger (Petr.Salinger seznam.cz) for kFreeBSD
Reviewed by: kib
MFC after: 3 days
2009-08-24 16:19:47 +00:00
|
|
|
int32_t *osrel);
|
2018-03-13 16:40:29 +00:00
|
|
|
static bool kfreebsd_trans_osrel(const Elf_Note *note, int32_t *osrel);
|
2009-03-13 16:40:51 +00:00
|
|
|
static boolean_t __elfN(check_note)(struct image_params *imgp,
|
2018-11-23 23:33:55 +00:00
|
|
|
Elf_Brandnote *checknote, int32_t *osrel, uint32_t *fctl0);
|
2011-01-08 16:02:14 +00:00
|
|
|
static vm_prot_t __elfN(trans_prot)(Elf_Word);
|
|
|
|
static Elf_Word __elfN(untrans_prot)(vm_prot_t);
|
1996-03-10 08:42:54 +00:00
|
|
|
|
2003-01-04 22:07:48 +00:00
|
|
|
SYSCTL_NODE(_kern, OID_AUTO, __CONCAT(elf, __ELF_WORD_SIZE), CTLFLAG_RW, 0,
|
|
|
|
"");
|
|
|
|
|
2015-03-09 03:50:53 +00:00
|
|
|
#define CORE_BUF_SIZE (16 * 1024)
|
2010-03-02 06:58:58 +00:00
|
|
|
|
2003-01-05 03:48:14 +00:00
|
|
|
int __elfN(fallback_brand) = -1;
|
|
|
|
SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO,
|
2014-06-28 03:56:17 +00:00
|
|
|
fallback_brand, CTLFLAG_RWTUN, &__elfN(fallback_brand), 0,
|
2003-01-04 22:07:48 +00:00
|
|
|
__XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) " brand of last resort");
|
|
|
|
|
2002-12-21 01:15:39 +00:00
|
|
|
static int elf_legacy_coredump = 0;
|
2003-01-04 22:07:48 +00:00
|
|
|
SYSCTL_INT(_debug, OID_AUTO, __elfN(legacy_coredump), CTLFLAG_RW,
|
2016-07-05 14:46:06 +00:00
|
|
|
&elf_legacy_coredump, 0,
|
|
|
|
"include all and only RW pages in core dumps");
|
1996-03-10 08:42:54 +00:00
|
|
|
|
2012-01-30 07:56:00 +00:00
|
|
|
int __elfN(nxstack) =
|
2015-12-07 12:20:26 +00:00
|
|
|
#if defined(__amd64__) || defined(__powerpc64__) /* both 64 and 32 bit */ || \
|
2018-11-07 18:32:02 +00:00
|
|
|
(defined(__arm__) && __ARM_ARCH >= 7) || defined(__aarch64__) || \
|
|
|
|
defined(__riscv)
|
2012-01-30 07:56:00 +00:00
|
|
|
1;
|
|
|
|
#else
|
|
|
|
0;
|
|
|
|
#endif
|
2011-01-08 16:30:59 +00:00
|
|
|
SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO,
|
|
|
|
nxstack, CTLFLAG_RW, &__elfN(nxstack), 0,
|
|
|
|
__XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) ": enable non-executable stack");
|
|
|
|
|
2019-02-07 02:17:34 +00:00
|
|
|
#if __ELF_WORD_SIZE == 32 && (defined(__amd64__) || defined(__i386__))
|
2011-10-15 12:35:18 +00:00
|
|
|
int i386_read_exec = 0;
|
|
|
|
SYSCTL_INT(_kern_elf32, OID_AUTO, read_exec, CTLFLAG_RW, &i386_read_exec, 0,
|
|
|
|
"enable execution from readable segments");
|
|
|
|
#endif
|
|
|
|
|
2019-09-21 18:00:23 +00:00
|
|
|
static u_long __elfN(pie_base) = ET_DYN_LOAD_ADDR;
|
2019-09-21 20:03:17 +00:00
|
|
|
static int
|
|
|
|
sysctl_pie_base(SYSCTL_HANDLER_ARGS)
|
|
|
|
{
|
|
|
|
u_long val;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
val = __elfN(pie_base);
|
|
|
|
error = sysctl_handle_long(oidp, &val, 0, req);
|
|
|
|
if (error != 0 || req->newptr == NULL)
|
|
|
|
return (error);
|
|
|
|
if ((val & PAGE_MASK) != 0)
|
|
|
|
return (EINVAL);
|
|
|
|
__elfN(pie_base) = val;
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
SYSCTL_PROC(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, pie_base,
|
|
|
|
CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0,
|
|
|
|
sysctl_pie_base, "LU",
|
2019-09-21 18:00:23 +00:00
|
|
|
"PIE load base without randomization");
|
|
|
|
|
Implement Address Space Layout Randomization (ASLR)
With this change, randomization can be enabled for all non-fixed
mappings. It means that the base address for the mapping is selected
with a guaranteed amount of entropy (bits). If the mapping was
requested to be superpage aligned, the randomization honours the
superpage attributes.
Although the value of ASLR is diminshing over time as exploit authors
work out simple ASLR bypass techniques, it elimintates the trivial
exploitation of certain vulnerabilities, at least in theory. This
implementation is relatively small and happens at the correct
architectural level. Also, it is not expected to introduce
regressions in existing cases when turned off (default for now), or
cause any significant maintaince burden.
The randomization is done on a best-effort basis - that is, the
allocator falls back to a first fit strategy if fragmentation prevents
entropy injection. It is trivial to implement a strong mode where
failure to guarantee the requested amount of entropy results in
mapping request failure, but I do not consider that to be usable.
I have not fine-tuned the amount of entropy injected right now. It is
only a quantitive change that will not change the implementation. The
current amount is controlled by aslr_pages_rnd.
To not spoil coalescing optimizations, to reduce the page table
fragmentation inherent to ASLR, and to keep the transient superpage
promotion for the malloced memory, locality clustering is implemented
for anonymous private mappings, which are automatically grouped until
fragmentation kicks in. The initial location for the anon group range
is, of course, randomized. This is controlled by vm.cluster_anon,
enabled by default.
The default mode keeps the sbrk area unpopulated by other mappings,
but this can be turned off, which gives much more breathing bits on
architectures with small address space, such as i386. This is tied
with the question of following an application's hint about the mmap(2)
base address. Testing shows that ignoring the hint does not affect the
function of common applications, but I would expect more demanding
code could break. By default sbrk is preserved and mmap hints are
satisfied, which can be changed by using the
kern.elf{32,64}.aslr.honor_sbrk sysctl.
ASLR is enabled on per-ABI basis, and currently it is only allowed on
FreeBSD native i386 and amd64 (including compat 32bit) ABIs. Support
for additional architectures will be added after further testing.
Both per-process and per-image controls are implemented:
- procctl(2) adds PROC_ASLR_CTL/PROC_ASLR_STATUS;
- NT_FREEBSD_FCTL_ASLR_DISABLE feature control note bit makes it possible
to force ASLR off for the given binary. (A tool to edit the feature
control note is in development.)
Global controls are:
- kern.elf{32,64}.aslr.enable - for non-fixed mappings done by mmap(2);
- kern.elf{32,64}.aslr.pie_enable - for PIE image activation mappings;
- kern.elf{32,64}.aslr.honor_sbrk - allow to use sbrk area for mmap(2);
- vm.cluster_anon - enables anon mapping clustering.
PR: 208580 (exp runs)
Exp-runs done by: antoine
Reviewed by: markj (previous version)
Discussed with: emaste
Tested by: pho
MFC after: 1 month
Sponsored by: The FreeBSD Foundation
Differential revision: https://reviews.freebsd.org/D5603
2019-02-10 17:19:45 +00:00
|
|
|
SYSCTL_NODE(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, aslr, CTLFLAG_RW, 0,
|
|
|
|
"");
|
|
|
|
#define ASLR_NODE_OID __CONCAT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), _aslr)
|
|
|
|
|
|
|
|
static int __elfN(aslr_enabled) = 0;
|
|
|
|
SYSCTL_INT(ASLR_NODE_OID, OID_AUTO, enable, CTLFLAG_RWTUN,
|
|
|
|
&__elfN(aslr_enabled), 0,
|
|
|
|
__XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE))
|
|
|
|
": enable address map randomization");
|
|
|
|
|
|
|
|
static int __elfN(pie_aslr_enabled) = 0;
|
|
|
|
SYSCTL_INT(ASLR_NODE_OID, OID_AUTO, pie_enable, CTLFLAG_RWTUN,
|
|
|
|
&__elfN(pie_aslr_enabled), 0,
|
|
|
|
__XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE))
|
|
|
|
": enable address map randomization for PIE binaries");
|
|
|
|
|
|
|
|
static int __elfN(aslr_honor_sbrk) = 1;
|
|
|
|
SYSCTL_INT(ASLR_NODE_OID, OID_AUTO, honor_sbrk, CTLFLAG_RW,
|
|
|
|
&__elfN(aslr_honor_sbrk), 0,
|
|
|
|
__XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) ": assume sbrk is used");
|
|
|
|
|
2019-07-31 20:23:10 +00:00
|
|
|
static int __elfN(aslr_stack_gap) = 3;
|
|
|
|
SYSCTL_INT(ASLR_NODE_OID, OID_AUTO, stack_gap, CTLFLAG_RW,
|
|
|
|
&__elfN(aslr_stack_gap), 0,
|
|
|
|
__XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE))
|
|
|
|
": maximum percentage of main stack to waste on a random gap");
|
|
|
|
|
2002-07-20 02:56:12 +00:00
|
|
|
static Elf_Brandinfo *elf_brand_list[MAX_BRANDS];
|
1996-03-10 08:42:54 +00:00
|
|
|
|
2019-03-23 13:41:14 +00:00
|
|
|
#define aligned(a, t) (rounddown2((u_long)(a), sizeof(t)) == (u_long)(a))
|
2007-12-04 12:21:27 +00:00
|
|
|
|
2009-03-13 16:40:51 +00:00
|
|
|
static const char FREEBSD_ABI_VENDOR[] = "FreeBSD";
|
|
|
|
|
|
|
|
Elf_Brandnote __elfN(freebsd_brandnote) = {
|
|
|
|
.hdr.n_namesz = sizeof(FREEBSD_ABI_VENDOR),
|
|
|
|
.hdr.n_descsz = sizeof(int32_t),
|
2015-12-07 18:43:27 +00:00
|
|
|
.hdr.n_type = NT_FREEBSD_ABI_TAG,
|
2009-03-13 16:40:51 +00:00
|
|
|
.vendor = FREEBSD_ABI_VENDOR,
|
Fix handling of .note.ABI-tag section for GNU systems [1].
Handle GNU/Linux according to LSB Core Specification 4.0,
Chapter 11. Object Format, 11.8. ABI note tag.
Also check the first word of desc, not only name, according to
glibc abi-tags specification to distinguish between Linux and
kFreeBSD.
Add explicit handling for Debian GNU/kFreeBSD, which runs
on our kernels as well [2].
In {amd64,i386}/trap.c, when checking osrel of the current process,
also check the ABI to not change the signal behaviour for Linux
binary processes, now that we save an osrel version for all three
from the lists above in struct proc [2].
These changes make it possible to run FreeBSD, Debian GNU/kFreeBSD
and Linux binaries on the same machine again for at least i386 and
amd64, and no longer break kFreeBSD which was detected as GNU(/Linux).
PR: kern/135468
Submitted by: dchagin [1] (initial patch)
Suggested by: kib [2]
Tested by: Petr Salinger (Petr.Salinger seznam.cz) for kFreeBSD
Reviewed by: kib
MFC after: 3 days
2009-08-24 16:19:47 +00:00
|
|
|
.flags = BN_TRANSLATE_OSREL,
|
|
|
|
.trans_osrel = __elfN(freebsd_trans_osrel)
|
2009-03-13 16:40:51 +00:00
|
|
|
};
|
|
|
|
|
2018-03-13 16:40:29 +00:00
|
|
|
static bool
|
Fix handling of .note.ABI-tag section for GNU systems [1].
Handle GNU/Linux according to LSB Core Specification 4.0,
Chapter 11. Object Format, 11.8. ABI note tag.
Also check the first word of desc, not only name, according to
glibc abi-tags specification to distinguish between Linux and
kFreeBSD.
Add explicit handling for Debian GNU/kFreeBSD, which runs
on our kernels as well [2].
In {amd64,i386}/trap.c, when checking osrel of the current process,
also check the ABI to not change the signal behaviour for Linux
binary processes, now that we save an osrel version for all three
from the lists above in struct proc [2].
These changes make it possible to run FreeBSD, Debian GNU/kFreeBSD
and Linux binaries on the same machine again for at least i386 and
amd64, and no longer break kFreeBSD which was detected as GNU(/Linux).
PR: kern/135468
Submitted by: dchagin [1] (initial patch)
Suggested by: kib [2]
Tested by: Petr Salinger (Petr.Salinger seznam.cz) for kFreeBSD
Reviewed by: kib
MFC after: 3 days
2009-08-24 16:19:47 +00:00
|
|
|
__elfN(freebsd_trans_osrel)(const Elf_Note *note, int32_t *osrel)
|
|
|
|
{
|
|
|
|
uintptr_t p;
|
|
|
|
|
|
|
|
p = (uintptr_t)(note + 1);
|
2013-05-01 14:59:16 +00:00
|
|
|
p += roundup2(note->n_namesz, ELF_NOTE_ROUNDSIZE);
|
Fix handling of .note.ABI-tag section for GNU systems [1].
Handle GNU/Linux according to LSB Core Specification 4.0,
Chapter 11. Object Format, 11.8. ABI note tag.
Also check the first word of desc, not only name, according to
glibc abi-tags specification to distinguish between Linux and
kFreeBSD.
Add explicit handling for Debian GNU/kFreeBSD, which runs
on our kernels as well [2].
In {amd64,i386}/trap.c, when checking osrel of the current process,
also check the ABI to not change the signal behaviour for Linux
binary processes, now that we save an osrel version for all three
from the lists above in struct proc [2].
These changes make it possible to run FreeBSD, Debian GNU/kFreeBSD
and Linux binaries on the same machine again for at least i386 and
amd64, and no longer break kFreeBSD which was detected as GNU(/Linux).
PR: kern/135468
Submitted by: dchagin [1] (initial patch)
Suggested by: kib [2]
Tested by: Petr Salinger (Petr.Salinger seznam.cz) for kFreeBSD
Reviewed by: kib
MFC after: 3 days
2009-08-24 16:19:47 +00:00
|
|
|
*osrel = *(const int32_t *)(p);
|
|
|
|
|
2018-03-13 16:40:29 +00:00
|
|
|
return (true);
|
Fix handling of .note.ABI-tag section for GNU systems [1].
Handle GNU/Linux according to LSB Core Specification 4.0,
Chapter 11. Object Format, 11.8. ABI note tag.
Also check the first word of desc, not only name, according to
glibc abi-tags specification to distinguish between Linux and
kFreeBSD.
Add explicit handling for Debian GNU/kFreeBSD, which runs
on our kernels as well [2].
In {amd64,i386}/trap.c, when checking osrel of the current process,
also check the ABI to not change the signal behaviour for Linux
binary processes, now that we save an osrel version for all three
from the lists above in struct proc [2].
These changes make it possible to run FreeBSD, Debian GNU/kFreeBSD
and Linux binaries on the same machine again for at least i386 and
amd64, and no longer break kFreeBSD which was detected as GNU(/Linux).
PR: kern/135468
Submitted by: dchagin [1] (initial patch)
Suggested by: kib [2]
Tested by: Petr Salinger (Petr.Salinger seznam.cz) for kFreeBSD
Reviewed by: kib
MFC after: 3 days
2009-08-24 16:19:47 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static const char GNU_ABI_VENDOR[] = "GNU";
|
|
|
|
static int GNU_KFREEBSD_ABI_DESC = 3;
|
|
|
|
|
|
|
|
Elf_Brandnote __elfN(kfreebsd_brandnote) = {
|
|
|
|
.hdr.n_namesz = sizeof(GNU_ABI_VENDOR),
|
|
|
|
.hdr.n_descsz = 16, /* XXX at least 16 */
|
|
|
|
.hdr.n_type = 1,
|
|
|
|
.vendor = GNU_ABI_VENDOR,
|
|
|
|
.flags = BN_TRANSLATE_OSREL,
|
|
|
|
.trans_osrel = kfreebsd_trans_osrel
|
|
|
|
};
|
|
|
|
|
2018-03-13 16:40:29 +00:00
|
|
|
static bool
|
Fix handling of .note.ABI-tag section for GNU systems [1].
Handle GNU/Linux according to LSB Core Specification 4.0,
Chapter 11. Object Format, 11.8. ABI note tag.
Also check the first word of desc, not only name, according to
glibc abi-tags specification to distinguish between Linux and
kFreeBSD.
Add explicit handling for Debian GNU/kFreeBSD, which runs
on our kernels as well [2].
In {amd64,i386}/trap.c, when checking osrel of the current process,
also check the ABI to not change the signal behaviour for Linux
binary processes, now that we save an osrel version for all three
from the lists above in struct proc [2].
These changes make it possible to run FreeBSD, Debian GNU/kFreeBSD
and Linux binaries on the same machine again for at least i386 and
amd64, and no longer break kFreeBSD which was detected as GNU(/Linux).
PR: kern/135468
Submitted by: dchagin [1] (initial patch)
Suggested by: kib [2]
Tested by: Petr Salinger (Petr.Salinger seznam.cz) for kFreeBSD
Reviewed by: kib
MFC after: 3 days
2009-08-24 16:19:47 +00:00
|
|
|
kfreebsd_trans_osrel(const Elf_Note *note, int32_t *osrel)
|
|
|
|
{
|
|
|
|
const Elf32_Word *desc;
|
|
|
|
uintptr_t p;
|
|
|
|
|
|
|
|
p = (uintptr_t)(note + 1);
|
2013-05-01 14:59:16 +00:00
|
|
|
p += roundup2(note->n_namesz, ELF_NOTE_ROUNDSIZE);
|
Fix handling of .note.ABI-tag section for GNU systems [1].
Handle GNU/Linux according to LSB Core Specification 4.0,
Chapter 11. Object Format, 11.8. ABI note tag.
Also check the first word of desc, not only name, according to
glibc abi-tags specification to distinguish between Linux and
kFreeBSD.
Add explicit handling for Debian GNU/kFreeBSD, which runs
on our kernels as well [2].
In {amd64,i386}/trap.c, when checking osrel of the current process,
also check the ABI to not change the signal behaviour for Linux
binary processes, now that we save an osrel version for all three
from the lists above in struct proc [2].
These changes make it possible to run FreeBSD, Debian GNU/kFreeBSD
and Linux binaries on the same machine again for at least i386 and
amd64, and no longer break kFreeBSD which was detected as GNU(/Linux).
PR: kern/135468
Submitted by: dchagin [1] (initial patch)
Suggested by: kib [2]
Tested by: Petr Salinger (Petr.Salinger seznam.cz) for kFreeBSD
Reviewed by: kib
MFC after: 3 days
2009-08-24 16:19:47 +00:00
|
|
|
|
|
|
|
desc = (const Elf32_Word *)p;
|
|
|
|
if (desc[0] != GNU_KFREEBSD_ABI_DESC)
|
2018-03-13 16:40:29 +00:00
|
|
|
return (false);
|
Fix handling of .note.ABI-tag section for GNU systems [1].
Handle GNU/Linux according to LSB Core Specification 4.0,
Chapter 11. Object Format, 11.8. ABI note tag.
Also check the first word of desc, not only name, according to
glibc abi-tags specification to distinguish between Linux and
kFreeBSD.
Add explicit handling for Debian GNU/kFreeBSD, which runs
on our kernels as well [2].
In {amd64,i386}/trap.c, when checking osrel of the current process,
also check the ABI to not change the signal behaviour for Linux
binary processes, now that we save an osrel version for all three
from the lists above in struct proc [2].
These changes make it possible to run FreeBSD, Debian GNU/kFreeBSD
and Linux binaries on the same machine again for at least i386 and
amd64, and no longer break kFreeBSD which was detected as GNU(/Linux).
PR: kern/135468
Submitted by: dchagin [1] (initial patch)
Suggested by: kib [2]
Tested by: Petr Salinger (Petr.Salinger seznam.cz) for kFreeBSD
Reviewed by: kib
MFC after: 3 days
2009-08-24 16:19:47 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Debian GNU/kFreeBSD embed the earliest compatible kernel version
|
|
|
|
* (__FreeBSD_version: <major><two digit minor>Rxx) in the LSB way.
|
|
|
|
*/
|
|
|
|
*osrel = desc[1] * 100000 + desc[2] * 1000 + desc[3];
|
|
|
|
|
2018-03-13 16:40:29 +00:00
|
|
|
return (true);
|
Fix handling of .note.ABI-tag section for GNU systems [1].
Handle GNU/Linux according to LSB Core Specification 4.0,
Chapter 11. Object Format, 11.8. ABI note tag.
Also check the first word of desc, not only name, according to
glibc abi-tags specification to distinguish between Linux and
kFreeBSD.
Add explicit handling for Debian GNU/kFreeBSD, which runs
on our kernels as well [2].
In {amd64,i386}/trap.c, when checking osrel of the current process,
also check the ABI to not change the signal behaviour for Linux
binary processes, now that we save an osrel version for all three
from the lists above in struct proc [2].
These changes make it possible to run FreeBSD, Debian GNU/kFreeBSD
and Linux binaries on the same machine again for at least i386 and
amd64, and no longer break kFreeBSD which was detected as GNU(/Linux).
PR: kern/135468
Submitted by: dchagin [1] (initial patch)
Suggested by: kib [2]
Tested by: Petr Salinger (Petr.Salinger seznam.cz) for kFreeBSD
Reviewed by: kib
MFC after: 3 days
2009-08-24 16:19:47 +00:00
|
|
|
}
|
|
|
|
|
1996-03-10 08:42:54 +00:00
|
|
|
int
|
2002-07-20 02:56:12 +00:00
|
|
|
__elfN(insert_brand_entry)(Elf_Brandinfo *entry)
|
1996-03-10 08:42:54 +00:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
2002-08-24 22:55:16 +00:00
|
|
|
for (i = 0; i < MAX_BRANDS; i++) {
|
1996-10-16 17:51:08 +00:00
|
|
|
if (elf_brand_list[i] == NULL) {
|
|
|
|
elf_brand_list[i] = entry;
|
1996-03-10 08:42:54 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2009-10-03 10:50:00 +00:00
|
|
|
if (i == MAX_BRANDS) {
|
|
|
|
printf("WARNING: %s: could not insert brandinfo entry: %p\n",
|
|
|
|
__func__, entry);
|
2002-08-24 22:01:40 +00:00
|
|
|
return (-1);
|
2009-10-03 10:50:00 +00:00
|
|
|
}
|
2002-08-24 22:01:40 +00:00
|
|
|
return (0);
|
1996-03-10 08:42:54 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
int
|
2002-07-20 02:56:12 +00:00
|
|
|
__elfN(remove_brand_entry)(Elf_Brandinfo *entry)
|
1996-03-10 08:42:54 +00:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
2002-08-24 22:55:16 +00:00
|
|
|
for (i = 0; i < MAX_BRANDS; i++) {
|
1996-10-16 17:51:08 +00:00
|
|
|
if (elf_brand_list[i] == entry) {
|
|
|
|
elf_brand_list[i] = NULL;
|
1996-03-10 08:42:54 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
1996-10-16 17:51:08 +00:00
|
|
|
if (i == MAX_BRANDS)
|
2002-08-24 22:01:40 +00:00
|
|
|
return (-1);
|
|
|
|
return (0);
|
1996-03-10 08:42:54 +00:00
|
|
|
}
|
|
|
|
|
1999-02-04 12:42:39 +00:00
|
|
|
int
|
2002-07-20 02:56:12 +00:00
|
|
|
__elfN(brand_inuse)(Elf_Brandinfo *entry)
|
1999-02-04 12:42:39 +00:00
|
|
|
{
|
|
|
|
struct proc *p;
|
2000-11-22 07:42:04 +00:00
|
|
|
int rval = FALSE;
|
1999-02-04 12:42:39 +00:00
|
|
|
|
2001-03-28 11:52:56 +00:00
|
|
|
sx_slock(&allproc_lock);
|
2007-01-17 14:58:53 +00:00
|
|
|
FOREACH_PROC_IN_SYSTEM(p) {
|
2000-11-22 07:42:04 +00:00
|
|
|
if (p->p_sysent == entry->sysvec) {
|
|
|
|
rval = TRUE;
|
|
|
|
break;
|
|
|
|
}
|
1999-02-04 12:42:39 +00:00
|
|
|
}
|
2001-03-28 11:52:56 +00:00
|
|
|
sx_sunlock(&allproc_lock);
|
1999-02-04 12:42:39 +00:00
|
|
|
|
2000-11-22 07:42:04 +00:00
|
|
|
return (rval);
|
1999-02-04 12:42:39 +00:00
|
|
|
}
|
|
|
|
|
2002-09-02 04:50:57 +00:00
|
|
|
static Elf_Brandinfo *
|
2009-03-13 16:40:51 +00:00
|
|
|
__elfN(get_brandinfo)(struct image_params *imgp, const char *interp,
|
2019-03-28 21:43:01 +00:00
|
|
|
int32_t *osrel, uint32_t *fctl0)
|
2002-09-02 04:50:57 +00:00
|
|
|
{
|
2009-03-13 16:40:51 +00:00
|
|
|
const Elf_Ehdr *hdr = (const Elf_Ehdr *)imgp->image_header;
|
2016-02-04 20:55:49 +00:00
|
|
|
Elf_Brandinfo *bi, *bi_m;
|
2009-03-13 16:40:51 +00:00
|
|
|
boolean_t ret;
|
2019-03-28 21:43:01 +00:00
|
|
|
int i, interp_name_len;
|
|
|
|
|
2019-03-30 16:58:51 +00:00
|
|
|
interp_name_len = interp != NULL ? strlen(interp) + 1 : 0;
|
2002-09-02 04:50:57 +00:00
|
|
|
|
|
|
|
/*
|
2009-03-13 16:40:51 +00:00
|
|
|
* We support four types of branding -- (1) the ELF EI_OSABI field
|
2002-09-02 04:50:57 +00:00
|
|
|
* that SCO added to the ELF spec, (2) FreeBSD 3.x's traditional string
|
2009-03-13 16:40:51 +00:00
|
|
|
* branding w/in the ELF header, (3) path of the `interp_path'
|
|
|
|
* field, and (4) the ".note.ABI-tag" ELF section.
|
2002-09-02 04:50:57 +00:00
|
|
|
*/
|
|
|
|
|
2009-03-13 16:40:51 +00:00
|
|
|
/* Look for an ".note.ABI-tag" ELF section */
|
2016-02-04 20:55:49 +00:00
|
|
|
bi_m = NULL;
|
2009-03-13 16:40:51 +00:00
|
|
|
for (i = 0; i < MAX_BRANDS; i++) {
|
|
|
|
bi = elf_brand_list[i];
|
2009-08-30 14:38:17 +00:00
|
|
|
if (bi == NULL)
|
|
|
|
continue;
|
2017-03-22 22:28:13 +00:00
|
|
|
if (interp != NULL && (bi->flags & BI_BRAND_ONLY_STATIC) != 0)
|
2017-03-22 22:23:01 +00:00
|
|
|
continue;
|
2009-08-30 14:38:17 +00:00
|
|
|
if (hdr->e_machine == bi->machine && (bi->flags &
|
|
|
|
(BI_BRAND_NOTE|BI_BRAND_NOTE_MANDATORY)) != 0) {
|
2018-11-23 23:33:55 +00:00
|
|
|
ret = __elfN(check_note)(imgp, bi->brand_note, osrel,
|
|
|
|
fctl0);
|
2015-12-01 17:00:31 +00:00
|
|
|
/* Give brand a chance to veto check_note's guess */
|
|
|
|
if (ret && bi->header_supported)
|
|
|
|
ret = bi->header_supported(imgp);
|
2016-02-04 20:55:49 +00:00
|
|
|
/*
|
|
|
|
* If note checker claimed the binary, but the
|
|
|
|
* interpreter path in the image does not
|
|
|
|
* match default one for the brand, try to
|
|
|
|
* search for other brands with the same
|
|
|
|
* interpreter. Either there is better brand
|
|
|
|
* with the right interpreter, or, failing
|
|
|
|
* this, we return first brand which accepted
|
|
|
|
* our note and, optionally, header.
|
|
|
|
*/
|
2017-03-30 04:21:02 +00:00
|
|
|
if (ret && bi_m == NULL && interp != NULL &&
|
|
|
|
(bi->interp_path == NULL ||
|
|
|
|
(strlen(bi->interp_path) + 1 != interp_name_len ||
|
|
|
|
strncmp(interp, bi->interp_path, interp_name_len)
|
|
|
|
!= 0))) {
|
2016-02-04 20:55:49 +00:00
|
|
|
bi_m = bi;
|
|
|
|
ret = 0;
|
|
|
|
}
|
2009-03-13 16:40:51 +00:00
|
|
|
if (ret)
|
|
|
|
return (bi);
|
|
|
|
}
|
|
|
|
}
|
2016-02-04 20:55:49 +00:00
|
|
|
if (bi_m != NULL)
|
|
|
|
return (bi_m);
|
2009-03-13 16:40:51 +00:00
|
|
|
|
2002-09-02 04:50:57 +00:00
|
|
|
/* If the executable has a brand, search for it in the brand list. */
|
|
|
|
for (i = 0; i < MAX_BRANDS; i++) {
|
|
|
|
bi = elf_brand_list[i];
|
2017-03-22 22:23:01 +00:00
|
|
|
if (bi == NULL || (bi->flags & BI_BRAND_NOTE_MANDATORY) != 0 ||
|
2017-03-22 22:28:13 +00:00
|
|
|
(interp != NULL && (bi->flags & BI_BRAND_ONLY_STATIC) != 0))
|
2009-08-30 14:38:17 +00:00
|
|
|
continue;
|
|
|
|
if (hdr->e_machine == bi->machine &&
|
2002-09-02 04:50:57 +00:00
|
|
|
(hdr->e_ident[EI_OSABI] == bi->brand ||
|
2017-03-23 14:09:45 +00:00
|
|
|
(bi->compat_3_brand != NULL &&
|
2017-03-07 13:37:35 +00:00
|
|
|
strcmp((const char *)&hdr->e_ident[OLD_EI_BRAND],
|
2017-03-23 14:09:45 +00:00
|
|
|
bi->compat_3_brand) == 0))) {
|
2015-11-18 17:03:22 +00:00
|
|
|
/* Looks good, but give brand a chance to veto */
|
2018-02-05 23:27:42 +00:00
|
|
|
if (bi->header_supported == NULL ||
|
2017-03-07 13:38:25 +00:00
|
|
|
bi->header_supported(imgp)) {
|
|
|
|
/*
|
|
|
|
* Again, prefer strictly matching
|
|
|
|
* interpreter path.
|
|
|
|
*/
|
2017-03-22 22:06:48 +00:00
|
|
|
if (interp_name_len == 0 &&
|
|
|
|
bi->interp_path == NULL)
|
|
|
|
return (bi);
|
|
|
|
if (bi->interp_path != NULL &&
|
|
|
|
strlen(bi->interp_path) + 1 ==
|
2017-03-07 13:38:25 +00:00
|
|
|
interp_name_len && strncmp(interp,
|
|
|
|
bi->interp_path, interp_name_len) == 0)
|
|
|
|
return (bi);
|
|
|
|
if (bi_m == NULL)
|
|
|
|
bi_m = bi;
|
|
|
|
}
|
2015-11-18 17:03:22 +00:00
|
|
|
}
|
2002-09-02 04:50:57 +00:00
|
|
|
}
|
2017-03-07 13:38:25 +00:00
|
|
|
if (bi_m != NULL)
|
|
|
|
return (bi_m);
|
2002-09-02 04:50:57 +00:00
|
|
|
|
2014-08-18 02:44:56 +00:00
|
|
|
/* No known brand, see if the header is recognized by any brand */
|
|
|
|
for (i = 0; i < MAX_BRANDS; i++) {
|
|
|
|
bi = elf_brand_list[i];
|
|
|
|
if (bi == NULL || bi->flags & BI_BRAND_NOTE_MANDATORY ||
|
|
|
|
bi->header_supported == NULL)
|
|
|
|
continue;
|
|
|
|
if (hdr->e_machine == bi->machine) {
|
|
|
|
ret = bi->header_supported(imgp);
|
|
|
|
if (ret)
|
|
|
|
return (bi);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2002-09-02 04:50:57 +00:00
|
|
|
/* Lacking a known brand, search for a recognized interpreter. */
|
|
|
|
if (interp != NULL) {
|
|
|
|
for (i = 0; i < MAX_BRANDS; i++) {
|
|
|
|
bi = elf_brand_list[i];
|
2017-03-22 22:28:13 +00:00
|
|
|
if (bi == NULL || (bi->flags &
|
|
|
|
(BI_BRAND_NOTE_MANDATORY | BI_BRAND_ONLY_STATIC))
|
|
|
|
!= 0)
|
2009-08-30 14:38:17 +00:00
|
|
|
continue;
|
|
|
|
if (hdr->e_machine == bi->machine &&
|
2017-03-30 04:21:02 +00:00
|
|
|
bi->interp_path != NULL &&
|
2012-07-19 11:15:53 +00:00
|
|
|
/* ELF image p_filesz includes terminating zero */
|
|
|
|
strlen(bi->interp_path) + 1 == interp_name_len &&
|
|
|
|
strncmp(interp, bi->interp_path, interp_name_len)
|
2018-02-05 23:27:42 +00:00
|
|
|
== 0 && (bi->header_supported == NULL ||
|
|
|
|
bi->header_supported(imgp)))
|
2002-09-02 04:50:57 +00:00
|
|
|
return (bi);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Lacking a recognized interpreter, try the default brand */
|
|
|
|
for (i = 0; i < MAX_BRANDS; i++) {
|
|
|
|
bi = elf_brand_list[i];
|
2017-03-22 22:23:01 +00:00
|
|
|
if (bi == NULL || (bi->flags & BI_BRAND_NOTE_MANDATORY) != 0 ||
|
2017-03-22 22:28:13 +00:00
|
|
|
(interp != NULL && (bi->flags & BI_BRAND_ONLY_STATIC) != 0))
|
2009-08-30 14:38:17 +00:00
|
|
|
continue;
|
|
|
|
if (hdr->e_machine == bi->machine &&
|
2018-02-05 23:27:42 +00:00
|
|
|
__elfN(fallback_brand) == bi->brand &&
|
|
|
|
(bi->header_supported == NULL ||
|
|
|
|
bi->header_supported(imgp)))
|
2002-09-02 04:50:57 +00:00
|
|
|
return (bi);
|
|
|
|
}
|
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
|
1996-03-10 08:42:54 +00:00
|
|
|
static int
|
2002-07-20 02:56:12 +00:00
|
|
|
__elfN(check_header)(const Elf_Ehdr *hdr)
|
1996-03-10 08:42:54 +00:00
|
|
|
{
|
2002-09-08 02:17:44 +00:00
|
|
|
Elf_Brandinfo *bi;
|
2002-07-20 02:56:12 +00:00
|
|
|
int i;
|
|
|
|
|
1998-10-18 15:55:12 +00:00
|
|
|
if (!IS_ELF(*hdr) ||
|
|
|
|
hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS ||
|
|
|
|
hdr->e_ident[EI_DATA] != ELF_TARG_DATA ||
|
2004-03-18 16:33:05 +00:00
|
|
|
hdr->e_ident[EI_VERSION] != EV_CURRENT ||
|
|
|
|
hdr->e_phentsize != sizeof(Elf_Phdr) ||
|
|
|
|
hdr->e_version != ELF_TARG_VER)
|
2002-08-24 22:01:40 +00:00
|
|
|
return (ENOEXEC);
|
1996-03-10 08:42:54 +00:00
|
|
|
|
2002-07-20 02:56:12 +00:00
|
|
|
/*
|
|
|
|
* Make sure we have at least one brand for this machine.
|
|
|
|
*/
|
|
|
|
|
2002-08-24 22:55:16 +00:00
|
|
|
for (i = 0; i < MAX_BRANDS; i++) {
|
2002-09-08 02:17:44 +00:00
|
|
|
bi = elf_brand_list[i];
|
|
|
|
if (bi != NULL && bi->machine == hdr->e_machine)
|
2002-07-20 02:56:12 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (i == MAX_BRANDS)
|
2002-08-24 22:01:40 +00:00
|
|
|
return (ENOEXEC);
|
1996-03-10 08:42:54 +00:00
|
|
|
|
2002-08-24 22:01:40 +00:00
|
|
|
return (0);
|
1996-03-10 08:42:54 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
2002-07-20 02:56:12 +00:00
|
|
|
__elfN(map_partial)(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
|
2005-12-20 23:42:18 +00:00
|
|
|
vm_offset_t start, vm_offset_t end, vm_prot_t prot)
|
2002-07-20 02:56:12 +00:00
|
|
|
{
|
2005-12-16 18:34:14 +00:00
|
|
|
struct sf_buf *sf;
|
|
|
|
int error;
|
2002-07-20 02:56:12 +00:00
|
|
|
vm_offset_t off;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Create the page if it doesn't exist yet. Ignore errors.
|
|
|
|
*/
|
2017-03-06 14:09:54 +00:00
|
|
|
vm_map_fixed(map, NULL, 0, trunc_page(start), round_page(end) -
|
|
|
|
trunc_page(start), VM_PROT_ALL, VM_PROT_ALL, MAP_CHECK_EXCL);
|
2002-07-20 02:56:12 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Find the page from the underlying object.
|
|
|
|
*/
|
2017-03-05 23:59:04 +00:00
|
|
|
if (object != NULL) {
|
2005-12-16 18:34:14 +00:00
|
|
|
sf = vm_imgact_map_page(object, offset);
|
|
|
|
if (sf == NULL)
|
|
|
|
return (KERN_FAILURE);
|
2002-07-20 02:56:12 +00:00
|
|
|
off = offset - trunc_page(offset);
|
2005-12-16 18:34:14 +00:00
|
|
|
error = copyout((caddr_t)sf_buf_kva(sf) + off, (caddr_t)start,
|
2002-08-25 20:48:45 +00:00
|
|
|
end - start);
|
2013-08-05 08:55:35 +00:00
|
|
|
vm_imgact_unmap_page(sf);
|
2017-03-02 17:35:13 +00:00
|
|
|
if (error != 0)
|
2002-08-24 22:01:40 +00:00
|
|
|
return (KERN_FAILURE);
|
2002-07-20 02:56:12 +00:00
|
|
|
}
|
|
|
|
|
2002-08-24 22:01:40 +00:00
|
|
|
return (KERN_SUCCESS);
|
2002-07-20 02:56:12 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
2017-03-01 10:22:07 +00:00
|
|
|
__elfN(map_insert)(struct image_params *imgp, vm_map_t map, vm_object_t object,
|
|
|
|
vm_ooffset_t offset, vm_offset_t start, vm_offset_t end, vm_prot_t prot,
|
|
|
|
int cow)
|
2002-07-20 02:56:12 +00:00
|
|
|
{
|
2005-12-16 18:34:14 +00:00
|
|
|
struct sf_buf *sf;
|
|
|
|
vm_offset_t off;
|
2003-05-31 19:55:05 +00:00
|
|
|
vm_size_t sz;
|
2017-03-01 10:22:07 +00:00
|
|
|
int error, locked, rv;
|
2002-07-20 02:56:12 +00:00
|
|
|
|
|
|
|
if (start != trunc_page(start)) {
|
2002-08-25 22:36:52 +00:00
|
|
|
rv = __elfN(map_partial)(map, object, offset, start,
|
2005-12-20 23:42:18 +00:00
|
|
|
round_page(start), prot);
|
2017-03-05 23:59:04 +00:00
|
|
|
if (rv != KERN_SUCCESS)
|
2002-08-24 22:01:40 +00:00
|
|
|
return (rv);
|
2002-07-20 02:56:12 +00:00
|
|
|
offset += round_page(start) - start;
|
|
|
|
start = round_page(start);
|
|
|
|
}
|
|
|
|
if (end != round_page(end)) {
|
2002-08-25 22:36:52 +00:00
|
|
|
rv = __elfN(map_partial)(map, object, offset +
|
2005-12-20 23:42:18 +00:00
|
|
|
trunc_page(end) - start, trunc_page(end), end, prot);
|
2017-03-05 23:59:04 +00:00
|
|
|
if (rv != KERN_SUCCESS)
|
2002-08-24 22:01:40 +00:00
|
|
|
return (rv);
|
2002-07-20 02:56:12 +00:00
|
|
|
end = trunc_page(end);
|
|
|
|
}
|
2017-03-11 18:57:13 +00:00
|
|
|
if (start >= end)
|
|
|
|
return (KERN_SUCCESS);
|
|
|
|
if ((offset & PAGE_MASK) != 0) {
|
|
|
|
/*
|
|
|
|
* The mapping is not page aligned. This means that we have
|
|
|
|
* to copy the data.
|
|
|
|
*/
|
|
|
|
rv = vm_map_fixed(map, NULL, 0, start, end - start,
|
|
|
|
prot | VM_PROT_WRITE, VM_PROT_ALL, MAP_CHECK_EXCL);
|
|
|
|
if (rv != KERN_SUCCESS)
|
|
|
|
return (rv);
|
|
|
|
if (object == NULL)
|
|
|
|
return (KERN_SUCCESS);
|
|
|
|
for (; start < end; start += sz) {
|
|
|
|
sf = vm_imgact_map_page(object, offset);
|
|
|
|
if (sf == NULL)
|
|
|
|
return (KERN_FAILURE);
|
|
|
|
off = offset - trunc_page(offset);
|
|
|
|
sz = end - start;
|
|
|
|
if (sz > PAGE_SIZE - off)
|
|
|
|
sz = PAGE_SIZE - off;
|
|
|
|
error = copyout((caddr_t)sf_buf_kva(sf) + off,
|
|
|
|
(caddr_t)start, sz);
|
|
|
|
vm_imgact_unmap_page(sf);
|
|
|
|
if (error != 0)
|
|
|
|
return (KERN_FAILURE);
|
|
|
|
offset += sz;
|
2002-07-20 02:56:12 +00:00
|
|
|
}
|
|
|
|
} else {
|
2017-03-11 18:57:13 +00:00
|
|
|
vm_object_reference(object);
|
|
|
|
rv = vm_map_fixed(map, object, offset, start, end - start,
|
Switch to use shared vnode locks for text files during image activation.
kern_execve() locks text vnode exclusive to be able to set and clear
VV_TEXT flag. VV_TEXT is mutually exclusive with the v_writecount > 0
condition.
The change removes VV_TEXT, replacing it with the condition
v_writecount <= -1, and puts v_writecount under the vnode interlock.
Each text reference decrements v_writecount. To clear the text
reference when the segment is unmapped, it is recorded in the
vm_map_entry backed by the text file as MAP_ENTRY_VN_TEXT flag, and
v_writecount is incremented on the map entry removal
The operations like VOP_ADD_WRITECOUNT() and VOP_SET_TEXT() check that
v_writecount does not contradict the desired change. vn_writecheck()
is now racy and its use was eliminated everywhere except access.
Atomic check for writeability and increment of v_writecount is
performed by the VOP. vn_truncate() now increments v_writecount
around VOP_SETATTR() call, lack of which is arguably a bug on its own.
nullfs bypasses v_writecount to the lower vnode always, so nullfs
vnode has its own v_writecount correct, and lower vnode gets all
references, since object->handle is always lower vnode.
On the text vnode' vm object dealloc, the v_writecount value is reset
to zero, and deadfs vop_unset_text short-circuit the operation.
Reclamation of lowervp always reclaims all nullfs vnodes referencing
lowervp first, so no stray references are left.
Reviewed by: markj, trasz
Tested by: mjg, pho
Sponsored by: The FreeBSD Foundation
MFC after: 1 month
Differential revision: https://reviews.freebsd.org/D19923
2019-05-05 11:20:43 +00:00
|
|
|
prot, VM_PROT_ALL, cow | MAP_CHECK_EXCL |
|
|
|
|
(object != NULL ? MAP_VN_EXEC : 0));
|
2017-03-11 18:57:13 +00:00
|
|
|
if (rv != KERN_SUCCESS) {
|
|
|
|
locked = VOP_ISLOCKED(imgp->vp);
|
|
|
|
VOP_UNLOCK(imgp->vp, 0);
|
|
|
|
vm_object_deallocate(object);
|
|
|
|
vn_lock(imgp->vp, locked | LK_RETRY);
|
|
|
|
return (rv);
|
Switch to use shared vnode locks for text files during image activation.
kern_execve() locks text vnode exclusive to be able to set and clear
VV_TEXT flag. VV_TEXT is mutually exclusive with the v_writecount > 0
condition.
The change removes VV_TEXT, replacing it with the condition
v_writecount <= -1, and puts v_writecount under the vnode interlock.
Each text reference decrements v_writecount. To clear the text
reference when the segment is unmapped, it is recorded in the
vm_map_entry backed by the text file as MAP_ENTRY_VN_TEXT flag, and
v_writecount is incremented on the map entry removal
The operations like VOP_ADD_WRITECOUNT() and VOP_SET_TEXT() check that
v_writecount does not contradict the desired change. vn_writecheck()
is now racy and its use was eliminated everywhere except access.
Atomic check for writeability and increment of v_writecount is
performed by the VOP. vn_truncate() now increments v_writecount
around VOP_SETATTR() call, lack of which is arguably a bug on its own.
nullfs bypasses v_writecount to the lower vnode always, so nullfs
vnode has its own v_writecount correct, and lower vnode gets all
references, since object->handle is always lower vnode.
On the text vnode' vm object dealloc, the v_writecount value is reset
to zero, and deadfs vop_unset_text short-circuit the operation.
Reclamation of lowervp always reclaims all nullfs vnodes referencing
lowervp first, so no stray references are left.
Reviewed by: markj, trasz
Tested by: mjg, pho
Sponsored by: The FreeBSD Foundation
MFC after: 1 month
Differential revision: https://reviews.freebsd.org/D19923
2019-05-05 11:20:43 +00:00
|
|
|
} else if (object != NULL) {
|
|
|
|
MPASS(imgp->vp->v_object == object);
|
|
|
|
VOP_SET_TEXT_CHECKED(imgp->vp);
|
2017-03-11 18:57:13 +00:00
|
|
|
}
|
2002-07-20 02:56:12 +00:00
|
|
|
}
|
2017-03-11 18:57:13 +00:00
|
|
|
return (KERN_SUCCESS);
|
2002-07-20 02:56:12 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
2017-03-07 13:36:43 +00:00
|
|
|
__elfN(load_section)(struct image_params *imgp, vm_ooffset_t offset,
|
2019-03-01 16:16:38 +00:00
|
|
|
caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot)
|
1996-03-10 08:42:54 +00:00
|
|
|
{
|
2005-12-16 18:34:14 +00:00
|
|
|
struct sf_buf *sf;
|
1996-03-10 08:42:54 +00:00
|
|
|
size_t map_len;
|
2012-01-17 00:27:32 +00:00
|
|
|
vm_map_t map;
|
|
|
|
vm_object_t object;
|
2019-07-24 15:18:05 +00:00
|
|
|
vm_offset_t map_addr;
|
2002-12-16 19:24:43 +00:00
|
|
|
int error, rv, cow;
|
1996-03-10 08:42:54 +00:00
|
|
|
size_t copy_len;
|
2017-03-07 13:36:43 +00:00
|
|
|
vm_ooffset_t file_addr;
|
1998-10-18 15:55:12 +00:00
|
|
|
|
2000-07-23 06:49:46 +00:00
|
|
|
/*
|
|
|
|
* It's necessary to fail if the filsz + offset taken from the
|
|
|
|
* header is greater than the actual file pager object's size.
|
|
|
|
* If we were to allow this, then the vm_map_find() below would
|
|
|
|
* walk right off the end of the file object and into the ether.
|
|
|
|
*
|
|
|
|
* While I'm here, might as well check for something else that
|
|
|
|
* is invalid: filsz cannot be greater than memsz.
|
|
|
|
*/
|
2017-03-12 13:51:13 +00:00
|
|
|
if ((filsz != 0 && (off_t)filsz + offset > imgp->attr->va_size) ||
|
|
|
|
filsz > memsz) {
|
2000-07-23 06:49:46 +00:00
|
|
|
uprintf("elf_load_section: truncated ELF file\n");
|
|
|
|
return (ENOEXEC);
|
|
|
|
}
|
|
|
|
|
2012-01-17 00:27:32 +00:00
|
|
|
object = imgp->object;
|
|
|
|
map = &imgp->proc->p_vmspace->vm_map;
|
2019-03-23 13:41:14 +00:00
|
|
|
map_addr = trunc_page((vm_offset_t)vmaddr);
|
|
|
|
file_addr = trunc_page(offset);
|
1996-03-10 08:42:54 +00:00
|
|
|
|
1998-10-18 15:55:12 +00:00
|
|
|
/*
|
|
|
|
* We have two choices. We can either clear the data in the last page
|
|
|
|
* of an oversized mapping, or we can start the anon mapping a page
|
|
|
|
* early and copy the initialized data into that first page. We
|
2017-03-05 23:59:04 +00:00
|
|
|
* choose the second.
|
1998-10-18 15:55:12 +00:00
|
|
|
*/
|
2017-03-12 13:51:13 +00:00
|
|
|
if (filsz == 0)
|
|
|
|
map_len = 0;
|
|
|
|
else if (memsz > filsz)
|
2019-03-23 13:41:14 +00:00
|
|
|
map_len = trunc_page(offset + filsz) - file_addr;
|
1996-03-10 08:42:54 +00:00
|
|
|
else
|
2019-03-23 13:41:14 +00:00
|
|
|
map_len = round_page(offset + filsz) - file_addr;
|
1998-10-18 15:55:12 +00:00
|
|
|
|
|
|
|
if (map_len != 0) {
|
2002-12-16 19:24:43 +00:00
|
|
|
/* cow flags: don't dump readonly sections in core */
|
|
|
|
cow = MAP_COPY_ON_WRITE | MAP_PREFAULT |
|
|
|
|
(prot & VM_PROT_WRITE ? 0 : MAP_DISABLE_COREDUMP);
|
|
|
|
|
Switch to use shared vnode locks for text files during image activation.
kern_execve() locks text vnode exclusive to be able to set and clear
VV_TEXT flag. VV_TEXT is mutually exclusive with the v_writecount > 0
condition.
The change removes VV_TEXT, replacing it with the condition
v_writecount <= -1, and puts v_writecount under the vnode interlock.
Each text reference decrements v_writecount. To clear the text
reference when the segment is unmapped, it is recorded in the
vm_map_entry backed by the text file as MAP_ENTRY_VN_TEXT flag, and
v_writecount is incremented on the map entry removal
The operations like VOP_ADD_WRITECOUNT() and VOP_SET_TEXT() check that
v_writecount does not contradict the desired change. vn_writecheck()
is now racy and its use was eliminated everywhere except access.
Atomic check for writeability and increment of v_writecount is
performed by the VOP. vn_truncate() now increments v_writecount
around VOP_SETATTR() call, lack of which is arguably a bug on its own.
nullfs bypasses v_writecount to the lower vnode always, so nullfs
vnode has its own v_writecount correct, and lower vnode gets all
references, since object->handle is always lower vnode.
On the text vnode' vm object dealloc, the v_writecount value is reset
to zero, and deadfs vop_unset_text short-circuit the operation.
Reclamation of lowervp always reclaims all nullfs vnodes referencing
lowervp first, so no stray references are left.
Reviewed by: markj, trasz
Tested by: mjg, pho
Sponsored by: The FreeBSD Foundation
MFC after: 1 month
Differential revision: https://reviews.freebsd.org/D19923
2019-05-05 11:20:43 +00:00
|
|
|
rv = __elfN(map_insert)(imgp, map, object, file_addr,
|
|
|
|
map_addr, map_addr + map_len, prot, cow);
|
2006-01-21 20:11:49 +00:00
|
|
|
if (rv != KERN_SUCCESS)
|
2002-08-24 22:01:40 +00:00
|
|
|
return (EINVAL);
|
1998-10-18 15:55:12 +00:00
|
|
|
|
|
|
|
/* we can stop now if we've covered it all */
|
2017-03-12 13:49:42 +00:00
|
|
|
if (memsz == filsz)
|
2002-08-24 22:01:40 +00:00
|
|
|
return (0);
|
1998-10-18 15:55:12 +00:00
|
|
|
}
|
1996-03-10 08:42:54 +00:00
|
|
|
|
|
|
|
|
|
|
|
/*
|
1998-10-18 15:55:12 +00:00
|
|
|
* We have to get the remaining bit of the file into the first part
|
|
|
|
* of the oversized map segment. This is normally because the .data
|
|
|
|
* segment in the file is extended to provide bss. It's a neat idea
|
|
|
|
* to try and save a page, but it's a pain in the behind to implement.
|
1996-03-10 08:42:54 +00:00
|
|
|
*/
|
2019-03-23 13:41:14 +00:00
|
|
|
copy_len = filsz == 0 ? 0 : (offset + filsz) - trunc_page(offset +
|
|
|
|
filsz);
|
|
|
|
map_addr = trunc_page((vm_offset_t)vmaddr + filsz);
|
|
|
|
map_len = round_page((vm_offset_t)vmaddr + memsz) - map_addr;
|
1996-03-10 08:42:54 +00:00
|
|
|
|
1998-10-18 15:55:12 +00:00
|
|
|
/* This had damn well better be true! */
|
2002-07-20 02:56:12 +00:00
|
|
|
if (map_len != 0) {
|
2017-03-01 10:22:07 +00:00
|
|
|
rv = __elfN(map_insert)(imgp, map, NULL, 0, map_addr,
|
2017-03-18 23:37:00 +00:00
|
|
|
map_addr + map_len, prot, 0);
|
2017-03-12 13:49:42 +00:00
|
|
|
if (rv != KERN_SUCCESS)
|
2002-08-24 22:01:40 +00:00
|
|
|
return (EINVAL);
|
1996-03-10 22:37:34 +00:00
|
|
|
}
|
1996-03-10 08:42:54 +00:00
|
|
|
|
1998-10-18 15:55:12 +00:00
|
|
|
if (copy_len != 0) {
|
2005-12-16 18:34:14 +00:00
|
|
|
sf = vm_imgact_map_page(object, offset + filsz);
|
|
|
|
if (sf == NULL)
|
|
|
|
return (EIO);
|
1998-10-18 15:55:12 +00:00
|
|
|
|
|
|
|
/* send the page fragment to user space */
|
2019-07-24 15:18:05 +00:00
|
|
|
error = copyout((caddr_t)sf_buf_kva(sf), (caddr_t)map_addr,
|
|
|
|
copy_len);
|
2013-08-05 08:55:35 +00:00
|
|
|
vm_imgact_unmap_page(sf);
|
2017-03-12 13:49:42 +00:00
|
|
|
if (error != 0)
|
1998-10-18 15:55:12 +00:00
|
|
|
return (error);
|
|
|
|
}
|
1996-03-10 08:42:54 +00:00
|
|
|
|
|
|
|
/*
|
2017-03-18 23:37:00 +00:00
|
|
|
* Remove write access to the page if it was only granted by map_insert
|
|
|
|
* to allow copyout.
|
1996-03-10 08:42:54 +00:00
|
|
|
*/
|
2017-03-18 23:37:00 +00:00
|
|
|
if ((prot & VM_PROT_WRITE) == 0)
|
|
|
|
vm_map_protect(map, trunc_page(map_addr), round_page(map_addr +
|
|
|
|
map_len), prot, FALSE);
|
1996-03-10 22:37:34 +00:00
|
|
|
|
2005-12-20 23:42:18 +00:00
|
|
|
return (0);
|
1996-03-10 08:42:54 +00:00
|
|
|
}
|
|
|
|
|
2019-04-09 15:24:38 +00:00
|
|
|
static int
|
|
|
|
__elfN(load_sections)(struct image_params *imgp, const Elf_Ehdr *hdr,
|
|
|
|
const Elf_Phdr *phdr, u_long rbase, u_long *base_addrp)
|
|
|
|
{
|
|
|
|
vm_prot_t prot;
|
|
|
|
u_long base_addr;
|
|
|
|
bool first;
|
|
|
|
int error, i;
|
|
|
|
|
2019-04-10 10:21:14 +00:00
|
|
|
ASSERT_VOP_LOCKED(imgp->vp, __func__);
|
|
|
|
|
2019-04-09 15:24:38 +00:00
|
|
|
base_addr = 0;
|
|
|
|
first = true;
|
|
|
|
|
|
|
|
for (i = 0; i < hdr->e_phnum; i++) {
|
|
|
|
if (phdr[i].p_type != PT_LOAD || phdr[i].p_memsz == 0)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/* Loadable segment */
|
|
|
|
prot = __elfN(trans_prot)(phdr[i].p_flags);
|
|
|
|
error = __elfN(load_section)(imgp, phdr[i].p_offset,
|
|
|
|
(caddr_t)(uintptr_t)phdr[i].p_vaddr + rbase,
|
|
|
|
phdr[i].p_memsz, phdr[i].p_filesz, prot);
|
|
|
|
if (error != 0)
|
|
|
|
return (error);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Establish the base address if this is the first segment.
|
|
|
|
*/
|
|
|
|
if (first) {
|
|
|
|
base_addr = trunc_page(phdr[i].p_vaddr + rbase);
|
|
|
|
first = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (base_addrp != NULL)
|
|
|
|
*base_addrp = base_addr;
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
1999-02-20 23:52:34 +00:00
|
|
|
/*
|
|
|
|
* Load the file "file" into memory. It may be either a shared object
|
|
|
|
* or an executable.
|
|
|
|
*
|
|
|
|
* The "addr" reference parameter is in/out. On entry, it specifies
|
|
|
|
* the address where a shared object should be loaded. If the file is
|
|
|
|
* an executable, this value is ignored. On exit, "addr" specifies
|
|
|
|
* where the file was actually loaded.
|
|
|
|
*
|
|
|
|
* The "entry" reference parameter is out only. On exit, it specifies
|
|
|
|
* the entry point for the loaded file.
|
|
|
|
*/
|
1996-03-10 08:42:54 +00:00
|
|
|
static int
|
2002-07-20 02:56:12 +00:00
|
|
|
__elfN(load_file)(struct proc *p, const char *file, u_long *addr,
|
2019-03-01 16:16:38 +00:00
|
|
|
u_long *entry)
|
1996-03-10 08:42:54 +00:00
|
|
|
{
|
2001-08-16 16:14:26 +00:00
|
|
|
struct {
|
|
|
|
struct nameidata nd;
|
|
|
|
struct vattr attr;
|
|
|
|
struct image_params image_params;
|
|
|
|
} *tempdata;
|
1999-01-27 21:50:00 +00:00
|
|
|
const Elf_Ehdr *hdr = NULL;
|
|
|
|
const Elf_Phdr *phdr = NULL;
|
2001-08-16 16:14:26 +00:00
|
|
|
struct nameidata *nd;
|
|
|
|
struct vattr *attr;
|
|
|
|
struct image_params *imgp;
|
Switch to use shared vnode locks for text files during image activation.
kern_execve() locks text vnode exclusive to be able to set and clear
VV_TEXT flag. VV_TEXT is mutually exclusive with the v_writecount > 0
condition.
The change removes VV_TEXT, replacing it with the condition
v_writecount <= -1, and puts v_writecount under the vnode interlock.
Each text reference decrements v_writecount. To clear the text
reference when the segment is unmapped, it is recorded in the
vm_map_entry backed by the text file as MAP_ENTRY_VN_TEXT flag, and
v_writecount is incremented on the map entry removal
The operations like VOP_ADD_WRITECOUNT() and VOP_SET_TEXT() check that
v_writecount does not contradict the desired change. vn_writecheck()
is now racy and its use was eliminated everywhere except access.
Atomic check for writeability and increment of v_writecount is
performed by the VOP. vn_truncate() now increments v_writecount
around VOP_SETATTR() call, lack of which is arguably a bug on its own.
nullfs bypasses v_writecount to the lower vnode always, so nullfs
vnode has its own v_writecount correct, and lower vnode gets all
references, since object->handle is always lower vnode.
On the text vnode' vm object dealloc, the v_writecount value is reset
to zero, and deadfs vop_unset_text short-circuit the operation.
Reclamation of lowervp always reclaims all nullfs vnodes referencing
lowervp first, so no stray references are left.
Reviewed by: markj, trasz
Tested by: mjg, pho
Sponsored by: The FreeBSD Foundation
MFC after: 1 month
Differential revision: https://reviews.freebsd.org/D19923
2019-05-05 11:20:43 +00:00
|
|
|
u_long rbase;
|
1999-02-20 23:52:34 +00:00
|
|
|
u_long base_addr = 0;
|
2019-04-09 15:24:38 +00:00
|
|
|
int error;
|
1996-03-10 08:42:54 +00:00
|
|
|
|
2011-06-30 10:56:02 +00:00
|
|
|
#ifdef CAPABILITY_MODE
|
|
|
|
/*
|
|
|
|
* XXXJA: This check can go away once we are sufficiently confident
|
|
|
|
* that the checks in namei() are correct.
|
|
|
|
*/
|
|
|
|
if (IN_CAPABILITY_MODE(curthread))
|
|
|
|
return (ECAPMODE);
|
|
|
|
#endif
|
|
|
|
|
2019-09-07 16:03:26 +00:00
|
|
|
tempdata = malloc(sizeof(*tempdata), M_TEMP, M_WAITOK | M_ZERO);
|
2001-08-16 16:14:26 +00:00
|
|
|
nd = &tempdata->nd;
|
|
|
|
attr = &tempdata->attr;
|
|
|
|
imgp = &tempdata->image_params;
|
|
|
|
|
1998-03-02 05:47:58 +00:00
|
|
|
/*
|
|
|
|
* Initialize part of the common data
|
|
|
|
*/
|
|
|
|
imgp->proc = p;
|
2001-08-16 16:14:26 +00:00
|
|
|
imgp->attr = attr;
|
1998-03-02 05:47:58 +00:00
|
|
|
|
2019-08-03 01:02:52 +00:00
|
|
|
NDINIT(nd, LOOKUP, ISOPEN | FOLLOW | LOCKSHARED | LOCKLEAF,
|
|
|
|
UIO_SYSSPACE, file, curthread);
|
2001-08-16 16:14:26 +00:00
|
|
|
if ((error = namei(nd)) != 0) {
|
|
|
|
nd->ni_vp = NULL;
|
1996-03-10 08:42:54 +00:00
|
|
|
goto fail;
|
|
|
|
}
|
2001-08-16 16:14:26 +00:00
|
|
|
NDFREE(nd, NDF_ONLY_PNBUF);
|
|
|
|
imgp->vp = nd->ni_vp;
|
1998-03-02 05:47:58 +00:00
|
|
|
|
1996-03-10 08:42:54 +00:00
|
|
|
/*
|
|
|
|
* Check permissions, modes, uid, etc on the file, and "open" it.
|
|
|
|
*/
|
1998-03-02 05:47:58 +00:00
|
|
|
error = exec_check_permissions(imgp);
|
2005-12-21 18:58:40 +00:00
|
|
|
if (error)
|
1998-03-02 05:47:58 +00:00
|
|
|
goto fail;
|
1996-03-10 08:42:54 +00:00
|
|
|
|
1998-03-02 05:47:58 +00:00
|
|
|
error = exec_map_first_page(imgp);
|
2005-12-21 18:58:40 +00:00
|
|
|
if (error)
|
|
|
|
goto fail;
|
|
|
|
|
2005-01-25 00:40:01 +00:00
|
|
|
imgp->object = nd->ni_vp->v_object;
|
1996-03-10 08:42:54 +00:00
|
|
|
|
1999-01-27 21:50:00 +00:00
|
|
|
hdr = (const Elf_Ehdr *)imgp->image_header;
|
2002-07-20 02:56:12 +00:00
|
|
|
if ((error = __elfN(check_header)(hdr)) != 0)
|
1996-03-10 08:42:54 +00:00
|
|
|
goto fail;
|
1999-02-20 23:52:34 +00:00
|
|
|
if (hdr->e_type == ET_DYN)
|
|
|
|
rbase = *addr;
|
|
|
|
else if (hdr->e_type == ET_EXEC)
|
|
|
|
rbase = 0;
|
|
|
|
else {
|
|
|
|
error = ENOEXEC;
|
|
|
|
goto fail;
|
|
|
|
}
|
1996-03-10 08:42:54 +00:00
|
|
|
|
2004-03-18 16:33:05 +00:00
|
|
|
/* Only support headers that fit within first page for now */
|
1998-10-18 15:55:12 +00:00
|
|
|
if ((hdr->e_phoff > PAGE_SIZE) ||
|
2013-03-13 22:01:31 +00:00
|
|
|
(u_int)hdr->e_phentsize * hdr->e_phnum > PAGE_SIZE - hdr->e_phoff) {
|
1998-03-02 05:47:58 +00:00
|
|
|
error = ENOEXEC;
|
|
|
|
goto fail;
|
|
|
|
}
|
|
|
|
|
1999-01-27 21:50:00 +00:00
|
|
|
phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff);
|
2007-12-04 12:21:27 +00:00
|
|
|
if (!aligned(phdr, Elf_Addr)) {
|
|
|
|
error = ENOEXEC;
|
|
|
|
goto fail;
|
|
|
|
}
|
1996-03-10 08:42:54 +00:00
|
|
|
|
2019-04-09 15:24:38 +00:00
|
|
|
error = __elfN(load_sections)(imgp, hdr, phdr, rbase, &base_addr);
|
|
|
|
if (error != 0)
|
|
|
|
goto fail;
|
|
|
|
|
1999-02-20 23:52:34 +00:00
|
|
|
*addr = base_addr;
|
2002-08-24 22:55:16 +00:00
|
|
|
*entry = (unsigned long)hdr->e_entry + rbase;
|
1996-03-10 08:42:54 +00:00
|
|
|
|
|
|
|
fail:
|
1998-03-02 05:47:58 +00:00
|
|
|
if (imgp->firstpage)
|
|
|
|
exec_unmap_first_page(imgp);
|
2002-07-06 07:00:01 +00:00
|
|
|
|
Switch to use shared vnode locks for text files during image activation.
kern_execve() locks text vnode exclusive to be able to set and clear
VV_TEXT flag. VV_TEXT is mutually exclusive with the v_writecount > 0
condition.
The change removes VV_TEXT, replacing it with the condition
v_writecount <= -1, and puts v_writecount under the vnode interlock.
Each text reference decrements v_writecount. To clear the text
reference when the segment is unmapped, it is recorded in the
vm_map_entry backed by the text file as MAP_ENTRY_VN_TEXT flag, and
v_writecount is incremented on the map entry removal
The operations like VOP_ADD_WRITECOUNT() and VOP_SET_TEXT() check that
v_writecount does not contradict the desired change. vn_writecheck()
is now racy and its use was eliminated everywhere except access.
Atomic check for writeability and increment of v_writecount is
performed by the VOP. vn_truncate() now increments v_writecount
around VOP_SETATTR() call, lack of which is arguably a bug on its own.
nullfs bypasses v_writecount to the lower vnode always, so nullfs
vnode has its own v_writecount correct, and lower vnode gets all
references, since object->handle is always lower vnode.
On the text vnode' vm object dealloc, the v_writecount value is reset
to zero, and deadfs vop_unset_text short-circuit the operation.
Reclamation of lowervp always reclaims all nullfs vnodes referencing
lowervp first, so no stray references are left.
Reviewed by: markj, trasz
Tested by: mjg, pho
Sponsored by: The FreeBSD Foundation
MFC after: 1 month
Differential revision: https://reviews.freebsd.org/D19923
2019-05-05 11:20:43 +00:00
|
|
|
if (nd->ni_vp) {
|
|
|
|
if (imgp->textset)
|
|
|
|
VOP_UNSET_TEXT_CHECKED(nd->ni_vp);
|
2005-12-21 18:58:40 +00:00
|
|
|
vput(nd->ni_vp);
|
Switch to use shared vnode locks for text files during image activation.
kern_execve() locks text vnode exclusive to be able to set and clear
VV_TEXT flag. VV_TEXT is mutually exclusive with the v_writecount > 0
condition.
The change removes VV_TEXT, replacing it with the condition
v_writecount <= -1, and puts v_writecount under the vnode interlock.
Each text reference decrements v_writecount. To clear the text
reference when the segment is unmapped, it is recorded in the
vm_map_entry backed by the text file as MAP_ENTRY_VN_TEXT flag, and
v_writecount is incremented on the map entry removal
The operations like VOP_ADD_WRITECOUNT() and VOP_SET_TEXT() check that
v_writecount does not contradict the desired change. vn_writecheck()
is now racy and its use was eliminated everywhere except access.
Atomic check for writeability and increment of v_writecount is
performed by the VOP. vn_truncate() now increments v_writecount
around VOP_SETATTR() call, lack of which is arguably a bug on its own.
nullfs bypasses v_writecount to the lower vnode always, so nullfs
vnode has its own v_writecount correct, and lower vnode gets all
references, since object->handle is always lower vnode.
On the text vnode' vm object dealloc, the v_writecount value is reset
to zero, and deadfs vop_unset_text short-circuit the operation.
Reclamation of lowervp always reclaims all nullfs vnodes referencing
lowervp first, so no stray references are left.
Reviewed by: markj, trasz
Tested by: mjg, pho
Sponsored by: The FreeBSD Foundation
MFC after: 1 month
Differential revision: https://reviews.freebsd.org/D19923
2019-05-05 11:20:43 +00:00
|
|
|
}
|
2001-08-16 16:14:26 +00:00
|
|
|
free(tempdata, M_TEMP);
|
1996-03-10 08:42:54 +00:00
|
|
|
|
2002-08-24 22:01:40 +00:00
|
|
|
return (error);
|
1996-03-10 08:42:54 +00:00
|
|
|
}
|
|
|
|
|
Implement Address Space Layout Randomization (ASLR)
With this change, randomization can be enabled for all non-fixed
mappings. It means that the base address for the mapping is selected
with a guaranteed amount of entropy (bits). If the mapping was
requested to be superpage aligned, the randomization honours the
superpage attributes.
Although the value of ASLR is diminshing over time as exploit authors
work out simple ASLR bypass techniques, it elimintates the trivial
exploitation of certain vulnerabilities, at least in theory. This
implementation is relatively small and happens at the correct
architectural level. Also, it is not expected to introduce
regressions in existing cases when turned off (default for now), or
cause any significant maintaince burden.
The randomization is done on a best-effort basis - that is, the
allocator falls back to a first fit strategy if fragmentation prevents
entropy injection. It is trivial to implement a strong mode where
failure to guarantee the requested amount of entropy results in
mapping request failure, but I do not consider that to be usable.
I have not fine-tuned the amount of entropy injected right now. It is
only a quantitive change that will not change the implementation. The
current amount is controlled by aslr_pages_rnd.
To not spoil coalescing optimizations, to reduce the page table
fragmentation inherent to ASLR, and to keep the transient superpage
promotion for the malloced memory, locality clustering is implemented
for anonymous private mappings, which are automatically grouped until
fragmentation kicks in. The initial location for the anon group range
is, of course, randomized. This is controlled by vm.cluster_anon,
enabled by default.
The default mode keeps the sbrk area unpopulated by other mappings,
but this can be turned off, which gives much more breathing bits on
architectures with small address space, such as i386. This is tied
with the question of following an application's hint about the mmap(2)
base address. Testing shows that ignoring the hint does not affect the
function of common applications, but I would expect more demanding
code could break. By default sbrk is preserved and mmap hints are
satisfied, which can be changed by using the
kern.elf{32,64}.aslr.honor_sbrk sysctl.
ASLR is enabled on per-ABI basis, and currently it is only allowed on
FreeBSD native i386 and amd64 (including compat 32bit) ABIs. Support
for additional architectures will be added after further testing.
Both per-process and per-image controls are implemented:
- procctl(2) adds PROC_ASLR_CTL/PROC_ASLR_STATUS;
- NT_FREEBSD_FCTL_ASLR_DISABLE feature control note bit makes it possible
to force ASLR off for the given binary. (A tool to edit the feature
control note is in development.)
Global controls are:
- kern.elf{32,64}.aslr.enable - for non-fixed mappings done by mmap(2);
- kern.elf{32,64}.aslr.pie_enable - for PIE image activation mappings;
- kern.elf{32,64}.aslr.honor_sbrk - allow to use sbrk area for mmap(2);
- vm.cluster_anon - enables anon mapping clustering.
PR: 208580 (exp runs)
Exp-runs done by: antoine
Reviewed by: markj (previous version)
Discussed with: emaste
Tested by: pho
MFC after: 1 month
Sponsored by: The FreeBSD Foundation
Differential revision: https://reviews.freebsd.org/D5603
2019-02-10 17:19:45 +00:00
|
|
|
static u_long
|
|
|
|
__CONCAT(rnd_, __elfN(base))(vm_map_t map __unused, u_long minv, u_long maxv,
|
|
|
|
u_int align)
|
|
|
|
{
|
|
|
|
u_long rbase, res;
|
|
|
|
|
|
|
|
MPASS(vm_map_min(map) <= minv);
|
|
|
|
MPASS(maxv <= vm_map_max(map));
|
|
|
|
MPASS(minv < maxv);
|
|
|
|
MPASS(minv + align < maxv);
|
|
|
|
arc4rand(&rbase, sizeof(rbase), 0);
|
|
|
|
res = roundup(minv, (u_long)align) + rbase % (maxv - minv);
|
|
|
|
res &= ~((u_long)align - 1);
|
|
|
|
if (res >= maxv)
|
|
|
|
res -= align;
|
|
|
|
KASSERT(res >= minv,
|
|
|
|
("res %#lx < minv %#lx, maxv %#lx rbase %#lx",
|
|
|
|
res, minv, maxv, rbase));
|
|
|
|
KASSERT(res < maxv,
|
|
|
|
("res %#lx > maxv %#lx, minv %#lx rbase %#lx",
|
|
|
|
res, maxv, minv, rbase));
|
|
|
|
return (res);
|
|
|
|
}
|
|
|
|
|
2019-03-26 15:35:49 +00:00
|
|
|
static int
|
|
|
|
__elfN(enforce_limits)(struct image_params *imgp, const Elf_Ehdr *hdr,
|
|
|
|
const Elf_Phdr *phdr, u_long et_dyn_addr)
|
|
|
|
{
|
|
|
|
struct vmspace *vmspace;
|
|
|
|
const char *err_str;
|
|
|
|
u_long text_size, data_size, total_size, text_addr, data_addr;
|
|
|
|
u_long seg_size, seg_addr;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
err_str = NULL;
|
|
|
|
text_size = data_size = total_size = text_addr = data_addr = 0;
|
|
|
|
|
|
|
|
for (i = 0; i < hdr->e_phnum; i++) {
|
|
|
|
if (phdr[i].p_type != PT_LOAD || phdr[i].p_memsz == 0)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
seg_addr = trunc_page(phdr[i].p_vaddr + et_dyn_addr);
|
|
|
|
seg_size = round_page(phdr[i].p_memsz +
|
|
|
|
phdr[i].p_vaddr + et_dyn_addr - seg_addr);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Make the largest executable segment the official
|
|
|
|
* text segment and all others data.
|
|
|
|
*
|
|
|
|
* Note that obreak() assumes that data_addr + data_size == end
|
|
|
|
* of data load area, and the ELF file format expects segments
|
|
|
|
* to be sorted by address. If multiple data segments exist,
|
|
|
|
* the last one will be used.
|
|
|
|
*/
|
|
|
|
|
|
|
|
if ((phdr[i].p_flags & PF_X) != 0 && text_size < seg_size) {
|
|
|
|
text_size = seg_size;
|
|
|
|
text_addr = seg_addr;
|
|
|
|
} else {
|
|
|
|
data_size = seg_size;
|
|
|
|
data_addr = seg_addr;
|
|
|
|
}
|
|
|
|
total_size += seg_size;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (data_addr == 0 && data_size == 0) {
|
|
|
|
data_addr = text_addr;
|
|
|
|
data_size = text_size;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check limits. It should be safe to check the
|
|
|
|
* limits after loading the segments since we do
|
|
|
|
* not actually fault in all the segments pages.
|
|
|
|
*/
|
|
|
|
PROC_LOCK(imgp->proc);
|
|
|
|
if (data_size > lim_cur_proc(imgp->proc, RLIMIT_DATA))
|
|
|
|
err_str = "Data segment size exceeds process limit";
|
|
|
|
else if (text_size > maxtsiz)
|
|
|
|
err_str = "Text segment size exceeds system limit";
|
|
|
|
else if (total_size > lim_cur_proc(imgp->proc, RLIMIT_VMEM))
|
|
|
|
err_str = "Total segment size exceeds process limit";
|
|
|
|
else if (racct_set(imgp->proc, RACCT_DATA, data_size) != 0)
|
|
|
|
err_str = "Data segment size exceeds resource limit";
|
|
|
|
else if (racct_set(imgp->proc, RACCT_VMEM, total_size) != 0)
|
|
|
|
err_str = "Total segment size exceeds resource limit";
|
|
|
|
PROC_UNLOCK(imgp->proc);
|
|
|
|
if (err_str != NULL) {
|
|
|
|
uprintf("%s\n", err_str);
|
|
|
|
return (ENOMEM);
|
|
|
|
}
|
|
|
|
|
|
|
|
vmspace = imgp->proc->p_vmspace;
|
|
|
|
vmspace->vm_tsize = text_size >> PAGE_SHIFT;
|
|
|
|
vmspace->vm_taddr = (caddr_t)(uintptr_t)text_addr;
|
|
|
|
vmspace->vm_dsize = data_size >> PAGE_SHIFT;
|
|
|
|
vmspace->vm_daddr = (caddr_t)(uintptr_t)data_addr;
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2019-03-28 21:43:01 +00:00
|
|
|
static int
|
|
|
|
__elfN(get_interp)(struct image_params *imgp, const Elf_Phdr *phdr,
|
|
|
|
char **interpp, bool *free_interpp)
|
|
|
|
{
|
|
|
|
struct thread *td;
|
|
|
|
char *interp;
|
|
|
|
int error, interp_name_len;
|
|
|
|
|
|
|
|
KASSERT(phdr->p_type == PT_INTERP,
|
|
|
|
("%s: p_type %u != PT_INTERP", __func__, phdr->p_type));
|
2019-04-10 10:21:14 +00:00
|
|
|
ASSERT_VOP_LOCKED(imgp->vp, __func__);
|
2019-03-28 21:43:01 +00:00
|
|
|
|
|
|
|
td = curthread;
|
|
|
|
|
|
|
|
/* Path to interpreter */
|
|
|
|
if (phdr->p_filesz < 2 || phdr->p_filesz > MAXPATHLEN) {
|
|
|
|
uprintf("Invalid PT_INTERP\n");
|
|
|
|
return (ENOEXEC);
|
|
|
|
}
|
|
|
|
|
|
|
|
interp_name_len = phdr->p_filesz;
|
|
|
|
if (phdr->p_offset > PAGE_SIZE ||
|
|
|
|
interp_name_len > PAGE_SIZE - phdr->p_offset) {
|
2019-05-16 13:03:54 +00:00
|
|
|
/*
|
2019-05-17 21:18:11 +00:00
|
|
|
* The vnode lock might be needed by the pagedaemon to
|
2019-05-16 13:03:54 +00:00
|
|
|
* clean pages owned by the vnode. Do not allow sleep
|
|
|
|
* waiting for memory with the vnode locked, instead
|
|
|
|
* try non-sleepable allocation first, and if it
|
|
|
|
* fails, go to the slow path were we drop the lock
|
2019-05-17 21:18:11 +00:00
|
|
|
* and do M_WAITOK. A text reference prevents
|
|
|
|
* modifications to the vnode content.
|
2019-05-16 13:03:54 +00:00
|
|
|
*/
|
2019-05-05 11:04:01 +00:00
|
|
|
interp = malloc(interp_name_len + 1, M_TEMP, M_NOWAIT);
|
|
|
|
if (interp == NULL) {
|
|
|
|
VOP_UNLOCK(imgp->vp, 0);
|
|
|
|
interp = malloc(interp_name_len + 1, M_TEMP, M_WAITOK);
|
Switch to use shared vnode locks for text files during image activation.
kern_execve() locks text vnode exclusive to be able to set and clear
VV_TEXT flag. VV_TEXT is mutually exclusive with the v_writecount > 0
condition.
The change removes VV_TEXT, replacing it with the condition
v_writecount <= -1, and puts v_writecount under the vnode interlock.
Each text reference decrements v_writecount. To clear the text
reference when the segment is unmapped, it is recorded in the
vm_map_entry backed by the text file as MAP_ENTRY_VN_TEXT flag, and
v_writecount is incremented on the map entry removal
The operations like VOP_ADD_WRITECOUNT() and VOP_SET_TEXT() check that
v_writecount does not contradict the desired change. vn_writecheck()
is now racy and its use was eliminated everywhere except access.
Atomic check for writeability and increment of v_writecount is
performed by the VOP. vn_truncate() now increments v_writecount
around VOP_SETATTR() call, lack of which is arguably a bug on its own.
nullfs bypasses v_writecount to the lower vnode always, so nullfs
vnode has its own v_writecount correct, and lower vnode gets all
references, since object->handle is always lower vnode.
On the text vnode' vm object dealloc, the v_writecount value is reset
to zero, and deadfs vop_unset_text short-circuit the operation.
Reclamation of lowervp always reclaims all nullfs vnodes referencing
lowervp first, so no stray references are left.
Reviewed by: markj, trasz
Tested by: mjg, pho
Sponsored by: The FreeBSD Foundation
MFC after: 1 month
Differential revision: https://reviews.freebsd.org/D19923
2019-05-05 11:20:43 +00:00
|
|
|
vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
|
2019-05-05 11:04:01 +00:00
|
|
|
}
|
2019-05-16 13:03:54 +00:00
|
|
|
|
2019-03-28 21:43:01 +00:00
|
|
|
error = vn_rdwr(UIO_READ, imgp->vp, interp,
|
|
|
|
interp_name_len, phdr->p_offset,
|
|
|
|
UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred,
|
|
|
|
NOCRED, NULL, td);
|
|
|
|
if (error != 0) {
|
|
|
|
free(interp, M_TEMP);
|
|
|
|
uprintf("i/o error PT_INTERP %d\n", error);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
interp[interp_name_len] = '\0';
|
|
|
|
|
|
|
|
*interpp = interp;
|
|
|
|
*free_interpp = true;
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
interp = __DECONST(char *, imgp->image_header) + phdr->p_offset;
|
|
|
|
if (interp[interp_name_len - 1] != '\0') {
|
|
|
|
uprintf("Invalid PT_INTERP\n");
|
|
|
|
return (ENOEXEC);
|
|
|
|
}
|
|
|
|
|
|
|
|
*interpp = interp;
|
|
|
|
*free_interpp = false;
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2019-04-08 14:31:07 +00:00
|
|
|
static int
|
|
|
|
__elfN(load_interp)(struct image_params *imgp, const Elf_Brandinfo *brand_info,
|
|
|
|
const char *interp, u_long *addr, u_long *entry)
|
|
|
|
{
|
|
|
|
char *path;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
if (brand_info->emul_path != NULL &&
|
|
|
|
brand_info->emul_path[0] != '\0') {
|
|
|
|
path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
|
|
|
|
snprintf(path, MAXPATHLEN, "%s%s",
|
|
|
|
brand_info->emul_path, interp);
|
|
|
|
error = __elfN(load_file)(imgp->proc, path, addr, entry);
|
|
|
|
free(path, M_TEMP);
|
|
|
|
if (error == 0)
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (brand_info->interp_newpath != NULL &&
|
|
|
|
(brand_info->interp_path == NULL ||
|
|
|
|
strcmp(interp, brand_info->interp_path) == 0)) {
|
|
|
|
error = __elfN(load_file)(imgp->proc,
|
|
|
|
brand_info->interp_newpath, addr, entry);
|
|
|
|
if (error == 0)
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
error = __elfN(load_file)(imgp->proc, interp, addr, entry);
|
|
|
|
if (error == 0)
|
|
|
|
return (0);
|
|
|
|
|
|
|
|
uprintf("ELF interpreter %s not found, error %d\n", interp, error);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
Implement Address Space Layout Randomization (ASLR)
With this change, randomization can be enabled for all non-fixed
mappings. It means that the base address for the mapping is selected
with a guaranteed amount of entropy (bits). If the mapping was
requested to be superpage aligned, the randomization honours the
superpage attributes.
Although the value of ASLR is diminshing over time as exploit authors
work out simple ASLR bypass techniques, it elimintates the trivial
exploitation of certain vulnerabilities, at least in theory. This
implementation is relatively small and happens at the correct
architectural level. Also, it is not expected to introduce
regressions in existing cases when turned off (default for now), or
cause any significant maintaince burden.
The randomization is done on a best-effort basis - that is, the
allocator falls back to a first fit strategy if fragmentation prevents
entropy injection. It is trivial to implement a strong mode where
failure to guarantee the requested amount of entropy results in
mapping request failure, but I do not consider that to be usable.
I have not fine-tuned the amount of entropy injected right now. It is
only a quantitive change that will not change the implementation. The
current amount is controlled by aslr_pages_rnd.
To not spoil coalescing optimizations, to reduce the page table
fragmentation inherent to ASLR, and to keep the transient superpage
promotion for the malloced memory, locality clustering is implemented
for anonymous private mappings, which are automatically grouped until
fragmentation kicks in. The initial location for the anon group range
is, of course, randomized. This is controlled by vm.cluster_anon,
enabled by default.
The default mode keeps the sbrk area unpopulated by other mappings,
but this can be turned off, which gives much more breathing bits on
architectures with small address space, such as i386. This is tied
with the question of following an application's hint about the mmap(2)
base address. Testing shows that ignoring the hint does not affect the
function of common applications, but I would expect more demanding
code could break. By default sbrk is preserved and mmap hints are
satisfied, which can be changed by using the
kern.elf{32,64}.aslr.honor_sbrk sysctl.
ASLR is enabled on per-ABI basis, and currently it is only allowed on
FreeBSD native i386 and amd64 (including compat 32bit) ABIs. Support
for additional architectures will be added after further testing.
Both per-process and per-image controls are implemented:
- procctl(2) adds PROC_ASLR_CTL/PROC_ASLR_STATUS;
- NT_FREEBSD_FCTL_ASLR_DISABLE feature control note bit makes it possible
to force ASLR off for the given binary. (A tool to edit the feature
control note is in development.)
Global controls are:
- kern.elf{32,64}.aslr.enable - for non-fixed mappings done by mmap(2);
- kern.elf{32,64}.aslr.pie_enable - for PIE image activation mappings;
- kern.elf{32,64}.aslr.honor_sbrk - allow to use sbrk area for mmap(2);
- vm.cluster_anon - enables anon mapping clustering.
PR: 208580 (exp runs)
Exp-runs done by: antoine
Reviewed by: markj (previous version)
Discussed with: emaste
Tested by: pho
MFC after: 1 month
Sponsored by: The FreeBSD Foundation
Differential revision: https://reviews.freebsd.org/D5603
2019-02-10 17:19:45 +00:00
|
|
|
/*
|
|
|
|
* Impossible et_dyn_addr initial value indicating that the real base
|
|
|
|
* must be calculated later with some randomization applied.
|
|
|
|
*/
|
|
|
|
#define ET_DYN_ADDR_RAND 1
|
|
|
|
|
1998-02-09 06:11:36 +00:00
|
|
|
static int
|
2002-07-20 02:56:12 +00:00
|
|
|
__CONCAT(exec_, __elfN(imgact))(struct image_params *imgp)
|
1996-03-10 08:42:54 +00:00
|
|
|
{
|
2015-10-14 18:27:35 +00:00
|
|
|
struct thread *td;
|
|
|
|
const Elf_Ehdr *hdr;
|
2009-03-13 16:40:51 +00:00
|
|
|
const Elf_Phdr *phdr;
|
2006-01-21 20:11:49 +00:00
|
|
|
Elf_Auxargs *elf_auxargs;
|
1997-04-13 01:48:35 +00:00
|
|
|
struct vmspace *vmspace;
|
Implement Address Space Layout Randomization (ASLR)
With this change, randomization can be enabled for all non-fixed
mappings. It means that the base address for the mapping is selected
with a guaranteed amount of entropy (bits). If the mapping was
requested to be superpage aligned, the randomization honours the
superpage attributes.
Although the value of ASLR is diminshing over time as exploit authors
work out simple ASLR bypass techniques, it elimintates the trivial
exploitation of certain vulnerabilities, at least in theory. This
implementation is relatively small and happens at the correct
architectural level. Also, it is not expected to introduce
regressions in existing cases when turned off (default for now), or
cause any significant maintaince burden.
The randomization is done on a best-effort basis - that is, the
allocator falls back to a first fit strategy if fragmentation prevents
entropy injection. It is trivial to implement a strong mode where
failure to guarantee the requested amount of entropy results in
mapping request failure, but I do not consider that to be usable.
I have not fine-tuned the amount of entropy injected right now. It is
only a quantitive change that will not change the implementation. The
current amount is controlled by aslr_pages_rnd.
To not spoil coalescing optimizations, to reduce the page table
fragmentation inherent to ASLR, and to keep the transient superpage
promotion for the malloced memory, locality clustering is implemented
for anonymous private mappings, which are automatically grouped until
fragmentation kicks in. The initial location for the anon group range
is, of course, randomized. This is controlled by vm.cluster_anon,
enabled by default.
The default mode keeps the sbrk area unpopulated by other mappings,
but this can be turned off, which gives much more breathing bits on
architectures with small address space, such as i386. This is tied
with the question of following an application's hint about the mmap(2)
base address. Testing shows that ignoring the hint does not affect the
function of common applications, but I would expect more demanding
code could break. By default sbrk is preserved and mmap hints are
satisfied, which can be changed by using the
kern.elf{32,64}.aslr.honor_sbrk sysctl.
ASLR is enabled on per-ABI basis, and currently it is only allowed on
FreeBSD native i386 and amd64 (including compat 32bit) ABIs. Support
for additional architectures will be added after further testing.
Both per-process and per-image controls are implemented:
- procctl(2) adds PROC_ASLR_CTL/PROC_ASLR_STATUS;
- NT_FREEBSD_FCTL_ASLR_DISABLE feature control note bit makes it possible
to force ASLR off for the given binary. (A tool to edit the feature
control note is in development.)
Global controls are:
- kern.elf{32,64}.aslr.enable - for non-fixed mappings done by mmap(2);
- kern.elf{32,64}.aslr.pie_enable - for PIE image activation mappings;
- kern.elf{32,64}.aslr.honor_sbrk - allow to use sbrk area for mmap(2);
- vm.cluster_anon - enables anon mapping clustering.
PR: 208580 (exp runs)
Exp-runs done by: antoine
Reviewed by: markj (previous version)
Discussed with: emaste
Tested by: pho
MFC after: 1 month
Sponsored by: The FreeBSD Foundation
Differential revision: https://reviews.freebsd.org/D5603
2019-02-10 17:19:45 +00:00
|
|
|
vm_map_t map;
|
2019-04-08 14:31:07 +00:00
|
|
|
char *interp;
|
1998-10-11 19:22:07 +00:00
|
|
|
Elf_Brandinfo *brand_info;
|
2002-09-02 04:50:57 +00:00
|
|
|
struct sysentvec *sv;
|
2019-03-26 15:35:49 +00:00
|
|
|
u_long addr, baddr, et_dyn_addr, entry, proghdr;
|
Implement Address Space Layout Randomization (ASLR)
With this change, randomization can be enabled for all non-fixed
mappings. It means that the base address for the mapping is selected
with a guaranteed amount of entropy (bits). If the mapping was
requested to be superpage aligned, the randomization honours the
superpage attributes.
Although the value of ASLR is diminshing over time as exploit authors
work out simple ASLR bypass techniques, it elimintates the trivial
exploitation of certain vulnerabilities, at least in theory. This
implementation is relatively small and happens at the correct
architectural level. Also, it is not expected to introduce
regressions in existing cases when turned off (default for now), or
cause any significant maintaince burden.
The randomization is done on a best-effort basis - that is, the
allocator falls back to a first fit strategy if fragmentation prevents
entropy injection. It is trivial to implement a strong mode where
failure to guarantee the requested amount of entropy results in
mapping request failure, but I do not consider that to be usable.
I have not fine-tuned the amount of entropy injected right now. It is
only a quantitive change that will not change the implementation. The
current amount is controlled by aslr_pages_rnd.
To not spoil coalescing optimizations, to reduce the page table
fragmentation inherent to ASLR, and to keep the transient superpage
promotion for the malloced memory, locality clustering is implemented
for anonymous private mappings, which are automatically grouped until
fragmentation kicks in. The initial location for the anon group range
is, of course, randomized. This is controlled by vm.cluster_anon,
enabled by default.
The default mode keeps the sbrk area unpopulated by other mappings,
but this can be turned off, which gives much more breathing bits on
architectures with small address space, such as i386. This is tied
with the question of following an application's hint about the mmap(2)
base address. Testing shows that ignoring the hint does not affect the
function of common applications, but I would expect more demanding
code could break. By default sbrk is preserved and mmap hints are
satisfied, which can be changed by using the
kern.elf{32,64}.aslr.honor_sbrk sysctl.
ASLR is enabled on per-ABI basis, and currently it is only allowed on
FreeBSD native i386 and amd64 (including compat 32bit) ABIs. Support
for additional architectures will be added after further testing.
Both per-process and per-image controls are implemented:
- procctl(2) adds PROC_ASLR_CTL/PROC_ASLR_STATUS;
- NT_FREEBSD_FCTL_ASLR_DISABLE feature control note bit makes it possible
to force ASLR off for the given binary. (A tool to edit the feature
control note is in development.)
Global controls are:
- kern.elf{32,64}.aslr.enable - for non-fixed mappings done by mmap(2);
- kern.elf{32,64}.aslr.pie_enable - for PIE image activation mappings;
- kern.elf{32,64}.aslr.honor_sbrk - allow to use sbrk area for mmap(2);
- vm.cluster_anon - enables anon mapping clustering.
PR: 208580 (exp runs)
Exp-runs done by: antoine
Reviewed by: markj (previous version)
Discussed with: emaste
Tested by: pho
MFC after: 1 month
Sponsored by: The FreeBSD Foundation
Differential revision: https://reviews.freebsd.org/D5603
2019-02-10 17:19:45 +00:00
|
|
|
u_long maxalign, mapsz, maxv, maxv1;
|
2018-11-23 23:33:55 +00:00
|
|
|
uint32_t fctl0;
|
2015-10-14 18:27:35 +00:00
|
|
|
int32_t osrel;
|
2019-03-28 21:43:01 +00:00
|
|
|
bool free_interp;
|
2019-04-08 14:31:07 +00:00
|
|
|
int error, i, n;
|
2015-10-14 18:27:35 +00:00
|
|
|
|
|
|
|
hdr = (const Elf_Ehdr *)imgp->image_header;
|
1996-03-10 08:42:54 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Do we have a valid ELF header ?
|
2005-12-26 21:23:57 +00:00
|
|
|
*
|
|
|
|
* Only allow ET_EXEC & ET_DYN here, reject ET_DYN later
|
|
|
|
* if particular brand doesn't support it.
|
1996-03-10 08:42:54 +00:00
|
|
|
*/
|
2005-12-26 21:23:57 +00:00
|
|
|
if (__elfN(check_header)(hdr) != 0 ||
|
|
|
|
(hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN))
|
2002-08-24 22:01:40 +00:00
|
|
|
return (-1);
|
1996-03-10 08:42:54 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* From here on down, we return an errno, not -1, as we've
|
|
|
|
* detected an ELF file.
|
|
|
|
*/
|
|
|
|
|
|
|
|
if ((hdr->e_phoff > PAGE_SIZE) ||
|
2013-03-13 22:01:31 +00:00
|
|
|
(u_int)hdr->e_phentsize * hdr->e_phnum > PAGE_SIZE - hdr->e_phoff) {
|
1998-03-02 05:47:58 +00:00
|
|
|
/* Only support headers in first page for now */
|
2015-06-08 16:07:07 +00:00
|
|
|
uprintf("Program headers not in the first page\n");
|
2002-08-24 22:01:40 +00:00
|
|
|
return (ENOEXEC);
|
1996-03-10 08:42:54 +00:00
|
|
|
}
|
2015-06-08 16:07:07 +00:00
|
|
|
phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff);
|
|
|
|
if (!aligned(phdr, Elf_Addr)) {
|
|
|
|
uprintf("Unaligned program headers\n");
|
2007-12-04 12:21:27 +00:00
|
|
|
return (ENOEXEC);
|
2015-06-08 16:07:07 +00:00
|
|
|
}
|
2015-10-14 18:27:35 +00:00
|
|
|
|
|
|
|
n = error = 0;
|
2009-10-18 12:57:48 +00:00
|
|
|
baddr = 0;
|
2015-10-14 18:27:35 +00:00
|
|
|
osrel = 0;
|
2018-11-23 23:33:55 +00:00
|
|
|
fctl0 = 0;
|
2015-10-14 18:27:35 +00:00
|
|
|
entry = proghdr = 0;
|
2019-04-08 14:31:07 +00:00
|
|
|
interp = NULL;
|
2019-03-28 21:43:01 +00:00
|
|
|
free_interp = false;
|
2015-10-14 18:27:35 +00:00
|
|
|
td = curthread;
|
Implement Address Space Layout Randomization (ASLR)
With this change, randomization can be enabled for all non-fixed
mappings. It means that the base address for the mapping is selected
with a guaranteed amount of entropy (bits). If the mapping was
requested to be superpage aligned, the randomization honours the
superpage attributes.
Although the value of ASLR is diminshing over time as exploit authors
work out simple ASLR bypass techniques, it elimintates the trivial
exploitation of certain vulnerabilities, at least in theory. This
implementation is relatively small and happens at the correct
architectural level. Also, it is not expected to introduce
regressions in existing cases when turned off (default for now), or
cause any significant maintaince burden.
The randomization is done on a best-effort basis - that is, the
allocator falls back to a first fit strategy if fragmentation prevents
entropy injection. It is trivial to implement a strong mode where
failure to guarantee the requested amount of entropy results in
mapping request failure, but I do not consider that to be usable.
I have not fine-tuned the amount of entropy injected right now. It is
only a quantitive change that will not change the implementation. The
current amount is controlled by aslr_pages_rnd.
To not spoil coalescing optimizations, to reduce the page table
fragmentation inherent to ASLR, and to keep the transient superpage
promotion for the malloced memory, locality clustering is implemented
for anonymous private mappings, which are automatically grouped until
fragmentation kicks in. The initial location for the anon group range
is, of course, randomized. This is controlled by vm.cluster_anon,
enabled by default.
The default mode keeps the sbrk area unpopulated by other mappings,
but this can be turned off, which gives much more breathing bits on
architectures with small address space, such as i386. This is tied
with the question of following an application's hint about the mmap(2)
base address. Testing shows that ignoring the hint does not affect the
function of common applications, but I would expect more demanding
code could break. By default sbrk is preserved and mmap hints are
satisfied, which can be changed by using the
kern.elf{32,64}.aslr.honor_sbrk sysctl.
ASLR is enabled on per-ABI basis, and currently it is only allowed on
FreeBSD native i386 and amd64 (including compat 32bit) ABIs. Support
for additional architectures will be added after further testing.
Both per-process and per-image controls are implemented:
- procctl(2) adds PROC_ASLR_CTL/PROC_ASLR_STATUS;
- NT_FREEBSD_FCTL_ASLR_DISABLE feature control note bit makes it possible
to force ASLR off for the given binary. (A tool to edit the feature
control note is in development.)
Global controls are:
- kern.elf{32,64}.aslr.enable - for non-fixed mappings done by mmap(2);
- kern.elf{32,64}.aslr.pie_enable - for PIE image activation mappings;
- kern.elf{32,64}.aslr.honor_sbrk - allow to use sbrk area for mmap(2);
- vm.cluster_anon - enables anon mapping clustering.
PR: 208580 (exp runs)
Exp-runs done by: antoine
Reviewed by: markj (previous version)
Discussed with: emaste
Tested by: pho
MFC after: 1 month
Sponsored by: The FreeBSD Foundation
Differential revision: https://reviews.freebsd.org/D5603
2019-02-10 17:19:45 +00:00
|
|
|
maxalign = PAGE_SIZE;
|
|
|
|
mapsz = 0;
|
2015-10-14 18:27:35 +00:00
|
|
|
|
2002-09-02 04:50:57 +00:00
|
|
|
for (i = 0; i < hdr->e_phnum; i++) {
|
2011-01-08 16:30:59 +00:00
|
|
|
switch (phdr[i].p_type) {
|
|
|
|
case PT_LOAD:
|
2009-10-18 12:57:48 +00:00
|
|
|
if (n == 0)
|
|
|
|
baddr = phdr[i].p_vaddr;
|
Implement Address Space Layout Randomization (ASLR)
With this change, randomization can be enabled for all non-fixed
mappings. It means that the base address for the mapping is selected
with a guaranteed amount of entropy (bits). If the mapping was
requested to be superpage aligned, the randomization honours the
superpage attributes.
Although the value of ASLR is diminshing over time as exploit authors
work out simple ASLR bypass techniques, it elimintates the trivial
exploitation of certain vulnerabilities, at least in theory. This
implementation is relatively small and happens at the correct
architectural level. Also, it is not expected to introduce
regressions in existing cases when turned off (default for now), or
cause any significant maintaince burden.
The randomization is done on a best-effort basis - that is, the
allocator falls back to a first fit strategy if fragmentation prevents
entropy injection. It is trivial to implement a strong mode where
failure to guarantee the requested amount of entropy results in
mapping request failure, but I do not consider that to be usable.
I have not fine-tuned the amount of entropy injected right now. It is
only a quantitive change that will not change the implementation. The
current amount is controlled by aslr_pages_rnd.
To not spoil coalescing optimizations, to reduce the page table
fragmentation inherent to ASLR, and to keep the transient superpage
promotion for the malloced memory, locality clustering is implemented
for anonymous private mappings, which are automatically grouped until
fragmentation kicks in. The initial location for the anon group range
is, of course, randomized. This is controlled by vm.cluster_anon,
enabled by default.
The default mode keeps the sbrk area unpopulated by other mappings,
but this can be turned off, which gives much more breathing bits on
architectures with small address space, such as i386. This is tied
with the question of following an application's hint about the mmap(2)
base address. Testing shows that ignoring the hint does not affect the
function of common applications, but I would expect more demanding
code could break. By default sbrk is preserved and mmap hints are
satisfied, which can be changed by using the
kern.elf{32,64}.aslr.honor_sbrk sysctl.
ASLR is enabled on per-ABI basis, and currently it is only allowed on
FreeBSD native i386 and amd64 (including compat 32bit) ABIs. Support
for additional architectures will be added after further testing.
Both per-process and per-image controls are implemented:
- procctl(2) adds PROC_ASLR_CTL/PROC_ASLR_STATUS;
- NT_FREEBSD_FCTL_ASLR_DISABLE feature control note bit makes it possible
to force ASLR off for the given binary. (A tool to edit the feature
control note is in development.)
Global controls are:
- kern.elf{32,64}.aslr.enable - for non-fixed mappings done by mmap(2);
- kern.elf{32,64}.aslr.pie_enable - for PIE image activation mappings;
- kern.elf{32,64}.aslr.honor_sbrk - allow to use sbrk area for mmap(2);
- vm.cluster_anon - enables anon mapping clustering.
PR: 208580 (exp runs)
Exp-runs done by: antoine
Reviewed by: markj (previous version)
Discussed with: emaste
Tested by: pho
MFC after: 1 month
Sponsored by: The FreeBSD Foundation
Differential revision: https://reviews.freebsd.org/D5603
2019-02-10 17:19:45 +00:00
|
|
|
if (phdr[i].p_align > maxalign)
|
|
|
|
maxalign = phdr[i].p_align;
|
|
|
|
mapsz += phdr[i].p_memsz;
|
2009-10-18 12:57:48 +00:00
|
|
|
n++;
|
2019-04-09 15:24:38 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If this segment contains the program headers,
|
|
|
|
* remember their virtual address for the AT_PHDR
|
|
|
|
* aux entry. Static binaries don't usually include
|
|
|
|
* a PT_PHDR entry.
|
|
|
|
*/
|
|
|
|
if (phdr[i].p_offset == 0 &&
|
|
|
|
hdr->e_phoff + hdr->e_phnum * hdr->e_phentsize
|
|
|
|
<= phdr[i].p_filesz)
|
|
|
|
proghdr = phdr[i].p_vaddr + hdr->e_phoff;
|
2011-01-08 16:30:59 +00:00
|
|
|
break;
|
|
|
|
case PT_INTERP:
|
2006-01-21 20:11:49 +00:00
|
|
|
/* Path to interpreter */
|
2015-12-24 00:58:11 +00:00
|
|
|
if (interp != NULL) {
|
|
|
|
uprintf("Multiple PT_INTERP headers\n");
|
|
|
|
error = ENOEXEC;
|
|
|
|
goto ret;
|
|
|
|
}
|
2019-03-28 21:43:01 +00:00
|
|
|
error = __elfN(get_interp)(imgp, &phdr[i], &interp,
|
|
|
|
&free_interp);
|
|
|
|
if (error != 0)
|
|
|
|
goto ret;
|
2011-01-08 16:30:59 +00:00
|
|
|
break;
|
|
|
|
case PT_GNU_STACK:
|
|
|
|
if (__elfN(nxstack))
|
|
|
|
imgp->stack_prot =
|
|
|
|
__elfN(trans_prot)(phdr[i].p_flags);
|
2015-04-15 08:13:53 +00:00
|
|
|
imgp->stack_sz = phdr[i].p_memsz;
|
2011-01-08 16:30:59 +00:00
|
|
|
break;
|
2019-04-09 15:24:38 +00:00
|
|
|
case PT_PHDR: /* Program header table info */
|
|
|
|
proghdr = phdr[i].p_vaddr;
|
|
|
|
break;
|
2002-07-20 02:56:12 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-03-28 21:43:01 +00:00
|
|
|
brand_info = __elfN(get_brandinfo)(imgp, interp, &osrel, &fctl0);
|
2002-09-02 04:50:57 +00:00
|
|
|
if (brand_info == NULL) {
|
|
|
|
uprintf("ELF binary type \"%u\" not known.\n",
|
|
|
|
hdr->e_ident[EI_OSABI]);
|
2015-10-14 18:27:35 +00:00
|
|
|
error = ENOEXEC;
|
|
|
|
goto ret;
|
2002-07-20 02:56:12 +00:00
|
|
|
}
|
Implement Address Space Layout Randomization (ASLR)
With this change, randomization can be enabled for all non-fixed
mappings. It means that the base address for the mapping is selected
with a guaranteed amount of entropy (bits). If the mapping was
requested to be superpage aligned, the randomization honours the
superpage attributes.
Although the value of ASLR is diminshing over time as exploit authors
work out simple ASLR bypass techniques, it elimintates the trivial
exploitation of certain vulnerabilities, at least in theory. This
implementation is relatively small and happens at the correct
architectural level. Also, it is not expected to introduce
regressions in existing cases when turned off (default for now), or
cause any significant maintaince burden.
The randomization is done on a best-effort basis - that is, the
allocator falls back to a first fit strategy if fragmentation prevents
entropy injection. It is trivial to implement a strong mode where
failure to guarantee the requested amount of entropy results in
mapping request failure, but I do not consider that to be usable.
I have not fine-tuned the amount of entropy injected right now. It is
only a quantitive change that will not change the implementation. The
current amount is controlled by aslr_pages_rnd.
To not spoil coalescing optimizations, to reduce the page table
fragmentation inherent to ASLR, and to keep the transient superpage
promotion for the malloced memory, locality clustering is implemented
for anonymous private mappings, which are automatically grouped until
fragmentation kicks in. The initial location for the anon group range
is, of course, randomized. This is controlled by vm.cluster_anon,
enabled by default.
The default mode keeps the sbrk area unpopulated by other mappings,
but this can be turned off, which gives much more breathing bits on
architectures with small address space, such as i386. This is tied
with the question of following an application's hint about the mmap(2)
base address. Testing shows that ignoring the hint does not affect the
function of common applications, but I would expect more demanding
code could break. By default sbrk is preserved and mmap hints are
satisfied, which can be changed by using the
kern.elf{32,64}.aslr.honor_sbrk sysctl.
ASLR is enabled on per-ABI basis, and currently it is only allowed on
FreeBSD native i386 and amd64 (including compat 32bit) ABIs. Support
for additional architectures will be added after further testing.
Both per-process and per-image controls are implemented:
- procctl(2) adds PROC_ASLR_CTL/PROC_ASLR_STATUS;
- NT_FREEBSD_FCTL_ASLR_DISABLE feature control note bit makes it possible
to force ASLR off for the given binary. (A tool to edit the feature
control note is in development.)
Global controls are:
- kern.elf{32,64}.aslr.enable - for non-fixed mappings done by mmap(2);
- kern.elf{32,64}.aslr.pie_enable - for PIE image activation mappings;
- kern.elf{32,64}.aslr.honor_sbrk - allow to use sbrk area for mmap(2);
- vm.cluster_anon - enables anon mapping clustering.
PR: 208580 (exp runs)
Exp-runs done by: antoine
Reviewed by: markj (previous version)
Discussed with: emaste
Tested by: pho
MFC after: 1 month
Sponsored by: The FreeBSD Foundation
Differential revision: https://reviews.freebsd.org/D5603
2019-02-10 17:19:45 +00:00
|
|
|
sv = brand_info->sysvec;
|
2017-01-24 22:46:43 +00:00
|
|
|
et_dyn_addr = 0;
|
2009-10-10 15:33:01 +00:00
|
|
|
if (hdr->e_type == ET_DYN) {
|
2015-06-08 16:07:07 +00:00
|
|
|
if ((brand_info->flags & BI_CAN_EXEC_DYN) == 0) {
|
|
|
|
uprintf("Cannot execute shared object\n");
|
2015-10-14 18:27:35 +00:00
|
|
|
error = ENOEXEC;
|
|
|
|
goto ret;
|
2015-06-08 16:07:07 +00:00
|
|
|
}
|
2009-10-18 12:57:48 +00:00
|
|
|
/*
|
|
|
|
* Honour the base load address from the dso if it is
|
|
|
|
* non-zero for some reason.
|
|
|
|
*/
|
Implement Address Space Layout Randomization (ASLR)
With this change, randomization can be enabled for all non-fixed
mappings. It means that the base address for the mapping is selected
with a guaranteed amount of entropy (bits). If the mapping was
requested to be superpage aligned, the randomization honours the
superpage attributes.
Although the value of ASLR is diminshing over time as exploit authors
work out simple ASLR bypass techniques, it elimintates the trivial
exploitation of certain vulnerabilities, at least in theory. This
implementation is relatively small and happens at the correct
architectural level. Also, it is not expected to introduce
regressions in existing cases when turned off (default for now), or
cause any significant maintaince burden.
The randomization is done on a best-effort basis - that is, the
allocator falls back to a first fit strategy if fragmentation prevents
entropy injection. It is trivial to implement a strong mode where
failure to guarantee the requested amount of entropy results in
mapping request failure, but I do not consider that to be usable.
I have not fine-tuned the amount of entropy injected right now. It is
only a quantitive change that will not change the implementation. The
current amount is controlled by aslr_pages_rnd.
To not spoil coalescing optimizations, to reduce the page table
fragmentation inherent to ASLR, and to keep the transient superpage
promotion for the malloced memory, locality clustering is implemented
for anonymous private mappings, which are automatically grouped until
fragmentation kicks in. The initial location for the anon group range
is, of course, randomized. This is controlled by vm.cluster_anon,
enabled by default.
The default mode keeps the sbrk area unpopulated by other mappings,
but this can be turned off, which gives much more breathing bits on
architectures with small address space, such as i386. This is tied
with the question of following an application's hint about the mmap(2)
base address. Testing shows that ignoring the hint does not affect the
function of common applications, but I would expect more demanding
code could break. By default sbrk is preserved and mmap hints are
satisfied, which can be changed by using the
kern.elf{32,64}.aslr.honor_sbrk sysctl.
ASLR is enabled on per-ABI basis, and currently it is only allowed on
FreeBSD native i386 and amd64 (including compat 32bit) ABIs. Support
for additional architectures will be added after further testing.
Both per-process and per-image controls are implemented:
- procctl(2) adds PROC_ASLR_CTL/PROC_ASLR_STATUS;
- NT_FREEBSD_FCTL_ASLR_DISABLE feature control note bit makes it possible
to force ASLR off for the given binary. (A tool to edit the feature
control note is in development.)
Global controls are:
- kern.elf{32,64}.aslr.enable - for non-fixed mappings done by mmap(2);
- kern.elf{32,64}.aslr.pie_enable - for PIE image activation mappings;
- kern.elf{32,64}.aslr.honor_sbrk - allow to use sbrk area for mmap(2);
- vm.cluster_anon - enables anon mapping clustering.
PR: 208580 (exp runs)
Exp-runs done by: antoine
Reviewed by: markj (previous version)
Discussed with: emaste
Tested by: pho
MFC after: 1 month
Sponsored by: The FreeBSD Foundation
Differential revision: https://reviews.freebsd.org/D5603
2019-02-10 17:19:45 +00:00
|
|
|
if (baddr == 0) {
|
|
|
|
if ((sv->sv_flags & SV_ASLR) == 0 ||
|
|
|
|
(fctl0 & NT_FREEBSD_FCTL_ASLR_DISABLE) != 0)
|
2019-09-21 18:00:23 +00:00
|
|
|
et_dyn_addr = __elfN(pie_base);
|
Implement Address Space Layout Randomization (ASLR)
With this change, randomization can be enabled for all non-fixed
mappings. It means that the base address for the mapping is selected
with a guaranteed amount of entropy (bits). If the mapping was
requested to be superpage aligned, the randomization honours the
superpage attributes.
Although the value of ASLR is diminshing over time as exploit authors
work out simple ASLR bypass techniques, it elimintates the trivial
exploitation of certain vulnerabilities, at least in theory. This
implementation is relatively small and happens at the correct
architectural level. Also, it is not expected to introduce
regressions in existing cases when turned off (default for now), or
cause any significant maintaince burden.
The randomization is done on a best-effort basis - that is, the
allocator falls back to a first fit strategy if fragmentation prevents
entropy injection. It is trivial to implement a strong mode where
failure to guarantee the requested amount of entropy results in
mapping request failure, but I do not consider that to be usable.
I have not fine-tuned the amount of entropy injected right now. It is
only a quantitive change that will not change the implementation. The
current amount is controlled by aslr_pages_rnd.
To not spoil coalescing optimizations, to reduce the page table
fragmentation inherent to ASLR, and to keep the transient superpage
promotion for the malloced memory, locality clustering is implemented
for anonymous private mappings, which are automatically grouped until
fragmentation kicks in. The initial location for the anon group range
is, of course, randomized. This is controlled by vm.cluster_anon,
enabled by default.
The default mode keeps the sbrk area unpopulated by other mappings,
but this can be turned off, which gives much more breathing bits on
architectures with small address space, such as i386. This is tied
with the question of following an application's hint about the mmap(2)
base address. Testing shows that ignoring the hint does not affect the
function of common applications, but I would expect more demanding
code could break. By default sbrk is preserved and mmap hints are
satisfied, which can be changed by using the
kern.elf{32,64}.aslr.honor_sbrk sysctl.
ASLR is enabled on per-ABI basis, and currently it is only allowed on
FreeBSD native i386 and amd64 (including compat 32bit) ABIs. Support
for additional architectures will be added after further testing.
Both per-process and per-image controls are implemented:
- procctl(2) adds PROC_ASLR_CTL/PROC_ASLR_STATUS;
- NT_FREEBSD_FCTL_ASLR_DISABLE feature control note bit makes it possible
to force ASLR off for the given binary. (A tool to edit the feature
control note is in development.)
Global controls are:
- kern.elf{32,64}.aslr.enable - for non-fixed mappings done by mmap(2);
- kern.elf{32,64}.aslr.pie_enable - for PIE image activation mappings;
- kern.elf{32,64}.aslr.honor_sbrk - allow to use sbrk area for mmap(2);
- vm.cluster_anon - enables anon mapping clustering.
PR: 208580 (exp runs)
Exp-runs done by: antoine
Reviewed by: markj (previous version)
Discussed with: emaste
Tested by: pho
MFC after: 1 month
Sponsored by: The FreeBSD Foundation
Differential revision: https://reviews.freebsd.org/D5603
2019-02-10 17:19:45 +00:00
|
|
|
else if ((__elfN(pie_aslr_enabled) &&
|
|
|
|
(imgp->proc->p_flag2 & P2_ASLR_DISABLE) == 0) ||
|
|
|
|
(imgp->proc->p_flag2 & P2_ASLR_ENABLE) != 0)
|
|
|
|
et_dyn_addr = ET_DYN_ADDR_RAND;
|
|
|
|
else
|
2019-09-21 18:00:23 +00:00
|
|
|
et_dyn_addr = __elfN(pie_base);
|
Implement Address Space Layout Randomization (ASLR)
With this change, randomization can be enabled for all non-fixed
mappings. It means that the base address for the mapping is selected
with a guaranteed amount of entropy (bits). If the mapping was
requested to be superpage aligned, the randomization honours the
superpage attributes.
Although the value of ASLR is diminshing over time as exploit authors
work out simple ASLR bypass techniques, it elimintates the trivial
exploitation of certain vulnerabilities, at least in theory. This
implementation is relatively small and happens at the correct
architectural level. Also, it is not expected to introduce
regressions in existing cases when turned off (default for now), or
cause any significant maintaince burden.
The randomization is done on a best-effort basis - that is, the
allocator falls back to a first fit strategy if fragmentation prevents
entropy injection. It is trivial to implement a strong mode where
failure to guarantee the requested amount of entropy results in
mapping request failure, but I do not consider that to be usable.
I have not fine-tuned the amount of entropy injected right now. It is
only a quantitive change that will not change the implementation. The
current amount is controlled by aslr_pages_rnd.
To not spoil coalescing optimizations, to reduce the page table
fragmentation inherent to ASLR, and to keep the transient superpage
promotion for the malloced memory, locality clustering is implemented
for anonymous private mappings, which are automatically grouped until
fragmentation kicks in. The initial location for the anon group range
is, of course, randomized. This is controlled by vm.cluster_anon,
enabled by default.
The default mode keeps the sbrk area unpopulated by other mappings,
but this can be turned off, which gives much more breathing bits on
architectures with small address space, such as i386. This is tied
with the question of following an application's hint about the mmap(2)
base address. Testing shows that ignoring the hint does not affect the
function of common applications, but I would expect more demanding
code could break. By default sbrk is preserved and mmap hints are
satisfied, which can be changed by using the
kern.elf{32,64}.aslr.honor_sbrk sysctl.
ASLR is enabled on per-ABI basis, and currently it is only allowed on
FreeBSD native i386 and amd64 (including compat 32bit) ABIs. Support
for additional architectures will be added after further testing.
Both per-process and per-image controls are implemented:
- procctl(2) adds PROC_ASLR_CTL/PROC_ASLR_STATUS;
- NT_FREEBSD_FCTL_ASLR_DISABLE feature control note bit makes it possible
to force ASLR off for the given binary. (A tool to edit the feature
control note is in development.)
Global controls are:
- kern.elf{32,64}.aslr.enable - for non-fixed mappings done by mmap(2);
- kern.elf{32,64}.aslr.pie_enable - for PIE image activation mappings;
- kern.elf{32,64}.aslr.honor_sbrk - allow to use sbrk area for mmap(2);
- vm.cluster_anon - enables anon mapping clustering.
PR: 208580 (exp runs)
Exp-runs done by: antoine
Reviewed by: markj (previous version)
Discussed with: emaste
Tested by: pho
MFC after: 1 month
Sponsored by: The FreeBSD Foundation
Differential revision: https://reviews.freebsd.org/D5603
2019-02-10 17:19:45 +00:00
|
|
|
}
|
2017-01-24 22:46:43 +00:00
|
|
|
}
|
2002-09-02 04:50:57 +00:00
|
|
|
|
2005-12-24 04:57:50 +00:00
|
|
|
/*
|
|
|
|
* Avoid a possible deadlock if the current address space is destroyed
|
|
|
|
* and that address space maps the locked vnode. In the common case,
|
|
|
|
* the locked vnode's v_usecount is decremented but remains greater
|
|
|
|
* than zero. Consequently, the vnode lock is not needed by vrele().
|
|
|
|
* However, in cases where the vnode lock is external, such as nullfs,
|
|
|
|
* v_usecount may become zero.
|
2012-01-17 16:20:50 +00:00
|
|
|
*
|
|
|
|
* The VV_TEXT flag prevents modifications to the executable while
|
|
|
|
* the vnode is unlocked.
|
2005-12-24 04:57:50 +00:00
|
|
|
*/
|
2008-01-13 14:44:15 +00:00
|
|
|
VOP_UNLOCK(imgp->vp, 0);
|
2005-12-24 04:57:50 +00:00
|
|
|
|
Implement Address Space Layout Randomization (ASLR)
With this change, randomization can be enabled for all non-fixed
mappings. It means that the base address for the mapping is selected
with a guaranteed amount of entropy (bits). If the mapping was
requested to be superpage aligned, the randomization honours the
superpage attributes.
Although the value of ASLR is diminshing over time as exploit authors
work out simple ASLR bypass techniques, it elimintates the trivial
exploitation of certain vulnerabilities, at least in theory. This
implementation is relatively small and happens at the correct
architectural level. Also, it is not expected to introduce
regressions in existing cases when turned off (default for now), or
cause any significant maintaince burden.
The randomization is done on a best-effort basis - that is, the
allocator falls back to a first fit strategy if fragmentation prevents
entropy injection. It is trivial to implement a strong mode where
failure to guarantee the requested amount of entropy results in
mapping request failure, but I do not consider that to be usable.
I have not fine-tuned the amount of entropy injected right now. It is
only a quantitive change that will not change the implementation. The
current amount is controlled by aslr_pages_rnd.
To not spoil coalescing optimizations, to reduce the page table
fragmentation inherent to ASLR, and to keep the transient superpage
promotion for the malloced memory, locality clustering is implemented
for anonymous private mappings, which are automatically grouped until
fragmentation kicks in. The initial location for the anon group range
is, of course, randomized. This is controlled by vm.cluster_anon,
enabled by default.
The default mode keeps the sbrk area unpopulated by other mappings,
but this can be turned off, which gives much more breathing bits on
architectures with small address space, such as i386. This is tied
with the question of following an application's hint about the mmap(2)
base address. Testing shows that ignoring the hint does not affect the
function of common applications, but I would expect more demanding
code could break. By default sbrk is preserved and mmap hints are
satisfied, which can be changed by using the
kern.elf{32,64}.aslr.honor_sbrk sysctl.
ASLR is enabled on per-ABI basis, and currently it is only allowed on
FreeBSD native i386 and amd64 (including compat 32bit) ABIs. Support
for additional architectures will be added after further testing.
Both per-process and per-image controls are implemented:
- procctl(2) adds PROC_ASLR_CTL/PROC_ASLR_STATUS;
- NT_FREEBSD_FCTL_ASLR_DISABLE feature control note bit makes it possible
to force ASLR off for the given binary. (A tool to edit the feature
control note is in development.)
Global controls are:
- kern.elf{32,64}.aslr.enable - for non-fixed mappings done by mmap(2);
- kern.elf{32,64}.aslr.pie_enable - for PIE image activation mappings;
- kern.elf{32,64}.aslr.honor_sbrk - allow to use sbrk area for mmap(2);
- vm.cluster_anon - enables anon mapping clustering.
PR: 208580 (exp runs)
Exp-runs done by: antoine
Reviewed by: markj (previous version)
Discussed with: emaste
Tested by: pho
MFC after: 1 month
Sponsored by: The FreeBSD Foundation
Differential revision: https://reviews.freebsd.org/D5603
2019-02-10 17:19:45 +00:00
|
|
|
/*
|
|
|
|
* Decide whether to enable randomization of user mappings.
|
|
|
|
* First, reset user preferences for the setid binaries.
|
|
|
|
* Then, account for the support of the randomization by the
|
|
|
|
* ABI, by user preferences, and make special treatment for
|
|
|
|
* PIE binaries.
|
|
|
|
*/
|
|
|
|
if (imgp->credential_setid) {
|
|
|
|
PROC_LOCK(imgp->proc);
|
|
|
|
imgp->proc->p_flag2 &= ~(P2_ASLR_ENABLE | P2_ASLR_DISABLE);
|
|
|
|
PROC_UNLOCK(imgp->proc);
|
|
|
|
}
|
|
|
|
if ((sv->sv_flags & SV_ASLR) == 0 ||
|
|
|
|
(imgp->proc->p_flag2 & P2_ASLR_DISABLE) != 0 ||
|
|
|
|
(fctl0 & NT_FREEBSD_FCTL_ASLR_DISABLE) != 0) {
|
|
|
|
KASSERT(et_dyn_addr != ET_DYN_ADDR_RAND,
|
|
|
|
("et_dyn_addr == RAND and !ASLR"));
|
|
|
|
} else if ((imgp->proc->p_flag2 & P2_ASLR_ENABLE) != 0 ||
|
|
|
|
(__elfN(aslr_enabled) && hdr->e_type == ET_EXEC) ||
|
|
|
|
et_dyn_addr == ET_DYN_ADDR_RAND) {
|
|
|
|
imgp->map_flags |= MAP_ASLR;
|
|
|
|
/*
|
|
|
|
* If user does not care about sbrk, utilize the bss
|
|
|
|
* grow region for mappings as well. We can select
|
|
|
|
* the base for the image anywere and still not suffer
|
|
|
|
* from the fragmentation.
|
|
|
|
*/
|
|
|
|
if (!__elfN(aslr_honor_sbrk) ||
|
|
|
|
(imgp->proc->p_flag2 & P2_ASLR_IGNSTART) != 0)
|
|
|
|
imgp->map_flags |= MAP_ASLR_IGNSTART;
|
|
|
|
}
|
|
|
|
|
2007-11-05 11:36:16 +00:00
|
|
|
error = exec_new_vmspace(imgp, sv);
|
Implement Address Space Layout Randomization (ASLR)
With this change, randomization can be enabled for all non-fixed
mappings. It means that the base address for the mapping is selected
with a guaranteed amount of entropy (bits). If the mapping was
requested to be superpage aligned, the randomization honours the
superpage attributes.
Although the value of ASLR is diminshing over time as exploit authors
work out simple ASLR bypass techniques, it elimintates the trivial
exploitation of certain vulnerabilities, at least in theory. This
implementation is relatively small and happens at the correct
architectural level. Also, it is not expected to introduce
regressions in existing cases when turned off (default for now), or
cause any significant maintaince burden.
The randomization is done on a best-effort basis - that is, the
allocator falls back to a first fit strategy if fragmentation prevents
entropy injection. It is trivial to implement a strong mode where
failure to guarantee the requested amount of entropy results in
mapping request failure, but I do not consider that to be usable.
I have not fine-tuned the amount of entropy injected right now. It is
only a quantitive change that will not change the implementation. The
current amount is controlled by aslr_pages_rnd.
To not spoil coalescing optimizations, to reduce the page table
fragmentation inherent to ASLR, and to keep the transient superpage
promotion for the malloced memory, locality clustering is implemented
for anonymous private mappings, which are automatically grouped until
fragmentation kicks in. The initial location for the anon group range
is, of course, randomized. This is controlled by vm.cluster_anon,
enabled by default.
The default mode keeps the sbrk area unpopulated by other mappings,
but this can be turned off, which gives much more breathing bits on
architectures with small address space, such as i386. This is tied
with the question of following an application's hint about the mmap(2)
base address. Testing shows that ignoring the hint does not affect the
function of common applications, but I would expect more demanding
code could break. By default sbrk is preserved and mmap hints are
satisfied, which can be changed by using the
kern.elf{32,64}.aslr.honor_sbrk sysctl.
ASLR is enabled on per-ABI basis, and currently it is only allowed on
FreeBSD native i386 and amd64 (including compat 32bit) ABIs. Support
for additional architectures will be added after further testing.
Both per-process and per-image controls are implemented:
- procctl(2) adds PROC_ASLR_CTL/PROC_ASLR_STATUS;
- NT_FREEBSD_FCTL_ASLR_DISABLE feature control note bit makes it possible
to force ASLR off for the given binary. (A tool to edit the feature
control note is in development.)
Global controls are:
- kern.elf{32,64}.aslr.enable - for non-fixed mappings done by mmap(2);
- kern.elf{32,64}.aslr.pie_enable - for PIE image activation mappings;
- kern.elf{32,64}.aslr.honor_sbrk - allow to use sbrk area for mmap(2);
- vm.cluster_anon - enables anon mapping clustering.
PR: 208580 (exp runs)
Exp-runs done by: antoine
Reviewed by: markj (previous version)
Discussed with: emaste
Tested by: pho
MFC after: 1 month
Sponsored by: The FreeBSD Foundation
Differential revision: https://reviews.freebsd.org/D5603
2019-02-10 17:19:45 +00:00
|
|
|
vmspace = imgp->proc->p_vmspace;
|
|
|
|
map = &vmspace->vm_map;
|
|
|
|
|
2007-05-14 22:40:04 +00:00
|
|
|
imgp->proc->p_sysent = sv;
|
1996-03-10 08:42:54 +00:00
|
|
|
|
Implement Address Space Layout Randomization (ASLR)
With this change, randomization can be enabled for all non-fixed
mappings. It means that the base address for the mapping is selected
with a guaranteed amount of entropy (bits). If the mapping was
requested to be superpage aligned, the randomization honours the
superpage attributes.
Although the value of ASLR is diminshing over time as exploit authors
work out simple ASLR bypass techniques, it elimintates the trivial
exploitation of certain vulnerabilities, at least in theory. This
implementation is relatively small and happens at the correct
architectural level. Also, it is not expected to introduce
regressions in existing cases when turned off (default for now), or
cause any significant maintaince burden.
The randomization is done on a best-effort basis - that is, the
allocator falls back to a first fit strategy if fragmentation prevents
entropy injection. It is trivial to implement a strong mode where
failure to guarantee the requested amount of entropy results in
mapping request failure, but I do not consider that to be usable.
I have not fine-tuned the amount of entropy injected right now. It is
only a quantitive change that will not change the implementation. The
current amount is controlled by aslr_pages_rnd.
To not spoil coalescing optimizations, to reduce the page table
fragmentation inherent to ASLR, and to keep the transient superpage
promotion for the malloced memory, locality clustering is implemented
for anonymous private mappings, which are automatically grouped until
fragmentation kicks in. The initial location for the anon group range
is, of course, randomized. This is controlled by vm.cluster_anon,
enabled by default.
The default mode keeps the sbrk area unpopulated by other mappings,
but this can be turned off, which gives much more breathing bits on
architectures with small address space, such as i386. This is tied
with the question of following an application's hint about the mmap(2)
base address. Testing shows that ignoring the hint does not affect the
function of common applications, but I would expect more demanding
code could break. By default sbrk is preserved and mmap hints are
satisfied, which can be changed by using the
kern.elf{32,64}.aslr.honor_sbrk sysctl.
ASLR is enabled on per-ABI basis, and currently it is only allowed on
FreeBSD native i386 and amd64 (including compat 32bit) ABIs. Support
for additional architectures will be added after further testing.
Both per-process and per-image controls are implemented:
- procctl(2) adds PROC_ASLR_CTL/PROC_ASLR_STATUS;
- NT_FREEBSD_FCTL_ASLR_DISABLE feature control note bit makes it possible
to force ASLR off for the given binary. (A tool to edit the feature
control note is in development.)
Global controls are:
- kern.elf{32,64}.aslr.enable - for non-fixed mappings done by mmap(2);
- kern.elf{32,64}.aslr.pie_enable - for PIE image activation mappings;
- kern.elf{32,64}.aslr.honor_sbrk - allow to use sbrk area for mmap(2);
- vm.cluster_anon - enables anon mapping clustering.
PR: 208580 (exp runs)
Exp-runs done by: antoine
Reviewed by: markj (previous version)
Discussed with: emaste
Tested by: pho
MFC after: 1 month
Sponsored by: The FreeBSD Foundation
Differential revision: https://reviews.freebsd.org/D5603
2019-02-10 17:19:45 +00:00
|
|
|
maxv = vm_map_max(map) - lim_max(td, RLIMIT_STACK);
|
|
|
|
if (et_dyn_addr == ET_DYN_ADDR_RAND) {
|
|
|
|
KASSERT((map->flags & MAP_ASLR) != 0,
|
|
|
|
("ET_DYN_ADDR_RAND but !MAP_ASLR"));
|
|
|
|
et_dyn_addr = __CONCAT(rnd_, __elfN(base))(map,
|
|
|
|
vm_map_min(map) + mapsz + lim_max(td, RLIMIT_DATA),
|
|
|
|
/* reserve half of the address space to interpreter */
|
|
|
|
maxv / 2, 1UL << flsl(maxalign));
|
|
|
|
}
|
|
|
|
|
Switch to use shared vnode locks for text files during image activation.
kern_execve() locks text vnode exclusive to be able to set and clear
VV_TEXT flag. VV_TEXT is mutually exclusive with the v_writecount > 0
condition.
The change removes VV_TEXT, replacing it with the condition
v_writecount <= -1, and puts v_writecount under the vnode interlock.
Each text reference decrements v_writecount. To clear the text
reference when the segment is unmapped, it is recorded in the
vm_map_entry backed by the text file as MAP_ENTRY_VN_TEXT flag, and
v_writecount is incremented on the map entry removal
The operations like VOP_ADD_WRITECOUNT() and VOP_SET_TEXT() check that
v_writecount does not contradict the desired change. vn_writecheck()
is now racy and its use was eliminated everywhere except access.
Atomic check for writeability and increment of v_writecount is
performed by the VOP. vn_truncate() now increments v_writecount
around VOP_SETATTR() call, lack of which is arguably a bug on its own.
nullfs bypasses v_writecount to the lower vnode always, so nullfs
vnode has its own v_writecount correct, and lower vnode gets all
references, since object->handle is always lower vnode.
On the text vnode' vm object dealloc, the v_writecount value is reset
to zero, and deadfs vop_unset_text short-circuit the operation.
Reclamation of lowervp always reclaims all nullfs vnodes referencing
lowervp first, so no stray references are left.
Reviewed by: markj, trasz
Tested by: mjg, pho
Sponsored by: The FreeBSD Foundation
MFC after: 1 month
Differential revision: https://reviews.freebsd.org/D19923
2019-05-05 11:20:43 +00:00
|
|
|
vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
|
2015-10-14 18:27:35 +00:00
|
|
|
if (error != 0)
|
|
|
|
goto ret;
|
2005-12-24 04:57:50 +00:00
|
|
|
|
2019-04-09 15:24:38 +00:00
|
|
|
error = __elfN(load_sections)(imgp, hdr, phdr, et_dyn_addr, NULL);
|
|
|
|
if (error != 0)
|
|
|
|
goto ret;
|
2010-03-25 14:21:22 +00:00
|
|
|
|
2019-03-26 15:35:49 +00:00
|
|
|
error = __elfN(enforce_limits)(imgp, hdr, phdr, et_dyn_addr);
|
|
|
|
if (error != 0)
|
2015-10-14 18:27:35 +00:00
|
|
|
goto ret;
|
1996-03-10 08:42:54 +00:00
|
|
|
|
2019-03-26 15:35:49 +00:00
|
|
|
entry = (u_long)hdr->e_entry + et_dyn_addr;
|
1996-03-10 08:42:54 +00:00
|
|
|
|
2003-09-25 01:10:26 +00:00
|
|
|
/*
|
|
|
|
* We load the dynamic linker where a userland call
|
|
|
|
* to mmap(0, ...) would put it. The rationale behind this
|
|
|
|
* calculation is that it leaves room for the heap to grow to
|
|
|
|
* its maximum allowed size.
|
|
|
|
*/
|
2015-10-14 18:27:35 +00:00
|
|
|
addr = round_page((vm_offset_t)vmspace->vm_daddr + lim_max(td,
|
2012-01-17 00:27:32 +00:00
|
|
|
RLIMIT_DATA));
|
Implement Address Space Layout Randomization (ASLR)
With this change, randomization can be enabled for all non-fixed
mappings. It means that the base address for the mapping is selected
with a guaranteed amount of entropy (bits). If the mapping was
requested to be superpage aligned, the randomization honours the
superpage attributes.
Although the value of ASLR is diminshing over time as exploit authors
work out simple ASLR bypass techniques, it elimintates the trivial
exploitation of certain vulnerabilities, at least in theory. This
implementation is relatively small and happens at the correct
architectural level. Also, it is not expected to introduce
regressions in existing cases when turned off (default for now), or
cause any significant maintaince burden.
The randomization is done on a best-effort basis - that is, the
allocator falls back to a first fit strategy if fragmentation prevents
entropy injection. It is trivial to implement a strong mode where
failure to guarantee the requested amount of entropy results in
mapping request failure, but I do not consider that to be usable.
I have not fine-tuned the amount of entropy injected right now. It is
only a quantitive change that will not change the implementation. The
current amount is controlled by aslr_pages_rnd.
To not spoil coalescing optimizations, to reduce the page table
fragmentation inherent to ASLR, and to keep the transient superpage
promotion for the malloced memory, locality clustering is implemented
for anonymous private mappings, which are automatically grouped until
fragmentation kicks in. The initial location for the anon group range
is, of course, randomized. This is controlled by vm.cluster_anon,
enabled by default.
The default mode keeps the sbrk area unpopulated by other mappings,
but this can be turned off, which gives much more breathing bits on
architectures with small address space, such as i386. This is tied
with the question of following an application's hint about the mmap(2)
base address. Testing shows that ignoring the hint does not affect the
function of common applications, but I would expect more demanding
code could break. By default sbrk is preserved and mmap hints are
satisfied, which can be changed by using the
kern.elf{32,64}.aslr.honor_sbrk sysctl.
ASLR is enabled on per-ABI basis, and currently it is only allowed on
FreeBSD native i386 and amd64 (including compat 32bit) ABIs. Support
for additional architectures will be added after further testing.
Both per-process and per-image controls are implemented:
- procctl(2) adds PROC_ASLR_CTL/PROC_ASLR_STATUS;
- NT_FREEBSD_FCTL_ASLR_DISABLE feature control note bit makes it possible
to force ASLR off for the given binary. (A tool to edit the feature
control note is in development.)
Global controls are:
- kern.elf{32,64}.aslr.enable - for non-fixed mappings done by mmap(2);
- kern.elf{32,64}.aslr.pie_enable - for PIE image activation mappings;
- kern.elf{32,64}.aslr.honor_sbrk - allow to use sbrk area for mmap(2);
- vm.cluster_anon - enables anon mapping clustering.
PR: 208580 (exp runs)
Exp-runs done by: antoine
Reviewed by: markj (previous version)
Discussed with: emaste
Tested by: pho
MFC after: 1 month
Sponsored by: The FreeBSD Foundation
Differential revision: https://reviews.freebsd.org/D5603
2019-02-10 17:19:45 +00:00
|
|
|
if ((map->flags & MAP_ASLR) != 0) {
|
|
|
|
maxv1 = maxv / 2 + addr / 2;
|
|
|
|
MPASS(maxv1 >= addr); /* No overflow */
|
|
|
|
map->anon_loc = __CONCAT(rnd_, __elfN(base))(map, addr, maxv1,
|
|
|
|
MAXPAGESIZES > 1 ? pagesizes[1] : pagesizes[0]);
|
|
|
|
} else {
|
|
|
|
map->anon_loc = addr;
|
|
|
|
}
|
1996-03-10 08:42:54 +00:00
|
|
|
|
1996-10-16 17:51:08 +00:00
|
|
|
imgp->entry_addr = entry;
|
|
|
|
|
2003-12-23 02:42:39 +00:00
|
|
|
if (interp != NULL) {
|
2008-01-13 14:44:15 +00:00
|
|
|
VOP_UNLOCK(imgp->vp, 0);
|
Implement Address Space Layout Randomization (ASLR)
With this change, randomization can be enabled for all non-fixed
mappings. It means that the base address for the mapping is selected
with a guaranteed amount of entropy (bits). If the mapping was
requested to be superpage aligned, the randomization honours the
superpage attributes.
Although the value of ASLR is diminshing over time as exploit authors
work out simple ASLR bypass techniques, it elimintates the trivial
exploitation of certain vulnerabilities, at least in theory. This
implementation is relatively small and happens at the correct
architectural level. Also, it is not expected to introduce
regressions in existing cases when turned off (default for now), or
cause any significant maintaince burden.
The randomization is done on a best-effort basis - that is, the
allocator falls back to a first fit strategy if fragmentation prevents
entropy injection. It is trivial to implement a strong mode where
failure to guarantee the requested amount of entropy results in
mapping request failure, but I do not consider that to be usable.
I have not fine-tuned the amount of entropy injected right now. It is
only a quantitive change that will not change the implementation. The
current amount is controlled by aslr_pages_rnd.
To not spoil coalescing optimizations, to reduce the page table
fragmentation inherent to ASLR, and to keep the transient superpage
promotion for the malloced memory, locality clustering is implemented
for anonymous private mappings, which are automatically grouped until
fragmentation kicks in. The initial location for the anon group range
is, of course, randomized. This is controlled by vm.cluster_anon,
enabled by default.
The default mode keeps the sbrk area unpopulated by other mappings,
but this can be turned off, which gives much more breathing bits on
architectures with small address space, such as i386. This is tied
with the question of following an application's hint about the mmap(2)
base address. Testing shows that ignoring the hint does not affect the
function of common applications, but I would expect more demanding
code could break. By default sbrk is preserved and mmap hints are
satisfied, which can be changed by using the
kern.elf{32,64}.aslr.honor_sbrk sysctl.
ASLR is enabled on per-ABI basis, and currently it is only allowed on
FreeBSD native i386 and amd64 (including compat 32bit) ABIs. Support
for additional architectures will be added after further testing.
Both per-process and per-image controls are implemented:
- procctl(2) adds PROC_ASLR_CTL/PROC_ASLR_STATUS;
- NT_FREEBSD_FCTL_ASLR_DISABLE feature control note bit makes it possible
to force ASLR off for the given binary. (A tool to edit the feature
control note is in development.)
Global controls are:
- kern.elf{32,64}.aslr.enable - for non-fixed mappings done by mmap(2);
- kern.elf{32,64}.aslr.pie_enable - for PIE image activation mappings;
- kern.elf{32,64}.aslr.honor_sbrk - allow to use sbrk area for mmap(2);
- vm.cluster_anon - enables anon mapping clustering.
PR: 208580 (exp runs)
Exp-runs done by: antoine
Reviewed by: markj (previous version)
Discussed with: emaste
Tested by: pho
MFC after: 1 month
Sponsored by: The FreeBSD Foundation
Differential revision: https://reviews.freebsd.org/D5603
2019-02-10 17:19:45 +00:00
|
|
|
if ((map->flags & MAP_ASLR) != 0) {
|
|
|
|
/* Assume that interpeter fits into 1/4 of AS */
|
|
|
|
maxv1 = maxv / 2 + addr / 2;
|
|
|
|
MPASS(maxv1 >= addr); /* No overflow */
|
|
|
|
addr = __CONCAT(rnd_, __elfN(base))(map, addr,
|
|
|
|
maxv1, PAGE_SIZE);
|
|
|
|
}
|
2019-04-08 14:31:07 +00:00
|
|
|
error = __elfN(load_interp)(imgp, brand_info, interp, &addr,
|
|
|
|
&imgp->entry_addr);
|
Switch to use shared vnode locks for text files during image activation.
kern_execve() locks text vnode exclusive to be able to set and clear
VV_TEXT flag. VV_TEXT is mutually exclusive with the v_writecount > 0
condition.
The change removes VV_TEXT, replacing it with the condition
v_writecount <= -1, and puts v_writecount under the vnode interlock.
Each text reference decrements v_writecount. To clear the text
reference when the segment is unmapped, it is recorded in the
vm_map_entry backed by the text file as MAP_ENTRY_VN_TEXT flag, and
v_writecount is incremented on the map entry removal
The operations like VOP_ADD_WRITECOUNT() and VOP_SET_TEXT() check that
v_writecount does not contradict the desired change. vn_writecheck()
is now racy and its use was eliminated everywhere except access.
Atomic check for writeability and increment of v_writecount is
performed by the VOP. vn_truncate() now increments v_writecount
around VOP_SETATTR() call, lack of which is arguably a bug on its own.
nullfs bypasses v_writecount to the lower vnode always, so nullfs
vnode has its own v_writecount correct, and lower vnode gets all
references, since object->handle is always lower vnode.
On the text vnode' vm object dealloc, the v_writecount value is reset
to zero, and deadfs vop_unset_text short-circuit the operation.
Reclamation of lowervp always reclaims all nullfs vnodes referencing
lowervp first, so no stray references are left.
Reviewed by: markj, trasz
Tested by: mjg, pho
Sponsored by: The FreeBSD Foundation
MFC after: 1 month
Differential revision: https://reviews.freebsd.org/D19923
2019-05-05 11:20:43 +00:00
|
|
|
vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
|
2019-04-08 14:31:07 +00:00
|
|
|
if (error != 0)
|
2015-10-14 18:27:35 +00:00
|
|
|
goto ret;
|
2009-01-25 12:07:43 +00:00
|
|
|
} else
|
2009-10-18 12:57:48 +00:00
|
|
|
addr = et_dyn_addr;
|
1996-10-16 17:51:08 +00:00
|
|
|
|
1996-03-10 08:42:54 +00:00
|
|
|
/*
|
|
|
|
* Construct auxargs table (used by the fixup routine)
|
|
|
|
*/
|
2019-05-05 11:04:01 +00:00
|
|
|
elf_auxargs = malloc(sizeof(Elf_Auxargs), M_TEMP, M_NOWAIT);
|
|
|
|
if (elf_auxargs == NULL) {
|
|
|
|
VOP_UNLOCK(imgp->vp, 0);
|
|
|
|
elf_auxargs = malloc(sizeof(Elf_Auxargs), M_TEMP, M_WAITOK);
|
Switch to use shared vnode locks for text files during image activation.
kern_execve() locks text vnode exclusive to be able to set and clear
VV_TEXT flag. VV_TEXT is mutually exclusive with the v_writecount > 0
condition.
The change removes VV_TEXT, replacing it with the condition
v_writecount <= -1, and puts v_writecount under the vnode interlock.
Each text reference decrements v_writecount. To clear the text
reference when the segment is unmapped, it is recorded in the
vm_map_entry backed by the text file as MAP_ENTRY_VN_TEXT flag, and
v_writecount is incremented on the map entry removal
The operations like VOP_ADD_WRITECOUNT() and VOP_SET_TEXT() check that
v_writecount does not contradict the desired change. vn_writecheck()
is now racy and its use was eliminated everywhere except access.
Atomic check for writeability and increment of v_writecount is
performed by the VOP. vn_truncate() now increments v_writecount
around VOP_SETATTR() call, lack of which is arguably a bug on its own.
nullfs bypasses v_writecount to the lower vnode always, so nullfs
vnode has its own v_writecount correct, and lower vnode gets all
references, since object->handle is always lower vnode.
On the text vnode' vm object dealloc, the v_writecount value is reset
to zero, and deadfs vop_unset_text short-circuit the operation.
Reclamation of lowervp always reclaims all nullfs vnodes referencing
lowervp first, so no stray references are left.
Reviewed by: markj, trasz
Tested by: mjg, pho
Sponsored by: The FreeBSD Foundation
MFC after: 1 month
Differential revision: https://reviews.freebsd.org/D19923
2019-05-05 11:20:43 +00:00
|
|
|
vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
|
2019-05-05 11:04:01 +00:00
|
|
|
}
|
1996-03-10 08:42:54 +00:00
|
|
|
elf_auxargs->execfd = -1;
|
2019-04-09 15:24:38 +00:00
|
|
|
elf_auxargs->phdr = proghdr + et_dyn_addr;
|
1996-03-10 08:42:54 +00:00
|
|
|
elf_auxargs->phent = hdr->e_phentsize;
|
|
|
|
elf_auxargs->phnum = hdr->e_phnum;
|
|
|
|
elf_auxargs->pagesz = PAGE_SIZE;
|
|
|
|
elf_auxargs->base = addr;
|
|
|
|
elf_auxargs->flags = 0;
|
|
|
|
elf_auxargs->entry = entry;
|
2015-05-22 20:50:35 +00:00
|
|
|
elf_auxargs->hdr_eflags = hdr->e_flags;
|
1996-03-10 08:42:54 +00:00
|
|
|
|
|
|
|
imgp->auxargs = elf_auxargs;
|
|
|
|
imgp->interpreted = 0;
|
2010-03-25 14:31:26 +00:00
|
|
|
imgp->reloc_base = addr;
|
2009-03-13 16:40:51 +00:00
|
|
|
imgp->proc->p_osrel = osrel;
|
2018-11-23 23:33:55 +00:00
|
|
|
imgp->proc->p_fctl0 = fctl0;
|
2017-02-07 20:34:03 +00:00
|
|
|
imgp->proc->p_elf_machine = hdr->e_machine;
|
|
|
|
imgp->proc->p_elf_flags = hdr->e_flags;
|
2007-12-04 12:28:07 +00:00
|
|
|
|
2017-01-24 22:46:43 +00:00
|
|
|
ret:
|
2019-03-28 21:43:01 +00:00
|
|
|
if (free_interp)
|
|
|
|
free(interp, M_TEMP);
|
2002-08-24 22:01:40 +00:00
|
|
|
return (error);
|
1996-03-10 08:42:54 +00:00
|
|
|
}
|
|
|
|
|
2003-01-04 22:07:48 +00:00
|
|
|
#define suword __CONCAT(suword, __ELF_WORD_SIZE)
|
2002-07-20 02:56:12 +00:00
|
|
|
|
|
|
|
int
|
|
|
|
__elfN(freebsd_fixup)(register_t **stack_base, struct image_params *imgp)
|
1996-03-10 08:42:54 +00:00
|
|
|
{
|
1998-06-07 17:13:14 +00:00
|
|
|
Elf_Auxargs *args = (Elf_Auxargs *)imgp->auxargs;
|
2018-05-24 16:25:18 +00:00
|
|
|
Elf_Auxinfo *argarray, *pos;
|
|
|
|
Elf_Addr *base, *auxbase;
|
|
|
|
int error;
|
1996-03-10 08:42:54 +00:00
|
|
|
|
2003-01-04 22:07:48 +00:00
|
|
|
base = (Elf_Addr *)*stack_base;
|
2018-05-24 16:25:18 +00:00
|
|
|
auxbase = base + imgp->args->argc + 1 + imgp->args->envc + 1;
|
|
|
|
argarray = pos = malloc(AT_COUNT * sizeof(*pos), M_TEMP,
|
|
|
|
M_WAITOK | M_ZERO);
|
1996-03-10 08:42:54 +00:00
|
|
|
|
2008-12-17 16:25:20 +00:00
|
|
|
if (args->execfd != -1)
|
1996-03-10 08:42:54 +00:00
|
|
|
AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
|
|
|
|
AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
|
|
|
|
AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
|
|
|
|
AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
|
|
|
|
AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
|
|
|
|
AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
|
|
|
|
AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
|
|
|
|
AUXARGS_ENTRY(pos, AT_BASE, args->base);
|
2015-05-22 20:50:35 +00:00
|
|
|
AUXARGS_ENTRY(pos, AT_EHDRFLAGS, args->hdr_eflags);
|
2009-03-17 12:53:28 +00:00
|
|
|
if (imgp->execpathp != 0)
|
|
|
|
AUXARGS_ENTRY(pos, AT_EXECPATH, imgp->execpathp);
|
2015-02-27 16:28:55 +00:00
|
|
|
AUXARGS_ENTRY(pos, AT_OSRELDATE,
|
|
|
|
imgp->proc->p_ucred->cr_prison->pr_osreldate);
|
2010-08-17 08:55:45 +00:00
|
|
|
if (imgp->canary != 0) {
|
|
|
|
AUXARGS_ENTRY(pos, AT_CANARY, imgp->canary);
|
|
|
|
AUXARGS_ENTRY(pos, AT_CANARYLEN, imgp->canarylen);
|
|
|
|
}
|
|
|
|
AUXARGS_ENTRY(pos, AT_NCPUS, mp_ncpus);
|
|
|
|
if (imgp->pagesizes != 0) {
|
|
|
|
AUXARGS_ENTRY(pos, AT_PAGESIZES, imgp->pagesizes);
|
|
|
|
AUXARGS_ENTRY(pos, AT_PAGESIZESLEN, imgp->pagesizeslen);
|
|
|
|
}
|
2012-06-22 07:06:40 +00:00
|
|
|
if (imgp->sysent->sv_timekeep_base != 0) {
|
|
|
|
AUXARGS_ENTRY(pos, AT_TIMEKEEP,
|
|
|
|
imgp->sysent->sv_timekeep_base);
|
|
|
|
}
|
2011-01-08 18:41:19 +00:00
|
|
|
AUXARGS_ENTRY(pos, AT_STACKPROT, imgp->sysent->sv_shared_page_obj
|
|
|
|
!= NULL && imgp->stack_prot != 0 ? imgp->stack_prot :
|
|
|
|
imgp->sysent->sv_stackprot);
|
2017-09-14 14:26:55 +00:00
|
|
|
if (imgp->sysent->sv_hwcap != NULL)
|
|
|
|
AUXARGS_ENTRY(pos, AT_HWCAP, *imgp->sysent->sv_hwcap);
|
2017-10-21 12:05:01 +00:00
|
|
|
if (imgp->sysent->sv_hwcap2 != NULL)
|
|
|
|
AUXARGS_ENTRY(pos, AT_HWCAP2, *imgp->sysent->sv_hwcap2);
|
1996-03-10 08:42:54 +00:00
|
|
|
AUXARGS_ENTRY(pos, AT_NULL, 0);
|
|
|
|
|
|
|
|
free(imgp->auxargs, M_TEMP);
|
|
|
|
imgp->auxargs = NULL;
|
2018-05-29 17:49:03 +00:00
|
|
|
KASSERT(pos - argarray <= AT_COUNT, ("Too many auxargs"));
|
2018-05-24 16:25:18 +00:00
|
|
|
|
|
|
|
error = copyout(argarray, auxbase, sizeof(*argarray) * AT_COUNT);
|
|
|
|
free(argarray, M_TEMP);
|
|
|
|
if (error != 0)
|
|
|
|
return (error);
|
1996-03-10 08:42:54 +00:00
|
|
|
|
2002-07-20 02:56:12 +00:00
|
|
|
base--;
|
2018-05-24 16:25:18 +00:00
|
|
|
if (suword(base, imgp->args->argc) == -1)
|
|
|
|
return (EFAULT);
|
2002-07-20 02:56:12 +00:00
|
|
|
*stack_base = (register_t *)base;
|
2002-08-24 22:01:40 +00:00
|
|
|
return (0);
|
2002-07-20 02:56:12 +00:00
|
|
|
}
|
1996-03-10 08:42:54 +00:00
|
|
|
|
1998-09-14 22:46:08 +00:00
|
|
|
/*
|
|
|
|
* Code for generating ELF core dumps.
|
|
|
|
*/
|
|
|
|
|
2002-03-19 21:25:46 +00:00
|
|
|
typedef void (*segment_callback)(vm_map_entry_t, void *);
|
1998-09-16 02:04:05 +00:00
|
|
|
|
|
|
|
/* Closure for cb_put_phdr(). */
|
|
|
|
struct phdr_closure {
|
|
|
|
Elf_Phdr *phdr; /* Program header to fill in */
|
|
|
|
Elf_Off offset; /* Offset of segment in core file */
|
|
|
|
};
|
|
|
|
|
|
|
|
/* Closure for cb_size_segment(). */
|
|
|
|
struct sseg_closure {
|
|
|
|
int count; /* Count of writable segments. */
|
|
|
|
size_t size; /* Total size of all writable segments. */
|
|
|
|
};
|
|
|
|
|
2013-04-14 19:59:38 +00:00
|
|
|
typedef void (*outfunc_t)(void *, struct sbuf *, size_t *);
|
|
|
|
|
|
|
|
struct note_info {
|
|
|
|
int type; /* Note type. */
|
|
|
|
outfunc_t outfunc; /* Output function. */
|
|
|
|
void *outarg; /* Argument for the output function. */
|
|
|
|
size_t outsize; /* Output size. */
|
|
|
|
TAILQ_ENTRY(note_info) link; /* Link to the next note info. */
|
|
|
|
};
|
|
|
|
|
|
|
|
TAILQ_HEAD(note_info_list, note_info);
|
|
|
|
|
2015-03-09 03:50:53 +00:00
|
|
|
/* Coredump output parameters. */
|
|
|
|
struct coredump_params {
|
|
|
|
off_t offset;
|
|
|
|
struct ucred *active_cred;
|
|
|
|
struct ucred *file_cred;
|
|
|
|
struct thread *td;
|
|
|
|
struct vnode *vp;
|
2018-01-08 21:27:41 +00:00
|
|
|
struct compressor *comp;
|
2015-03-09 03:50:53 +00:00
|
|
|
};
|
|
|
|
|
2018-01-08 21:27:41 +00:00
|
|
|
extern int compress_user_cores;
|
|
|
|
extern int compress_user_cores_level;
|
|
|
|
|
2002-03-19 21:25:46 +00:00
|
|
|
static void cb_put_phdr(vm_map_entry_t, void *);
|
|
|
|
static void cb_size_segment(vm_map_entry_t, void *);
|
2017-01-20 13:39:07 +00:00
|
|
|
static int core_write(struct coredump_params *, const void *, size_t, off_t,
|
2015-03-09 03:50:53 +00:00
|
|
|
enum uio_seg);
|
2016-07-20 22:51:33 +00:00
|
|
|
static void each_dumpable_segment(struct thread *, segment_callback, void *);
|
2015-03-09 03:50:53 +00:00
|
|
|
static int __elfN(corehdr)(struct coredump_params *, int, void *, size_t,
|
|
|
|
struct note_info_list *, size_t);
|
2013-04-14 19:59:38 +00:00
|
|
|
static void __elfN(prepare_notes)(struct thread *, struct note_info_list *,
|
|
|
|
size_t *);
|
|
|
|
static void __elfN(puthdr)(struct thread *, void *, size_t, int, size_t);
|
|
|
|
static void __elfN(putnote)(struct note_info *, struct sbuf *);
|
|
|
|
static size_t register_note(struct note_info_list *, int, outfunc_t, void *);
|
|
|
|
static int sbuf_drain_core_output(void *, const char *, int);
|
|
|
|
|
|
|
|
static void __elfN(note_fpregset)(void *, struct sbuf *, size_t *);
|
|
|
|
static void __elfN(note_prpsinfo)(void *, struct sbuf *, size_t *);
|
|
|
|
static void __elfN(note_prstatus)(void *, struct sbuf *, size_t *);
|
|
|
|
static void __elfN(note_threadmd)(void *, struct sbuf *, size_t *);
|
|
|
|
static void __elfN(note_thrmisc)(void *, struct sbuf *, size_t *);
|
2017-03-30 18:21:36 +00:00
|
|
|
static void __elfN(note_ptlwpinfo)(void *, struct sbuf *, size_t *);
|
2013-04-16 19:19:14 +00:00
|
|
|
static void __elfN(note_procstat_auxv)(void *, struct sbuf *, size_t *);
|
|
|
|
static void __elfN(note_procstat_proc)(void *, struct sbuf *, size_t *);
|
|
|
|
static void __elfN(note_procstat_psstrings)(void *, struct sbuf *, size_t *);
|
|
|
|
static void note_procstat_files(void *, struct sbuf *, size_t *);
|
|
|
|
static void note_procstat_groups(void *, struct sbuf *, size_t *);
|
|
|
|
static void note_procstat_osrel(void *, struct sbuf *, size_t *);
|
|
|
|
static void note_procstat_rlimit(void *, struct sbuf *, size_t *);
|
|
|
|
static void note_procstat_umask(void *, struct sbuf *, size_t *);
|
|
|
|
static void note_procstat_vmmap(void *, struct sbuf *, size_t *);
|
1998-09-14 22:46:08 +00:00
|
|
|
|
2015-03-09 03:50:53 +00:00
|
|
|
/*
|
|
|
|
* Write out a core segment to the compression stream.
|
|
|
|
*/
|
2010-03-02 06:58:58 +00:00
|
|
|
static int
|
2015-03-09 03:50:53 +00:00
|
|
|
compress_chunk(struct coredump_params *p, char *base, char *buf, u_int len)
|
|
|
|
{
|
|
|
|
u_int chunk_len;
|
2010-03-02 06:58:58 +00:00
|
|
|
int error;
|
2015-03-09 03:50:53 +00:00
|
|
|
|
|
|
|
while (len > 0) {
|
|
|
|
chunk_len = MIN(len, CORE_BUF_SIZE);
|
2017-01-20 13:39:07 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We can get EFAULT error here.
|
|
|
|
* In that case zero out the current chunk of the segment.
|
|
|
|
*/
|
|
|
|
error = copyin(base, buf, chunk_len);
|
|
|
|
if (error != 0)
|
|
|
|
bzero(buf, chunk_len);
|
2018-01-08 21:27:41 +00:00
|
|
|
error = compressor_write(p->comp, buf, chunk_len);
|
2015-03-09 03:50:53 +00:00
|
|
|
if (error != 0)
|
|
|
|
break;
|
|
|
|
base += chunk_len;
|
|
|
|
len -= chunk_len;
|
2010-03-02 06:58:58 +00:00
|
|
|
}
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2015-03-09 03:50:53 +00:00
|
|
|
static int
|
2018-01-08 21:27:41 +00:00
|
|
|
core_compressed_write(void *base, size_t len, off_t offset, void *arg)
|
2015-03-09 03:50:53 +00:00
|
|
|
{
|
|
|
|
|
|
|
|
return (core_write((struct coredump_params *)arg, base, len, offset,
|
|
|
|
UIO_SYSSPACE));
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
2017-01-20 13:39:07 +00:00
|
|
|
core_write(struct coredump_params *p, const void *base, size_t len,
|
|
|
|
off_t offset, enum uio_seg seg)
|
2015-03-09 03:50:53 +00:00
|
|
|
{
|
|
|
|
|
2017-01-20 13:39:07 +00:00
|
|
|
return (vn_rdwr_inchunks(UIO_WRITE, p->vp, __DECONST(void *, base),
|
|
|
|
len, offset, seg, IO_UNIT | IO_DIRECT | IO_RANGELOCKED,
|
2015-03-09 03:50:53 +00:00
|
|
|
p->active_cred, p->file_cred, NULL, p->td));
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
core_output(void *base, size_t len, off_t offset, struct coredump_params *p,
|
|
|
|
void *tmpbuf)
|
|
|
|
{
|
2017-01-20 13:39:07 +00:00
|
|
|
int error;
|
2015-03-09 03:50:53 +00:00
|
|
|
|
2018-01-08 21:27:41 +00:00
|
|
|
if (p->comp != NULL)
|
2015-03-09 03:50:53 +00:00
|
|
|
return (compress_chunk(p, base, tmpbuf, len));
|
2018-01-08 21:27:41 +00:00
|
|
|
|
2017-01-20 13:39:07 +00:00
|
|
|
/*
|
|
|
|
* EFAULT is a non-fatal error that we can get, for example,
|
|
|
|
* if the segment is backed by a file but extends beyond its
|
|
|
|
* end.
|
|
|
|
*/
|
|
|
|
error = core_write(p, base, len, offset, UIO_USERSPACE);
|
|
|
|
if (error == EFAULT) {
|
|
|
|
log(LOG_WARNING, "Failed to fully fault in a core file segment "
|
|
|
|
"at VA %p with size 0x%zx to be written at offset 0x%jx "
|
|
|
|
"for process %s\n", base, len, offset, curproc->p_comm);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Write a "real" zero byte at the end of the target region
|
|
|
|
* in the case this is the last segment.
|
|
|
|
* The intermediate space will be implicitly zero-filled.
|
|
|
|
*/
|
|
|
|
error = core_write(p, zero_region, 1, offset + len - 1,
|
|
|
|
UIO_SYSSPACE);
|
|
|
|
}
|
|
|
|
return (error);
|
2015-03-09 03:50:53 +00:00
|
|
|
}
|
2013-04-14 19:59:38 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Drain into a core file.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
sbuf_drain_core_output(void *arg, const char *data, int len)
|
|
|
|
{
|
2015-03-09 03:50:53 +00:00
|
|
|
struct coredump_params *p;
|
2013-04-16 19:19:14 +00:00
|
|
|
int error, locked;
|
2013-04-14 19:59:38 +00:00
|
|
|
|
2015-03-09 03:50:53 +00:00
|
|
|
p = (struct coredump_params *)arg;
|
2013-04-16 19:19:14 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Some kern_proc out routines that print to this sbuf may
|
|
|
|
* call us with the process lock held. Draining with the
|
|
|
|
* non-sleepable lock held is unsafe. The lock is needed for
|
|
|
|
* those routines when dumping a live process. In our case we
|
|
|
|
* can safely release the lock before draining and acquire
|
|
|
|
* again after.
|
|
|
|
*/
|
|
|
|
locked = PROC_LOCKED(p->td->td_proc);
|
|
|
|
if (locked)
|
|
|
|
PROC_UNLOCK(p->td->td_proc);
|
2018-01-08 21:27:41 +00:00
|
|
|
if (p->comp != NULL)
|
|
|
|
error = compressor_write(p->comp, __DECONST(char *, data), len);
|
2013-04-14 19:59:38 +00:00
|
|
|
else
|
2015-03-09 03:50:53 +00:00
|
|
|
error = core_write(p, __DECONST(void *, data), len, p->offset,
|
|
|
|
UIO_SYSSPACE);
|
2013-04-16 19:19:14 +00:00
|
|
|
if (locked)
|
|
|
|
PROC_LOCK(p->td->td_proc);
|
2013-04-14 19:59:38 +00:00
|
|
|
if (error != 0)
|
|
|
|
return (-error);
|
|
|
|
p->offset += len;
|
|
|
|
return (len);
|
|
|
|
}
|
|
|
|
|
1998-09-14 22:46:08 +00:00
|
|
|
int
|
2010-03-02 06:58:58 +00:00
|
|
|
__elfN(coredump)(struct thread *td, struct vnode *vp, off_t limit, int flags)
|
1999-09-01 00:29:56 +00:00
|
|
|
{
|
2004-06-26 18:58:22 +00:00
|
|
|
struct ucred *cred = td->td_ucred;
|
1999-09-01 00:29:56 +00:00
|
|
|
int error = 0;
|
1998-09-16 02:04:05 +00:00
|
|
|
struct sseg_closure seginfo;
|
2013-04-14 19:59:38 +00:00
|
|
|
struct note_info_list notelst;
|
2015-03-09 03:50:53 +00:00
|
|
|
struct coredump_params params;
|
2013-04-14 19:59:38 +00:00
|
|
|
struct note_info *ninfo;
|
2015-03-09 03:50:53 +00:00
|
|
|
void *hdr, *tmpbuf;
|
2013-04-14 19:59:38 +00:00
|
|
|
size_t hdrsize, notesz, coresize;
|
1998-09-14 22:46:08 +00:00
|
|
|
|
2010-03-02 06:58:58 +00:00
|
|
|
hdr = NULL;
|
2015-07-14 18:24:05 +00:00
|
|
|
tmpbuf = NULL;
|
2013-04-14 19:59:38 +00:00
|
|
|
TAILQ_INIT(¬elst);
|
2010-03-02 06:58:58 +00:00
|
|
|
|
1998-09-16 02:04:05 +00:00
|
|
|
/* Size the program segments. */
|
|
|
|
seginfo.count = 0;
|
|
|
|
seginfo.size = 0;
|
2016-07-20 22:51:33 +00:00
|
|
|
each_dumpable_segment(td, cb_size_segment, &seginfo);
|
1998-09-16 02:04:05 +00:00
|
|
|
|
|
|
|
/*
|
2013-04-14 19:59:38 +00:00
|
|
|
* Collect info about the core file header area.
|
1998-09-16 02:04:05 +00:00
|
|
|
*/
|
2013-04-14 19:59:38 +00:00
|
|
|
hdrsize = sizeof(Elf_Ehdr) + sizeof(Elf_Phdr) * (1 + seginfo.count);
|
2016-07-20 16:59:36 +00:00
|
|
|
if (seginfo.count + 1 >= PN_XNUM)
|
|
|
|
hdrsize += sizeof(Elf_Shdr);
|
2013-04-14 19:59:38 +00:00
|
|
|
__elfN(prepare_notes)(td, ¬elst, ¬esz);
|
|
|
|
coresize = round_page(hdrsize + notesz) + seginfo.size;
|
1998-09-16 02:04:05 +00:00
|
|
|
|
2015-07-14 18:24:05 +00:00
|
|
|
/* Set up core dump parameters. */
|
|
|
|
params.offset = 0;
|
|
|
|
params.active_cred = cred;
|
|
|
|
params.file_cred = NOCRED;
|
|
|
|
params.td = td;
|
|
|
|
params.vp = vp;
|
2018-01-08 21:27:41 +00:00
|
|
|
params.comp = NULL;
|
2015-07-14 18:24:05 +00:00
|
|
|
|
2011-07-06 20:06:44 +00:00
|
|
|
#ifdef RACCT
|
2015-04-29 10:23:02 +00:00
|
|
|
if (racct_enable) {
|
|
|
|
PROC_LOCK(td->td_proc);
|
|
|
|
error = racct_add(td->td_proc, RACCT_CORE, coresize);
|
|
|
|
PROC_UNLOCK(td->td_proc);
|
|
|
|
if (error != 0) {
|
|
|
|
error = EFAULT;
|
|
|
|
goto done;
|
|
|
|
}
|
2011-04-05 20:23:59 +00:00
|
|
|
}
|
2011-07-06 20:06:44 +00:00
|
|
|
#endif
|
2013-04-14 19:59:38 +00:00
|
|
|
if (coresize >= limit) {
|
2010-04-30 03:13:24 +00:00
|
|
|
error = EFAULT;
|
|
|
|
goto done;
|
|
|
|
}
|
1998-09-16 02:04:05 +00:00
|
|
|
|
2015-03-09 03:50:53 +00:00
|
|
|
/* Create a compression stream if necessary. */
|
2018-01-08 21:27:41 +00:00
|
|
|
if (compress_user_cores != 0) {
|
|
|
|
params.comp = compressor_init(core_compressed_write,
|
|
|
|
compress_user_cores, CORE_BUF_SIZE,
|
|
|
|
compress_user_cores_level, ¶ms);
|
|
|
|
if (params.comp == NULL) {
|
2015-03-09 03:50:53 +00:00
|
|
|
error = EFAULT;
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
tmpbuf = malloc(CORE_BUF_SIZE, M_TEMP, M_WAITOK | M_ZERO);
|
|
|
|
}
|
|
|
|
|
1998-09-16 02:04:05 +00:00
|
|
|
/*
|
|
|
|
* Allocate memory for building the header, fill it up,
|
2013-04-14 19:59:38 +00:00
|
|
|
* and write it out following the notes.
|
1998-09-16 02:04:05 +00:00
|
|
|
*/
|
2003-02-19 05:47:46 +00:00
|
|
|
hdr = malloc(hdrsize, M_TEMP, M_WAITOK);
|
2015-03-09 03:50:53 +00:00
|
|
|
error = __elfN(corehdr)(¶ms, seginfo.count, hdr, hdrsize, ¬elst,
|
|
|
|
notesz);
|
1998-09-16 02:04:05 +00:00
|
|
|
|
|
|
|
/* Write the contents of all of the writable segments. */
|
|
|
|
if (error == 0) {
|
|
|
|
Elf_Phdr *php;
|
2004-06-05 02:00:12 +00:00
|
|
|
off_t offset;
|
1998-09-16 02:04:05 +00:00
|
|
|
int i;
|
|
|
|
|
|
|
|
php = (Elf_Phdr *)((char *)hdr + sizeof(Elf_Ehdr)) + 1;
|
2013-04-14 19:59:38 +00:00
|
|
|
offset = round_page(hdrsize + notesz);
|
2002-08-24 22:55:16 +00:00
|
|
|
for (i = 0; i < seginfo.count; i++) {
|
2015-03-09 03:50:53 +00:00
|
|
|
error = core_output((caddr_t)(uintptr_t)php->p_vaddr,
|
|
|
|
php->p_filesz, offset, ¶ms, tmpbuf);
|
2004-06-05 02:00:12 +00:00
|
|
|
if (error != 0)
|
|
|
|
break;
|
1998-09-16 02:04:05 +00:00
|
|
|
offset += php->p_filesz;
|
|
|
|
php++;
|
|
|
|
}
|
2018-01-08 21:27:41 +00:00
|
|
|
if (error == 0 && params.comp != NULL)
|
|
|
|
error = compressor_flush(params.comp);
|
1998-09-16 02:04:05 +00:00
|
|
|
}
|
2010-03-02 06:58:58 +00:00
|
|
|
if (error) {
|
|
|
|
log(LOG_WARNING,
|
|
|
|
"Failed to write core file for process %s (error %d)\n",
|
|
|
|
curproc->p_comm, error);
|
|
|
|
}
|
|
|
|
|
|
|
|
done:
|
2018-01-08 21:27:41 +00:00
|
|
|
free(tmpbuf, M_TEMP);
|
|
|
|
if (params.comp != NULL)
|
|
|
|
compressor_fini(params.comp);
|
2013-04-14 19:59:38 +00:00
|
|
|
while ((ninfo = TAILQ_FIRST(¬elst)) != NULL) {
|
|
|
|
TAILQ_REMOVE(¬elst, ninfo, link);
|
|
|
|
free(ninfo, M_TEMP);
|
|
|
|
}
|
|
|
|
if (hdr != NULL)
|
|
|
|
free(hdr, M_TEMP);
|
2002-07-20 02:56:12 +00:00
|
|
|
|
2002-08-24 22:01:40 +00:00
|
|
|
return (error);
|
1998-09-14 22:46:08 +00:00
|
|
|
}
|
|
|
|
|
1998-09-16 02:04:05 +00:00
|
|
|
/*
|
2016-07-20 22:51:33 +00:00
|
|
|
* A callback for each_dumpable_segment() to write out the segment's
|
1998-09-16 02:04:05 +00:00
|
|
|
* program header entry.
|
|
|
|
*/
|
|
|
|
static void
|
2018-03-12 15:45:50 +00:00
|
|
|
cb_put_phdr(vm_map_entry_t entry, void *closure)
|
1998-09-16 02:04:05 +00:00
|
|
|
{
|
|
|
|
struct phdr_closure *phc = (struct phdr_closure *)closure;
|
|
|
|
Elf_Phdr *phdr = phc->phdr;
|
|
|
|
|
|
|
|
phc->offset = round_page(phc->offset);
|
|
|
|
|
|
|
|
phdr->p_type = PT_LOAD;
|
|
|
|
phdr->p_offset = phc->offset;
|
|
|
|
phdr->p_vaddr = entry->start;
|
|
|
|
phdr->p_paddr = 0;
|
|
|
|
phdr->p_filesz = phdr->p_memsz = entry->end - entry->start;
|
|
|
|
phdr->p_align = PAGE_SIZE;
|
2011-01-08 16:02:14 +00:00
|
|
|
phdr->p_flags = __elfN(untrans_prot)(entry->protection);
|
1998-09-16 02:04:05 +00:00
|
|
|
|
|
|
|
phc->offset += phdr->p_filesz;
|
|
|
|
phc->phdr++;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2016-07-20 22:51:33 +00:00
|
|
|
* A callback for each_dumpable_segment() to gather information about
|
1998-09-16 02:04:05 +00:00
|
|
|
* the number of segments and their total size.
|
|
|
|
*/
|
|
|
|
static void
|
2016-07-20 22:46:56 +00:00
|
|
|
cb_size_segment(vm_map_entry_t entry, void *closure)
|
1998-09-16 02:04:05 +00:00
|
|
|
{
|
|
|
|
struct sseg_closure *ssc = (struct sseg_closure *)closure;
|
|
|
|
|
|
|
|
ssc->count++;
|
|
|
|
ssc->size += entry->end - entry->start;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* For each writable segment in the process's memory map, call the given
|
|
|
|
* function with a pointer to the map entry and some arbitrary
|
|
|
|
* caller-supplied data.
|
|
|
|
*/
|
|
|
|
static void
|
2016-07-20 22:51:33 +00:00
|
|
|
each_dumpable_segment(struct thread *td, segment_callback func, void *closure)
|
1998-09-16 02:04:05 +00:00
|
|
|
{
|
2004-06-26 18:58:22 +00:00
|
|
|
struct proc *p = td->td_proc;
|
1998-09-16 02:04:05 +00:00
|
|
|
vm_map_t map = &p->p_vmspace->vm_map;
|
|
|
|
vm_map_entry_t entry;
|
2006-11-19 23:38:59 +00:00
|
|
|
vm_object_t backing_object, object;
|
|
|
|
boolean_t ignore_entry;
|
1998-09-16 02:04:05 +00:00
|
|
|
|
2006-11-19 23:38:59 +00:00
|
|
|
vm_map_lock_read(map);
|
2019-10-08 07:14:21 +00:00
|
|
|
VM_MAP_ENTRY_FOREACH(entry, map) {
|
2002-12-16 19:24:43 +00:00
|
|
|
/*
|
|
|
|
* Don't dump inaccessible mappings, deal with legacy
|
|
|
|
* coredump mode.
|
|
|
|
*
|
|
|
|
* Note that read-only segments related to the elf binary
|
|
|
|
* are marked MAP_ENTRY_NOCOREDUMP now so we no longer
|
|
|
|
* need to arbitrarily ignore such segments.
|
|
|
|
*/
|
|
|
|
if (elf_legacy_coredump) {
|
|
|
|
if ((entry->protection & VM_PROT_RW) != VM_PROT_RW)
|
|
|
|
continue;
|
|
|
|
} else {
|
|
|
|
if ((entry->protection & VM_PROT_ALL) == 0)
|
|
|
|
continue;
|
|
|
|
}
|
1998-09-16 02:04:05 +00:00
|
|
|
|
2000-02-28 04:10:35 +00:00
|
|
|
/*
|
2002-12-16 19:24:43 +00:00
|
|
|
* Dont include memory segment in the coredump if
|
|
|
|
* MAP_NOCORE is set in mmap(2) or MADV_NOCORE in
|
|
|
|
* madvise(2). Do not dump submaps (i.e. parts of the
|
|
|
|
* kernel map).
|
|
|
|
*/
|
|
|
|
if (entry->eflags & (MAP_ENTRY_NOCOREDUMP|MAP_ENTRY_IS_SUB_MAP))
|
2000-02-28 04:10:35 +00:00
|
|
|
continue;
|
|
|
|
|
2006-11-19 23:38:59 +00:00
|
|
|
if ((object = entry->object.vm_object) == NULL)
|
1998-09-16 02:04:05 +00:00
|
|
|
continue;
|
|
|
|
|
|
|
|
/* Ignore memory-mapped devices and such things. */
|
2013-04-08 19:58:32 +00:00
|
|
|
VM_OBJECT_RLOCK(object);
|
2006-11-19 23:38:59 +00:00
|
|
|
while ((backing_object = object->backing_object) != NULL) {
|
2013-04-08 19:58:32 +00:00
|
|
|
VM_OBJECT_RLOCK(backing_object);
|
|
|
|
VM_OBJECT_RUNLOCK(object);
|
2006-11-19 23:38:59 +00:00
|
|
|
object = backing_object;
|
|
|
|
}
|
|
|
|
ignore_entry = object->type != OBJT_DEFAULT &&
|
2015-02-14 17:12:31 +00:00
|
|
|
object->type != OBJT_SWAP && object->type != OBJT_VNODE &&
|
|
|
|
object->type != OBJT_PHYS;
|
2013-04-08 19:58:32 +00:00
|
|
|
VM_OBJECT_RUNLOCK(object);
|
2006-11-19 23:38:59 +00:00
|
|
|
if (ignore_entry)
|
1998-09-16 02:04:05 +00:00
|
|
|
continue;
|
|
|
|
|
|
|
|
(*func)(entry, closure);
|
|
|
|
}
|
2006-11-19 23:38:59 +00:00
|
|
|
vm_map_unlock_read(map);
|
1998-09-16 02:04:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Write the core file header to the file, including padding up to
|
|
|
|
* the page boundary.
|
|
|
|
*/
|
1998-09-14 22:46:08 +00:00
|
|
|
static int
|
2015-03-09 03:50:53 +00:00
|
|
|
__elfN(corehdr)(struct coredump_params *p, int numsegs, void *hdr,
|
|
|
|
size_t hdrsize, struct note_info_list *notelst, size_t notesz)
|
1998-09-14 22:46:08 +00:00
|
|
|
{
|
2013-04-14 19:59:38 +00:00
|
|
|
struct note_info *ninfo;
|
|
|
|
struct sbuf *sb;
|
|
|
|
int error;
|
1998-09-14 22:46:08 +00:00
|
|
|
|
|
|
|
/* Fill in the header. */
|
1998-09-16 02:04:05 +00:00
|
|
|
bzero(hdr, hdrsize);
|
2015-03-09 03:50:53 +00:00
|
|
|
__elfN(puthdr)(p->td, hdr, hdrsize, numsegs, notesz);
|
2013-04-14 19:59:38 +00:00
|
|
|
|
|
|
|
sb = sbuf_new(NULL, NULL, CORE_BUF_SIZE, SBUF_FIXEDLEN);
|
2015-03-09 03:50:53 +00:00
|
|
|
sbuf_set_drain(sb, sbuf_drain_core_output, p);
|
2013-04-14 19:59:38 +00:00
|
|
|
sbuf_start_section(sb, NULL);
|
|
|
|
sbuf_bcat(sb, hdr, hdrsize);
|
|
|
|
TAILQ_FOREACH(ninfo, notelst, link)
|
|
|
|
__elfN(putnote)(ninfo, sb);
|
|
|
|
/* Align up to a page boundary for the program segments. */
|
|
|
|
sbuf_end_section(sb, -1, PAGE_SIZE, 0);
|
|
|
|
error = sbuf_finish(sb);
|
|
|
|
sbuf_delete(sb);
|
|
|
|
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
__elfN(prepare_notes)(struct thread *td, struct note_info_list *list,
|
|
|
|
size_t *sizep)
|
|
|
|
{
|
|
|
|
struct proc *p;
|
|
|
|
struct thread *thr;
|
|
|
|
size_t size;
|
|
|
|
|
|
|
|
p = td->td_proc;
|
|
|
|
size = 0;
|
|
|
|
|
|
|
|
size += register_note(list, NT_PRPSINFO, __elfN(note_prpsinfo), p);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* To have the debugger select the right thread (LWP) as the initial
|
|
|
|
* thread, we dump the state of the thread passed to us in td first.
|
|
|
|
* This is the thread that causes the core dump and thus likely to
|
|
|
|
* be the right thread one wants to have selected in the debugger.
|
|
|
|
*/
|
|
|
|
thr = td;
|
|
|
|
while (thr != NULL) {
|
|
|
|
size += register_note(list, NT_PRSTATUS,
|
|
|
|
__elfN(note_prstatus), thr);
|
|
|
|
size += register_note(list, NT_FPREGSET,
|
|
|
|
__elfN(note_fpregset), thr);
|
|
|
|
size += register_note(list, NT_THRMISC,
|
|
|
|
__elfN(note_thrmisc), thr);
|
2017-03-30 18:21:36 +00:00
|
|
|
size += register_note(list, NT_PTLWPINFO,
|
|
|
|
__elfN(note_ptlwpinfo), thr);
|
2013-04-14 19:59:38 +00:00
|
|
|
size += register_note(list, -1,
|
|
|
|
__elfN(note_threadmd), thr);
|
|
|
|
|
|
|
|
thr = (thr == td) ? TAILQ_FIRST(&p->p_threads) :
|
|
|
|
TAILQ_NEXT(thr, td_plist);
|
|
|
|
if (thr == td)
|
|
|
|
thr = TAILQ_NEXT(thr, td_plist);
|
|
|
|
}
|
|
|
|
|
2013-04-16 19:19:14 +00:00
|
|
|
size += register_note(list, NT_PROCSTAT_PROC,
|
|
|
|
__elfN(note_procstat_proc), p);
|
|
|
|
size += register_note(list, NT_PROCSTAT_FILES,
|
|
|
|
note_procstat_files, p);
|
|
|
|
size += register_note(list, NT_PROCSTAT_VMMAP,
|
|
|
|
note_procstat_vmmap, p);
|
|
|
|
size += register_note(list, NT_PROCSTAT_GROUPS,
|
|
|
|
note_procstat_groups, p);
|
|
|
|
size += register_note(list, NT_PROCSTAT_UMASK,
|
|
|
|
note_procstat_umask, p);
|
|
|
|
size += register_note(list, NT_PROCSTAT_RLIMIT,
|
|
|
|
note_procstat_rlimit, p);
|
|
|
|
size += register_note(list, NT_PROCSTAT_OSREL,
|
|
|
|
note_procstat_osrel, p);
|
|
|
|
size += register_note(list, NT_PROCSTAT_PSSTRINGS,
|
|
|
|
__elfN(note_procstat_psstrings), p);
|
|
|
|
size += register_note(list, NT_PROCSTAT_AUXV,
|
|
|
|
__elfN(note_procstat_auxv), p);
|
|
|
|
|
2013-04-14 19:59:38 +00:00
|
|
|
*sizep = size;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
__elfN(puthdr)(struct thread *td, void *hdr, size_t hdrsize, int numsegs,
|
|
|
|
size_t notesz)
|
|
|
|
{
|
|
|
|
Elf_Ehdr *ehdr;
|
|
|
|
Elf_Phdr *phdr;
|
2016-07-20 16:59:36 +00:00
|
|
|
Elf_Shdr *shdr;
|
2013-04-14 19:59:38 +00:00
|
|
|
struct phdr_closure phc;
|
|
|
|
|
|
|
|
ehdr = (Elf_Ehdr *)hdr;
|
|
|
|
|
|
|
|
ehdr->e_ident[EI_MAG0] = ELFMAG0;
|
|
|
|
ehdr->e_ident[EI_MAG1] = ELFMAG1;
|
|
|
|
ehdr->e_ident[EI_MAG2] = ELFMAG2;
|
|
|
|
ehdr->e_ident[EI_MAG3] = ELFMAG3;
|
|
|
|
ehdr->e_ident[EI_CLASS] = ELF_CLASS;
|
|
|
|
ehdr->e_ident[EI_DATA] = ELF_DATA;
|
|
|
|
ehdr->e_ident[EI_VERSION] = EV_CURRENT;
|
|
|
|
ehdr->e_ident[EI_OSABI] = ELFOSABI_FREEBSD;
|
|
|
|
ehdr->e_ident[EI_ABIVERSION] = 0;
|
|
|
|
ehdr->e_ident[EI_PAD] = 0;
|
|
|
|
ehdr->e_type = ET_CORE;
|
2017-02-07 20:34:03 +00:00
|
|
|
ehdr->e_machine = td->td_proc->p_elf_machine;
|
2013-04-14 19:59:38 +00:00
|
|
|
ehdr->e_version = EV_CURRENT;
|
|
|
|
ehdr->e_entry = 0;
|
|
|
|
ehdr->e_phoff = sizeof(Elf_Ehdr);
|
2017-02-07 20:34:03 +00:00
|
|
|
ehdr->e_flags = td->td_proc->p_elf_flags;
|
2013-04-14 19:59:38 +00:00
|
|
|
ehdr->e_ehsize = sizeof(Elf_Ehdr);
|
|
|
|
ehdr->e_phentsize = sizeof(Elf_Phdr);
|
|
|
|
ehdr->e_shentsize = sizeof(Elf_Shdr);
|
|
|
|
ehdr->e_shstrndx = SHN_UNDEF;
|
2016-07-20 16:59:36 +00:00
|
|
|
if (numsegs + 1 < PN_XNUM) {
|
|
|
|
ehdr->e_phnum = numsegs + 1;
|
|
|
|
ehdr->e_shnum = 0;
|
|
|
|
} else {
|
|
|
|
ehdr->e_phnum = PN_XNUM;
|
|
|
|
ehdr->e_shnum = 1;
|
|
|
|
|
|
|
|
ehdr->e_shoff = ehdr->e_phoff +
|
|
|
|
(numsegs + 1) * ehdr->e_phentsize;
|
|
|
|
KASSERT(ehdr->e_shoff == hdrsize - sizeof(Elf_Shdr),
|
|
|
|
("e_shoff: %zu, hdrsize - shdr: %zu",
|
2016-07-20 18:11:22 +00:00
|
|
|
(size_t)ehdr->e_shoff, hdrsize - sizeof(Elf_Shdr)));
|
2016-07-20 16:59:36 +00:00
|
|
|
|
|
|
|
shdr = (Elf_Shdr *)((char *)hdr + ehdr->e_shoff);
|
|
|
|
memset(shdr, 0, sizeof(*shdr));
|
|
|
|
/*
|
|
|
|
* A special first section is used to hold large segment and
|
|
|
|
* section counts. This was proposed by Sun Microsystems in
|
|
|
|
* Solaris and has been adopted by Linux; the standard ELF
|
|
|
|
* tools are already familiar with the technique.
|
|
|
|
*
|
|
|
|
* See table 7-7 of the Solaris "Linker and Libraries Guide"
|
|
|
|
* (or 12-7 depending on the version of the document) for more
|
|
|
|
* details.
|
|
|
|
*/
|
|
|
|
shdr->sh_type = SHT_NULL;
|
|
|
|
shdr->sh_size = ehdr->e_shnum;
|
|
|
|
shdr->sh_link = ehdr->e_shstrndx;
|
|
|
|
shdr->sh_info = numsegs + 1;
|
|
|
|
}
|
2013-04-14 19:59:38 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Fill in the program header entries.
|
|
|
|
*/
|
2016-07-20 16:59:36 +00:00
|
|
|
phdr = (Elf_Phdr *)((char *)hdr + ehdr->e_phoff);
|
2013-04-14 19:59:38 +00:00
|
|
|
|
|
|
|
/* The note segement. */
|
|
|
|
phdr->p_type = PT_NOTE;
|
|
|
|
phdr->p_offset = hdrsize;
|
|
|
|
phdr->p_vaddr = 0;
|
|
|
|
phdr->p_paddr = 0;
|
|
|
|
phdr->p_filesz = notesz;
|
|
|
|
phdr->p_memsz = 0;
|
|
|
|
phdr->p_flags = PF_R;
|
2013-05-01 14:59:16 +00:00
|
|
|
phdr->p_align = ELF_NOTE_ROUNDSIZE;
|
2013-04-14 19:59:38 +00:00
|
|
|
phdr++;
|
|
|
|
|
|
|
|
/* All the writable segments from the program. */
|
|
|
|
phc.phdr = phdr;
|
|
|
|
phc.offset = round_page(hdrsize + notesz);
|
2016-07-20 22:51:33 +00:00
|
|
|
each_dumpable_segment(td, cb_put_phdr, &phc);
|
2013-04-14 19:59:38 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static size_t
|
|
|
|
register_note(struct note_info_list *list, int type, outfunc_t out, void *arg)
|
|
|
|
{
|
|
|
|
struct note_info *ninfo;
|
|
|
|
size_t size, notesize;
|
|
|
|
|
|
|
|
size = 0;
|
|
|
|
out(arg, NULL, &size);
|
|
|
|
ninfo = malloc(sizeof(*ninfo), M_TEMP, M_ZERO | M_WAITOK);
|
|
|
|
ninfo->type = type;
|
|
|
|
ninfo->outfunc = out;
|
|
|
|
ninfo->outarg = arg;
|
|
|
|
ninfo->outsize = size;
|
|
|
|
TAILQ_INSERT_TAIL(list, ninfo, link);
|
|
|
|
|
|
|
|
if (type == -1)
|
|
|
|
return (size);
|
|
|
|
|
|
|
|
notesize = sizeof(Elf_Note) + /* note header */
|
2014-11-21 20:53:17 +00:00
|
|
|
roundup2(sizeof(FREEBSD_ABI_VENDOR), ELF_NOTE_ROUNDSIZE) +
|
|
|
|
/* note name */
|
|
|
|
roundup2(size, ELF_NOTE_ROUNDSIZE); /* note description */
|
|
|
|
|
|
|
|
return (notesize);
|
|
|
|
}
|
|
|
|
|
|
|
|
static size_t
|
|
|
|
append_note_data(const void *src, void *dst, size_t len)
|
|
|
|
{
|
|
|
|
size_t padded_len;
|
|
|
|
|
|
|
|
padded_len = roundup2(len, ELF_NOTE_ROUNDSIZE);
|
|
|
|
if (dst != NULL) {
|
|
|
|
bcopy(src, dst, len);
|
|
|
|
bzero((char *)dst + len, padded_len - len);
|
|
|
|
}
|
|
|
|
return (padded_len);
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t
|
|
|
|
__elfN(populate_note)(int type, void *src, void *dst, size_t size, void **descp)
|
|
|
|
{
|
|
|
|
Elf_Note *note;
|
|
|
|
char *buf;
|
|
|
|
size_t notesize;
|
|
|
|
|
|
|
|
buf = dst;
|
|
|
|
if (buf != NULL) {
|
|
|
|
note = (Elf_Note *)buf;
|
|
|
|
note->n_namesz = sizeof(FREEBSD_ABI_VENDOR);
|
|
|
|
note->n_descsz = size;
|
|
|
|
note->n_type = type;
|
|
|
|
buf += sizeof(*note);
|
|
|
|
buf += append_note_data(FREEBSD_ABI_VENDOR, buf,
|
|
|
|
sizeof(FREEBSD_ABI_VENDOR));
|
|
|
|
append_note_data(src, buf, size);
|
|
|
|
if (descp != NULL)
|
|
|
|
*descp = buf;
|
|
|
|
}
|
|
|
|
|
|
|
|
notesize = sizeof(Elf_Note) + /* note header */
|
|
|
|
roundup2(sizeof(FREEBSD_ABI_VENDOR), ELF_NOTE_ROUNDSIZE) +
|
|
|
|
/* note name */
|
2013-05-01 14:59:16 +00:00
|
|
|
roundup2(size, ELF_NOTE_ROUNDSIZE); /* note description */
|
2013-04-14 19:59:38 +00:00
|
|
|
|
|
|
|
return (notesize);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
__elfN(putnote)(struct note_info *ninfo, struct sbuf *sb)
|
|
|
|
{
|
|
|
|
Elf_Note note;
|
Detect badly behaved coredump note helpers
Coredump notes depend on being able to invoke dump routines twice; once
in a dry-run mode to get the size of the note, and another to actually
emit the note to the corefile.
When a note helper emits a different length section the second time
around than the length it requested the first time, the kernel produces
a corrupt coredump.
NT_PROCSTAT_FILES output length, when packing kinfo structs, is tied to
the length of filenames corresponding to vnodes in the process' fd table
via vn_fullpath. As vnodes may move around during dump, this is racy.
So:
- Detect badly behaved notes in putnote() and pad underfilled notes.
- Add a fail point, debug.fail_point.fill_kinfo_vnode__random_path to
exercise the NT_PROCSTAT_FILES corruption. It simply picks random
lengths to expand or truncate paths to in fo_fill_kinfo_vnode().
- Add a sysctl, kern.coredump_pack_fileinfo, to allow users to
disable kinfo packing for PROCSTAT_FILES notes. This should avoid
both FILES note corruption and truncation, even if filenames change,
at the cost of about 1 kiB in padding bloat per open fd. Document
the new sysctl in core.5.
- Fix note_procstat_files to self-limit in the 2nd pass. Since
sometimes this will result in a short write, pad up to our advertised
size. This addresses note corruption, at the risk of sometimes
truncating the last several fd info entries.
- Fix NT_PROCSTAT_FILES consumers libutil and libprocstat to grok the
zero padding.
With suggestions from: bjk, jhb, kib, wblock
Approved by: markj (mentor)
Relnotes: yes
Sponsored by: EMC / Isilon Storage Division
Differential Revision: https://reviews.freebsd.org/D3548
2015-09-03 20:32:10 +00:00
|
|
|
ssize_t old_len, sect_len;
|
|
|
|
size_t new_len, descsz, i;
|
2013-04-14 19:59:38 +00:00
|
|
|
|
|
|
|
if (ninfo->type == -1) {
|
|
|
|
ninfo->outfunc(ninfo->outarg, sb, &ninfo->outsize);
|
|
|
|
return;
|
2010-03-02 06:58:58 +00:00
|
|
|
}
|
2013-04-14 19:59:38 +00:00
|
|
|
|
2014-11-21 20:53:17 +00:00
|
|
|
note.n_namesz = sizeof(FREEBSD_ABI_VENDOR);
|
2013-04-14 19:59:38 +00:00
|
|
|
note.n_descsz = ninfo->outsize;
|
|
|
|
note.n_type = ninfo->type;
|
|
|
|
|
|
|
|
sbuf_bcat(sb, ¬e, sizeof(note));
|
|
|
|
sbuf_start_section(sb, &old_len);
|
2014-11-21 20:53:17 +00:00
|
|
|
sbuf_bcat(sb, FREEBSD_ABI_VENDOR, sizeof(FREEBSD_ABI_VENDOR));
|
2013-05-01 14:59:16 +00:00
|
|
|
sbuf_end_section(sb, old_len, ELF_NOTE_ROUNDSIZE, 0);
|
2013-04-14 19:59:38 +00:00
|
|
|
if (note.n_descsz == 0)
|
|
|
|
return;
|
|
|
|
sbuf_start_section(sb, &old_len);
|
|
|
|
ninfo->outfunc(ninfo->outarg, sb, &ninfo->outsize);
|
Detect badly behaved coredump note helpers
Coredump notes depend on being able to invoke dump routines twice; once
in a dry-run mode to get the size of the note, and another to actually
emit the note to the corefile.
When a note helper emits a different length section the second time
around than the length it requested the first time, the kernel produces
a corrupt coredump.
NT_PROCSTAT_FILES output length, when packing kinfo structs, is tied to
the length of filenames corresponding to vnodes in the process' fd table
via vn_fullpath. As vnodes may move around during dump, this is racy.
So:
- Detect badly behaved notes in putnote() and pad underfilled notes.
- Add a fail point, debug.fail_point.fill_kinfo_vnode__random_path to
exercise the NT_PROCSTAT_FILES corruption. It simply picks random
lengths to expand or truncate paths to in fo_fill_kinfo_vnode().
- Add a sysctl, kern.coredump_pack_fileinfo, to allow users to
disable kinfo packing for PROCSTAT_FILES notes. This should avoid
both FILES note corruption and truncation, even if filenames change,
at the cost of about 1 kiB in padding bloat per open fd. Document
the new sysctl in core.5.
- Fix note_procstat_files to self-limit in the 2nd pass. Since
sometimes this will result in a short write, pad up to our advertised
size. This addresses note corruption, at the risk of sometimes
truncating the last several fd info entries.
- Fix NT_PROCSTAT_FILES consumers libutil and libprocstat to grok the
zero padding.
With suggestions from: bjk, jhb, kib, wblock
Approved by: markj (mentor)
Relnotes: yes
Sponsored by: EMC / Isilon Storage Division
Differential Revision: https://reviews.freebsd.org/D3548
2015-09-03 20:32:10 +00:00
|
|
|
sect_len = sbuf_end_section(sb, old_len, ELF_NOTE_ROUNDSIZE, 0);
|
|
|
|
if (sect_len < 0)
|
|
|
|
return;
|
|
|
|
|
|
|
|
new_len = (size_t)sect_len;
|
|
|
|
descsz = roundup(note.n_descsz, ELF_NOTE_ROUNDSIZE);
|
|
|
|
if (new_len < descsz) {
|
|
|
|
/*
|
|
|
|
* It is expected that individual note emitters will correctly
|
|
|
|
* predict their expected output size and fill up to that size
|
|
|
|
* themselves, padding in a format-specific way if needed.
|
|
|
|
* However, in case they don't, just do it here with zeros.
|
|
|
|
*/
|
|
|
|
for (i = 0; i < descsz - new_len; i++)
|
|
|
|
sbuf_putc(sb, 0);
|
|
|
|
} else if (new_len > descsz) {
|
|
|
|
/*
|
|
|
|
* We can't always truncate sb -- we may have drained some
|
|
|
|
* of it already.
|
|
|
|
*/
|
|
|
|
KASSERT(new_len == descsz, ("%s: Note type %u changed as we "
|
|
|
|
"read it (%zu > %zu). Since it is longer than "
|
|
|
|
"expected, this coredump's notes are corrupt. THIS "
|
|
|
|
"IS A BUG in the note_procstat routine for type %u.\n",
|
|
|
|
__func__, (unsigned)note.n_type, new_len, descsz,
|
|
|
|
(unsigned)note.n_type));
|
|
|
|
}
|
1998-09-15 22:23:12 +00:00
|
|
|
}
|
|
|
|
|
2013-04-14 19:59:38 +00:00
|
|
|
/*
|
|
|
|
* Miscellaneous note out functions.
|
|
|
|
*/
|
|
|
|
|
2010-03-11 14:49:06 +00:00
|
|
|
#if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32
|
|
|
|
#include <compat/freebsd32/freebsd32.h>
|
2017-06-29 21:31:13 +00:00
|
|
|
#include <compat/freebsd32/freebsd32_signal.h>
|
2010-03-11 14:49:06 +00:00
|
|
|
|
2005-06-30 07:49:22 +00:00
|
|
|
typedef struct prstatus32 elf_prstatus_t;
|
|
|
|
typedef struct prpsinfo32 elf_prpsinfo_t;
|
|
|
|
typedef struct fpreg32 elf_prfpregset_t;
|
|
|
|
typedef struct fpreg32 elf_fpregset_t;
|
|
|
|
typedef struct reg32 elf_gregset_t;
|
Add the ability for GDB to printout the thread name along with other
thread specific informations.
In order to do that, and in order to avoid KBI breakage with existing
infrastructure the following semantic is implemented:
- For live programs, a new member to the PT_LWPINFO is added (pl_tdname)
- For cores, a new ELF note is added (NT_THRMISC) that can be used for
storing thread specific, miscellaneous, informations. Right now it is
just popluated with a thread name.
GDB, then, retrieves the correct informations from the corefile via the
BFD interface, as it groks the ELF notes and create appropriate
pseudo-sections.
Sponsored by: Sandvine Incorporated
Tested by: gianni
Discussed with: dim, kan, kib
MFC after: 2 weeks
2010-11-22 14:42:13 +00:00
|
|
|
typedef struct thrmisc32 elf_thrmisc_t;
|
2013-04-16 19:19:14 +00:00
|
|
|
#define ELF_KERN_PROC_MASK KERN_PROC_MASK32
|
|
|
|
typedef struct kinfo_proc32 elf_kinfo_proc_t;
|
|
|
|
typedef uint32_t elf_ps_strings_t;
|
2005-06-30 07:49:22 +00:00
|
|
|
#else
|
|
|
|
typedef prstatus_t elf_prstatus_t;
|
|
|
|
typedef prpsinfo_t elf_prpsinfo_t;
|
|
|
|
typedef prfpregset_t elf_prfpregset_t;
|
|
|
|
typedef prfpregset_t elf_fpregset_t;
|
|
|
|
typedef gregset_t elf_gregset_t;
|
Add the ability for GDB to printout the thread name along with other
thread specific informations.
In order to do that, and in order to avoid KBI breakage with existing
infrastructure the following semantic is implemented:
- For live programs, a new member to the PT_LWPINFO is added (pl_tdname)
- For cores, a new ELF note is added (NT_THRMISC) that can be used for
storing thread specific, miscellaneous, informations. Right now it is
just popluated with a thread name.
GDB, then, retrieves the correct informations from the corefile via the
BFD interface, as it groks the ELF notes and create appropriate
pseudo-sections.
Sponsored by: Sandvine Incorporated
Tested by: gianni
Discussed with: dim, kan, kib
MFC after: 2 weeks
2010-11-22 14:42:13 +00:00
|
|
|
typedef thrmisc_t elf_thrmisc_t;
|
2013-04-16 19:19:14 +00:00
|
|
|
#define ELF_KERN_PROC_MASK 0
|
|
|
|
typedef struct kinfo_proc elf_kinfo_proc_t;
|
|
|
|
typedef vm_offset_t elf_ps_strings_t;
|
2005-06-30 07:49:22 +00:00
|
|
|
#endif
|
|
|
|
|
1998-09-14 22:46:08 +00:00
|
|
|
static void
|
2013-04-14 19:59:38 +00:00
|
|
|
__elfN(note_prpsinfo)(void *arg, struct sbuf *sb, size_t *sizep)
|
1998-09-14 22:46:08 +00:00
|
|
|
{
|
2016-07-14 23:20:05 +00:00
|
|
|
struct sbuf sbarg;
|
|
|
|
size_t len;
|
|
|
|
char *cp, *end;
|
2004-06-26 18:58:22 +00:00
|
|
|
struct proc *p;
|
2013-04-14 19:59:38 +00:00
|
|
|
elf_prpsinfo_t *psinfo;
|
2016-07-14 23:20:05 +00:00
|
|
|
int error;
|
2004-04-03 20:25:41 +00:00
|
|
|
|
2013-04-14 19:59:38 +00:00
|
|
|
p = (struct proc *)arg;
|
|
|
|
if (sb != NULL) {
|
|
|
|
KASSERT(*sizep == sizeof(*psinfo), ("invalid size"));
|
|
|
|
psinfo = malloc(sizeof(*psinfo), M_TEMP, M_ZERO | M_WAITOK);
|
2004-04-03 20:25:41 +00:00
|
|
|
psinfo->pr_version = PRPSINFO_VERSION;
|
2005-06-30 07:49:22 +00:00
|
|
|
psinfo->pr_psinfosz = sizeof(elf_prpsinfo_t);
|
2008-05-15 03:07:34 +00:00
|
|
|
strlcpy(psinfo->pr_fname, p->p_comm, sizeof(psinfo->pr_fname));
|
2016-07-14 23:20:05 +00:00
|
|
|
PROC_LOCK(p);
|
|
|
|
if (p->p_args != NULL) {
|
|
|
|
len = sizeof(psinfo->pr_psargs) - 1;
|
|
|
|
if (len > p->p_args->ar_length)
|
|
|
|
len = p->p_args->ar_length;
|
|
|
|
memcpy(psinfo->pr_psargs, p->p_args->ar_args, len);
|
|
|
|
PROC_UNLOCK(p);
|
|
|
|
error = 0;
|
|
|
|
} else {
|
|
|
|
_PHOLD(p);
|
|
|
|
PROC_UNLOCK(p);
|
|
|
|
sbuf_new(&sbarg, psinfo->pr_psargs,
|
|
|
|
sizeof(psinfo->pr_psargs), SBUF_FIXEDLEN);
|
|
|
|
error = proc_getargv(curthread, p, &sbarg);
|
|
|
|
PRELE(p);
|
|
|
|
if (sbuf_finish(&sbarg) == 0)
|
|
|
|
len = sbuf_len(&sbarg) - 1;
|
|
|
|
else
|
|
|
|
len = sizeof(psinfo->pr_psargs) - 1;
|
|
|
|
sbuf_delete(&sbarg);
|
|
|
|
}
|
|
|
|
if (error || len == 0)
|
|
|
|
strlcpy(psinfo->pr_psargs, p->p_comm,
|
|
|
|
sizeof(psinfo->pr_psargs));
|
|
|
|
else {
|
|
|
|
KASSERT(len < sizeof(psinfo->pr_psargs),
|
|
|
|
("len is too long: %zu vs %zu", len,
|
|
|
|
sizeof(psinfo->pr_psargs)));
|
|
|
|
cp = psinfo->pr_psargs;
|
|
|
|
end = cp + len - 1;
|
|
|
|
for (;;) {
|
|
|
|
cp = memchr(cp, '\0', end - cp);
|
|
|
|
if (cp == NULL)
|
|
|
|
break;
|
|
|
|
*cp = ' ';
|
|
|
|
}
|
|
|
|
}
|
2016-07-18 15:14:23 +00:00
|
|
|
psinfo->pr_pid = p->p_pid;
|
2013-04-14 19:59:38 +00:00
|
|
|
sbuf_bcat(sb, psinfo, sizeof(*psinfo));
|
|
|
|
free(psinfo, M_TEMP);
|
2004-04-03 20:25:41 +00:00
|
|
|
}
|
2013-04-14 19:59:38 +00:00
|
|
|
*sizep = sizeof(*psinfo);
|
|
|
|
}
|
2004-04-03 20:25:41 +00:00
|
|
|
|
2013-04-14 19:59:38 +00:00
|
|
|
static void
|
|
|
|
__elfN(note_prstatus)(void *arg, struct sbuf *sb, size_t *sizep)
|
|
|
|
{
|
|
|
|
struct thread *td;
|
|
|
|
elf_prstatus_t *status;
|
|
|
|
|
|
|
|
td = (struct thread *)arg;
|
|
|
|
if (sb != NULL) {
|
|
|
|
KASSERT(*sizep == sizeof(*status), ("invalid size"));
|
|
|
|
status = malloc(sizeof(*status), M_TEMP, M_ZERO | M_WAITOK);
|
|
|
|
status->pr_version = PRSTATUS_VERSION;
|
|
|
|
status->pr_statussz = sizeof(elf_prstatus_t);
|
|
|
|
status->pr_gregsetsz = sizeof(elf_gregset_t);
|
|
|
|
status->pr_fpregsetsz = sizeof(elf_fpregset_t);
|
|
|
|
status->pr_osreldate = osreldate;
|
|
|
|
status->pr_cursig = td->td_proc->p_sig;
|
|
|
|
status->pr_pid = td->td_tid;
|
2010-03-11 14:49:06 +00:00
|
|
|
#if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32
|
2013-04-14 19:59:38 +00:00
|
|
|
fill_regs32(td, &status->pr_reg);
|
2005-06-30 07:49:22 +00:00
|
|
|
#else
|
2013-04-14 19:59:38 +00:00
|
|
|
fill_regs(td, &status->pr_reg);
|
2005-06-30 07:49:22 +00:00
|
|
|
#endif
|
2013-04-14 19:59:38 +00:00
|
|
|
sbuf_bcat(sb, status, sizeof(*status));
|
|
|
|
free(status, M_TEMP);
|
2004-06-26 18:58:22 +00:00
|
|
|
}
|
2013-04-14 19:59:38 +00:00
|
|
|
*sizep = sizeof(*status);
|
|
|
|
}
|
2004-04-03 20:25:41 +00:00
|
|
|
|
2013-04-14 19:59:38 +00:00
|
|
|
static void
|
|
|
|
__elfN(note_fpregset)(void *arg, struct sbuf *sb, size_t *sizep)
|
|
|
|
{
|
|
|
|
struct thread *td;
|
|
|
|
elf_prfpregset_t *fpregset;
|
1998-09-14 22:46:08 +00:00
|
|
|
|
2013-04-14 19:59:38 +00:00
|
|
|
td = (struct thread *)arg;
|
|
|
|
if (sb != NULL) {
|
|
|
|
KASSERT(*sizep == sizeof(*fpregset), ("invalid size"));
|
|
|
|
fpregset = malloc(sizeof(*fpregset), M_TEMP, M_ZERO | M_WAITOK);
|
2010-03-11 14:49:06 +00:00
|
|
|
#if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32
|
2013-04-14 19:59:38 +00:00
|
|
|
fill_fpregs32(td, fpregset);
|
2005-06-30 07:49:22 +00:00
|
|
|
#else
|
2013-04-14 19:59:38 +00:00
|
|
|
fill_fpregs(td, fpregset);
|
2005-06-30 07:49:22 +00:00
|
|
|
#endif
|
2013-04-14 19:59:38 +00:00
|
|
|
sbuf_bcat(sb, fpregset, sizeof(*fpregset));
|
|
|
|
free(fpregset, M_TEMP);
|
1998-09-14 22:46:08 +00:00
|
|
|
}
|
2013-04-14 19:59:38 +00:00
|
|
|
*sizep = sizeof(*fpregset);
|
1998-09-14 22:46:08 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2013-04-14 19:59:38 +00:00
|
|
|
__elfN(note_thrmisc)(void *arg, struct sbuf *sb, size_t *sizep)
|
1998-09-14 22:46:08 +00:00
|
|
|
{
|
2013-04-14 19:59:38 +00:00
|
|
|
struct thread *td;
|
|
|
|
elf_thrmisc_t thrmisc;
|
|
|
|
|
|
|
|
td = (struct thread *)arg;
|
|
|
|
if (sb != NULL) {
|
|
|
|
KASSERT(*sizep == sizeof(thrmisc), ("invalid size"));
|
|
|
|
bzero(&thrmisc._pad, sizeof(thrmisc._pad));
|
|
|
|
strcpy(thrmisc.pr_tname, td->td_name);
|
|
|
|
sbuf_bcat(sb, &thrmisc, sizeof(thrmisc));
|
|
|
|
}
|
|
|
|
*sizep = sizeof(thrmisc);
|
|
|
|
}
|
1998-09-14 22:46:08 +00:00
|
|
|
|
2017-03-30 18:21:36 +00:00
|
|
|
static void
|
|
|
|
__elfN(note_ptlwpinfo)(void *arg, struct sbuf *sb, size_t *sizep)
|
|
|
|
{
|
|
|
|
struct thread *td;
|
|
|
|
size_t size;
|
|
|
|
int structsize;
|
2017-06-29 21:31:13 +00:00
|
|
|
#if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32
|
|
|
|
struct ptrace_lwpinfo32 pl;
|
|
|
|
#else
|
2017-03-30 18:21:36 +00:00
|
|
|
struct ptrace_lwpinfo pl;
|
2017-06-29 21:31:13 +00:00
|
|
|
#endif
|
2017-03-30 18:21:36 +00:00
|
|
|
|
|
|
|
td = (struct thread *)arg;
|
2017-06-29 21:31:13 +00:00
|
|
|
size = sizeof(structsize) + sizeof(pl);
|
2017-03-30 18:21:36 +00:00
|
|
|
if (sb != NULL) {
|
|
|
|
KASSERT(*sizep == size, ("invalid size"));
|
2017-06-29 21:31:13 +00:00
|
|
|
structsize = sizeof(pl);
|
2017-03-30 18:21:36 +00:00
|
|
|
sbuf_bcat(sb, &structsize, sizeof(structsize));
|
|
|
|
bzero(&pl, sizeof(pl));
|
|
|
|
pl.pl_lwpid = td->td_tid;
|
|
|
|
pl.pl_event = PL_EVENT_NONE;
|
|
|
|
pl.pl_sigmask = td->td_sigmask;
|
|
|
|
pl.pl_siglist = td->td_siglist;
|
|
|
|
if (td->td_si.si_signo != 0) {
|
|
|
|
pl.pl_event = PL_EVENT_SIGNAL;
|
|
|
|
pl.pl_flags |= PL_FLAG_SI;
|
2017-06-29 21:31:13 +00:00
|
|
|
#if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32
|
|
|
|
siginfo_to_siginfo32(&td->td_si, &pl.pl_siginfo);
|
|
|
|
#else
|
2017-03-30 18:21:36 +00:00
|
|
|
pl.pl_siginfo = td->td_si;
|
2017-06-29 21:31:13 +00:00
|
|
|
#endif
|
2017-03-30 18:21:36 +00:00
|
|
|
}
|
|
|
|
strcpy(pl.pl_tdname, td->td_name);
|
|
|
|
/* XXX TODO: supply more information in struct ptrace_lwpinfo*/
|
2017-06-29 21:31:13 +00:00
|
|
|
sbuf_bcat(sb, &pl, sizeof(pl));
|
2017-03-30 18:21:36 +00:00
|
|
|
}
|
|
|
|
*sizep = size;
|
|
|
|
}
|
|
|
|
|
2013-04-14 19:59:38 +00:00
|
|
|
/*
|
|
|
|
* Allow for MD specific notes, as well as any MD
|
|
|
|
* specific preparations for writing MI notes.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
__elfN(note_threadmd)(void *arg, struct sbuf *sb, size_t *sizep)
|
|
|
|
{
|
|
|
|
struct thread *td;
|
|
|
|
void *buf;
|
|
|
|
size_t size;
|
|
|
|
|
|
|
|
td = (struct thread *)arg;
|
|
|
|
size = *sizep;
|
|
|
|
if (size != 0 && sb != NULL)
|
|
|
|
buf = malloc(size, M_TEMP, M_ZERO | M_WAITOK);
|
2014-04-14 21:02:20 +00:00
|
|
|
else
|
|
|
|
buf = NULL;
|
2013-04-14 19:59:38 +00:00
|
|
|
size = 0;
|
|
|
|
__elfN(dump_thread)(td, buf, &size);
|
2014-11-22 18:15:02 +00:00
|
|
|
KASSERT(sb == NULL || *sizep == size, ("invalid size"));
|
2014-04-14 21:02:20 +00:00
|
|
|
if (size != 0 && sb != NULL)
|
2013-04-14 19:59:38 +00:00
|
|
|
sbuf_bcat(sb, buf, size);
|
2014-04-14 21:02:20 +00:00
|
|
|
free(buf, M_TEMP);
|
2013-04-14 19:59:38 +00:00
|
|
|
*sizep = size;
|
1998-09-14 22:46:08 +00:00
|
|
|
}
|
|
|
|
|
2013-04-16 19:19:14 +00:00
|
|
|
#ifdef KINFO_PROC_SIZE
|
|
|
|
CTASSERT(sizeof(struct kinfo_proc) == KINFO_PROC_SIZE);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
static void
|
|
|
|
__elfN(note_procstat_proc)(void *arg, struct sbuf *sb, size_t *sizep)
|
|
|
|
{
|
|
|
|
struct proc *p;
|
|
|
|
size_t size;
|
|
|
|
int structsize;
|
|
|
|
|
|
|
|
p = (struct proc *)arg;
|
|
|
|
size = sizeof(structsize) + p->p_numthreads *
|
|
|
|
sizeof(elf_kinfo_proc_t);
|
|
|
|
|
|
|
|
if (sb != NULL) {
|
|
|
|
KASSERT(*sizep == size, ("invalid size"));
|
|
|
|
structsize = sizeof(elf_kinfo_proc_t);
|
|
|
|
sbuf_bcat(sb, &structsize, sizeof(structsize));
|
|
|
|
PROC_LOCK(p);
|
|
|
|
kern_proc_out(p, sb, ELF_KERN_PROC_MASK);
|
|
|
|
}
|
|
|
|
*sizep = size;
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef KINFO_FILE_SIZE
|
|
|
|
CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
static void
|
|
|
|
note_procstat_files(void *arg, struct sbuf *sb, size_t *sizep)
|
|
|
|
{
|
|
|
|
struct proc *p;
|
Detect badly behaved coredump note helpers
Coredump notes depend on being able to invoke dump routines twice; once
in a dry-run mode to get the size of the note, and another to actually
emit the note to the corefile.
When a note helper emits a different length section the second time
around than the length it requested the first time, the kernel produces
a corrupt coredump.
NT_PROCSTAT_FILES output length, when packing kinfo structs, is tied to
the length of filenames corresponding to vnodes in the process' fd table
via vn_fullpath. As vnodes may move around during dump, this is racy.
So:
- Detect badly behaved notes in putnote() and pad underfilled notes.
- Add a fail point, debug.fail_point.fill_kinfo_vnode__random_path to
exercise the NT_PROCSTAT_FILES corruption. It simply picks random
lengths to expand or truncate paths to in fo_fill_kinfo_vnode().
- Add a sysctl, kern.coredump_pack_fileinfo, to allow users to
disable kinfo packing for PROCSTAT_FILES notes. This should avoid
both FILES note corruption and truncation, even if filenames change,
at the cost of about 1 kiB in padding bloat per open fd. Document
the new sysctl in core.5.
- Fix note_procstat_files to self-limit in the 2nd pass. Since
sometimes this will result in a short write, pad up to our advertised
size. This addresses note corruption, at the risk of sometimes
truncating the last several fd info entries.
- Fix NT_PROCSTAT_FILES consumers libutil and libprocstat to grok the
zero padding.
With suggestions from: bjk, jhb, kib, wblock
Approved by: markj (mentor)
Relnotes: yes
Sponsored by: EMC / Isilon Storage Division
Differential Revision: https://reviews.freebsd.org/D3548
2015-09-03 20:32:10 +00:00
|
|
|
size_t size, sect_sz, i;
|
|
|
|
ssize_t start_len, sect_len;
|
|
|
|
int structsize, filedesc_flags;
|
|
|
|
|
2015-09-07 16:44:28 +00:00
|
|
|
if (coredump_pack_fileinfo)
|
Detect badly behaved coredump note helpers
Coredump notes depend on being able to invoke dump routines twice; once
in a dry-run mode to get the size of the note, and another to actually
emit the note to the corefile.
When a note helper emits a different length section the second time
around than the length it requested the first time, the kernel produces
a corrupt coredump.
NT_PROCSTAT_FILES output length, when packing kinfo structs, is tied to
the length of filenames corresponding to vnodes in the process' fd table
via vn_fullpath. As vnodes may move around during dump, this is racy.
So:
- Detect badly behaved notes in putnote() and pad underfilled notes.
- Add a fail point, debug.fail_point.fill_kinfo_vnode__random_path to
exercise the NT_PROCSTAT_FILES corruption. It simply picks random
lengths to expand or truncate paths to in fo_fill_kinfo_vnode().
- Add a sysctl, kern.coredump_pack_fileinfo, to allow users to
disable kinfo packing for PROCSTAT_FILES notes. This should avoid
both FILES note corruption and truncation, even if filenames change,
at the cost of about 1 kiB in padding bloat per open fd. Document
the new sysctl in core.5.
- Fix note_procstat_files to self-limit in the 2nd pass. Since
sometimes this will result in a short write, pad up to our advertised
size. This addresses note corruption, at the risk of sometimes
truncating the last several fd info entries.
- Fix NT_PROCSTAT_FILES consumers libutil and libprocstat to grok the
zero padding.
With suggestions from: bjk, jhb, kib, wblock
Approved by: markj (mentor)
Relnotes: yes
Sponsored by: EMC / Isilon Storage Division
Differential Revision: https://reviews.freebsd.org/D3548
2015-09-03 20:32:10 +00:00
|
|
|
filedesc_flags = KERN_FILEDESC_PACK_KINFO;
|
|
|
|
else
|
|
|
|
filedesc_flags = 0;
|
2013-04-16 19:19:14 +00:00
|
|
|
|
|
|
|
p = (struct proc *)arg;
|
Detect badly behaved coredump note helpers
Coredump notes depend on being able to invoke dump routines twice; once
in a dry-run mode to get the size of the note, and another to actually
emit the note to the corefile.
When a note helper emits a different length section the second time
around than the length it requested the first time, the kernel produces
a corrupt coredump.
NT_PROCSTAT_FILES output length, when packing kinfo structs, is tied to
the length of filenames corresponding to vnodes in the process' fd table
via vn_fullpath. As vnodes may move around during dump, this is racy.
So:
- Detect badly behaved notes in putnote() and pad underfilled notes.
- Add a fail point, debug.fail_point.fill_kinfo_vnode__random_path to
exercise the NT_PROCSTAT_FILES corruption. It simply picks random
lengths to expand or truncate paths to in fo_fill_kinfo_vnode().
- Add a sysctl, kern.coredump_pack_fileinfo, to allow users to
disable kinfo packing for PROCSTAT_FILES notes. This should avoid
both FILES note corruption and truncation, even if filenames change,
at the cost of about 1 kiB in padding bloat per open fd. Document
the new sysctl in core.5.
- Fix note_procstat_files to self-limit in the 2nd pass. Since
sometimes this will result in a short write, pad up to our advertised
size. This addresses note corruption, at the risk of sometimes
truncating the last several fd info entries.
- Fix NT_PROCSTAT_FILES consumers libutil and libprocstat to grok the
zero padding.
With suggestions from: bjk, jhb, kib, wblock
Approved by: markj (mentor)
Relnotes: yes
Sponsored by: EMC / Isilon Storage Division
Differential Revision: https://reviews.freebsd.org/D3548
2015-09-03 20:32:10 +00:00
|
|
|
structsize = sizeof(struct kinfo_file);
|
2013-04-16 19:19:14 +00:00
|
|
|
if (sb == NULL) {
|
|
|
|
size = 0;
|
|
|
|
sb = sbuf_new(NULL, NULL, 128, SBUF_FIXEDLEN);
|
Optimize kern.geom.conf* sysctls.
On large systems those sysctls may generate megabytes of output. Before
this change sbuf(9) code was resizing buffer by 4KB each time many times,
generating tons of TLB shootdowns. Unfortunately in this case existing
sbuf_new_for_sysctl() mechanism, supposed to help with this issue, is not
applicable, since all the sbuf writes are done in different kernel thread.
This change improves situation in two ways:
- on first sysctl call, not providing any output buffer, it sets special
sbuf drain function, just counting the data and so not needing big buffer;
- on second sysctl call it uses as initial buffer size value saved on
previous call, so that in most cases there will be no reallocation, unless
GEOM topology changed significantly.
MFC after: 1 week
Sponsored by: iXsystems, Inc.
2019-06-18 21:05:10 +00:00
|
|
|
sbuf_set_drain(sb, sbuf_count_drain, &size);
|
2013-04-16 19:19:14 +00:00
|
|
|
sbuf_bcat(sb, &structsize, sizeof(structsize));
|
|
|
|
PROC_LOCK(p);
|
Detect badly behaved coredump note helpers
Coredump notes depend on being able to invoke dump routines twice; once
in a dry-run mode to get the size of the note, and another to actually
emit the note to the corefile.
When a note helper emits a different length section the second time
around than the length it requested the first time, the kernel produces
a corrupt coredump.
NT_PROCSTAT_FILES output length, when packing kinfo structs, is tied to
the length of filenames corresponding to vnodes in the process' fd table
via vn_fullpath. As vnodes may move around during dump, this is racy.
So:
- Detect badly behaved notes in putnote() and pad underfilled notes.
- Add a fail point, debug.fail_point.fill_kinfo_vnode__random_path to
exercise the NT_PROCSTAT_FILES corruption. It simply picks random
lengths to expand or truncate paths to in fo_fill_kinfo_vnode().
- Add a sysctl, kern.coredump_pack_fileinfo, to allow users to
disable kinfo packing for PROCSTAT_FILES notes. This should avoid
both FILES note corruption and truncation, even if filenames change,
at the cost of about 1 kiB in padding bloat per open fd. Document
the new sysctl in core.5.
- Fix note_procstat_files to self-limit in the 2nd pass. Since
sometimes this will result in a short write, pad up to our advertised
size. This addresses note corruption, at the risk of sometimes
truncating the last several fd info entries.
- Fix NT_PROCSTAT_FILES consumers libutil and libprocstat to grok the
zero padding.
With suggestions from: bjk, jhb, kib, wblock
Approved by: markj (mentor)
Relnotes: yes
Sponsored by: EMC / Isilon Storage Division
Differential Revision: https://reviews.freebsd.org/D3548
2015-09-03 20:32:10 +00:00
|
|
|
kern_proc_filedesc_out(p, sb, -1, filedesc_flags);
|
2013-04-16 19:19:14 +00:00
|
|
|
sbuf_finish(sb);
|
|
|
|
sbuf_delete(sb);
|
|
|
|
*sizep = size;
|
|
|
|
} else {
|
Detect badly behaved coredump note helpers
Coredump notes depend on being able to invoke dump routines twice; once
in a dry-run mode to get the size of the note, and another to actually
emit the note to the corefile.
When a note helper emits a different length section the second time
around than the length it requested the first time, the kernel produces
a corrupt coredump.
NT_PROCSTAT_FILES output length, when packing kinfo structs, is tied to
the length of filenames corresponding to vnodes in the process' fd table
via vn_fullpath. As vnodes may move around during dump, this is racy.
So:
- Detect badly behaved notes in putnote() and pad underfilled notes.
- Add a fail point, debug.fail_point.fill_kinfo_vnode__random_path to
exercise the NT_PROCSTAT_FILES corruption. It simply picks random
lengths to expand or truncate paths to in fo_fill_kinfo_vnode().
- Add a sysctl, kern.coredump_pack_fileinfo, to allow users to
disable kinfo packing for PROCSTAT_FILES notes. This should avoid
both FILES note corruption and truncation, even if filenames change,
at the cost of about 1 kiB in padding bloat per open fd. Document
the new sysctl in core.5.
- Fix note_procstat_files to self-limit in the 2nd pass. Since
sometimes this will result in a short write, pad up to our advertised
size. This addresses note corruption, at the risk of sometimes
truncating the last several fd info entries.
- Fix NT_PROCSTAT_FILES consumers libutil and libprocstat to grok the
zero padding.
With suggestions from: bjk, jhb, kib, wblock
Approved by: markj (mentor)
Relnotes: yes
Sponsored by: EMC / Isilon Storage Division
Differential Revision: https://reviews.freebsd.org/D3548
2015-09-03 20:32:10 +00:00
|
|
|
sbuf_start_section(sb, &start_len);
|
|
|
|
|
2013-04-16 19:19:14 +00:00
|
|
|
sbuf_bcat(sb, &structsize, sizeof(structsize));
|
|
|
|
PROC_LOCK(p);
|
Detect badly behaved coredump note helpers
Coredump notes depend on being able to invoke dump routines twice; once
in a dry-run mode to get the size of the note, and another to actually
emit the note to the corefile.
When a note helper emits a different length section the second time
around than the length it requested the first time, the kernel produces
a corrupt coredump.
NT_PROCSTAT_FILES output length, when packing kinfo structs, is tied to
the length of filenames corresponding to vnodes in the process' fd table
via vn_fullpath. As vnodes may move around during dump, this is racy.
So:
- Detect badly behaved notes in putnote() and pad underfilled notes.
- Add a fail point, debug.fail_point.fill_kinfo_vnode__random_path to
exercise the NT_PROCSTAT_FILES corruption. It simply picks random
lengths to expand or truncate paths to in fo_fill_kinfo_vnode().
- Add a sysctl, kern.coredump_pack_fileinfo, to allow users to
disable kinfo packing for PROCSTAT_FILES notes. This should avoid
both FILES note corruption and truncation, even if filenames change,
at the cost of about 1 kiB in padding bloat per open fd. Document
the new sysctl in core.5.
- Fix note_procstat_files to self-limit in the 2nd pass. Since
sometimes this will result in a short write, pad up to our advertised
size. This addresses note corruption, at the risk of sometimes
truncating the last several fd info entries.
- Fix NT_PROCSTAT_FILES consumers libutil and libprocstat to grok the
zero padding.
With suggestions from: bjk, jhb, kib, wblock
Approved by: markj (mentor)
Relnotes: yes
Sponsored by: EMC / Isilon Storage Division
Differential Revision: https://reviews.freebsd.org/D3548
2015-09-03 20:32:10 +00:00
|
|
|
kern_proc_filedesc_out(p, sb, *sizep - sizeof(structsize),
|
|
|
|
filedesc_flags);
|
|
|
|
|
|
|
|
sect_len = sbuf_end_section(sb, start_len, 0, 0);
|
|
|
|
if (sect_len < 0)
|
|
|
|
return;
|
|
|
|
sect_sz = sect_len;
|
|
|
|
|
|
|
|
KASSERT(sect_sz <= *sizep,
|
|
|
|
("kern_proc_filedesc_out did not respect maxlen; "
|
|
|
|
"requested %zu, got %zu", *sizep - sizeof(structsize),
|
|
|
|
sect_sz - sizeof(structsize)));
|
|
|
|
|
|
|
|
for (i = 0; i < *sizep - sect_sz && sb->s_error == 0; i++)
|
|
|
|
sbuf_putc(sb, 0);
|
2013-04-16 19:19:14 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef KINFO_VMENTRY_SIZE
|
|
|
|
CTASSERT(sizeof(struct kinfo_vmentry) == KINFO_VMENTRY_SIZE);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
static void
|
|
|
|
note_procstat_vmmap(void *arg, struct sbuf *sb, size_t *sizep)
|
|
|
|
{
|
|
|
|
struct proc *p;
|
|
|
|
size_t size;
|
Fix core corruption caused by race in note_procstat_vmmap
This fix is spiritually similar to r287442 and was discovered thanks to
the KASSERT added in that revision.
NT_PROCSTAT_VMMAP output length, when packing kinfo structs, is tied to
the length of filenames corresponding to vnodes in the process' vm map
via vn_fullpath. As vnodes may move during coredump, this is racy.
We do not remove the race, only prevent it from causing coredump
corruption.
- Add a sysctl, kern.coredump_pack_vmmapinfo, to allow users to disable
kinfo packing for PROCSTAT_VMMAP notes. This avoids VMMAP corruption
and truncation, even if names change, at the cost of up to PATH_MAX
bytes per mapped object. The new sysctl is documented in core.5.
- Fix note_procstat_vmmap to self-limit in the second pass. This
addresses corruption, at the cost of sometimes producing a truncated
result.
- Fix PROCSTAT_VMMAP consumers libutil (and libprocstat, via copy-paste)
to grok the new zero padding.
Reported by: pho (https://people.freebsd.org/~pho/stress/log/datamove4-2.txt)
Relnotes: yes
Sponsored by: EMC / Isilon Storage Division
Differential Revision: https://reviews.freebsd.org/D3824
2015-10-06 18:07:00 +00:00
|
|
|
int structsize, vmmap_flags;
|
|
|
|
|
|
|
|
if (coredump_pack_vmmapinfo)
|
|
|
|
vmmap_flags = KERN_VMMAP_PACK_KINFO;
|
|
|
|
else
|
|
|
|
vmmap_flags = 0;
|
2013-04-16 19:19:14 +00:00
|
|
|
|
|
|
|
p = (struct proc *)arg;
|
Fix core corruption caused by race in note_procstat_vmmap
This fix is spiritually similar to r287442 and was discovered thanks to
the KASSERT added in that revision.
NT_PROCSTAT_VMMAP output length, when packing kinfo structs, is tied to
the length of filenames corresponding to vnodes in the process' vm map
via vn_fullpath. As vnodes may move during coredump, this is racy.
We do not remove the race, only prevent it from causing coredump
corruption.
- Add a sysctl, kern.coredump_pack_vmmapinfo, to allow users to disable
kinfo packing for PROCSTAT_VMMAP notes. This avoids VMMAP corruption
and truncation, even if names change, at the cost of up to PATH_MAX
bytes per mapped object. The new sysctl is documented in core.5.
- Fix note_procstat_vmmap to self-limit in the second pass. This
addresses corruption, at the cost of sometimes producing a truncated
result.
- Fix PROCSTAT_VMMAP consumers libutil (and libprocstat, via copy-paste)
to grok the new zero padding.
Reported by: pho (https://people.freebsd.org/~pho/stress/log/datamove4-2.txt)
Relnotes: yes
Sponsored by: EMC / Isilon Storage Division
Differential Revision: https://reviews.freebsd.org/D3824
2015-10-06 18:07:00 +00:00
|
|
|
structsize = sizeof(struct kinfo_vmentry);
|
2013-04-16 19:19:14 +00:00
|
|
|
if (sb == NULL) {
|
|
|
|
size = 0;
|
|
|
|
sb = sbuf_new(NULL, NULL, 128, SBUF_FIXEDLEN);
|
Optimize kern.geom.conf* sysctls.
On large systems those sysctls may generate megabytes of output. Before
this change sbuf(9) code was resizing buffer by 4KB each time many times,
generating tons of TLB shootdowns. Unfortunately in this case existing
sbuf_new_for_sysctl() mechanism, supposed to help with this issue, is not
applicable, since all the sbuf writes are done in different kernel thread.
This change improves situation in two ways:
- on first sysctl call, not providing any output buffer, it sets special
sbuf drain function, just counting the data and so not needing big buffer;
- on second sysctl call it uses as initial buffer size value saved on
previous call, so that in most cases there will be no reallocation, unless
GEOM topology changed significantly.
MFC after: 1 week
Sponsored by: iXsystems, Inc.
2019-06-18 21:05:10 +00:00
|
|
|
sbuf_set_drain(sb, sbuf_count_drain, &size);
|
2013-04-16 19:19:14 +00:00
|
|
|
sbuf_bcat(sb, &structsize, sizeof(structsize));
|
|
|
|
PROC_LOCK(p);
|
Fix core corruption caused by race in note_procstat_vmmap
This fix is spiritually similar to r287442 and was discovered thanks to
the KASSERT added in that revision.
NT_PROCSTAT_VMMAP output length, when packing kinfo structs, is tied to
the length of filenames corresponding to vnodes in the process' vm map
via vn_fullpath. As vnodes may move during coredump, this is racy.
We do not remove the race, only prevent it from causing coredump
corruption.
- Add a sysctl, kern.coredump_pack_vmmapinfo, to allow users to disable
kinfo packing for PROCSTAT_VMMAP notes. This avoids VMMAP corruption
and truncation, even if names change, at the cost of up to PATH_MAX
bytes per mapped object. The new sysctl is documented in core.5.
- Fix note_procstat_vmmap to self-limit in the second pass. This
addresses corruption, at the cost of sometimes producing a truncated
result.
- Fix PROCSTAT_VMMAP consumers libutil (and libprocstat, via copy-paste)
to grok the new zero padding.
Reported by: pho (https://people.freebsd.org/~pho/stress/log/datamove4-2.txt)
Relnotes: yes
Sponsored by: EMC / Isilon Storage Division
Differential Revision: https://reviews.freebsd.org/D3824
2015-10-06 18:07:00 +00:00
|
|
|
kern_proc_vmmap_out(p, sb, -1, vmmap_flags);
|
2013-04-16 19:19:14 +00:00
|
|
|
sbuf_finish(sb);
|
|
|
|
sbuf_delete(sb);
|
|
|
|
*sizep = size;
|
|
|
|
} else {
|
|
|
|
sbuf_bcat(sb, &structsize, sizeof(structsize));
|
|
|
|
PROC_LOCK(p);
|
Fix core corruption caused by race in note_procstat_vmmap
This fix is spiritually similar to r287442 and was discovered thanks to
the KASSERT added in that revision.
NT_PROCSTAT_VMMAP output length, when packing kinfo structs, is tied to
the length of filenames corresponding to vnodes in the process' vm map
via vn_fullpath. As vnodes may move during coredump, this is racy.
We do not remove the race, only prevent it from causing coredump
corruption.
- Add a sysctl, kern.coredump_pack_vmmapinfo, to allow users to disable
kinfo packing for PROCSTAT_VMMAP notes. This avoids VMMAP corruption
and truncation, even if names change, at the cost of up to PATH_MAX
bytes per mapped object. The new sysctl is documented in core.5.
- Fix note_procstat_vmmap to self-limit in the second pass. This
addresses corruption, at the cost of sometimes producing a truncated
result.
- Fix PROCSTAT_VMMAP consumers libutil (and libprocstat, via copy-paste)
to grok the new zero padding.
Reported by: pho (https://people.freebsd.org/~pho/stress/log/datamove4-2.txt)
Relnotes: yes
Sponsored by: EMC / Isilon Storage Division
Differential Revision: https://reviews.freebsd.org/D3824
2015-10-06 18:07:00 +00:00
|
|
|
kern_proc_vmmap_out(p, sb, *sizep - sizeof(structsize),
|
|
|
|
vmmap_flags);
|
2013-04-16 19:19:14 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
note_procstat_groups(void *arg, struct sbuf *sb, size_t *sizep)
|
|
|
|
{
|
|
|
|
struct proc *p;
|
|
|
|
size_t size;
|
|
|
|
int structsize;
|
|
|
|
|
|
|
|
p = (struct proc *)arg;
|
|
|
|
size = sizeof(structsize) + p->p_ucred->cr_ngroups * sizeof(gid_t);
|
|
|
|
if (sb != NULL) {
|
|
|
|
KASSERT(*sizep == size, ("invalid size"));
|
|
|
|
structsize = sizeof(gid_t);
|
|
|
|
sbuf_bcat(sb, &structsize, sizeof(structsize));
|
|
|
|
sbuf_bcat(sb, p->p_ucred->cr_groups, p->p_ucred->cr_ngroups *
|
|
|
|
sizeof(gid_t));
|
|
|
|
}
|
|
|
|
*sizep = size;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
note_procstat_umask(void *arg, struct sbuf *sb, size_t *sizep)
|
|
|
|
{
|
|
|
|
struct proc *p;
|
|
|
|
size_t size;
|
|
|
|
int structsize;
|
|
|
|
|
|
|
|
p = (struct proc *)arg;
|
|
|
|
size = sizeof(structsize) + sizeof(p->p_fd->fd_cmask);
|
|
|
|
if (sb != NULL) {
|
|
|
|
KASSERT(*sizep == size, ("invalid size"));
|
|
|
|
structsize = sizeof(p->p_fd->fd_cmask);
|
|
|
|
sbuf_bcat(sb, &structsize, sizeof(structsize));
|
|
|
|
sbuf_bcat(sb, &p->p_fd->fd_cmask, sizeof(p->p_fd->fd_cmask));
|
|
|
|
}
|
|
|
|
*sizep = size;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
note_procstat_rlimit(void *arg, struct sbuf *sb, size_t *sizep)
|
|
|
|
{
|
|
|
|
struct proc *p;
|
|
|
|
struct rlimit rlim[RLIM_NLIMITS];
|
|
|
|
size_t size;
|
|
|
|
int structsize, i;
|
|
|
|
|
|
|
|
p = (struct proc *)arg;
|
|
|
|
size = sizeof(structsize) + sizeof(rlim);
|
|
|
|
if (sb != NULL) {
|
|
|
|
KASSERT(*sizep == size, ("invalid size"));
|
|
|
|
structsize = sizeof(rlim);
|
|
|
|
sbuf_bcat(sb, &structsize, sizeof(structsize));
|
|
|
|
PROC_LOCK(p);
|
|
|
|
for (i = 0; i < RLIM_NLIMITS; i++)
|
2015-06-10 10:48:12 +00:00
|
|
|
lim_rlimit_proc(p, i, &rlim[i]);
|
2013-04-16 19:19:14 +00:00
|
|
|
PROC_UNLOCK(p);
|
|
|
|
sbuf_bcat(sb, rlim, sizeof(rlim));
|
|
|
|
}
|
|
|
|
*sizep = size;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
note_procstat_osrel(void *arg, struct sbuf *sb, size_t *sizep)
|
|
|
|
{
|
|
|
|
struct proc *p;
|
|
|
|
size_t size;
|
|
|
|
int structsize;
|
|
|
|
|
|
|
|
p = (struct proc *)arg;
|
|
|
|
size = sizeof(structsize) + sizeof(p->p_osrel);
|
|
|
|
if (sb != NULL) {
|
|
|
|
KASSERT(*sizep == size, ("invalid size"));
|
|
|
|
structsize = sizeof(p->p_osrel);
|
|
|
|
sbuf_bcat(sb, &structsize, sizeof(structsize));
|
|
|
|
sbuf_bcat(sb, &p->p_osrel, sizeof(p->p_osrel));
|
|
|
|
}
|
|
|
|
*sizep = size;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
__elfN(note_procstat_psstrings)(void *arg, struct sbuf *sb, size_t *sizep)
|
|
|
|
{
|
|
|
|
struct proc *p;
|
|
|
|
elf_ps_strings_t ps_strings;
|
|
|
|
size_t size;
|
|
|
|
int structsize;
|
|
|
|
|
|
|
|
p = (struct proc *)arg;
|
|
|
|
size = sizeof(structsize) + sizeof(ps_strings);
|
|
|
|
if (sb != NULL) {
|
|
|
|
KASSERT(*sizep == size, ("invalid size"));
|
|
|
|
structsize = sizeof(ps_strings);
|
|
|
|
#if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32
|
|
|
|
ps_strings = PTROUT(p->p_sysent->sv_psstrings);
|
|
|
|
#else
|
|
|
|
ps_strings = p->p_sysent->sv_psstrings;
|
|
|
|
#endif
|
|
|
|
sbuf_bcat(sb, &structsize, sizeof(structsize));
|
|
|
|
sbuf_bcat(sb, &ps_strings, sizeof(ps_strings));
|
|
|
|
}
|
|
|
|
*sizep = size;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
__elfN(note_procstat_auxv)(void *arg, struct sbuf *sb, size_t *sizep)
|
|
|
|
{
|
|
|
|
struct proc *p;
|
|
|
|
size_t size;
|
|
|
|
int structsize;
|
|
|
|
|
|
|
|
p = (struct proc *)arg;
|
|
|
|
if (sb == NULL) {
|
|
|
|
size = 0;
|
|
|
|
sb = sbuf_new(NULL, NULL, 128, SBUF_FIXEDLEN);
|
Optimize kern.geom.conf* sysctls.
On large systems those sysctls may generate megabytes of output. Before
this change sbuf(9) code was resizing buffer by 4KB each time many times,
generating tons of TLB shootdowns. Unfortunately in this case existing
sbuf_new_for_sysctl() mechanism, supposed to help with this issue, is not
applicable, since all the sbuf writes are done in different kernel thread.
This change improves situation in two ways:
- on first sysctl call, not providing any output buffer, it sets special
sbuf drain function, just counting the data and so not needing big buffer;
- on second sysctl call it uses as initial buffer size value saved on
previous call, so that in most cases there will be no reallocation, unless
GEOM topology changed significantly.
MFC after: 1 week
Sponsored by: iXsystems, Inc.
2019-06-18 21:05:10 +00:00
|
|
|
sbuf_set_drain(sb, sbuf_count_drain, &size);
|
2013-04-16 19:19:14 +00:00
|
|
|
sbuf_bcat(sb, &structsize, sizeof(structsize));
|
|
|
|
PHOLD(p);
|
|
|
|
proc_getauxv(curthread, p, sb);
|
|
|
|
PRELE(p);
|
|
|
|
sbuf_finish(sb);
|
|
|
|
sbuf_delete(sb);
|
|
|
|
*sizep = size;
|
|
|
|
} else {
|
|
|
|
structsize = sizeof(Elf_Auxinfo);
|
|
|
|
sbuf_bcat(sb, &structsize, sizeof(structsize));
|
|
|
|
PHOLD(p);
|
|
|
|
proc_getauxv(curthread, p, sb);
|
|
|
|
PRELE(p);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-03-13 16:40:51 +00:00
|
|
|
static boolean_t
|
2018-11-23 23:29:14 +00:00
|
|
|
__elfN(parse_notes)(struct image_params *imgp, Elf_Note *checknote,
|
|
|
|
const char *note_vendor, const Elf_Phdr *pnote,
|
|
|
|
boolean_t (*cb)(const Elf_Note *, void *, boolean_t *), void *cb_arg)
|
2009-03-13 16:40:51 +00:00
|
|
|
{
|
2009-03-22 13:42:41 +00:00
|
|
|
const Elf_Note *note, *note0, *note_end;
|
2009-03-13 16:40:51 +00:00
|
|
|
const char *note_name;
|
2015-10-14 18:27:35 +00:00
|
|
|
char *buf;
|
|
|
|
int i, error;
|
|
|
|
boolean_t res;
|
2009-03-13 16:40:51 +00:00
|
|
|
|
2015-10-14 18:27:35 +00:00
|
|
|
/* We need some limit, might as well use PAGE_SIZE. */
|
|
|
|
if (pnote == NULL || pnote->p_filesz > PAGE_SIZE)
|
2009-03-13 16:40:51 +00:00
|
|
|
return (FALSE);
|
2015-10-14 18:27:35 +00:00
|
|
|
ASSERT_VOP_LOCKED(imgp->vp, "parse_notes");
|
|
|
|
if (pnote->p_offset > PAGE_SIZE ||
|
|
|
|
pnote->p_filesz > PAGE_SIZE - pnote->p_offset) {
|
2019-05-05 11:04:01 +00:00
|
|
|
buf = malloc(pnote->p_filesz, M_TEMP, M_NOWAIT);
|
|
|
|
if (buf == NULL) {
|
|
|
|
VOP_UNLOCK(imgp->vp, 0);
|
|
|
|
buf = malloc(pnote->p_filesz, M_TEMP, M_WAITOK);
|
Switch to use shared vnode locks for text files during image activation.
kern_execve() locks text vnode exclusive to be able to set and clear
VV_TEXT flag. VV_TEXT is mutually exclusive with the v_writecount > 0
condition.
The change removes VV_TEXT, replacing it with the condition
v_writecount <= -1, and puts v_writecount under the vnode interlock.
Each text reference decrements v_writecount. To clear the text
reference when the segment is unmapped, it is recorded in the
vm_map_entry backed by the text file as MAP_ENTRY_VN_TEXT flag, and
v_writecount is incremented on the map entry removal
The operations like VOP_ADD_WRITECOUNT() and VOP_SET_TEXT() check that
v_writecount does not contradict the desired change. vn_writecheck()
is now racy and its use was eliminated everywhere except access.
Atomic check for writeability and increment of v_writecount is
performed by the VOP. vn_truncate() now increments v_writecount
around VOP_SETATTR() call, lack of which is arguably a bug on its own.
nullfs bypasses v_writecount to the lower vnode always, so nullfs
vnode has its own v_writecount correct, and lower vnode gets all
references, since object->handle is always lower vnode.
On the text vnode' vm object dealloc, the v_writecount value is reset
to zero, and deadfs vop_unset_text short-circuit the operation.
Reclamation of lowervp always reclaims all nullfs vnodes referencing
lowervp first, so no stray references are left.
Reviewed by: markj, trasz
Tested by: mjg, pho
Sponsored by: The FreeBSD Foundation
MFC after: 1 month
Differential revision: https://reviews.freebsd.org/D19923
2019-05-05 11:20:43 +00:00
|
|
|
vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
|
2019-05-05 11:04:01 +00:00
|
|
|
}
|
2015-10-14 18:27:35 +00:00
|
|
|
error = vn_rdwr(UIO_READ, imgp->vp, buf, pnote->p_filesz,
|
|
|
|
pnote->p_offset, UIO_SYSSPACE, IO_NODELOCKED,
|
|
|
|
curthread->td_ucred, NOCRED, NULL, curthread);
|
|
|
|
if (error != 0) {
|
|
|
|
uprintf("i/o error PT_NOTE\n");
|
2018-11-23 23:16:01 +00:00
|
|
|
goto retf;
|
2015-10-14 18:27:35 +00:00
|
|
|
}
|
|
|
|
note = note0 = (const Elf_Note *)buf;
|
|
|
|
note_end = (const Elf_Note *)(buf + pnote->p_filesz);
|
|
|
|
} else {
|
|
|
|
note = note0 = (const Elf_Note *)(imgp->image_header +
|
|
|
|
pnote->p_offset);
|
|
|
|
note_end = (const Elf_Note *)(imgp->image_header +
|
|
|
|
pnote->p_offset + pnote->p_filesz);
|
|
|
|
buf = NULL;
|
|
|
|
}
|
2009-03-22 13:42:41 +00:00
|
|
|
for (i = 0; i < 100 && note >= note0 && note < note_end; i++) {
|
2012-07-19 11:15:53 +00:00
|
|
|
if (!aligned(note, Elf32_Addr) || (const char *)note_end -
|
2015-10-14 18:27:35 +00:00
|
|
|
(const char *)note < sizeof(Elf_Note)) {
|
2018-11-23 23:16:01 +00:00
|
|
|
goto retf;
|
2015-10-14 18:27:35 +00:00
|
|
|
}
|
2018-11-23 23:29:14 +00:00
|
|
|
if (note->n_namesz != checknote->n_namesz ||
|
|
|
|
note->n_descsz != checknote->n_descsz ||
|
|
|
|
note->n_type != checknote->n_type)
|
2009-03-13 16:40:51 +00:00
|
|
|
goto nextnote;
|
|
|
|
note_name = (const char *)(note + 1);
|
2018-11-23 23:29:14 +00:00
|
|
|
if (note_name + checknote->n_namesz >=
|
|
|
|
(const char *)note_end || strncmp(note_vendor,
|
|
|
|
note_name, checknote->n_namesz) != 0)
|
2009-03-13 16:40:51 +00:00
|
|
|
goto nextnote;
|
|
|
|
|
2018-11-23 23:29:14 +00:00
|
|
|
if (cb(note, cb_arg, &res))
|
2015-10-14 18:27:35 +00:00
|
|
|
goto ret;
|
2009-03-13 16:40:51 +00:00
|
|
|
nextnote:
|
|
|
|
note = (const Elf_Note *)((const char *)(note + 1) +
|
2013-05-01 14:59:16 +00:00
|
|
|
roundup2(note->n_namesz, ELF_NOTE_ROUNDSIZE) +
|
|
|
|
roundup2(note->n_descsz, ELF_NOTE_ROUNDSIZE));
|
2009-03-13 16:40:51 +00:00
|
|
|
}
|
2018-11-23 23:16:01 +00:00
|
|
|
retf:
|
2015-10-14 18:27:35 +00:00
|
|
|
res = FALSE;
|
|
|
|
ret:
|
|
|
|
free(buf, M_TEMP);
|
|
|
|
return (res);
|
2009-03-13 16:40:51 +00:00
|
|
|
}
|
|
|
|
|
2018-11-23 23:29:14 +00:00
|
|
|
struct brandnote_cb_arg {
|
|
|
|
Elf_Brandnote *brandnote;
|
|
|
|
int32_t *osrel;
|
|
|
|
};
|
|
|
|
|
|
|
|
static boolean_t
|
|
|
|
brandnote_cb(const Elf_Note *note, void *arg0, boolean_t *res)
|
|
|
|
{
|
|
|
|
struct brandnote_cb_arg *arg;
|
|
|
|
|
|
|
|
arg = arg0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Fetch the osreldate for binary from the ELF OSABI-note if
|
|
|
|
* necessary.
|
|
|
|
*/
|
|
|
|
*res = (arg->brandnote->flags & BN_TRANSLATE_OSREL) != 0 &&
|
|
|
|
arg->brandnote->trans_osrel != NULL ?
|
|
|
|
arg->brandnote->trans_osrel(note, arg->osrel) : TRUE;
|
|
|
|
|
|
|
|
return (TRUE);
|
|
|
|
}
|
|
|
|
|
2018-11-23 23:33:55 +00:00
|
|
|
static Elf_Note fctl_note = {
|
|
|
|
.n_namesz = sizeof(FREEBSD_ABI_VENDOR),
|
|
|
|
.n_descsz = sizeof(uint32_t),
|
|
|
|
.n_type = NT_FREEBSD_FEATURE_CTL,
|
|
|
|
};
|
|
|
|
|
|
|
|
struct fctl_cb_arg {
|
|
|
|
uint32_t *fctl0;
|
|
|
|
};
|
|
|
|
|
|
|
|
static boolean_t
|
|
|
|
note_fctl_cb(const Elf_Note *note, void *arg0, boolean_t *res)
|
|
|
|
{
|
|
|
|
struct fctl_cb_arg *arg;
|
|
|
|
const Elf32_Word *desc;
|
|
|
|
uintptr_t p;
|
|
|
|
|
|
|
|
arg = arg0;
|
|
|
|
p = (uintptr_t)(note + 1);
|
|
|
|
p += roundup2(note->n_namesz, ELF_NOTE_ROUNDSIZE);
|
|
|
|
desc = (const Elf32_Word *)p;
|
|
|
|
*arg->fctl0 = desc[0];
|
|
|
|
return (TRUE);
|
|
|
|
}
|
|
|
|
|
2012-03-11 19:38:49 +00:00
|
|
|
/*
|
2018-11-23 23:33:55 +00:00
|
|
|
* Try to find the appropriate ABI-note section for checknote, fetch
|
|
|
|
* the osreldate and feature control flags for binary from the ELF
|
|
|
|
* OSABI-note. Only the first page of the image is searched, the same
|
|
|
|
* as for headers.
|
2012-03-11 19:38:49 +00:00
|
|
|
*/
|
|
|
|
static boolean_t
|
2018-11-23 23:29:14 +00:00
|
|
|
__elfN(check_note)(struct image_params *imgp, Elf_Brandnote *brandnote,
|
2018-11-23 23:33:55 +00:00
|
|
|
int32_t *osrel, uint32_t *fctl0)
|
2012-03-11 19:38:49 +00:00
|
|
|
{
|
|
|
|
const Elf_Phdr *phdr;
|
|
|
|
const Elf_Ehdr *hdr;
|
2018-11-23 23:29:14 +00:00
|
|
|
struct brandnote_cb_arg b_arg;
|
2018-11-23 23:33:55 +00:00
|
|
|
struct fctl_cb_arg f_arg;
|
|
|
|
int i, j;
|
2012-03-11 19:38:49 +00:00
|
|
|
|
|
|
|
hdr = (const Elf_Ehdr *)imgp->image_header;
|
|
|
|
phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff);
|
2018-11-23 23:29:14 +00:00
|
|
|
b_arg.brandnote = brandnote;
|
|
|
|
b_arg.osrel = osrel;
|
2018-11-23 23:33:55 +00:00
|
|
|
f_arg.fctl0 = fctl0;
|
2012-03-11 19:38:49 +00:00
|
|
|
|
|
|
|
for (i = 0; i < hdr->e_phnum; i++) {
|
2018-11-23 23:29:14 +00:00
|
|
|
if (phdr[i].p_type == PT_NOTE && __elfN(parse_notes)(imgp,
|
|
|
|
&brandnote->hdr, brandnote->vendor, &phdr[i], brandnote_cb,
|
|
|
|
&b_arg)) {
|
2018-11-23 23:33:55 +00:00
|
|
|
for (j = 0; j < hdr->e_phnum; j++) {
|
|
|
|
if (phdr[j].p_type == PT_NOTE &&
|
|
|
|
__elfN(parse_notes)(imgp, &fctl_note,
|
|
|
|
FREEBSD_ABI_VENDOR, &phdr[j],
|
|
|
|
note_fctl_cb, &f_arg))
|
|
|
|
break;
|
|
|
|
}
|
2012-03-11 19:38:49 +00:00
|
|
|
return (TRUE);
|
2018-11-23 23:29:14 +00:00
|
|
|
}
|
2012-03-11 19:38:49 +00:00
|
|
|
}
|
|
|
|
return (FALSE);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
1996-03-10 08:42:54 +00:00
|
|
|
/*
|
|
|
|
* Tell kern_execve.c about it, with a little help from the linker.
|
|
|
|
*/
|
2003-01-04 22:07:48 +00:00
|
|
|
static struct execsw __elfN(execsw) = {
|
2018-03-13 13:09:10 +00:00
|
|
|
.ex_imgact = __CONCAT(exec_, __elfN(imgact)),
|
|
|
|
.ex_name = __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE))
|
2003-01-04 22:07:48 +00:00
|
|
|
};
|
|
|
|
EXEC_SET(__CONCAT(elf, __ELF_WORD_SIZE), __elfN(execsw));
|
2010-03-02 06:58:58 +00:00
|
|
|
|
2011-01-08 16:02:14 +00:00
|
|
|
static vm_prot_t
|
|
|
|
__elfN(trans_prot)(Elf_Word flags)
|
|
|
|
{
|
|
|
|
vm_prot_t prot;
|
|
|
|
|
|
|
|
prot = 0;
|
|
|
|
if (flags & PF_X)
|
|
|
|
prot |= VM_PROT_EXECUTE;
|
|
|
|
if (flags & PF_W)
|
|
|
|
prot |= VM_PROT_WRITE;
|
|
|
|
if (flags & PF_R)
|
|
|
|
prot |= VM_PROT_READ;
|
2019-02-07 02:17:34 +00:00
|
|
|
#if __ELF_WORD_SIZE == 32 && (defined(__amd64__) || defined(__i386__))
|
2011-10-15 12:35:18 +00:00
|
|
|
if (i386_read_exec && (flags & PF_R))
|
2011-10-13 16:16:46 +00:00
|
|
|
prot |= VM_PROT_EXECUTE;
|
|
|
|
#endif
|
2011-01-08 16:02:14 +00:00
|
|
|
return (prot);
|
|
|
|
}
|
|
|
|
|
|
|
|
static Elf_Word
|
|
|
|
__elfN(untrans_prot)(vm_prot_t prot)
|
|
|
|
{
|
|
|
|
Elf_Word flags;
|
|
|
|
|
|
|
|
flags = 0;
|
|
|
|
if (prot & VM_PROT_EXECUTE)
|
|
|
|
flags |= PF_X;
|
|
|
|
if (prot & VM_PROT_READ)
|
|
|
|
flags |= PF_R;
|
|
|
|
if (prot & VM_PROT_WRITE)
|
|
|
|
flags |= PF_W;
|
|
|
|
return (flags);
|
|
|
|
}
|
2019-07-31 20:23:10 +00:00
|
|
|
|
|
|
|
void
|
|
|
|
__elfN(stackgap)(struct image_params *imgp, u_long *stack_base)
|
|
|
|
{
|
|
|
|
u_long range, rbase, gap;
|
|
|
|
int pct;
|
|
|
|
|
|
|
|
if ((imgp->map_flags & MAP_ASLR) == 0)
|
|
|
|
return;
|
|
|
|
pct = __elfN(aslr_stack_gap);
|
|
|
|
if (pct == 0)
|
|
|
|
return;
|
|
|
|
if (pct > 50)
|
|
|
|
pct = 50;
|
|
|
|
range = imgp->eff_stack_sz * pct / 100;
|
|
|
|
arc4rand(&rbase, sizeof(rbase), 0);
|
|
|
|
gap = rbase % range;
|
|
|
|
gap &= ~(sizeof(u_long) - 1);
|
|
|
|
*stack_base -= gap;
|
|
|
|
}
|