54a3a11421
Historically we have not distinguished between kernel wirings and user wirings for accounting purposes. User wirings (via mlock(2)) were subject to a global limit on the number of wired pages, so if large swaths of physical memory were wired by the kernel, as happens with the ZFS ARC among other things, the limit could be exceeded, causing user wirings to fail. The change adds a new counter, v_user_wire_count, which counts the number of virtual pages wired by user processes via mlock(2) and mlockall(2). Only user-wired pages are subject to the system-wide limit which helps provide some safety against deadlocks. In particular, while sources of kernel wirings typically support some backpressure mechanism, there is no way to reclaim user-wired pages shorting of killing the wiring process. The limit is exported as vm.max_user_wired, renamed from vm.max_wired, and changed from u_int to u_long. The choice to count virtual user-wired pages rather than physical pages was done for simplicity. There are mechanisms that can cause user-wired mappings to be destroyed while maintaining a wiring of the backing physical page; these make it difficult to accurately track user wirings at the physical page layer. The change also closes some holes which allowed user wirings to succeed even when they would cause the system limit to be exceeded. For instance, mmap() may now fail with ENOMEM in a process that has called mlockall(MCL_FUTURE) if the new mapping would cause the user wiring limit to be exceeded. Note that bhyve -S is subject to the user wiring limit, which defaults to 1/3 of physical RAM. Users that wish to exceed the limit must tune vm.max_user_wired. Reviewed by: kib, ngie (mlock() test changes) Tested by: pho (earlier version) MFC after: 45 days Sponsored by: Netflix Differential Revision: https://reviews.freebsd.org/D19908
250 lines
6.7 KiB
C
250 lines
6.7 KiB
C
/*-
|
|
* SPDX-License-Identifier: BSD-3-Clause
|
|
*
|
|
* Copyright (c) 1988 University of Utah.
|
|
* Copyright (c) 1991, 1993
|
|
* The Regents of the University of California. All rights reserved.
|
|
*
|
|
* This code is derived from software contributed to Berkeley by
|
|
* the Systems Programming Group of the University of Utah Computer
|
|
* Science Department.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* 3. Neither the name of the University nor the names of its contributors
|
|
* may be used to endorse or promote products derived from this software
|
|
* without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
* SUCH DAMAGE.
|
|
*
|
|
* from: Utah $Hdr: vm_unix.c 1.1 89/11/07$
|
|
*
|
|
* @(#)vm_unix.c 8.1 (Berkeley) 6/11/93
|
|
*/
|
|
|
|
/*
|
|
* Traditional sbrk/grow interface to VM
|
|
*/
|
|
|
|
#include <sys/cdefs.h>
|
|
__FBSDID("$FreeBSD$");
|
|
|
|
#include <sys/param.h>
|
|
#include <sys/lock.h>
|
|
#include <sys/mutex.h>
|
|
#include <sys/proc.h>
|
|
#include <sys/racct.h>
|
|
#include <sys/resourcevar.h>
|
|
#include <sys/syscallsubr.h>
|
|
#include <sys/sysent.h>
|
|
#include <sys/sysproto.h>
|
|
#include <sys/systm.h>
|
|
#if defined(__amd64__) || defined(__i386__) /* for i386_read_exec */
|
|
#include <machine/md_var.h>
|
|
#endif
|
|
|
|
#include <vm/vm.h>
|
|
#include <vm/vm_param.h>
|
|
#include <vm/pmap.h>
|
|
#include <vm/vm_map.h>
|
|
|
|
#ifndef _SYS_SYSPROTO_H_
|
|
struct break_args {
|
|
char *nsize;
|
|
};
|
|
#endif
|
|
int
|
|
sys_break(struct thread *td, struct break_args *uap)
|
|
{
|
|
#if !defined(__aarch64__) && !defined(__riscv)
|
|
uintptr_t addr;
|
|
int error;
|
|
|
|
addr = (uintptr_t)uap->nsize;
|
|
error = kern_break(td, &addr);
|
|
if (error == 0)
|
|
td->td_retval[0] = addr;
|
|
return (error);
|
|
#else /* defined(__aarch64__) || defined(__riscv) */
|
|
return (ENOSYS);
|
|
#endif /* defined(__aarch64__) || defined(__riscv) */
|
|
}
|
|
|
|
int
|
|
kern_break(struct thread *td, uintptr_t *addr)
|
|
{
|
|
struct vmspace *vm = td->td_proc->p_vmspace;
|
|
vm_map_t map = &vm->vm_map;
|
|
vm_offset_t new, old, base;
|
|
rlim_t datalim, lmemlim, vmemlim;
|
|
int prot, rv;
|
|
int error = 0;
|
|
|
|
datalim = lim_cur(td, RLIMIT_DATA);
|
|
lmemlim = lim_cur(td, RLIMIT_MEMLOCK);
|
|
vmemlim = lim_cur(td, RLIMIT_VMEM);
|
|
|
|
new = round_page(*addr);
|
|
vm_map_lock(map);
|
|
|
|
base = round_page((vm_offset_t) vm->vm_daddr);
|
|
old = base + ctob(vm->vm_dsize);
|
|
if (new > base) {
|
|
/*
|
|
* Check the resource limit, but allow a process to reduce
|
|
* its usage, even if it remains over the limit.
|
|
*/
|
|
if (new - base > datalim && new > old) {
|
|
error = ENOMEM;
|
|
goto done;
|
|
}
|
|
if (new > vm_map_max(map)) {
|
|
error = ENOMEM;
|
|
goto done;
|
|
}
|
|
} else if (new < base) {
|
|
/*
|
|
* Simply return the current break address without
|
|
* modifying any state. This is an ad-hoc interface
|
|
* used by libc to determine the initial break address,
|
|
* avoiding a dependency on magic features in the system
|
|
* linker.
|
|
*/
|
|
new = old;
|
|
goto done;
|
|
}
|
|
|
|
if (new > old) {
|
|
if (!old_mlock && map->flags & MAP_WIREFUTURE) {
|
|
if (ptoa(pmap_wired_count(map->pmap)) +
|
|
(new - old) > lmemlim) {
|
|
error = ENOMEM;
|
|
goto done;
|
|
}
|
|
}
|
|
if (map->size + (new - old) > vmemlim) {
|
|
error = ENOMEM;
|
|
goto done;
|
|
}
|
|
#ifdef RACCT
|
|
if (racct_enable) {
|
|
PROC_LOCK(td->td_proc);
|
|
error = racct_set(td->td_proc, RACCT_DATA, new - base);
|
|
if (error != 0) {
|
|
PROC_UNLOCK(td->td_proc);
|
|
error = ENOMEM;
|
|
goto done;
|
|
}
|
|
error = racct_set(td->td_proc, RACCT_VMEM,
|
|
map->size + (new - old));
|
|
if (error != 0) {
|
|
racct_set_force(td->td_proc, RACCT_DATA,
|
|
old - base);
|
|
PROC_UNLOCK(td->td_proc);
|
|
error = ENOMEM;
|
|
goto done;
|
|
}
|
|
if (!old_mlock && map->flags & MAP_WIREFUTURE) {
|
|
error = racct_set(td->td_proc, RACCT_MEMLOCK,
|
|
ptoa(pmap_wired_count(map->pmap)) +
|
|
(new - old));
|
|
if (error != 0) {
|
|
racct_set_force(td->td_proc, RACCT_DATA,
|
|
old - base);
|
|
racct_set_force(td->td_proc, RACCT_VMEM,
|
|
map->size);
|
|
PROC_UNLOCK(td->td_proc);
|
|
error = ENOMEM;
|
|
goto done;
|
|
}
|
|
}
|
|
PROC_UNLOCK(td->td_proc);
|
|
}
|
|
#endif
|
|
prot = VM_PROT_RW;
|
|
#if (defined(COMPAT_FREEBSD32) && defined(__amd64__)) || defined(__i386__)
|
|
if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32))
|
|
prot |= VM_PROT_EXECUTE;
|
|
#endif
|
|
rv = vm_map_insert(map, NULL, 0, old, new, prot, VM_PROT_ALL,
|
|
0);
|
|
if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE) != 0) {
|
|
rv = vm_map_wire_locked(map, old, new,
|
|
VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
|
|
if (rv != KERN_SUCCESS)
|
|
vm_map_delete(map, old, new);
|
|
}
|
|
if (rv != KERN_SUCCESS) {
|
|
#ifdef RACCT
|
|
if (racct_enable) {
|
|
PROC_LOCK(td->td_proc);
|
|
racct_set_force(td->td_proc,
|
|
RACCT_DATA, old - base);
|
|
racct_set_force(td->td_proc,
|
|
RACCT_VMEM, map->size);
|
|
if (!old_mlock && map->flags & MAP_WIREFUTURE) {
|
|
racct_set_force(td->td_proc,
|
|
RACCT_MEMLOCK,
|
|
ptoa(pmap_wired_count(map->pmap)));
|
|
}
|
|
PROC_UNLOCK(td->td_proc);
|
|
}
|
|
#endif
|
|
error = ENOMEM;
|
|
goto done;
|
|
}
|
|
vm->vm_dsize += btoc(new - old);
|
|
} else if (new < old) {
|
|
rv = vm_map_delete(map, new, old);
|
|
if (rv != KERN_SUCCESS) {
|
|
error = ENOMEM;
|
|
goto done;
|
|
}
|
|
vm->vm_dsize -= btoc(old - new);
|
|
#ifdef RACCT
|
|
if (racct_enable) {
|
|
PROC_LOCK(td->td_proc);
|
|
racct_set_force(td->td_proc, RACCT_DATA, new - base);
|
|
racct_set_force(td->td_proc, RACCT_VMEM, map->size);
|
|
if (!old_mlock && map->flags & MAP_WIREFUTURE) {
|
|
racct_set_force(td->td_proc, RACCT_MEMLOCK,
|
|
ptoa(pmap_wired_count(map->pmap)));
|
|
}
|
|
PROC_UNLOCK(td->td_proc);
|
|
}
|
|
#endif
|
|
}
|
|
done:
|
|
vm_map_unlock(map);
|
|
|
|
if (error == 0)
|
|
*addr = new;
|
|
|
|
return (error);
|
|
}
|
|
|
|
#ifdef COMPAT_FREEBSD11
|
|
int
|
|
freebsd11_vadvise(struct thread *td, struct freebsd11_vadvise_args *uap)
|
|
{
|
|
|
|
return (EINVAL);
|
|
}
|
|
#endif
|