freebsd-skq/sys/vm/vm_unix.c
markj 094736f08f Provide separate accounting for user-wired pages.
Historically we have not distinguished between kernel wirings and user
wirings for accounting purposes.  User wirings (via mlock(2)) were
subject to a global limit on the number of wired pages, so if large
swaths of physical memory were wired by the kernel, as happens with
the ZFS ARC among other things, the limit could be exceeded, causing
user wirings to fail.

The change adds a new counter, v_user_wire_count, which counts the
number of virtual pages wired by user processes via mlock(2) and
mlockall(2).  Only user-wired pages are subject to the system-wide
limit which helps provide some safety against deadlocks.  In
particular, while sources of kernel wirings typically support some
backpressure mechanism, there is no way to reclaim user-wired pages
shorting of killing the wiring process.  The limit is exported as
vm.max_user_wired, renamed from vm.max_wired, and changed from u_int
to u_long.

The choice to count virtual user-wired pages rather than physical
pages was done for simplicity.  There are mechanisms that can cause
user-wired mappings to be destroyed while maintaining a wiring of
the backing physical page; these make it difficult to accurately
track user wirings at the physical page layer.

The change also closes some holes which allowed user wirings to succeed
even when they would cause the system limit to be exceeded.  For
instance, mmap() may now fail with ENOMEM in a process that has called
mlockall(MCL_FUTURE) if the new mapping would cause the user wiring
limit to be exceeded.

Note that bhyve -S is subject to the user wiring limit, which defaults
to 1/3 of physical RAM.  Users that wish to exceed the limit must tune
vm.max_user_wired.

Reviewed by:	kib, ngie (mlock() test changes)
Tested by:	pho (earlier version)
MFC after:	45 days
Sponsored by:	Netflix
Differential Revision:	https://reviews.freebsd.org/D19908
2019-05-13 16:38:48 +00:00

250 lines
6.7 KiB
C

/*-
* SPDX-License-Identifier: BSD-3-Clause
*
* Copyright (c) 1988 University of Utah.
* Copyright (c) 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* the Systems Programming Group of the University of Utah Computer
* Science Department.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: Utah $Hdr: vm_unix.c 1.1 89/11/07$
*
* @(#)vm_unix.c 8.1 (Berkeley) 6/11/93
*/
/*
* Traditional sbrk/grow interface to VM
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/racct.h>
#include <sys/resourcevar.h>
#include <sys/syscallsubr.h>
#include <sys/sysent.h>
#include <sys/sysproto.h>
#include <sys/systm.h>
#if defined(__amd64__) || defined(__i386__) /* for i386_read_exec */
#include <machine/md_var.h>
#endif
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#ifndef _SYS_SYSPROTO_H_
struct break_args {
char *nsize;
};
#endif
int
sys_break(struct thread *td, struct break_args *uap)
{
#if !defined(__aarch64__) && !defined(__riscv)
uintptr_t addr;
int error;
addr = (uintptr_t)uap->nsize;
error = kern_break(td, &addr);
if (error == 0)
td->td_retval[0] = addr;
return (error);
#else /* defined(__aarch64__) || defined(__riscv) */
return (ENOSYS);
#endif /* defined(__aarch64__) || defined(__riscv) */
}
int
kern_break(struct thread *td, uintptr_t *addr)
{
struct vmspace *vm = td->td_proc->p_vmspace;
vm_map_t map = &vm->vm_map;
vm_offset_t new, old, base;
rlim_t datalim, lmemlim, vmemlim;
int prot, rv;
int error = 0;
datalim = lim_cur(td, RLIMIT_DATA);
lmemlim = lim_cur(td, RLIMIT_MEMLOCK);
vmemlim = lim_cur(td, RLIMIT_VMEM);
new = round_page(*addr);
vm_map_lock(map);
base = round_page((vm_offset_t) vm->vm_daddr);
old = base + ctob(vm->vm_dsize);
if (new > base) {
/*
* Check the resource limit, but allow a process to reduce
* its usage, even if it remains over the limit.
*/
if (new - base > datalim && new > old) {
error = ENOMEM;
goto done;
}
if (new > vm_map_max(map)) {
error = ENOMEM;
goto done;
}
} else if (new < base) {
/*
* Simply return the current break address without
* modifying any state. This is an ad-hoc interface
* used by libc to determine the initial break address,
* avoiding a dependency on magic features in the system
* linker.
*/
new = old;
goto done;
}
if (new > old) {
if (!old_mlock && map->flags & MAP_WIREFUTURE) {
if (ptoa(pmap_wired_count(map->pmap)) +
(new - old) > lmemlim) {
error = ENOMEM;
goto done;
}
}
if (map->size + (new - old) > vmemlim) {
error = ENOMEM;
goto done;
}
#ifdef RACCT
if (racct_enable) {
PROC_LOCK(td->td_proc);
error = racct_set(td->td_proc, RACCT_DATA, new - base);
if (error != 0) {
PROC_UNLOCK(td->td_proc);
error = ENOMEM;
goto done;
}
error = racct_set(td->td_proc, RACCT_VMEM,
map->size + (new - old));
if (error != 0) {
racct_set_force(td->td_proc, RACCT_DATA,
old - base);
PROC_UNLOCK(td->td_proc);
error = ENOMEM;
goto done;
}
if (!old_mlock && map->flags & MAP_WIREFUTURE) {
error = racct_set(td->td_proc, RACCT_MEMLOCK,
ptoa(pmap_wired_count(map->pmap)) +
(new - old));
if (error != 0) {
racct_set_force(td->td_proc, RACCT_DATA,
old - base);
racct_set_force(td->td_proc, RACCT_VMEM,
map->size);
PROC_UNLOCK(td->td_proc);
error = ENOMEM;
goto done;
}
}
PROC_UNLOCK(td->td_proc);
}
#endif
prot = VM_PROT_RW;
#if (defined(COMPAT_FREEBSD32) && defined(__amd64__)) || defined(__i386__)
if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32))
prot |= VM_PROT_EXECUTE;
#endif
rv = vm_map_insert(map, NULL, 0, old, new, prot, VM_PROT_ALL,
0);
if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE) != 0) {
rv = vm_map_wire_locked(map, old, new,
VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
if (rv != KERN_SUCCESS)
vm_map_delete(map, old, new);
}
if (rv != KERN_SUCCESS) {
#ifdef RACCT
if (racct_enable) {
PROC_LOCK(td->td_proc);
racct_set_force(td->td_proc,
RACCT_DATA, old - base);
racct_set_force(td->td_proc,
RACCT_VMEM, map->size);
if (!old_mlock && map->flags & MAP_WIREFUTURE) {
racct_set_force(td->td_proc,
RACCT_MEMLOCK,
ptoa(pmap_wired_count(map->pmap)));
}
PROC_UNLOCK(td->td_proc);
}
#endif
error = ENOMEM;
goto done;
}
vm->vm_dsize += btoc(new - old);
} else if (new < old) {
rv = vm_map_delete(map, new, old);
if (rv != KERN_SUCCESS) {
error = ENOMEM;
goto done;
}
vm->vm_dsize -= btoc(old - new);
#ifdef RACCT
if (racct_enable) {
PROC_LOCK(td->td_proc);
racct_set_force(td->td_proc, RACCT_DATA, new - base);
racct_set_force(td->td_proc, RACCT_VMEM, map->size);
if (!old_mlock && map->flags & MAP_WIREFUTURE) {
racct_set_force(td->td_proc, RACCT_MEMLOCK,
ptoa(pmap_wired_count(map->pmap)));
}
PROC_UNLOCK(td->td_proc);
}
#endif
}
done:
vm_map_unlock(map);
if (error == 0)
*addr = new;
return (error);
}
#ifdef COMPAT_FREEBSD11
int
freebsd11_vadvise(struct thread *td, struct freebsd11_vadvise_args *uap)
{
return (EINVAL);
}
#endif