freebsd-dev/sys/vm/vm_unix.c

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

250 lines
6.7 KiB
C
Raw Normal View History

/*-
* SPDX-License-Identifier: BSD-3-Clause
*
1994-05-24 10:09:53 +00:00
* Copyright (c) 1988 University of Utah.
* Copyright (c) 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* the Systems Programming Group of the University of Utah Computer
* Science Department.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
1994-05-24 10:09:53 +00:00
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: Utah $Hdr: vm_unix.c 1.1 89/11/07$
*
* @(#)vm_unix.c 8.1 (Berkeley) 6/11/93
*/
/*
* Traditional sbrk/grow interface to VM
*/
2003-06-11 23:50:51 +00:00
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
1994-05-24 10:09:53 +00:00
#include <sys/param.h>
#include <sys/lock.h>
#include <sys/mutex.h>
1994-05-24 10:09:53 +00:00
#include <sys/proc.h>
#include <sys/racct.h>
1994-05-24 10:09:53 +00:00
#include <sys/resourcevar.h>
#include <sys/syscallsubr.h>
#include <sys/sysent.h>
#include <sys/sysproto.h>
#include <sys/systm.h>
#if defined(__amd64__) || defined(__i386__) /* for i386_read_exec */
#include <machine/md_var.h>
#endif
1994-05-24 10:09:53 +00:00
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#ifndef _SYS_SYSPROTO_H_
struct break_args {
char *nsize;
1994-05-24 10:09:53 +00:00
};
#endif
1994-05-24 10:09:53 +00:00
int
sys_break(struct thread *td, struct break_args *uap)
1994-05-24 10:09:53 +00:00
{
#if !defined(__aarch64__) && !defined(__riscv)
uintptr_t addr;
int error;
addr = (uintptr_t)uap->nsize;
error = kern_break(td, &addr);
if (error == 0)
td->td_retval[0] = addr;
return (error);
#else /* defined(__aarch64__) || defined(__riscv) */
return (ENOSYS);
#endif /* defined(__aarch64__) || defined(__riscv) */
}
int
kern_break(struct thread *td, uintptr_t *addr)
{
struct vmspace *vm = td->td_proc->p_vmspace;
vm_map_t map = &vm->vm_map;
vm_offset_t new, old, base;
rlim_t datalim, lmemlim, vmemlim;
int prot, rv;
int error = 0;
datalim = lim_cur(td, RLIMIT_DATA);
lmemlim = lim_cur(td, RLIMIT_MEMLOCK);
vmemlim = lim_cur(td, RLIMIT_VMEM);
Locking for the per-process resource limits structure. - struct plimit includes a mutex to protect a reference count. The plimit structure is treated similarly to struct ucred in that is is always copy on write, so having a reference to a structure is sufficient to read from it without needing a further lock. - The proc lock protects the p_limit pointer and must be held while reading limits from a process to keep the limit structure from changing out from under you while reading from it. - Various global limits that are ints are not protected by a lock since int writes are atomic on all the archs we support and thus a lock wouldn't buy us anything. - All accesses to individual resource limits from a process are abstracted behind a simple lim_rlimit(), lim_max(), and lim_cur() API that return either an rlimit, or the current or max individual limit of the specified resource from a process. - dosetrlimit() was renamed to kern_setrlimit() to match existing style of other similar syscall helper functions. - The alpha OSF/1 compat layer no longer calls getrlimit() and setrlimit() (it didn't used the stackgap when it should have) but uses lim_rlimit() and kern_setrlimit() instead. - The svr4 compat no longer uses the stackgap for resource limits calls, but uses lim_rlimit() and kern_setrlimit() instead. - The ibcs2 compat no longer uses the stackgap for resource limits. It also no longer uses the stackgap for accessing sysctl's for the ibcs2_sysconf() syscall but uses kernel_sysctl() instead. As a result, ibcs2_sysconf() no longer needs Giant. - The p_rlimit macro no longer exists. Submitted by: mtm (mostly, I only did a few cleanups and catchups) Tested on: i386 Compiled on: alpha, amd64
2004-02-04 21:52:57 +00:00
new = round_page(*addr);
vm_map_lock(map);
1994-05-24 10:09:53 +00:00
base = round_page((vm_offset_t) vm->vm_daddr);
old = base + ctob(vm->vm_dsize);
if (new > base) {
/*
* Check the resource limit, but allow a process to reduce
* its usage, even if it remains over the limit.
*/
Locking for the per-process resource limits structure. - struct plimit includes a mutex to protect a reference count. The plimit structure is treated similarly to struct ucred in that is is always copy on write, so having a reference to a structure is sufficient to read from it without needing a further lock. - The proc lock protects the p_limit pointer and must be held while reading limits from a process to keep the limit structure from changing out from under you while reading from it. - Various global limits that are ints are not protected by a lock since int writes are atomic on all the archs we support and thus a lock wouldn't buy us anything. - All accesses to individual resource limits from a process are abstracted behind a simple lim_rlimit(), lim_max(), and lim_cur() API that return either an rlimit, or the current or max individual limit of the specified resource from a process. - dosetrlimit() was renamed to kern_setrlimit() to match existing style of other similar syscall helper functions. - The alpha OSF/1 compat layer no longer calls getrlimit() and setrlimit() (it didn't used the stackgap when it should have) but uses lim_rlimit() and kern_setrlimit() instead. - The svr4 compat no longer uses the stackgap for resource limits calls, but uses lim_rlimit() and kern_setrlimit() instead. - The ibcs2 compat no longer uses the stackgap for resource limits. It also no longer uses the stackgap for accessing sysctl's for the ibcs2_sysconf() syscall but uses kernel_sysctl() instead. As a result, ibcs2_sysconf() no longer needs Giant. - The p_rlimit macro no longer exists. Submitted by: mtm (mostly, I only did a few cleanups and catchups) Tested on: i386 Compiled on: alpha, amd64
2004-02-04 21:52:57 +00:00
if (new - base > datalim && new > old) {
error = ENOMEM;
goto done;
}
if (new > vm_map_max(map)) {
error = ENOMEM;
goto done;
}
} else if (new < base) {
/*
* Simply return the current break address without
* modifying any state. This is an ad-hoc interface
* used by libc to determine the initial break address,
* avoiding a dependency on magic features in the system
* linker.
*/
new = old;
goto done;
}
if (new > old) {
if (!old_mlock && map->flags & MAP_WIREFUTURE) {
if (ptoa(pmap_wired_count(map->pmap)) +
(new - old) > lmemlim) {
error = ENOMEM;
goto done;
}
}
if (map->size + (new - old) > vmemlim) {
error = ENOMEM;
goto done;
}
#ifdef RACCT
if (racct_enable) {
PROC_LOCK(td->td_proc);
error = racct_set(td->td_proc, RACCT_DATA, new - base);
if (error != 0) {
PROC_UNLOCK(td->td_proc);
error = ENOMEM;
goto done;
}
error = racct_set(td->td_proc, RACCT_VMEM,
map->size + (new - old));
if (error != 0) {
racct_set_force(td->td_proc, RACCT_DATA,
old - base);
PROC_UNLOCK(td->td_proc);
error = ENOMEM;
goto done;
}
if (!old_mlock && map->flags & MAP_WIREFUTURE) {
error = racct_set(td->td_proc, RACCT_MEMLOCK,
ptoa(pmap_wired_count(map->pmap)) +
(new - old));
if (error != 0) {
racct_set_force(td->td_proc, RACCT_DATA,
old - base);
racct_set_force(td->td_proc, RACCT_VMEM,
map->size);
PROC_UNLOCK(td->td_proc);
error = ENOMEM;
goto done;
}
}
PROC_UNLOCK(td->td_proc);
}
#endif
prot = VM_PROT_RW;
#if (defined(COMPAT_FREEBSD32) && defined(__amd64__)) || defined(__i386__)
if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32))
prot |= VM_PROT_EXECUTE;
#endif
Provide separate accounting for user-wired pages. Historically we have not distinguished between kernel wirings and user wirings for accounting purposes. User wirings (via mlock(2)) were subject to a global limit on the number of wired pages, so if large swaths of physical memory were wired by the kernel, as happens with the ZFS ARC among other things, the limit could be exceeded, causing user wirings to fail. The change adds a new counter, v_user_wire_count, which counts the number of virtual pages wired by user processes via mlock(2) and mlockall(2). Only user-wired pages are subject to the system-wide limit which helps provide some safety against deadlocks. In particular, while sources of kernel wirings typically support some backpressure mechanism, there is no way to reclaim user-wired pages shorting of killing the wiring process. The limit is exported as vm.max_user_wired, renamed from vm.max_wired, and changed from u_int to u_long. The choice to count virtual user-wired pages rather than physical pages was done for simplicity. There are mechanisms that can cause user-wired mappings to be destroyed while maintaining a wiring of the backing physical page; these make it difficult to accurately track user wirings at the physical page layer. The change also closes some holes which allowed user wirings to succeed even when they would cause the system limit to be exceeded. For instance, mmap() may now fail with ENOMEM in a process that has called mlockall(MCL_FUTURE) if the new mapping would cause the user wiring limit to be exceeded. Note that bhyve -S is subject to the user wiring limit, which defaults to 1/3 of physical RAM. Users that wish to exceed the limit must tune vm.max_user_wired. Reviewed by: kib, ngie (mlock() test changes) Tested by: pho (earlier version) MFC after: 45 days Sponsored by: Netflix Differential Revision: https://reviews.freebsd.org/D19908
2019-05-13 16:38:48 +00:00
rv = vm_map_insert(map, NULL, 0, old, new, prot, VM_PROT_ALL,
0);
if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE) != 0) {
rv = vm_map_wire_locked(map, old, new,
VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
if (rv != KERN_SUCCESS)
(void)vm_map_delete(map, old, new);
Provide separate accounting for user-wired pages. Historically we have not distinguished between kernel wirings and user wirings for accounting purposes. User wirings (via mlock(2)) were subject to a global limit on the number of wired pages, so if large swaths of physical memory were wired by the kernel, as happens with the ZFS ARC among other things, the limit could be exceeded, causing user wirings to fail. The change adds a new counter, v_user_wire_count, which counts the number of virtual pages wired by user processes via mlock(2) and mlockall(2). Only user-wired pages are subject to the system-wide limit which helps provide some safety against deadlocks. In particular, while sources of kernel wirings typically support some backpressure mechanism, there is no way to reclaim user-wired pages shorting of killing the wiring process. The limit is exported as vm.max_user_wired, renamed from vm.max_wired, and changed from u_int to u_long. The choice to count virtual user-wired pages rather than physical pages was done for simplicity. There are mechanisms that can cause user-wired mappings to be destroyed while maintaining a wiring of the backing physical page; these make it difficult to accurately track user wirings at the physical page layer. The change also closes some holes which allowed user wirings to succeed even when they would cause the system limit to be exceeded. For instance, mmap() may now fail with ENOMEM in a process that has called mlockall(MCL_FUTURE) if the new mapping would cause the user wiring limit to be exceeded. Note that bhyve -S is subject to the user wiring limit, which defaults to 1/3 of physical RAM. Users that wish to exceed the limit must tune vm.max_user_wired. Reviewed by: kib, ngie (mlock() test changes) Tested by: pho (earlier version) MFC after: 45 days Sponsored by: Netflix Differential Revision: https://reviews.freebsd.org/D19908
2019-05-13 16:38:48 +00:00
}
1994-05-24 10:09:53 +00:00
if (rv != KERN_SUCCESS) {
#ifdef RACCT
if (racct_enable) {
PROC_LOCK(td->td_proc);
racct_set_force(td->td_proc,
RACCT_DATA, old - base);
racct_set_force(td->td_proc,
RACCT_VMEM, map->size);
if (!old_mlock && map->flags & MAP_WIREFUTURE) {
racct_set_force(td->td_proc,
RACCT_MEMLOCK,
ptoa(pmap_wired_count(map->pmap)));
}
PROC_UNLOCK(td->td_proc);
}
#endif
error = ENOMEM;
goto done;
1994-05-24 10:09:53 +00:00
}
vm->vm_dsize += btoc(new - old);
} else if (new < old) {
rv = vm_map_delete(map, new, old);
1994-05-24 10:09:53 +00:00
if (rv != KERN_SUCCESS) {
error = ENOMEM;
goto done;
1994-05-24 10:09:53 +00:00
}
vm->vm_dsize -= btoc(old - new);
#ifdef RACCT
if (racct_enable) {
PROC_LOCK(td->td_proc);
racct_set_force(td->td_proc, RACCT_DATA, new - base);
racct_set_force(td->td_proc, RACCT_VMEM, map->size);
if (!old_mlock && map->flags & MAP_WIREFUTURE) {
racct_set_force(td->td_proc, RACCT_MEMLOCK,
ptoa(pmap_wired_count(map->pmap)));
}
PROC_UNLOCK(td->td_proc);
}
#endif
1994-05-24 10:09:53 +00:00
}
done:
vm_map_unlock(map);
if (error == 0)
*addr = new;
return (error);
1994-05-24 10:09:53 +00:00
}
#ifdef COMPAT_FREEBSD11
1994-05-24 10:09:53 +00:00
int
freebsd11_vadvise(struct thread *td, struct freebsd11_vadvise_args *uap)
1994-05-24 10:09:53 +00:00
{
1994-05-24 10:09:53 +00:00
return (EINVAL);
}
#endif