Add the mlockall() and munlockall() system calls.

- All those diffs to syscalls.master for each architecture *are*
   necessary. This needed clarification; the stub code generation for
   mlockall() was disabled, which would prevent applications from
   linking to this API (suggested by mux)
 - Giant has been quoshed. It is no longer held by the code, as
   the required locking has been pushed down within vm_map.c.
 - Callers must specify VM_MAP_WIRE_HOLESOK or VM_MAP_WIRE_NOHOLES
   to express their intention explicitly.
 - Inspected at the vmstat, top and vm pager sysctl stats level.
   Paging-in activity is occurring correctly, using a test harness.
 - The RES size for a process may appear to be greater than its SIZE.
   This is believed to be due to mappings of the same shared library
   page being wired twice. Further exploration is needed.
 - Believed to back out of allocations and locks correctly
   (tested with WITNESS, MUTEX_PROFILING, INVARIANTS and DIAGNOSTIC).

PR:             kern/43426, standards/54223
Reviewed by:    jake, alc
Approved by:    jake (mentor)
MFC after:	2 weeks
This commit is contained in:
Bruce M Simpson 2003-08-11 07:14:08 +00:00
parent 3014050b19
commit abd498aa71
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=118771
17 changed files with 180 additions and 37 deletions

View File

@ -5,7 +5,7 @@ MDASM+= Ovfork.S brk.S cerror.S exect.S fork.S pipe.S ptrace.S \
# Don't generate default code for these syscalls:
NOASM= break.o exit.o ftruncate.o getdomainname.o getlogin.o \
lseek.o mlockall.o mmap.o munlockall.o openbsd_poll.o pread.o \
lseek.o mmap.o openbsd_poll.o pread.o \
pwrite.o setdomainname.o sstk.o truncate.o uname.o vfork.o yield.o
PSEUDO= _getlogin.o _exit.o

View File

@ -6,7 +6,7 @@ MDASM= vfork.S brk.S cerror.S exect.S pipe.S ptrace.S reboot.S sbrk.S \
# Don't generate default code for these syscalls:
NOASM= break.o exit.o ftruncate.o getdomainname.o getlogin.o \
lseek.o mlockall.o mmap.o munlockall.o openbsd_poll.o pread.o \
lseek.o mmap.o openbsd_poll.o pread.o \
pwrite.o setdomainname.o sstk.o truncate.o uname.o vfork.o yield.o
PSEUDO= _getlogin.o _exit.o

View File

@ -9,7 +9,7 @@ MDASM= Ovfork.S brk.S cerror.S exect.S pipe.S ptrace.S reboot.S sbrk.S \
# Don't generate default code for these syscalls:
NOASM= break.o exit.o ftruncate.o getdomainname.o getlogin.o \
lseek.o mlockall.o mmap.o munlockall.o openbsd_poll.o pread.o \
lseek.o mmap.o openbsd_poll.o pread.o \
pwrite.o setdomainname.o sstk.o truncate.o uname.o vfork.o yield.o
PSEUDO= _getlogin.o _exit.o

View File

@ -5,7 +5,7 @@ MDASM+= Ovfork.S brk.S cerror.S exect.S fork.S getcontext.S pipe.S ptrace.S \
# Don't generate default code for these syscalls:
NOASM= break.o exit.o ftruncate.o getdomainname.o getlogin.o \
lseek.o mlockall.o mmap.o munlockall.o openbsd_poll.o pread.o \
lseek.o mmap.o openbsd_poll.o pread.o \
pwrite.o setdomainname.o sstk.o truncate.o uname.o vfork.o yield.o
PSEUDO= _getlogin.o _exit.o

View File

@ -4,7 +4,7 @@ MDASM+= brk.S cerror.S exect.S pipe.S ptrace.S sbrk.S setlogin.S
# Don't generate default code for these syscalls:
NOASM= break.o exit.o ftruncate.o getdomainname.o getlogin.o \
lseek.o mlockall.o mmap.o munlockall.o openbsd_poll.o pread.o \
lseek.o mmap.o openbsd_poll.o pread.o \
pwrite.o setdomainname.o sstk.o truncate.o uname.o yield.o
PSEUDO= _getlogin.o _exit.o

View File

@ -16,7 +16,7 @@ MDASM+= brk.S cerror.S exect.S pipe.S ptrace.S sbrk.S setlogin.S sigaction.S
# Don't generate default code for these syscalls:
NOASM= break.o exit.o ftruncate.o getdomainname.o getlogin.o \
lseek.o mlockall.o mmap.o munlockall.o openbsd_poll.o pread.o \
lseek.o mmap.o openbsd_poll.o pread.o \
pwrite.o setdomainname.o sstk.o truncate.o uname.o yield.o
PSEUDO= _getlogin.o _exit.o

View File

@ -69,8 +69,8 @@ MAN+= _exit.2 accept.2 access.2 acct.2 adjtime.2 \
kldfind.2 kldfirstmod.2 kldload.2 kldnext.2 kldstat.2 kldsym.2 \
kldunload.2 kqueue.2 kse.2 ktrace.2 link.2 lio_listio.2 listen.2 \
lseek.2 \
madvise.2 mincore.2 minherit.2 mkdir.2 mkfifo.2 mknod.2 mlock.2 mmap.2 \
modfind.2 modnext.2 modstat.2 mount.2 \
madvise.2 mincore.2 minherit.2 mkdir.2 mkfifo.2 mknod.2 mlock.2 \
mlockall.2 mmap.2 modfind.2 modnext.2 modstat.2 mount.2 \
mprotect.2 msync.2 munmap.2 nanosleep.2 ntp_adjtime.2 ntp_gettime.2 \
nfssvc.2 open.2 pathconf.2 pipe.2 poll.2 profil.2 ptrace.2 quotactl.2 \
read.2 readlink.2 reboot.2 recv.2 rename.2 revoke.2 rfork.2 rmdir.2 \
@ -121,6 +121,7 @@ MLINKS+=kse.2 kse_create.2 kse.2 kse_exit.2 kse.2 kse_release.2 \
kse.2 kse_wakeup.2 kse.2 kse_thr_interrupt.2
MLINKS+=madvise.2 posix_madvise.2
MLINKS+=mlock.2 munlock.2
MLINKS+=mlockall.2 munlockall.2
MLINKS+=modnext.2 modfnext.2
MLINKS+=mount.2 unmount.2
MLINKS+=pathconf.2 fpathconf.2

View File

@ -745,7 +745,7 @@ link_elf_load_file(linker_class_t cls, const char* filename,
vm_map_wire(kernel_map,
(vm_offset_t) segbase,
(vm_offset_t) segbase + segs[i]->p_memsz,
FALSE);
VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES);
#endif
}

View File

@ -745,7 +745,7 @@ link_elf_load_file(linker_class_t cls, const char* filename,
vm_map_wire(kernel_map,
(vm_offset_t) segbase,
(vm_offset_t) segbase + segs[i]->p_memsz,
FALSE);
VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES);
#endif
}

View File

@ -188,6 +188,8 @@ int munmap(void *, size_t);
int posix_madvise(void *, size_t, int);
#endif
#if __POSIX_VISIBLE >= 199309
int mlockall(int);
int munlockall(void);
int shm_open(const char *, int, mode_t);
int shm_unlink(const char *);
#endif

View File

@ -266,7 +266,8 @@ contigmalloc1(
tmp_addr += PAGE_SIZE;
}
VM_OBJECT_UNLOCK(kernel_object);
vm_map_wire(map, addr, addr + size, FALSE);
vm_map_wire(map, addr, addr + size,
VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES);
splx(s);
return ((void *)addr);

View File

@ -189,7 +189,8 @@ vslock(addr, len)
{
vm_map_wire(&curproc->p_vmspace->vm_map, trunc_page((vm_offset_t)addr),
round_page((vm_offset_t)addr + len), FALSE);
round_page((vm_offset_t)addr + len),
VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES);
}
/*
@ -203,7 +204,8 @@ vsunlock(addr, len)
vm_map_unwire(&curproc->p_vmspace->vm_map,
trunc_page((vm_offset_t)addr),
round_page((vm_offset_t)addr + len), FALSE);
round_page((vm_offset_t)addr + len),
VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES);
}
/*

View File

@ -216,7 +216,8 @@ kmem_alloc(map, size)
/*
* And finally, mark the data as non-pageable.
*/
(void) vm_map_wire(map, addr, addr + size, FALSE);
(void) vm_map_wire(map, addr, addr + size,
VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES);
return (addr);
}

View File

@ -1604,19 +1604,24 @@ vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
*/
int
vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end,
boolean_t user_unwire)
int flags)
{
vm_map_entry_t entry, first_entry, tmp_entry;
vm_offset_t saved_start;
unsigned int last_timestamp;
int rv;
boolean_t need_wakeup, result;
boolean_t need_wakeup, result, user_unwire;
user_unwire = (flags & VM_MAP_WIRE_USER) ? TRUE : FALSE;
vm_map_lock(map);
VM_MAP_RANGE_CHECK(map, start, end);
if (!vm_map_lookup_entry(map, start, &first_entry)) {
vm_map_unlock(map);
return (KERN_INVALID_ADDRESS);
if (flags & VM_MAP_WIRE_HOLESOK)
first_entry = map->header.next;
else {
vm_map_unlock(map);
return (KERN_INVALID_ADDRESS);
}
}
last_timestamp = map->timestamp;
entry = first_entry;
@ -1672,9 +1677,11 @@ vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end,
entry->eflags |= MAP_ENTRY_IN_TRANSITION;
/*
* Check the map for holes in the specified region.
* If VM_MAP_WIRE_HOLESOK was specified, skip this check.
*/
if (entry->end < end && (entry->next == &map->header ||
entry->next->start > entry->end)) {
if (((flags & VM_MAP_WIRE_HOLESOK) == 0) &&
(entry->end < end && (entry->next == &map->header ||
entry->next->start > entry->end))) {
end = entry->end;
rv = KERN_INVALID_ADDRESS;
goto done;
@ -1733,19 +1740,24 @@ vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end,
*/
int
vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end,
boolean_t user_wire)
int flags)
{
vm_map_entry_t entry, first_entry, tmp_entry;
vm_offset_t saved_end, saved_start;
unsigned int last_timestamp;
int rv;
boolean_t need_wakeup, result;
boolean_t need_wakeup, result, user_wire;
user_wire = (flags & VM_MAP_WIRE_USER) ? TRUE : FALSE;
vm_map_lock(map);
VM_MAP_RANGE_CHECK(map, start, end);
if (!vm_map_lookup_entry(map, start, &first_entry)) {
vm_map_unlock(map);
return (KERN_INVALID_ADDRESS);
if (flags & VM_MAP_WIRE_HOLESOK)
first_entry = map->header.next;
else {
vm_map_unlock(map);
return (KERN_INVALID_ADDRESS);
}
}
last_timestamp = map->timestamp;
entry = first_entry;
@ -1856,9 +1868,11 @@ vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end,
}
/*
* Check the map for holes in the specified region.
* If VM_MAP_WIRE_HOLESOK was specified, skip this check.
*/
if (entry->end < end && (entry->next == &map->header ||
entry->next->start > entry->end)) {
if (((flags & VM_MAP_WIRE_HOLESOK) == 0) &&
(entry->end < end && (entry->next == &map->header ||
entry->next->start > entry->end))) {
end = entry->end;
rv = KERN_INVALID_ADDRESS;
goto done;
@ -2394,6 +2408,10 @@ vmspace_fork(struct vmspace *vm1)
new_map = &vm2->vm_map; /* XXX */
new_map->timestamp = 1;
/* Do not inherit the MAP_WIREFUTURE property. */
if ((new_map->flags & MAP_WIREFUTURE) == MAP_WIREFUTURE)
new_map->flags &= ~MAP_WIREFUTURE;
old_entry = old_map->header.next;
while (old_entry != &old_map->header) {
@ -2704,6 +2722,15 @@ vm_map_growstack (struct proc *p, vm_offset_t addr)
}
vm_map_unlock(map);
/*
* Heed the MAP_WIREFUTURE flag if it was set for this process.
*/
if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE))
vm_map_wire(map, addr, stack_entry->start,
(p->p_flag & P_SYSTEM ?
VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES :
VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES));
return (rv);
}

View File

@ -81,6 +81,7 @@
* vm_map_entry_t an entry in an address map.
*/
typedef u_char vm_flags_t;
typedef u_int vm_eflags_t;
/*
@ -171,6 +172,7 @@ struct vm_map {
u_char needs_wakeup;
u_char system_map; /* Am I a system map? */
u_char infork; /* Am I in fork processing? */
vm_flags_t flags; /* flags for this vm_map */
vm_map_entry_t root; /* Root of a binary search tree */
unsigned int timestamp; /* Version number */
vm_map_entry_t first_free; /* First free space hint */
@ -179,6 +181,11 @@ struct vm_map {
#define max_offset header.end /* (c) */
};
/*
* vm_flags_t values
*/
#define MAP_WIREFUTURE 0x01 /* wire all future pages */
#ifdef _KERNEL
static __inline vm_offset_t
vm_map_max(vm_map_t map)
@ -197,6 +204,12 @@ vm_map_pmap(vm_map_t map)
{
return (map->pmap);
}
static __inline void
vm_map_modflags(vm_map_t map, vm_flags_t set, vm_flags_t clear)
{
map->flags = (map->flags | set) & ~clear;
}
#endif /* _KERNEL */
/*
@ -296,6 +309,15 @@ long vmspace_resident_count(struct vmspace *vmspace);
#define VM_FAULT_WIRE_MASK (VM_FAULT_CHANGE_WIRING|VM_FAULT_USER_WIRE)
#define VM_FAULT_DIRTY 8 /* Dirty the page */
/*
* vm_map_wire and vm_map_unwire option flags
*/
#define VM_MAP_WIRE_SYSTEM 0 /* wiring in a kernel map */
#define VM_MAP_WIRE_USER 1 /* wiring in a user map */
#define VM_MAP_WIRE_NOHOLES 0 /* region must not have holes */
#define VM_MAP_WIRE_HOLESOK 2 /* region may have holes */
#ifdef _KERNEL
boolean_t vm_map_check_protection (vm_map_t, vm_offset_t, vm_offset_t, vm_prot_t);
vm_map_t vm_map_create(pmap_t, vm_offset_t, vm_offset_t);
@ -322,9 +344,9 @@ void vm_init2 (void);
int vm_map_stack (vm_map_t, vm_offset_t, vm_size_t, vm_prot_t, vm_prot_t, int);
int vm_map_growstack (struct proc *p, vm_offset_t addr);
int vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end,
boolean_t user_unwire);
int flags);
int vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end,
boolean_t user_wire);
int flags);
int vmspace_swap_count (struct vmspace *vmspace);
#endif /* _KERNEL */
#endif /* _VM_MAP_ */

View File

@ -1046,7 +1046,7 @@ mlock(td, uap)
#endif
error = vm_map_wire(&td->td_proc->p_vmspace->vm_map, addr,
addr + size, TRUE);
addr + size, VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES);
return (error == KERN_SUCCESS ? 0 : ENOMEM);
}
@ -1064,14 +1064,54 @@ mlockall(td, uap)
struct thread *td;
struct mlockall_args *uap;
{
/* mtx_lock(&Giant); */
/* mtx_unlock(&Giant); */
return 0;
vm_map_t map;
int error;
map = &td->td_proc->p_vmspace->vm_map;
error = 0;
if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0))
return (EINVAL);
#ifdef pmap_wired_count
/*
* If wiring all pages in the process would cause it to exceed
* a hard resource limit, return ENOMEM.
*/
if (map->size - ptoa(pmap_wired_count(vm_map_pmap(map)) >
td->td_proc->p_rlimit[RLIMIT_MEMLOCK].rlim_cur))
return (ENOMEM);
#else
error = suser(td);
if (error)
return (error);
#endif
if (uap->how & MCL_FUTURE) {
vm_map_lock(map);
vm_map_modflags(map, MAP_WIREFUTURE, 0);
vm_map_unlock(map);
error = 0;
}
if (uap->how & MCL_CURRENT) {
/*
* P1003.1-2001 mandates that all currently mapped pages
* will be memory resident and locked (wired) upon return
* from mlockall(). vm_map_wire() will wire pages, by
* calling vm_fault_wire() for each page in the region.
*/
error = vm_map_wire(map, vm_map_min(map), vm_map_max(map),
VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
error = (error == KERN_SUCCESS ? 0 : EAGAIN);
}
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct munlockall_args {
int how;
register_t dummy;
};
#endif
@ -1083,9 +1123,26 @@ munlockall(td, uap)
struct thread *td;
struct munlockall_args *uap;
{
/* mtx_lock(&Giant); */
/* mtx_unlock(&Giant); */
return 0;
vm_map_t map;
int error;
map = &td->td_proc->p_vmspace->vm_map;
#ifndef pmap_wired_count
error = suser(td);
if (error)
return (error);
#endif
/* Clear the MAP_WIREFUTURE flag from this vm_map. */
vm_map_lock(map);
vm_map_modflags(map, 0, MAP_WIREFUTURE);
vm_map_unlock(map);
/* Forcibly unwire all pages. */
error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map),
VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
return (error);
}
#ifndef _SYS_SYSPROTO_H_
@ -1125,7 +1182,7 @@ munlock(td, uap)
#endif
error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, addr,
addr + size, TRUE);
addr + size, VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES);
return (error == KERN_SUCCESS ? 0 : ENOMEM);
}
@ -1282,6 +1339,15 @@ vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
if (rv != KERN_SUCCESS)
(void) vm_map_remove(map, *addr, *addr + size);
}
/*
* If the process has requested that all future mappings
* be wired, then heed this.
*/
if ((rv == KERN_SUCCESS) && (map->flags & MAP_WIREFUTURE))
vm_map_wire(map, *addr, *addr + size,
VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES);
switch (rv) {
case KERN_SUCCESS:
return (0);

View File

@ -79,7 +79,9 @@ obreak(td, uap)
vm_offset_t new, old, base;
int rv;
int error = 0;
boolean_t do_map_wirefuture;
do_map_wirefuture = FALSE;
new = round_page((vm_offset_t)uap->nsize);
vm_map_lock(&vm->vm_map);
@ -121,6 +123,20 @@ obreak(td, uap)
goto done;
}
vm->vm_dsize += btoc(new - old);
/*
* Handle the MAP_WIREFUTURE case for legacy applications,
* by marking the newly mapped range of pages as wired.
* We are not required to perform a corresponding
* vm_map_unwire() before vm_map_delete() below, as
* it will forcibly unwire the pages in the range.
*
* XXX If the pages cannot be wired, no error is returned.
*/
if ((vm->vm_map.flags & MAP_WIREFUTURE) == MAP_WIREFUTURE) {
if (bootverbose)
printf("obreak: MAP_WIREFUTURE set\n");
do_map_wirefuture = TRUE;
}
} else if (new < old) {
rv = vm_map_delete(&vm->vm_map, new, old);
if (rv != KERN_SUCCESS) {
@ -131,6 +147,11 @@ obreak(td, uap)
}
done:
vm_map_unlock(&vm->vm_map);
if (do_map_wirefuture)
(void) vm_map_wire(&vm->vm_map, old, new,
VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES);
return (error);
}