/*- * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91 */ /*- * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/systm.h> #include <sys/lock.h> #include <sys/malloc.h> #include <sys/mutex.h> #include <sys/proc.h> #include <sys/kernel.h> #include <sys/linker_set.h> #include <sys/sysctl.h> #include <sys/vmmeter.h> #include <sys/vnode.h> #include <vm/vm.h> #include <vm/vm_param.h> #include <vm/vm_kern.h> #include <vm/pmap.h> #include <vm/vm_map.h> #include <vm/vm_object.h> #include <vm/vm_page.h> #include <vm/vm_pageout.h> #include <vm/vm_pager.h> #include <vm/vm_extern.h> static int vm_contig_launder_page(vm_page_t m) { vm_object_t object; vm_page_t m_tmp; struct vnode *vp; struct mount *mp; object = m->object; if (!VM_OBJECT_TRYLOCK(object)) return (EAGAIN); if (vm_page_sleep_if_busy(m, TRUE, "vpctw0")) { VM_OBJECT_UNLOCK(object); vm_page_lock_queues(); return (EBUSY); } vm_page_test_dirty(m); if (m->dirty == 0 && m->hold_count == 0) pmap_remove_all(m); if (m->dirty) { if ((object->flags & OBJ_DEAD) != 0) { VM_OBJECT_UNLOCK(object); return (EAGAIN); } if (object->type == OBJT_VNODE) { vm_page_unlock_queues(); vp = object->handle; vm_object_reference_locked(object); VM_OBJECT_UNLOCK(object); (void) vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread); VM_OBJECT_LOCK(object); vm_object_page_clean(object, 0, 0, OBJPC_SYNC); VM_OBJECT_UNLOCK(object); VOP_UNLOCK(vp, 0, curthread); vm_object_deallocate(object); vn_finished_write(mp); vm_page_lock_queues(); return (0); } else if (object->type == OBJT_SWAP || object->type == OBJT_DEFAULT) { m_tmp = m; vm_pageout_flush(&m_tmp, 1, VM_PAGER_PUT_SYNC); VM_OBJECT_UNLOCK(object); return (0); } } else if (m->hold_count == 0) vm_page_cache(m); VM_OBJECT_UNLOCK(object); return (0); } static int vm_contig_launder(int queue) { vm_page_t m, next; int error; for (m = TAILQ_FIRST(&vm_page_queues[queue].pl); m != NULL; m = next) { next = TAILQ_NEXT(m, pageq); /* Skip marker pages */ if ((m->flags & PG_MARKER) != 0) continue; KASSERT(VM_PAGE_INQUEUE2(m, queue), ("vm_contig_launder: page %p's queue is not %d", m, queue)); error = vm_contig_launder_page(m); if (error == 0) return (TRUE); if (error == EBUSY) return (FALSE); } return (FALSE); } /* * This interface is for merging with malloc() someday. * Even if we never implement compaction so that contiguous allocation * works after initialization time, malloc()'s data structures are good * for statistics and for allocations of less than a page. */ static void * contigmalloc1( unsigned long size, /* should be size_t here and for malloc() */ struct malloc_type *type, int flags, vm_paddr_t low, vm_paddr_t high, unsigned long alignment, unsigned long boundary, vm_map_t map) { int i, start; vm_paddr_t phys; vm_object_t object; vm_offset_t addr, tmp_addr; int pass, pqtype; int inactl, actl, inactmax, actmax; vm_page_t pga = vm_page_array; size = round_page(size); if (size == 0) panic("contigmalloc1: size must not be 0"); if ((alignment & (alignment - 1)) != 0) panic("contigmalloc1: alignment must be a power of 2"); if ((boundary & (boundary - 1)) != 0) panic("contigmalloc1: boundary must be a power of 2"); start = 0; for (pass = 2; pass >= 0; pass--) { vm_page_lock_queues(); again0: mtx_lock(&vm_page_queue_free_mtx); again: /* * Find first page in array that is free, within range, * aligned, and such that the boundary won't be crossed. */ for (i = start; i < cnt.v_page_count; i++) { phys = VM_PAGE_TO_PHYS(&pga[i]); pqtype = pga[i].queue - pga[i].pc; if (((pqtype == PQ_FREE) || (pqtype == PQ_CACHE)) && (phys >= low) && (phys < high) && ((phys & (alignment - 1)) == 0) && (((phys ^ (phys + size - 1)) & ~(boundary - 1)) == 0)) break; } /* * If the above failed or we will exceed the upper bound, fail. */ if ((i == cnt.v_page_count) || ((VM_PAGE_TO_PHYS(&pga[i]) + size) > high)) { mtx_unlock(&vm_page_queue_free_mtx); /* * Instead of racing to empty the inactive/active * queues, give up, even with more left to free, * if we try more than the initial amount of pages. * * There's no point attempting this on the last pass. */ if (pass > 0) { inactl = actl = 0; inactmax = vm_page_queues[PQ_INACTIVE].lcnt; actmax = vm_page_queues[PQ_ACTIVE].lcnt; again1: if (inactl < inactmax && vm_contig_launder(PQ_INACTIVE)) { inactl++; goto again1; } if (actl < actmax && vm_contig_launder(PQ_ACTIVE)) { actl++; goto again1; } } vm_page_unlock_queues(); continue; } start = i; /* * Check successive pages for contiguous and free. */ for (i = start + 1; i < (start + size / PAGE_SIZE); i++) { pqtype = pga[i].queue - pga[i].pc; if ((VM_PAGE_TO_PHYS(&pga[i]) != (VM_PAGE_TO_PHYS(&pga[i - 1]) + PAGE_SIZE)) || ((pqtype != PQ_FREE) && (pqtype != PQ_CACHE))) { start++; goto again; } } mtx_unlock(&vm_page_queue_free_mtx); for (i = start; i < (start + size / PAGE_SIZE); i++) { vm_page_t m = &pga[i]; if (VM_PAGE_INQUEUE1(m, PQ_CACHE)) { if (m->hold_count != 0) { start++; goto again0; } object = m->object; if (!VM_OBJECT_TRYLOCK(object)) { start++; goto again0; } if ((m->oflags & VPO_BUSY) || m->busy != 0) { VM_OBJECT_UNLOCK(object); start++; goto again0; } vm_page_free(m); VM_OBJECT_UNLOCK(object); } } mtx_lock(&vm_page_queue_free_mtx); for (i = start; i < (start + size / PAGE_SIZE); i++) { pqtype = pga[i].queue - pga[i].pc; if (pqtype != PQ_FREE) { start++; goto again; } } for (i = start; i < (start + size / PAGE_SIZE); i++) { vm_page_t m = &pga[i]; vm_pageq_remove_nowakeup(m); m->valid = VM_PAGE_BITS_ALL; if (m->flags & PG_ZERO) vm_page_zero_count--; /* Don't clear the PG_ZERO flag, we'll need it later. */ m->flags = PG_UNMANAGED | (m->flags & PG_ZERO); KASSERT(m->dirty == 0, ("contigmalloc1: page %p was dirty", m)); m->wire_count = 0; m->busy = 0; } mtx_unlock(&vm_page_queue_free_mtx); vm_page_unlock_queues(); /* * We've found a contiguous chunk that meets are requirements. * Allocate kernel VM, unfree and assign the physical pages to * it and return kernel VM pointer. */ vm_map_lock(map); if (vm_map_findspace(map, vm_map_min(map), size, &addr) != KERN_SUCCESS) { /* * XXX We almost never run out of kernel virtual * space, so we don't make the allocated memory * above available. */ vm_map_unlock(map); return (NULL); } vm_object_reference(kernel_object); vm_map_insert(map, kernel_object, addr - VM_MIN_KERNEL_ADDRESS, addr, addr + size, VM_PROT_ALL, VM_PROT_ALL, 0); vm_map_unlock(map); tmp_addr = addr; VM_OBJECT_LOCK(kernel_object); for (i = start; i < (start + size / PAGE_SIZE); i++) { vm_page_t m = &pga[i]; vm_page_insert(m, kernel_object, OFF_TO_IDX(tmp_addr - VM_MIN_KERNEL_ADDRESS)); if ((flags & M_ZERO) && !(m->flags & PG_ZERO)) pmap_zero_page(m); tmp_addr += PAGE_SIZE; } VM_OBJECT_UNLOCK(kernel_object); vm_map_wire(map, addr, addr + size, VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES); return ((void *)addr); } return (NULL); } static void vm_page_release_contigl(vm_page_t m, vm_pindex_t count) { while (count--) { vm_page_free_toq(m); m++; } } static void vm_page_release_contig(vm_page_t m, vm_pindex_t count) { vm_page_lock_queues(); vm_page_release_contigl(m, count); vm_page_unlock_queues(); } static int vm_contig_unqueue_free(vm_page_t m) { int error = 0; mtx_lock(&vm_page_queue_free_mtx); if ((m->queue - m->pc) == PQ_FREE) vm_pageq_remove_nowakeup(m); else error = EAGAIN; mtx_unlock(&vm_page_queue_free_mtx); if (error) return (error); m->valid = VM_PAGE_BITS_ALL; if (m->flags & PG_ZERO) vm_page_zero_count--; /* Don't clear the PG_ZERO flag; we'll need it later. */ m->flags = PG_UNMANAGED | (m->flags & PG_ZERO); m->oflags = 0; KASSERT(m->dirty == 0, ("contigmalloc2: page %p was dirty", m)); m->wire_count = 0; m->busy = 0; return (error); } vm_page_t vm_page_alloc_contig(vm_pindex_t npages, vm_paddr_t low, vm_paddr_t high, vm_offset_t alignment, vm_offset_t boundary) { vm_object_t object; vm_offset_t size; vm_paddr_t phys; vm_page_t pga = vm_page_array; static vm_pindex_t np = 0; static vm_pindex_t start = 0; vm_pindex_t startl = 0; int i, pass, pqtype; size = npages << PAGE_SHIFT; if (size == 0) panic("vm_page_alloc_contig: size must not be 0"); if ((alignment & (alignment - 1)) != 0) panic("vm_page_alloc_contig: alignment must be a power of 2"); if ((boundary & (boundary - 1)) != 0) panic("vm_page_alloc_contig: boundary must be a power of 2"); /* * Two simple optimizations. First, don't scan high ordered pages * if they are outside of the requested address range. Second, cache * the starting page index across calls and reuse it instead of * restarting the scan from the top. This is conditional on the * requested number of pages being the same or greater than the * cached amount. */ for (pass = 0; pass < 2; pass++) { vm_page_lock_queues(); if ((np == 0) || (np > npages)) { if (atop(high) < vm_page_array_size) start = atop(high) - npages + 1; else start = vm_page_array_size - npages + 1; } np = 0; retry: start--; /* * Find last page in array that is free, within range, * aligned, and such that the boundary won't be crossed. */ for (i = start; i >= 0; i--) { phys = VM_PAGE_TO_PHYS(&pga[i]); pqtype = pga[i].queue - pga[i].pc; if (pass == 0) { if (pqtype != PQ_FREE && pqtype != PQ_CACHE) continue; } else if (pqtype != PQ_FREE && pqtype != PQ_CACHE && pga[i].queue != PQ_ACTIVE && pga[i].queue != PQ_INACTIVE) continue; if (phys >= low && phys + size <= high && ((phys & (alignment - 1)) == 0) && ((phys ^ (phys + size - 1)) & ~(boundary - 1)) == 0) break; } /* There are no candidates at all. */ if (i < 0) { vm_page_unlock_queues(); continue; } start = i; /* * Check successive pages for contiguous and free. */ for (i = start + npages - 1; i > start; i--) { pqtype = pga[i].queue - pga[i].pc; if (VM_PAGE_TO_PHYS(&pga[i]) != VM_PAGE_TO_PHYS(&pga[i - 1]) + PAGE_SIZE) { start = i - npages + 1; goto retry; } if (pass == 0) { if (pqtype != PQ_FREE && pqtype != PQ_CACHE) { start = i - npages + 1; goto retry; } } else if (pqtype != PQ_FREE && pqtype != PQ_CACHE && pga[i].queue != PQ_ACTIVE && pga[i].queue != PQ_INACTIVE) { start = i - npages + 1; goto retry; } } for (i = start + npages - 1; i >= start; i--) { vm_page_t m = &pga[i]; retry_page: pqtype = m->queue - m->pc; if (pass != 0 && pqtype != PQ_FREE && pqtype != PQ_CACHE) { if (m->queue == PQ_ACTIVE || m->queue == PQ_INACTIVE) { if (vm_contig_launder_page(m) != 0) goto cleanup_freed; pqtype = m->queue - m->pc; if (pqtype != PQ_FREE && pqtype != PQ_CACHE) goto cleanup_freed; } else { cleanup_freed: vm_page_release_contigl(&pga[i + 1], start + npages - 1 - i); start = i - npages + 1; goto retry; } } if (pqtype == PQ_CACHE) { if (m->hold_count != 0) goto cleanup_freed; object = m->object; if (!VM_OBJECT_TRYLOCK(object)) goto cleanup_freed; if ((m->oflags & VPO_BUSY) || m->busy != 0) { VM_OBJECT_UNLOCK(object); goto cleanup_freed; } vm_page_free(m); VM_OBJECT_UNLOCK(object); } /* * There is no good API for freeing a page * directly to PQ_NONE on our behalf, so spin. */ if (vm_contig_unqueue_free(m) != 0) goto retry_page; } /* * We've found a contiguous chunk that meets are requirements. */ np = npages; startl = start; vm_page_unlock_queues(); return (&pga[startl]); } return (NULL); } static void * contigmalloc2(vm_page_t m, vm_pindex_t npages, int flags) { vm_object_t object = kernel_object; vm_map_t map = kernel_map; vm_offset_t addr, tmp_addr; vm_pindex_t i; /* * Allocate kernel VM, unfree and assign the physical pages to * it and return kernel VM pointer. */ vm_map_lock(map); if (vm_map_findspace(map, vm_map_min(map), npages << PAGE_SHIFT, &addr) != KERN_SUCCESS) { vm_map_unlock(map); return (NULL); } vm_object_reference(object); vm_map_insert(map, object, addr - VM_MIN_KERNEL_ADDRESS, addr, addr + (npages << PAGE_SHIFT), VM_PROT_ALL, VM_PROT_ALL, 0); vm_map_unlock(map); tmp_addr = addr; VM_OBJECT_LOCK(object); for (i = 0; i < npages; i++) { vm_page_insert(&m[i], object, OFF_TO_IDX(tmp_addr - VM_MIN_KERNEL_ADDRESS)); if ((flags & M_ZERO) && !(m[i].flags & PG_ZERO)) pmap_zero_page(&m[i]); tmp_addr += PAGE_SIZE; } VM_OBJECT_UNLOCK(object); vm_map_wire(map, addr, addr + (npages << PAGE_SHIFT), VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES); return ((void *)addr); } static int vm_old_contigmalloc = 0; SYSCTL_INT(_vm, OID_AUTO, old_contigmalloc, CTLFLAG_RW, &vm_old_contigmalloc, 0, "Use the old contigmalloc algorithm"); TUNABLE_INT("vm.old_contigmalloc", &vm_old_contigmalloc); void * contigmalloc( unsigned long size, /* should be size_t here and for malloc() */ struct malloc_type *type, int flags, vm_paddr_t low, vm_paddr_t high, unsigned long alignment, unsigned long boundary) { void * ret; vm_page_t pages; vm_pindex_t npgs; npgs = round_page(size) >> PAGE_SHIFT; mtx_lock(&Giant); if (vm_old_contigmalloc) { ret = contigmalloc1(size, type, flags, low, high, alignment, boundary, kernel_map); } else { pages = vm_page_alloc_contig(npgs, low, high, alignment, boundary); if (pages == NULL) { ret = NULL; } else { ret = contigmalloc2(pages, npgs, flags); if (ret == NULL) vm_page_release_contig(pages, npgs); } } mtx_unlock(&Giant); malloc_type_allocated(type, ret == NULL ? 0 : npgs << PAGE_SHIFT); return (ret); } void contigfree(void *addr, unsigned long size, struct malloc_type *type) { vm_pindex_t npgs; npgs = round_page(size) >> PAGE_SHIFT; kmem_free(kernel_map, (vm_offset_t)addr, size); malloc_type_freed(type, npgs << PAGE_SHIFT); }