From d1bce06c640268095219967823cfebfd141079a3 Mon Sep 17 00:00:00 2001 From: alc Date: Tue, 25 Sep 2007 06:25:06 +0000 Subject: [PATCH] Change the management of cached pages (PQ_CACHE) in two fundamental ways: (1) Cached pages are no longer kept in the object's resident page splay tree and memq. Instead, they are kept in a separate per-object splay tree of cached pages. However, access to this new per-object splay tree is synchronized by the _free_ page queues lock, not to be confused with the heavily contended page queues lock. Consequently, a cached page can be reclaimed by vm_page_alloc(9) without acquiring the object's lock or the page queues lock. This solves a problem independently reported by tegge@ and Isilon. Specifically, they observed the page daemon consuming a great deal of CPU time because of pages bouncing back and forth between the cache queue (PQ_CACHE) and the inactive queue (PQ_INACTIVE). The source of this problem turned out to be a deadlock avoidance strategy employed when selecting a cached page to reclaim in vm_page_select_cache(). However, the root cause was really that reclaiming a cached page required the acquisition of an object lock while the page queues lock was already held. Thus, this change addresses the problem at its root, by eliminating the need to acquire the object's lock. Moreover, keeping cached pages in the object's primary splay tree and memq was, in effect, optimizing for the uncommon case. Cached pages are reclaimed far, far more often than they are reactivated. Instead, this change makes reclamation cheaper, especially in terms of synchronization overhead, and reactivation more expensive, because reactivated pages will have to be reentered into the object's primary splay tree and memq. (2) Cached pages are now stored alongside free pages in the physical memory allocator's buddy queues, increasing the likelihood that large allocations of contiguous physical memory (i.e., superpages) will succeed. Finally, as a result of this change long-standing restrictions on when and where a cached page can be reclaimed and returned by vm_page_alloc(9) are eliminated. Specifically, calls to vm_page_alloc(9) specifying VM_ALLOC_INTERRUPT can now reclaim and return a formerly cached page. Consequently, a call to malloc(9) specifying M_NOWAIT is less likely to fail. Discussed with: many over the course of the summer, including jeff@, Justin Husted @ Isilon, peter@, tegge@ Tested by: an earlier version by kris@ Approved by: re (kensmith) --- sys/amd64/include/vmparam.h | 5 +- sys/arm/include/vmparam.h | 5 +- sys/i386/include/vmparam.h | 5 +- sys/ia64/include/vmparam.h | 5 +- sys/kern/kern_exec.c | 2 +- sys/kern/vfs_bio.c | 14 +- sys/powerpc/include/vmparam.h | 5 +- sys/sparc64/include/vmparam.h | 5 +- sys/sun4v/include/vmparam.h | 5 +- sys/sys/vmmeter.h | 4 +- sys/vm/vm_contig.c | 11 +- sys/vm/vm_fault.c | 29 +-- sys/vm/vm_map.c | 18 +- sys/vm/vm_object.c | 44 +++-- sys/vm/vm_object.h | 1 + sys/vm/vm_page.c | 361 +++++++++++++++++++++++++--------- sys/vm/vm_page.h | 26 ++- sys/vm/vm_pageout.c | 37 +--- sys/vm/vm_pageq.c | 27 --- sys/vm/vm_phys.c | 148 ++++++++++---- sys/vm/vm_phys.h | 3 + 21 files changed, 479 insertions(+), 281 deletions(-) diff --git a/sys/amd64/include/vmparam.h b/sys/amd64/include/vmparam.h index 9bafadeeeb7d..50962e5d0385 100644 --- a/sys/amd64/include/vmparam.h +++ b/sys/amd64/include/vmparam.h @@ -101,12 +101,13 @@ #define VM_PHYSSEG_MAX 31 /* - * Create two free page pools: VM_FREEPOOL_DEFAULT is the default pool + * Create three free page pools: VM_FREEPOOL_DEFAULT is the default pool * from which physical pages are allocated and VM_FREEPOOL_DIRECT is * the pool from which physical pages for page tables and small UMA * objects are allocated. */ -#define VM_NFREEPOOL 2 +#define VM_NFREEPOOL 3 +#define VM_FREEPOOL_CACHE 2 #define VM_FREEPOOL_DEFAULT 0 #define VM_FREEPOOL_DIRECT 1 diff --git a/sys/arm/include/vmparam.h b/sys/arm/include/vmparam.h index 5185a485a181..35b8d41954c5 100644 --- a/sys/arm/include/vmparam.h +++ b/sys/arm/include/vmparam.h @@ -59,12 +59,13 @@ #define VM_PHYSSEG_DENSE /* - * Create two free page pools: VM_FREEPOOL_DEFAULT is the default pool + * Create three free page pools: VM_FREEPOOL_DEFAULT is the default pool * from which physical pages are allocated and VM_FREEPOOL_DIRECT is * the pool from which physical pages for small UMA objects are * allocated. */ -#define VM_NFREEPOOL 2 +#define VM_NFREEPOOL 3 +#define VM_FREEPOOL_CACHE 2 #define VM_FREEPOOL_DEFAULT 0 #define VM_FREEPOOL_DIRECT 1 diff --git a/sys/i386/include/vmparam.h b/sys/i386/include/vmparam.h index 8f97e1b53796..2aa2848a1100 100644 --- a/sys/i386/include/vmparam.h +++ b/sys/i386/include/vmparam.h @@ -93,12 +93,13 @@ #define VM_PHYSSEG_MAX 17 /* - * Create one free page pool. Since the i386 kernel virtual address + * Create two free page pools. Since the i386 kernel virtual address * space does not include a mapping onto the machine's entire physical * memory, VM_FREEPOOL_DIRECT is defined as an alias for the default * pool, VM_FREEPOOL_DEFAULT. */ -#define VM_NFREEPOOL 1 +#define VM_NFREEPOOL 2 +#define VM_FREEPOOL_CACHE 1 #define VM_FREEPOOL_DEFAULT 0 #define VM_FREEPOOL_DIRECT 0 diff --git a/sys/ia64/include/vmparam.h b/sys/ia64/include/vmparam.h index de047bfed45f..c7dac2c8e3d3 100644 --- a/sys/ia64/include/vmparam.h +++ b/sys/ia64/include/vmparam.h @@ -122,12 +122,13 @@ #define VM_PHYSSEG_MAX 49 /* - * Create two free page pools: VM_FREEPOOL_DEFAULT is the default pool + * Create three free page pools: VM_FREEPOOL_DEFAULT is the default pool * from which physical pages are allocated and VM_FREEPOOL_DIRECT is * the pool from which physical pages for small UMA objects are * allocated. */ -#define VM_NFREEPOOL 2 +#define VM_NFREEPOOL 3 +#define VM_FREEPOOL_CACHE 2 #define VM_FREEPOOL_DEFAULT 0 #define VM_FREEPOOL_DIRECT 1 diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c index b0c107c00616..b3884d063e3d 100644 --- a/sys/kern/kern_exec.c +++ b/sys/kern/kern_exec.c @@ -832,7 +832,7 @@ exec_map_first_page(imgp) vm_page_busy(ma[i]); } else { ma[i] = vm_page_alloc(object, i, - VM_ALLOC_NORMAL); + VM_ALLOC_NORMAL | VM_ALLOC_IFNOTCACHED); if (ma[i] == NULL) break; } diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index d66b45dce50a..bb457db5f82b 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -2898,7 +2898,8 @@ allocbuf(struct buf *bp, int size) VM_WAIT; VM_OBJECT_LOCK(obj); } else { - bp->b_flags &= ~B_CACHE; + if (m->valid == 0) + bp->b_flags &= ~B_CACHE; bp->b_pages[bp->b_npages] = m; ++bp->b_npages; } @@ -2916,20 +2917,13 @@ allocbuf(struct buf *bp, int size) * vm_fault->getpages->cluster_read->allocbuf * */ - vm_page_lock_queues(); if (vm_page_sleep_if_busy(m, FALSE, "pgtblk")) continue; /* - * We have a good page. Should we wakeup the - * page daemon? + * We have a good page. */ - if ((curproc != pageproc) && - (VM_PAGE_INQUEUE1(m, PQ_CACHE)) && - ((cnt.v_free_count + cnt.v_cache_count) < - (cnt.v_free_min + cnt.v_cache_min))) { - pagedaemon_wakeup(); - } + vm_page_lock_queues(); vm_page_wire(m); vm_page_unlock_queues(); bp->b_pages[bp->b_npages] = m; diff --git a/sys/powerpc/include/vmparam.h b/sys/powerpc/include/vmparam.h index 60e240cb9973..0f8298e05c55 100644 --- a/sys/powerpc/include/vmparam.h +++ b/sys/powerpc/include/vmparam.h @@ -110,12 +110,13 @@ struct pmap_physseg { #define VM_PHYSSEG_DENSE /* - * Create two free page pools: VM_FREEPOOL_DEFAULT is the default pool + * Create three free page pools: VM_FREEPOOL_DEFAULT is the default pool * from which physical pages are allocated and VM_FREEPOOL_DIRECT is * the pool from which physical pages for small UMA objects are * allocated. */ -#define VM_NFREEPOOL 2 +#define VM_NFREEPOOL 3 +#define VM_FREEPOOL_CACHE 2 #define VM_FREEPOOL_DEFAULT 0 #define VM_FREEPOOL_DIRECT 1 diff --git a/sys/sparc64/include/vmparam.h b/sys/sparc64/include/vmparam.h index 5d83f60b1ac6..5609e8e9ec11 100644 --- a/sys/sparc64/include/vmparam.h +++ b/sys/sparc64/include/vmparam.h @@ -91,12 +91,13 @@ #define VM_PHYSSEG_MAX 64 /* - * Create two free page pools: VM_FREEPOOL_DEFAULT is the default pool + * Create three free page pools: VM_FREEPOOL_DEFAULT is the default pool * from which physical pages are allocated and VM_FREEPOOL_DIRECT is * the pool from which physical pages for small UMA objects are * allocated. */ -#define VM_NFREEPOOL 2 +#define VM_NFREEPOOL 3 +#define VM_FREEPOOL_CACHE 2 #define VM_FREEPOOL_DEFAULT 0 #define VM_FREEPOOL_DIRECT 1 diff --git a/sys/sun4v/include/vmparam.h b/sys/sun4v/include/vmparam.h index 42af1dfea9df..a994401eb33a 100644 --- a/sys/sun4v/include/vmparam.h +++ b/sys/sun4v/include/vmparam.h @@ -91,12 +91,13 @@ #define VM_PHYSSEG_MAX 64 /* - * Create two free page pools: VM_FREEPOOL_DEFAULT is the default pool + * Create three free page pools: VM_FREEPOOL_DEFAULT is the default pool * from which physical pages are allocated and VM_FREEPOOL_DIRECT is * the pool from which physical pages for small UMA objects are * allocated. */ -#define VM_NFREEPOOL 2 +#define VM_NFREEPOOL 3 +#define VM_FREEPOOL_CACHE 2 #define VM_FREEPOOL_DEFAULT 0 #define VM_FREEPOOL_DIRECT 1 diff --git a/sys/sys/vmmeter.h b/sys/sys/vmmeter.h index bdd196464357..2b4794a44a8e 100644 --- a/sys/sys/vmmeter.h +++ b/sys/sys/vmmeter.h @@ -68,7 +68,7 @@ struct vmmeter { u_int v_vnodepgsin; /* (p) vnode_pager pages paged in */ u_int v_vnodepgsout; /* (p) vnode pager pages paged out */ u_int v_intrans; /* (p) intransit blocking page faults */ - u_int v_reactivated; /* (q) pages reactivated from free list */ + u_int v_reactivated; /* (f) pages reactivated from free list */ u_int v_pdwakeups; /* (f) times daemon has awaken from sleep */ u_int v_pdpages; /* (q) pages analyzed by daemon */ @@ -89,7 +89,7 @@ struct vmmeter { u_int v_active_count; /* (q) pages active */ u_int v_inactive_target; /* (c) pages desired inactive */ u_int v_inactive_count; /* (q) pages inactive */ - u_int v_cache_count; /* (q) pages on buffer cache queue */ + u_int v_cache_count; /* (f) pages on buffer cache queue */ u_int v_cache_min; /* (c) min pages desired on cache queue */ u_int v_cache_max; /* (c) max pages in cached obj */ u_int v_pageout_free_min; /* (c) min pages reserved for kernel */ diff --git a/sys/vm/vm_contig.c b/sys/vm/vm_contig.c index 955df304c102..b40a9514f1ba 100644 --- a/sys/vm/vm_contig.c +++ b/sys/vm/vm_contig.c @@ -231,8 +231,7 @@ contigmalloc( unsigned long boundary) { void * ret; - vm_object_t object; - vm_page_t m, m_next, pages; + vm_page_t pages; unsigned long npgs; int actl, actmax, inactl, inactmax, tries; @@ -258,14 +257,6 @@ again: actl++; goto again; } - TAILQ_FOREACH_SAFE(m, &vm_page_queues[PQ_CACHE].pl, - pageq, m_next) { - if (m->hold_count == 0 && - VM_OBJECT_TRYLOCK(object = m->object)) { - vm_page_free(m); - VM_OBJECT_UNLOCK(object); - } - } vm_page_unlock_queues(); tries++; goto retry; diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index 8b843dc9bddb..b26972e96f81 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -328,8 +328,6 @@ RetryFault:; */ fs.m = vm_page_lookup(fs.object, fs.pindex); if (fs.m != NULL) { - int queue; - /* * check for page-based copy on write. * We check fs.object == fs.first_object so @@ -398,20 +396,7 @@ RetryFault:; vm_object_deallocate(fs.first_object); goto RetryFault; } - queue = fs.m->queue; - - vm_pageq_remove_nowakeup(fs.m); - - if (VM_PAGE_RESOLVEQUEUE(fs.m, queue) == PQ_CACHE) { - cnt.v_reactivated++; - if (vm_page_count_severe()) { - vm_page_activate(fs.m); - vm_page_unlock_queues(); - unlock_and_deallocate(&fs); - VM_WAITPFAULT; - goto RetryFault; - } - } + vm_pageq_remove(fs.m); vm_page_unlock_queues(); /* @@ -446,6 +431,8 @@ RetryFault:; if (!vm_page_count_severe()) { fs.m = vm_page_alloc(fs.object, fs.pindex, (fs.vp || fs.object->backing_object)? VM_ALLOC_NORMAL: VM_ALLOC_ZERO); + if ((fs.m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) + break; } if (fs.m == NULL) { unlock_and_deallocate(&fs); @@ -993,9 +980,7 @@ vm_fault_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry) (m->flags & PG_FICTITIOUS) == 0) { vm_page_lock_queues(); - if (!VM_PAGE_INQUEUE1(m, PQ_CACHE)) - pmap_enter_quick(pmap, addr, m, - entry->protection); + pmap_enter_quick(pmap, addr, m, entry->protection); vm_page_unlock_queues(); } VM_OBJECT_UNLOCK(lobject); @@ -1273,7 +1258,8 @@ vm_fault_additional_pages(m, rbehind, rahead, marray, reqpage) for (i = 0, tpindex = pindex - 1; tpindex >= startpindex && tpindex < pindex; i++, tpindex--) { - rtm = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL); + rtm = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL | + VM_ALLOC_IFNOTCACHED); if (rtm == NULL) { /* * Shift the allocated pages to the @@ -1311,7 +1297,8 @@ vm_fault_additional_pages(m, rbehind, rahead, marray, reqpage) for (; tpindex < endpindex; i++, tpindex++) { - rtm = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL); + rtm = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL | + VM_ALLOC_IFNOTCACHED); if (rtm == NULL) { break; } diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c index 905201ee46bb..cc6628b75451 100644 --- a/sys/vm/vm_map.c +++ b/sys/vm/vm_map.c @@ -1518,28 +1518,24 @@ vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot, start = addr + ptoa(tmpidx); p_start = p; } + } else if (p_start != NULL) { if (!are_queues_locked) { are_queues_locked = TRUE; vm_page_lock_queues(); } - if (VM_PAGE_INQUEUE1(p, PQ_CACHE)) { - if ((flags & MAP_PREFAULT_MADVISE) != 0) - vm_page_deactivate(p); - else if (p_start != NULL) { - pmap_enter_object(map->pmap, start, addr + - ptoa(tmpidx), p_start, prot); - p_start = NULL; - } - } - } else if (p_start != NULL) { pmap_enter_object(map->pmap, start, addr + ptoa(tmpidx), p_start, prot); p_start = NULL; } } - if (p_start != NULL) + if (p_start != NULL) { + if (!are_queues_locked) { + are_queues_locked = TRUE; + vm_page_lock_queues(); + } pmap_enter_object(map->pmap, start, addr + ptoa(psize), p_start, prot); + } if (are_queues_locked) vm_page_unlock_queues(); unlock_return: diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c index 47416389f902..0d2d61c49e92 100644 --- a/sys/vm/vm_object.c +++ b/sys/vm/vm_object.c @@ -170,6 +170,9 @@ vm_object_zdtor(void *mem, int size, void *arg) KASSERT(TAILQ_EMPTY(&object->memq), ("object %p has resident pages", object)); + KASSERT(object->cache == NULL, + ("object %p has cached pages", + object)); KASSERT(object->paging_in_progress == 0, ("object %p paging_in_progress = %d", object, object->paging_in_progress)); @@ -217,6 +220,7 @@ _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object) object->handle = NULL; object->backing_object = NULL; object->backing_object_offset = (vm_ooffset_t) 0; + object->cache = NULL; mtx_lock(&vm_object_list_mtx); TAILQ_INSERT_TAIL(&vm_object_list, object, object_list); @@ -648,6 +652,9 @@ vm_object_terminate(vm_object_t object) } vm_page_unlock_queues(); + if (__predict_false(object->cache != NULL)) + vm_page_cache_free(object); + /* * Let the pager know object is dead. */ @@ -732,8 +739,7 @@ vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end, int while (tscan < tend) { curgeneration = object->generation; p = vm_page_lookup(object, tscan); - if (p == NULL || p->valid == 0 || - VM_PAGE_INQUEUE1(p, PQ_CACHE)) { + if (p == NULL || p->valid == 0) { if (--scanlimit == 0) break; ++tscan; @@ -821,8 +827,7 @@ again: pi = p->pindex; if ((p->oflags & VPO_CLEANCHK) == 0 || (pi < tstart) || (pi >= tend) || - (p->valid == 0) || - VM_PAGE_INQUEUE1(p, PQ_CACHE)) { + p->valid == 0) { p->oflags &= ~VPO_CLEANCHK; continue; } @@ -900,10 +905,6 @@ vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int curgeneration, (tp->oflags & VPO_CLEANCHK) == 0) || (tp->busy != 0)) break; - if (VM_PAGE_INQUEUE1(tp, PQ_CACHE)) { - tp->oflags &= ~VPO_CLEANCHK; - break; - } vm_page_test_dirty(tp); if ((tp->dirty & tp->valid) == 0) { tp->oflags &= ~VPO_CLEANCHK; @@ -928,10 +929,6 @@ vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int curgeneration, (tp->oflags & VPO_CLEANCHK) == 0) || (tp->busy != 0)) break; - if (VM_PAGE_INQUEUE1(tp, PQ_CACHE)) { - tp->oflags &= ~VPO_CLEANCHK; - break; - } vm_page_test_dirty(tp); if ((tp->dirty & tp->valid) == 0) { tp->oflags &= ~VPO_CLEANCHK; @@ -1104,6 +1101,12 @@ shadowlookup: } } m = vm_page_lookup(tobject, tpindex); + if (m == NULL && advise == MADV_WILLNEED) { + /* + * If the page is cached, reactivate it. + */ + m = vm_page_alloc(tobject, tpindex, VM_ALLOC_IFCACHED); + } if (m == NULL) { /* * There may be swap even if there is no backing page @@ -1356,6 +1359,13 @@ retry: * and new_object's locks are released and reacquired. */ swap_pager_copy(orig_object, new_object, offidxstart, 0); + + /* + * Transfer any cached pages from orig_object to new_object. + */ + if (__predict_false(orig_object->cache != NULL)) + vm_page_cache_transfer(orig_object, offidxstart, + new_object); } VM_OBJECT_UNLOCK(orig_object); TAILQ_FOREACH(m, &new_object->memq, listq) @@ -1390,8 +1400,8 @@ vm_object_backing_scan(vm_object_t object, int op) */ if (op & OBSC_TEST_ALL_SHADOWED) { /* - * We do not want to have to test for the existence of - * swap pages in the backing object. XXX but with the + * We do not want to have to test for the existence of cache + * or swap pages in the backing object. XXX but with the * new swapper this would be pretty easy to do. * * XXX what about anonymous MAP_SHARED memory that hasn't @@ -1664,6 +1674,12 @@ vm_object_collapse(vm_object_t object) backing_object, object, OFF_TO_IDX(object->backing_object_offset), TRUE); + + /* + * Free any cached pages from backing_object. + */ + if (__predict_false(backing_object->cache != NULL)) + vm_page_cache_free(backing_object); } /* * Object now shadows whatever backing_object did. diff --git a/sys/vm/vm_object.h b/sys/vm/vm_object.h index 5a3f3c764097..b0da10b01284 100644 --- a/sys/vm/vm_object.h +++ b/sys/vm/vm_object.h @@ -100,6 +100,7 @@ struct vm_object { struct vm_object *backing_object; /* object that I'm a shadow of */ vm_ooffset_t backing_object_offset;/* Offset in backing object */ TAILQ_ENTRY(vm_object) pager_object_list; /* list of all objects of this pager type */ + vm_page_t cache; /* root of the cache page splay tree */ void *handle; union { /* diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index 6504f604c1c7..36fee281a157 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -547,7 +547,7 @@ vm_page_sleep(vm_page_t m, const char *msg) void vm_page_dirty(vm_page_t m) { - KASSERT(VM_PAGE_GETKNOWNQUEUE1(m) != PQ_CACHE, + KASSERT((m->flags & PG_CACHED) == 0, ("vm_page_dirty: page in cache!")); KASSERT(!VM_PAGE_IS_FREE(m), ("vm_page_dirty: page is free!")); @@ -790,50 +790,165 @@ vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex) vm_page_remove(m); vm_page_insert(m, new_object, new_pindex); - if (VM_PAGE_INQUEUE1(m, PQ_CACHE)) - vm_page_deactivate(m); vm_page_dirty(m); } /* - * vm_page_select_cache: - * - * Move a page of the given color from the cache queue to the free - * queue. As pages might be found, but are not applicable, they are - * deactivated. - * - * This routine may not block. + * Convert all of the cached pages belonging to the given object + * into free pages. If the given object has cached pages and is + * backed by a vnode, reduce the vnode's hold count. */ -vm_page_t -vm_page_select_cache(void) +void +vm_page_cache_free(vm_object_t object) { - vm_object_t object; - vm_page_t m; - boolean_t was_trylocked; + vm_page_t m, root; + boolean_t empty; - mtx_assert(&vm_page_queue_mtx, MA_OWNED); - while ((m = TAILQ_FIRST(&vm_page_queues[PQ_CACHE].pl)) != NULL) { - KASSERT(m->dirty == 0, ("Found dirty cache page %p", m)); - KASSERT(!pmap_page_is_mapped(m), - ("Found mapped cache page %p", m)); - KASSERT((m->flags & PG_UNMANAGED) == 0, - ("Found unmanaged cache page %p", m)); - KASSERT(m->wire_count == 0, ("Found wired cache page %p", m)); - if (m->hold_count == 0 && (object = m->object, - (was_trylocked = VM_OBJECT_TRYLOCK(object)) || - VM_OBJECT_LOCKED(object))) { - KASSERT((m->oflags & VPO_BUSY) == 0 && m->busy == 0, - ("Found busy cache page %p", m)); - vm_page_free(m); - if (was_trylocked) - VM_OBJECT_UNLOCK(object); - break; + mtx_lock(&vm_page_queue_free_mtx); + empty = object->cache == NULL; + while ((m = object->cache) != NULL) { + if (m->left == NULL) + root = m->right; + else if (m->right == NULL) + root = m->left; + else { + root = vm_page_splay(m->pindex, m->left); + root->right = m->right; } - vm_page_deactivate(m); + m->object->cache = root; + m->object = NULL; + m->valid = 0; + /* Clear PG_CACHED and set PG_FREE. */ + m->flags ^= PG_CACHED | PG_FREE; + KASSERT((m->flags & (PG_CACHED | PG_FREE)) == PG_FREE, + ("vm_page_cache_free: page %p has inconsistent flags", m)); + cnt.v_cache_count--; + cnt.v_free_count++; + } + mtx_unlock(&vm_page_queue_free_mtx); + if (object->type == OBJT_VNODE && !empty) + vdrop(object->handle); +} + +/* + * Returns the cached page that is associated with the given + * object and offset. If, however, none exists, returns NULL. + * + * The free page queue must be locked. + */ +static inline vm_page_t +vm_page_cache_lookup(vm_object_t object, vm_pindex_t pindex) +{ + vm_page_t m; + + mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); + if ((m = object->cache) != NULL && m->pindex != pindex) { + m = vm_page_splay(pindex, m); + if ((object->cache = m)->pindex != pindex) + m = NULL; } return (m); } +/* + * Remove the given cached page from its containing object's + * collection of cached pages. + * + * The free page queue must be locked. + */ +void +vm_page_cache_remove(vm_page_t m) +{ + vm_object_t object; + vm_page_t root; + + mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); + KASSERT((m->flags & PG_CACHED) != 0, + ("vm_page_cache_remove: page %p is not cached", m)); + object = m->object; + if (m != object->cache) { + root = vm_page_splay(m->pindex, object->cache); + KASSERT(root == m, + ("vm_page_cache_remove: page %p is not cached in object %p", + m, object)); + } + if (m->left == NULL) + root = m->right; + else if (m->right == NULL) + root = m->left; + else { + root = vm_page_splay(m->pindex, m->left); + root->right = m->right; + } + object->cache = root; + m->object = NULL; + cnt.v_cache_count--; +} + +/* + * Transfer all of the cached pages with offset greater than or + * equal to 'offidxstart' from the original object's cache to the + * new object's cache. Initially, the new object's cache must be + * empty. Offset 'offidxstart' in the original object must + * correspond to offset zero in the new object. + * + * The new object must be locked. + */ +void +vm_page_cache_transfer(vm_object_t orig_object, vm_pindex_t offidxstart, + vm_object_t new_object) +{ + vm_page_t m, m_next; + + /* + * Insertion into an object's collection of cached pages + * requires the object to be locked. In contrast, removal does + * not. + */ + VM_OBJECT_LOCK_ASSERT(new_object, MA_OWNED); + KASSERT(new_object->cache == NULL, + ("vm_page_cache_transfer: object %p has cached pages", + new_object)); + mtx_lock(&vm_page_queue_free_mtx); + if ((m = orig_object->cache) != NULL) { + /* + * Transfer all of the pages with offset greater than or + * equal to 'offidxstart' from the original object's + * cache to the new object's cache. + */ + m = vm_page_splay(offidxstart, m); + if (m->pindex < offidxstart) { + orig_object->cache = m; + new_object->cache = m->right; + m->right = NULL; + } else { + orig_object->cache = m->left; + new_object->cache = m; + m->left = NULL; + } + KASSERT(new_object->cache == NULL || + new_object->type == OBJT_SWAP, + ("vm_page_cache_transfer: object %p's type is incompatible" + " with cached pages", new_object)); + + /* + * Update the object and offset of each page that was + * transferred to the new object's cache. + */ + while ((m = new_object->cache) != NULL) { + m_next = vm_page_splay(m->pindex, m->right); + m->object = new_object; + m->pindex -= offidxstart; + if (m_next == NULL) + break; + m->right = NULL; + m_next->left = m; + new_object->cache = m_next; + } + } + mtx_unlock(&vm_page_queue_free_mtx); +} + /* * vm_page_alloc: * @@ -847,15 +962,13 @@ vm_page_select_cache(void) * VM_ALLOC_ZERO zero page * * This routine may not block. - * - * Additional special handling is required when called from an - * interrupt (VM_ALLOC_INTERRUPT). We are not allowed to mess with - * the page cache in this case. */ vm_page_t vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req) { - vm_page_t m = NULL; + struct vnode *vp = NULL; + vm_object_t m_object; + vm_page_t m; int flags, page_req; page_req = req & VM_ALLOC_CLASS_MASK; @@ -876,52 +989,32 @@ vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req) page_req = VM_ALLOC_SYSTEM; }; -loop: mtx_lock(&vm_page_queue_free_mtx); - if (cnt.v_free_count > cnt.v_free_reserved || + if (cnt.v_free_count + cnt.v_cache_count > cnt.v_free_reserved || (page_req == VM_ALLOC_SYSTEM && - cnt.v_cache_count == 0 && - cnt.v_free_count > cnt.v_interrupt_free_min) || - (page_req == VM_ALLOC_INTERRUPT && cnt.v_free_count > 0)) { + cnt.v_free_count + cnt.v_cache_count > cnt.v_interrupt_free_min) || + (page_req == VM_ALLOC_INTERRUPT && + cnt.v_free_count + cnt.v_cache_count > 0)) { /* * Allocate from the free queue if the number of free pages * exceeds the minimum for the request class. */ - m = vm_phys_alloc_pages(object != NULL ? - VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0); - } else if (page_req != VM_ALLOC_INTERRUPT) { - mtx_unlock(&vm_page_queue_free_mtx); - /* - * Allocatable from cache (non-interrupt only). On success, - * we must free the page and try again, thus ensuring that - * cnt.v_*_free_min counters are replenished. - */ - vm_page_lock_queues(); - if ((m = vm_page_select_cache()) == NULL) { - KASSERT(cnt.v_cache_count == 0, - ("vm_page_alloc: cache queue is missing %d pages", - cnt.v_cache_count)); - vm_page_unlock_queues(); - atomic_add_int(&vm_pageout_deficit, 1); - pagedaemon_wakeup(); - - if (page_req != VM_ALLOC_SYSTEM) - return (NULL); - - mtx_lock(&vm_page_queue_free_mtx); - if (cnt.v_free_count <= cnt.v_interrupt_free_min) { + if (object != NULL && + (m = vm_page_cache_lookup(object, pindex)) != NULL) { + if ((req & VM_ALLOC_IFNOTCACHED) != 0) { mtx_unlock(&vm_page_queue_free_mtx); return (NULL); } + vm_phys_unfree_page(m); + } else if ((req & VM_ALLOC_IFCACHED) != 0) { + mtx_unlock(&vm_page_queue_free_mtx); + return (NULL); + } else m = vm_phys_alloc_pages(object != NULL ? VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0); - } else { - vm_page_unlock_queues(); - goto loop; - } } else { /* - * Not allocatable from cache from interrupt, give up. + * Not allocatable, give up. */ mtx_unlock(&vm_page_queue_free_mtx); atomic_add_int(&vm_pageout_deficit, 1); @@ -937,8 +1030,24 @@ loop: m != NULL, ("vm_page_alloc(): missing page on free queue") ); - KASSERT(VM_PAGE_IS_FREE(m), - ("vm_page_alloc: page %p is not free", m)); + if ((m->flags & PG_CACHED) != 0) { + KASSERT(m->valid != 0, + ("vm_page_alloc: cached page %p is invalid", m)); + if (m->object == object && m->pindex == pindex) + cnt.v_reactivated++; + else + m->valid = 0; + m_object = m->object; + vm_page_cache_remove(m); + if (m_object->type == OBJT_VNODE && m_object->cache == NULL) + vp = m_object->handle; + } else { + KASSERT(VM_PAGE_IS_FREE(m), + ("vm_page_alloc: page %p is not free", m)); + KASSERT(m->valid == 0, + ("vm_page_alloc: free page %p is valid", m)); + cnt.v_free_count--; + } /* * Initialize structure. Only the PG_ZERO flag is inherited. @@ -964,7 +1073,6 @@ loop: m->hold_count = 0; m->act_count = 0; m->busy = 0; - m->valid = 0; KASSERT(m->dirty == 0, ("vm_page_alloc: free/cache page %p was dirty", m)); mtx_unlock(&vm_page_queue_free_mtx); @@ -973,6 +1081,15 @@ loop: else m->pindex = pindex; + /* + * The following call to vdrop() must come after the above call + * to vm_page_insert() in case both affect the same object and + * vnode. Otherwise, the affected vnode's hold count could + * temporarily become zero. + */ + if (vp != NULL) + vdrop(vp); + /* * Don't wakeup too often - wakeup the pageout daemon when * we would be nearly out of memory. @@ -1047,8 +1164,6 @@ vm_page_activate(vm_page_t m) mtx_assert(&vm_page_queue_mtx, MA_OWNED); if (VM_PAGE_GETKNOWNQUEUE2(m) != PQ_ACTIVE) { - if (VM_PAGE_INQUEUE1(m, PQ_CACHE)) - cnt.v_reactivated++; vm_pageq_remove(m); if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) { if (m->act_count < ACT_INIT) @@ -1133,7 +1248,7 @@ vm_page_free_toq(vm_page_t m) * callback routine until after we've put the page on the * appropriate free queue. */ - vm_pageq_remove_nowakeup(m); + vm_pageq_remove(m); vm_page_remove(m); /* @@ -1160,6 +1275,7 @@ vm_page_free_toq(vm_page_t m) } else { m->flags |= PG_FREE; mtx_lock(&vm_page_queue_free_mtx); + cnt.v_free_count++; if ((m->flags & PG_ZERO) != 0) { vm_phys_free_pages(m, 0); ++vm_page_zero_count; @@ -1279,8 +1395,6 @@ _vm_page_deactivate(vm_page_t m, int athead) if (VM_PAGE_INQUEUE2(m, PQ_INACTIVE)) return; if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) { - if (VM_PAGE_INQUEUE1(m, PQ_CACHE)) - cnt.v_reactivated++; vm_page_flag_clear(m, PG_WINATCFLS); vm_pageq_remove(m); if (athead) @@ -1354,15 +1468,26 @@ vm_page_try_to_free(vm_page_t m) void vm_page_cache(vm_page_t m) { + vm_object_t object; + vm_page_t root; mtx_assert(&vm_page_queue_mtx, MA_OWNED); - VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); + object = m->object; + VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); if ((m->flags & PG_UNMANAGED) || (m->oflags & VPO_BUSY) || m->busy || m->hold_count || m->wire_count) { panic("vm_page_cache: attempting to cache busy page"); } - if (VM_PAGE_INQUEUE1(m, PQ_CACHE)) + if (m->valid == 0 || object->type == OBJT_DEFAULT) { + /* + * Hypothesis: A cache-elgible page belonging to a + * default object must be zero filled. + */ + vm_page_free(m); return; + } + KASSERT((m->flags & PG_CACHED) == 0, + ("vm_page_cache: page %p is already cached", m)); cnt.v_tcached++; /* @@ -1374,11 +1499,72 @@ vm_page_cache(vm_page_t m) panic("vm_page_cache: caching a dirty page, pindex: %ld", (long)m->pindex); } - vm_pageq_remove_nowakeup(m); - vm_pageq_enqueue(PQ_CACHE, m); + + /* + * Remove the page from the paging queues. + */ + vm_pageq_remove(m); + + /* + * Remove the page from the object's collection of resident + * pages. + */ + if (m != object->root) + vm_page_splay(m->pindex, object->root); + if (m->left == NULL) + root = m->right; + else { + root = vm_page_splay(m->pindex, m->left); + root->right = m->right; + } + object->root = root; + TAILQ_REMOVE(&object->memq, m, listq); + object->resident_page_count--; + object->generation++; + + /* + * Insert the page into the object's collection of cached pages + * and the physical memory allocator's cache/free page queues. + */ + vm_page_flag_set(m, PG_CACHED); + vm_page_flag_clear(m, PG_ZERO); mtx_lock(&vm_page_queue_free_mtx); + vm_phys_set_pool(VM_FREEPOOL_CACHE, m, 0); + cnt.v_cache_count++; + root = object->cache; + if (root == NULL) { + m->left = NULL; + m->right = NULL; + } else { + root = vm_page_splay(m->pindex, root); + if (m->pindex < root->pindex) { + m->left = root->left; + m->right = root; + root->left = NULL; + } else if (__predict_false(m->pindex == root->pindex)) + panic("vm_page_cache: offset already cached"); + else { + m->right = root->right; + m->left = root; + root->right = NULL; + } + } + object->cache = m; + vm_phys_free_pages(m, 0); vm_page_free_wakeup(); mtx_unlock(&vm_page_queue_free_mtx); + + /* + * Increment the vnode's hold count if this is the object's only + * cached page. Decrement the vnode's hold count if this was + * the object's only resident page. + */ + if (object->type == OBJT_VNODE) { + if (root == NULL && object->resident_page_count != 0) + vhold(object->handle); + else if (root != NULL && object->resident_page_count == 0) + vdrop(object->handle); + } } /* @@ -1416,9 +1602,7 @@ vm_page_dontneed(vm_page_t m) * occassionally leave the page alone */ if ((dnw & 0x01F0) == 0 || - VM_PAGE_INQUEUE2(m, PQ_INACTIVE) || - VM_PAGE_INQUEUE1(m, PQ_CACHE) - ) { + VM_PAGE_INQUEUE2(m, PQ_INACTIVE)) { if (m->act_count >= ACT_INIT) --m->act_count; return; @@ -1482,7 +1666,8 @@ retrylookup: if ((allocflags & VM_ALLOC_RETRY) == 0) return (NULL); goto retrylookup; - } + } else if (m->valid != 0) + return (m); if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); return (m); @@ -1813,7 +1998,7 @@ DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info) db_printf("\n"); db_printf("PQ_CACHE:"); - db_printf(" %d", *vm_page_queues[PQ_CACHE].cnt); + db_printf(" %d", cnt.v_cache_count); db_printf("\n"); db_printf("PQ_ACTIVE: %d, PQ_INACTIVE: %d\n", diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h index b06a19e1d843..3ed2f75c7166 100644 --- a/sys/vm/vm_page.h +++ b/sys/vm/vm_page.h @@ -160,27 +160,20 @@ CTASSERT(sizeof(u_long) >= 8); #define PQ_NONE 0 #define PQ_INACTIVE 1 #define PQ_ACTIVE 2 -#define PQ_CACHE 3 -#define PQ_HOLD 4 -#define PQ_COUNT 5 -#define PQ_MAXCOUNT 5 +#define PQ_HOLD 3 +#define PQ_COUNT 4 +#define PQ_MAXCOUNT 4 /* Returns the real queue a page is on. */ #define VM_PAGE_GETQUEUE(m) ((m)->queue) /* Returns the well known queue a page is on. */ -#define VM_PAGE_GETKNOWNQUEUE1(m) VM_PAGE_GETQUEUE(m) #define VM_PAGE_GETKNOWNQUEUE2(m) VM_PAGE_GETQUEUE(m) -/* Given the real queue number and a page color return the well know queue. */ -#define VM_PAGE_RESOLVEQUEUE(m, q) (q) - /* Returns true if the page is in the named well known queue. */ -#define VM_PAGE_INQUEUE1(m, q) (VM_PAGE_GETKNOWNQUEUE1(m) == (q)) #define VM_PAGE_INQUEUE2(m, q) (VM_PAGE_GETKNOWNQUEUE2(m) == (q)) /* Sets the queue a page is on. */ -#define VM_PAGE_SETQUEUE1(m, q) (VM_PAGE_GETQUEUE(m) = (q)) #define VM_PAGE_SETQUEUE2(m, q) (VM_PAGE_GETQUEUE(m) = (q)) struct vpgqueues { @@ -201,6 +194,7 @@ extern struct mtx vm_page_queue_free_mtx; * pte mappings, nor can they be removed from their objects via * the object, and such pages are also not on any PQ queue. */ +#define PG_CACHED 0x0001 /* page is cached */ #define PG_FREE 0x0002 /* page is free */ #define PG_WINATCFLS 0x0004 /* flush dirty page on inactive q */ #define PG_FICTITIOUS 0x0008 /* physical page doesn't exist (O) */ @@ -230,9 +224,8 @@ extern struct mtx vm_page_queue_free_mtx; * Available for allocation now. * * cache - * Almost available for allocation. Still in an - * object, but clean and immediately freeable at - * non-interrupt times. + * Almost available for allocation. Still associated with + * an object, but clean and immediately freeable. * * hold * Will become free after a pending I/O operation @@ -302,6 +295,8 @@ extern struct mtx vm_page_queue_mtx; #define VM_ALLOC_RETRY 0x0080 /* vm_page_grab() only */ #define VM_ALLOC_NOOBJ 0x0100 /* No associated object */ #define VM_ALLOC_NOBUSY 0x0200 /* Do not busy the page */ +#define VM_ALLOC_IFCACHED 0x0400 /* Fail if the page is not cached */ +#define VM_ALLOC_IFNOTCACHED 0x0800 /* Fail if the page is cached */ void vm_page_flag_set(vm_page_t m, unsigned short bits); void vm_page_flag_clear(vm_page_t m, unsigned short bits); @@ -318,7 +313,6 @@ void vm_page_wakeup(vm_page_t m); void vm_pageq_init(void); void vm_pageq_enqueue(int queue, vm_page_t m); -void vm_pageq_remove_nowakeup(vm_page_t m); void vm_pageq_remove(vm_page_t m); void vm_pageq_requeue(vm_page_t m); @@ -326,6 +320,9 @@ void vm_page_activate (vm_page_t); vm_page_t vm_page_alloc (vm_object_t, vm_pindex_t, int); vm_page_t vm_page_grab (vm_object_t, vm_pindex_t, int); void vm_page_cache (register vm_page_t); +void vm_page_cache_free(vm_object_t); +void vm_page_cache_remove(vm_page_t); +void vm_page_cache_transfer(vm_object_t, vm_pindex_t, vm_object_t); int vm_page_try_to_cache (vm_page_t); int vm_page_try_to_free (vm_page_t); void vm_page_dontneed (register vm_page_t); @@ -334,7 +331,6 @@ void vm_page_insert (vm_page_t, vm_object_t, vm_pindex_t); vm_page_t vm_page_lookup (vm_object_t, vm_pindex_t); void vm_page_remove (vm_page_t); void vm_page_rename (vm_page_t, vm_object_t, vm_pindex_t); -vm_page_t vm_page_select_cache(void); void vm_page_sleep(vm_page_t m, const char *msg); vm_page_t vm_page_splay(vm_pindex_t, vm_page_t); vm_offset_t vm_page_startup(vm_offset_t vaddr); diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index 94788091c8de..2c267271b325 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -342,8 +342,7 @@ more: ib = 0; break; } - if (VM_PAGE_INQUEUE1(p, PQ_CACHE) || - (p->oflags & VPO_BUSY) || p->busy) { + if ((p->oflags & VPO_BUSY) || p->busy) { ib = 0; break; } @@ -372,8 +371,7 @@ more: if ((p = vm_page_lookup(object, pindex + is)) == NULL) break; - if (VM_PAGE_INQUEUE1(p, PQ_CACHE) || - (p->oflags & VPO_BUSY) || p->busy) { + if ((p->oflags & VPO_BUSY) || p->busy) { break; } vm_page_test_dirty(p); @@ -1139,37 +1137,6 @@ unlock_and_continue: VM_OBJECT_UNLOCK(object); m = next; } - - /* - * We try to maintain some *really* free pages, this allows interrupt - * code to be guaranteed space. Since both cache and free queues - * are considered basically 'free', moving pages from cache to free - * does not effect other calculations. - */ - while (cnt.v_free_count < cnt.v_free_reserved) { - TAILQ_FOREACH(m, &vm_page_queues[PQ_CACHE].pl, pageq) { - KASSERT(m->dirty == 0, - ("Found dirty cache page %p", m)); - KASSERT(!pmap_page_is_mapped(m), - ("Found mapped cache page %p", m)); - KASSERT((m->flags & PG_UNMANAGED) == 0, - ("Found unmanaged cache page %p", m)); - KASSERT(m->wire_count == 0, - ("Found wired cache page %p", m)); - if (m->hold_count == 0 && VM_OBJECT_TRYLOCK(object = - m->object)) { - KASSERT((m->oflags & VPO_BUSY) == 0 && - m->busy == 0, ("Found busy cache page %p", - m)); - vm_page_free(m); - VM_OBJECT_UNLOCK(object); - cnt.v_dfree++; - break; - } - } - if (m == NULL) - break; - } vm_page_unlock_queues(); #if !defined(NO_SWAPPING) /* diff --git a/sys/vm/vm_pageq.c b/sys/vm/vm_pageq.c index 5c10f62147ef..055bac513b1b 100644 --- a/sys/vm/vm_pageq.c +++ b/sys/vm/vm_pageq.c @@ -56,7 +56,6 @@ vm_pageq_init(void) { int i; - vm_page_queues[PQ_CACHE].cnt = &cnt.v_cache_count; vm_page_queues[PQ_INACTIVE].cnt = &cnt.v_inactive_count; vm_page_queues[PQ_ACTIVE].cnt = &cnt.v_active_count; vm_page_queues[PQ_HOLD].cnt = &cnt.v_active_count; @@ -93,28 +92,6 @@ vm_pageq_enqueue(int queue, vm_page_t m) ++*vpq->cnt; } -/* - * vm_pageq_remove_nowakeup: - * - * vm_page_unqueue() without any wakeup - * - * The queue containing the given page must be locked. - * This routine may not block. - */ -void -vm_pageq_remove_nowakeup(vm_page_t m) -{ - int queue = VM_PAGE_GETQUEUE(m); - struct vpgqueues *pq; - - if (queue != PQ_NONE) { - pq = &vm_page_queues[queue]; - VM_PAGE_SETQUEUE2(m, PQ_NONE); - TAILQ_REMOVE(&pq->pl, m, pageq); - (*pq->cnt)--; - } -} - /* * vm_pageq_remove: * @@ -134,9 +111,5 @@ vm_pageq_remove(vm_page_t m) pq = &vm_page_queues[queue]; TAILQ_REMOVE(&pq->pl, m, pageq); (*pq->cnt)--; - if (VM_PAGE_RESOLVEQUEUE(m, queue) == PQ_CACHE) { - if (vm_paging_needed()) - pagedaemon_wakeup(); - } } } diff --git a/sys/vm/vm_phys.c b/sys/vm/vm_phys.c index 81d597c5cc44..8efdf3df8405 100644 --- a/sys/vm/vm_phys.c +++ b/sys/vm/vm_phys.c @@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include @@ -89,7 +90,6 @@ SYSCTL_OID(_vm, OID_AUTO, phys_segs, CTLTYPE_STRING | CTLFLAG_RD, static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind); static int vm_phys_paddr_to_segind(vm_paddr_t pa); -static void vm_phys_set_pool(int pool, vm_page_t m, int order); static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order); @@ -286,6 +286,7 @@ vm_phys_add_page(vm_paddr_t pa) m->pool = VM_FREEPOOL_DEFAULT; pmap_page_init(m); mtx_lock(&vm_page_queue_free_mtx); + cnt.v_free_count++; vm_phys_free_pages(m, 0); mtx_unlock(&vm_page_queue_free_mtx); } @@ -318,7 +319,6 @@ vm_phys_alloc_pages(int pool, int order) fl[oind].lcnt--; m->order = VM_NFREEORDER; vm_phys_split_pages(m, oind, fl, order); - cnt.v_free_count -= 1 << order; return (m); } } @@ -339,7 +339,6 @@ vm_phys_alloc_pages(int pool, int order) m->order = VM_NFREEORDER; vm_phys_set_pool(pool, m, oind); vm_phys_split_pages(m, oind, fl, order); - cnt.v_free_count -= 1 << order; return (m); } } @@ -428,7 +427,6 @@ vm_phys_free_pages(vm_page_t m, int order) mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); pa = VM_PAGE_TO_PHYS(m); seg = &vm_phys_segs[m->segind]; - cnt.v_free_count += 1 << order; while (order < VM_NFREEORDER - 1) { pa_buddy = pa ^ (1 << (PAGE_SHIFT + order)); if (pa_buddy < seg->start || @@ -456,7 +454,7 @@ vm_phys_free_pages(vm_page_t m, int order) /* * Set the pool for a contiguous, power of two-sized set of physical pages. */ -static void +void vm_phys_set_pool(int pool, vm_page_t m, int order) { vm_page_t m_tmp; @@ -466,44 +464,113 @@ vm_phys_set_pool(int pool, vm_page_t m, int order) } /* - * Try to zero one or more physical pages. Used by an idle priority thread. + * Remove the given physical page "m" from the free lists. + * + * The free page queues must be locked. + */ +void +vm_phys_unfree_page(vm_page_t m) +{ + struct vm_freelist *fl; + struct vm_phys_seg *seg; + vm_paddr_t pa, pa_half; + vm_page_t m_set, m_tmp; + int order; + + mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); + + /* + * First, find the contiguous, power of two-sized set of free + * physical pages containing the given physical page "m" and + * assign it to "m_set". + */ + seg = &vm_phys_segs[m->segind]; + for (m_set = m, order = 0; m_set->order == VM_NFREEORDER && + order < VM_NFREEORDER; ) { + order++; + pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order)); + KASSERT(pa >= seg->start && pa < seg->end, + ("vm_phys_unfree_page: paddr %#jx is not within segment %p", + (uintmax_t)pa, seg)); + m_set = &seg->first_page[atop(pa - seg->start)]; + } + KASSERT(m_set->order >= order, ("vm_phys_unfree_page: page %p's order" + " (%d) is less than expected (%d)", m_set, m_set->order, order)); + KASSERT(m_set->order < VM_NFREEORDER, + ("vm_phys_unfree_page: page %p has unexpected order %d", + m_set, m_set->order)); + KASSERT(order < VM_NFREEORDER, + ("vm_phys_unfree_page: order %d is out of range", order)); + + /* + * Next, remove "m_set" from the free lists. Finally, extract + * "m" from "m_set" using an iterative algorithm: While "m_set" + * is larger than a page, shrink "m_set" by returning the half + * of "m_set" that does not contain "m" to the free lists. + */ + fl = (*seg->free_queues)[m_set->pool]; + order = m_set->order; + TAILQ_REMOVE(&fl[order].pl, m_set, pageq); + fl[order].lcnt--; + m_set->order = VM_NFREEORDER; + while (order > 0) { + order--; + pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order)); + if (m->phys_addr < pa_half) + m_tmp = &seg->first_page[atop(pa_half - seg->start)]; + else { + m_tmp = m_set; + m_set = &seg->first_page[atop(pa_half - seg->start)]; + } + m_tmp->order = order; + TAILQ_INSERT_HEAD(&fl[order].pl, m_tmp, pageq); + fl[order].lcnt++; + } + KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency")); +} + +/* + * Try to zero one physical page. Used by an idle priority thread. */ boolean_t vm_phys_zero_pages_idle(void) { - struct vm_freelist *fl; + static struct vm_freelist *fl = vm_phys_free_queues[0][0]; + static int flind, oind, pind; vm_page_t m, m_tmp; - int flind, pind, q, zeroed; mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); - for (flind = 0; flind < vm_nfreelists; flind++) { - pind = VM_FREEPOOL_DEFAULT; - fl = vm_phys_free_queues[flind][pind]; - for (q = 0; q < VM_NFREEORDER; q++) { - m = TAILQ_FIRST(&fl[q].pl); - if (m != NULL && (m->flags & PG_ZERO) == 0) { - TAILQ_REMOVE(&fl[q].pl, m, pageq); - fl[q].lcnt--; - m->order = VM_NFREEORDER; - cnt.v_free_count -= 1 << q; - mtx_unlock(&vm_page_queue_free_mtx); - zeroed = 0; - for (m_tmp = m; m_tmp < &m[1 << q]; m_tmp++) { - if ((m_tmp->flags & PG_ZERO) == 0) { - pmap_zero_page_idle(m_tmp); - m_tmp->flags |= PG_ZERO; - zeroed++; - } + for (;;) { + TAILQ_FOREACH_REVERSE(m, &fl[oind].pl, pglist, pageq) { + for (m_tmp = m; m_tmp < &m[1 << oind]; m_tmp++) { + if ((m_tmp->flags & (PG_CACHED | PG_ZERO)) == 0) { + vm_phys_unfree_page(m_tmp); + cnt.v_free_count--; + mtx_unlock(&vm_page_queue_free_mtx); + pmap_zero_page_idle(m_tmp); + m_tmp->flags |= PG_ZERO; + mtx_lock(&vm_page_queue_free_mtx); + cnt.v_free_count++; + vm_phys_free_pages(m_tmp, 0); + vm_page_zero_count++; + cnt_prezero++; + return (TRUE); } - cnt_prezero += zeroed; - mtx_lock(&vm_page_queue_free_mtx); - vm_phys_free_pages(m, q); - vm_page_zero_count += zeroed; - return (TRUE); } } + oind++; + if (oind == VM_NFREEORDER) { + oind = 0; + pind++; + if (pind == VM_NFREEPOOL) { + pind = 0; + flind++; + if (flind == vm_nfreelists) + flind = 0; + } + fl = vm_phys_free_queues[flind][pind]; + } } - return (FALSE); } /* @@ -522,6 +589,7 @@ vm_phys_alloc_contig(unsigned long npages, vm_paddr_t low, vm_paddr_t high, { struct vm_freelist *fl; struct vm_phys_seg *seg; + vm_object_t m_object; vm_paddr_t pa, pa_last, size; vm_page_t m, m_ret; int flind, i, oind, order, pind; @@ -606,12 +674,19 @@ done: vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m_ret, oind); fl = (*seg->free_queues)[m_ret->pool]; vm_phys_split_pages(m_ret, oind, fl, order); - cnt.v_free_count -= roundup2(npages, 1 << imin(oind, order)); for (i = 0; i < npages; i++) { m = &m_ret[i]; KASSERT(m->queue == PQ_NONE, ("vm_phys_alloc_contig: page %p has unexpected queue %d", m, m->queue)); + m_object = m->object; + if ((m->flags & PG_CACHED) != 0) + vm_page_cache_remove(m); + else { + KASSERT(VM_PAGE_IS_FREE(m), + ("vm_phys_alloc_contig: page %p is not free", m)); + cnt.v_free_count--; + } m->valid = VM_PAGE_BITS_ALL; if (m->flags & PG_ZERO) vm_page_zero_count--; @@ -622,6 +697,13 @@ done: ("vm_phys_alloc_contig: page %p was dirty", m)); m->wire_count = 0; m->busy = 0; + if (m_object != NULL && + m_object->type == OBJT_VNODE && + m_object->cache == NULL) { + mtx_unlock(&vm_page_queue_free_mtx); + vdrop(m_object->handle); + mtx_lock(&vm_page_queue_free_mtx); + } } for (; i < roundup2(npages, 1 << imin(oind, order)); i++) { m = &m_ret[i]; diff --git a/sys/vm/vm_phys.h b/sys/vm/vm_phys.h index 0debc0143e79..3e35f9b2cb18 100644 --- a/sys/vm/vm_phys.h +++ b/sys/vm/vm_phys.h @@ -1,5 +1,6 @@ /*- * Copyright (c) 2002-2006 Rice University + * Copyright (c) 2007 Alan L. Cox * All rights reserved. * * This software was developed for the FreeBSD Project by Alan L. Cox, @@ -45,6 +46,8 @@ vm_page_t vm_phys_alloc_pages(int pool, int order); vm_paddr_t vm_phys_bootstrap_alloc(vm_size_t size, unsigned long alignment); void vm_phys_free_pages(vm_page_t m, int order); void vm_phys_init(void); +void vm_phys_set_pool(int pool, vm_page_t m, int order); +void vm_phys_unfree_page(vm_page_t m); boolean_t vm_phys_zero_pages_idle(void); #endif /* !_VM_PHYS_H_ */