Change the management of cached pages (PQ_CACHE) in two fundamental

ways: (1) Cached pages are no longer kept in the object's resident page splay tree and memq. Instead, they are kept in a separate per-object splay tree of cached pages. However, access to this new per-object splay tree is synchronized by the _free_ page queues lock, not to be confused with the heavily contended page queues lock. Consequently, a cached page can be reclaimed by vm_page_alloc(9) without acquiring the object's lock or the page queues lock. This solves a problem independently reported by tegge@ and Isilon. Specifically, they observed the page daemon consuming a great deal of CPU time because of pages bouncing back and forth between the cache queue (PQ_CACHE) and the inactive queue (PQ_INACTIVE). The source of this problem turned out to be a deadlock avoidance strategy employed when selecting a cached page to reclaim in vm_page_select_cache(). However, the root cause was really that reclaiming a cached page required the acquisition of an object lock while the page queues lock was already held. Thus, this change addresses the problem at its root, by eliminating the need to acquire the object's lock. Moreover, keeping cached pages in the object's primary splay tree and memq was, in effect, optimizing for the uncommon case. Cached pages are reclaimed far, far more often than they are reactivated. Instead, this change makes reclamation cheaper, especially in terms of synchronization overhead, and reactivation more expensive, because reactivated pages will have to be reentered into the object's primary splay tree and memq. (2) Cached pages are now stored alongside free pages in the physical memory allocator's buddy queues, increasing the likelihood that large allocations of contiguous physical memory (i.e., superpages) will succeed. Finally, as a result of this change long-standing restrictions on when and where a cached page can be reclaimed and returned by vm_page_alloc(9) are eliminated. Specifically, calls to vm_page_alloc(9) specifying VM_ALLOC_INTERRUPT can now reclaim and return a formerly cached page. Consequently, a call to malloc(9) specifying M_NOWAIT is less likely to fail. Discussed with: many over the course of the summer, including jeff@, Justin Husted @ Isilon, peter@, tegge@ Tested by: an earlier version by kris@ Approved by: re (kensmith)
svn path=/head/; revision=172317
2007-09-25 06:25:06 +00:00 · 2007-09-25 06:25:06 +00:00 · 7bfda801a8 · 2020-12-20 02:59:44 +00:00
commit 7bfda801a8
parent 977b6507cb
21 changed files with 479 additions and 281 deletions
--- a/sys/amd64/include/vmparam.h
+++ b/sys/amd64/include/vmparam.h
@ -101,12 +101,13 @@
 #define	VM_PHYSSEG_MAX		31

 /*
- * Create two free page pools: VM_FREEPOOL_DEFAULT is the default pool
+ * Create three free page pools: VM_FREEPOOL_DEFAULT is the default pool
 * from which physical pages are allocated and VM_FREEPOOL_DIRECT is
 * the pool from which physical pages for page tables and small UMA
 * objects are allocated.
 */
-#define	VM_NFREEPOOL		2
+#define	VM_NFREEPOOL		3
+#define	VM_FREEPOOL_CACHE	2
 #define	VM_FREEPOOL_DEFAULT	0
 #define	VM_FREEPOOL_DIRECT	1

--- a/sys/arm/include/vmparam.h
+++ b/sys/arm/include/vmparam.h
@ -59,12 +59,13 @@
 #define	VM_PHYSSEG_DENSE

 /*
- * Create two free page pools: VM_FREEPOOL_DEFAULT is the default pool
+ * Create three free page pools: VM_FREEPOOL_DEFAULT is the default pool
 * from which physical pages are allocated and VM_FREEPOOL_DIRECT is
 * the pool from which physical pages for small UMA objects are
 * allocated.
 */
-#define	VM_NFREEPOOL		2
+#define	VM_NFREEPOOL		3
+#define	VM_FREEPOOL_CACHE	2
 #define	VM_FREEPOOL_DEFAULT	0
 #define	VM_FREEPOOL_DIRECT	1

--- a/sys/i386/include/vmparam.h
+++ b/sys/i386/include/vmparam.h
@ -93,12 +93,13 @@
 #define	VM_PHYSSEG_MAX		17

 /*
- * Create one free page pool.  Since the i386 kernel virtual address
+ * Create two free page pools.  Since the i386 kernel virtual address
 * space does not include a mapping onto the machine's entire physical
 * memory, VM_FREEPOOL_DIRECT is defined as an alias for the default
 * pool, VM_FREEPOOL_DEFAULT.
 */
-#define	VM_NFREEPOOL		1
+#define	VM_NFREEPOOL		2
+#define	VM_FREEPOOL_CACHE	1
 #define	VM_FREEPOOL_DEFAULT	0
 #define	VM_FREEPOOL_DIRECT	0

--- a/sys/ia64/include/vmparam.h
+++ b/sys/ia64/include/vmparam.h
@ -122,12 +122,13 @@
 #define	VM_PHYSSEG_MAX		49

 /*
- * Create two free page pools: VM_FREEPOOL_DEFAULT is the default pool
+ * Create three free page pools: VM_FREEPOOL_DEFAULT is the default pool
 * from which physical pages are allocated and VM_FREEPOOL_DIRECT is
 * the pool from which physical pages for small UMA objects are
 * allocated.
 */
-#define	VM_NFREEPOOL		2
+#define	VM_NFREEPOOL		3
+#define	VM_FREEPOOL_CACHE	2
 #define	VM_FREEPOOL_DEFAULT	0
 #define	VM_FREEPOOL_DIRECT	1

--- a/sys/kern/kern_exec.c
+++ b/sys/kern/kern_exec.c
@ -832,7 +832,7 @@ exec_map_first_page(imgp)
 				vm_page_busy(ma[i]);
 			} else {
 				ma[i] = vm_page_alloc(object, i,
-				    VM_ALLOC_NORMAL);
+				    VM_ALLOC_NORMAL | VM_ALLOC_IFNOTCACHED);
 				if (ma[i] == NULL)
 					break;
 			}
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@ -2898,7 +2898,8 @@ allocbuf(struct buf *bp, int size)
 						VM_WAIT;
 						VM_OBJECT_LOCK(obj);
 					} else {
-						bp->b_flags &= ~B_CACHE;
+						if (m->valid == 0)
+							bp->b_flags &= ~B_CACHE;
 						bp->b_pages[bp->b_npages] = m;
 						++bp->b_npages;
 					}
@ -2916,20 +2917,13 @@ allocbuf(struct buf *bp, int size)
 				 *  vm_fault->getpages->cluster_read->allocbuf
 				 *
 				 */
-				vm_page_lock_queues();
 				if (vm_page_sleep_if_busy(m, FALSE, "pgtblk"))
 					continue;

 				/*
-				 * We have a good page.  Should we wakeup the
-				 * page daemon?
+				 * We have a good page.
 				 */
-				if ((curproc != pageproc) &&
-				    (VM_PAGE_INQUEUE1(m, PQ_CACHE)) &&
-				    ((cnt.v_free_count + cnt.v_cache_count) <
-			 		(cnt.v_free_min + cnt.v_cache_min))) {
-					pagedaemon_wakeup();
-				}
+				vm_page_lock_queues();
 				vm_page_wire(m);
 				vm_page_unlock_queues();
 				bp->b_pages[bp->b_npages] = m;
--- a/sys/powerpc/include/vmparam.h
+++ b/sys/powerpc/include/vmparam.h
@ -110,12 +110,13 @@ struct pmap_physseg {
 #define	VM_PHYSSEG_DENSE

 /*
- * Create two free page pools: VM_FREEPOOL_DEFAULT is the default pool
+ * Create three free page pools: VM_FREEPOOL_DEFAULT is the default pool
 * from which physical pages are allocated and VM_FREEPOOL_DIRECT is
 * the pool from which physical pages for small UMA objects are
 * allocated.
 */
-#define	VM_NFREEPOOL		2
+#define	VM_NFREEPOOL		3
+#define	VM_FREEPOOL_CACHE	2
 #define	VM_FREEPOOL_DEFAULT	0
 #define	VM_FREEPOOL_DIRECT	1

--- a/sys/sparc64/include/vmparam.h
+++ b/sys/sparc64/include/vmparam.h
@ -91,12 +91,13 @@
 #define	VM_PHYSSEG_MAX		64

 /*
- * Create two free page pools: VM_FREEPOOL_DEFAULT is the default pool
+ * Create three free page pools: VM_FREEPOOL_DEFAULT is the default pool
 * from which physical pages are allocated and VM_FREEPOOL_DIRECT is
 * the pool from which physical pages for small UMA objects are
 * allocated.
 */
-#define	VM_NFREEPOOL		2
+#define	VM_NFREEPOOL		3
+#define	VM_FREEPOOL_CACHE	2
 #define	VM_FREEPOOL_DEFAULT	0
 #define	VM_FREEPOOL_DIRECT	1

--- a/sys/sun4v/include/vmparam.h
+++ b/sys/sun4v/include/vmparam.h
@ -91,12 +91,13 @@
 #define	VM_PHYSSEG_MAX		64

 /*
- * Create two free page pools: VM_FREEPOOL_DEFAULT is the default pool
+ * Create three free page pools: VM_FREEPOOL_DEFAULT is the default pool
 * from which physical pages are allocated and VM_FREEPOOL_DIRECT is
 * the pool from which physical pages for small UMA objects are
 * allocated.
 */
-#define	VM_NFREEPOOL		2
+#define	VM_NFREEPOOL		3
+#define	VM_FREEPOOL_CACHE	2
 #define	VM_FREEPOOL_DEFAULT	0
 #define	VM_FREEPOOL_DIRECT	1

--- a/sys/sys/vmmeter.h
+++ b/sys/sys/vmmeter.h
@ -68,7 +68,7 @@ struct vmmeter {
 	u_int v_vnodepgsin;	/* (p) vnode_pager pages paged in */
 	u_int v_vnodepgsout;	/* (p) vnode pager pages paged out */
 	u_int v_intrans;	/* (p) intransit blocking page faults */
-	u_int v_reactivated;	/* (q) pages reactivated from free list */
+	u_int v_reactivated;	/* (f) pages reactivated from free list */
 	u_int v_pdwakeups;	/* (f) times daemon has awaken from sleep */
 	u_int v_pdpages;	/* (q) pages analyzed by daemon */

@ -89,7 +89,7 @@ struct vmmeter {
 	u_int v_active_count;	/* (q) pages active */
 	u_int v_inactive_target; /* (c) pages desired inactive */
 	u_int v_inactive_count;	/* (q) pages inactive */
-	u_int v_cache_count;	/* (q) pages on buffer cache queue */
+	u_int v_cache_count;	/* (f) pages on buffer cache queue */
 	u_int v_cache_min;	/* (c) min pages desired on cache queue */
 	u_int v_cache_max;	/* (c) max pages in cached obj */
 	u_int v_pageout_free_min;   /* (c) min pages reserved for kernel */
--- a/sys/vm/vm_contig.c
+++ b/sys/vm/vm_contig.c
@ -231,8 +231,7 @@ contigmalloc(
 	unsigned long boundary)
 {
 	void * ret;
-	vm_object_t object;
-	vm_page_t m, m_next, pages;
+	vm_page_t pages;
 	unsigned long npgs;
 	int actl, actmax, inactl, inactmax, tries;

@ -258,14 +257,6 @@ contigmalloc(
 				actl++;
 				goto again;
 			}
-			TAILQ_FOREACH_SAFE(m, &vm_page_queues[PQ_CACHE].pl,
-			    pageq, m_next) {
-				if (m->hold_count == 0 &&
-				    VM_OBJECT_TRYLOCK(object = m->object)) {
-					vm_page_free(m);
-					VM_OBJECT_UNLOCK(object);
-				}
-			}
 			vm_page_unlock_queues();
 			tries++;
 			goto retry;
--- a/sys/vm/vm_fault.c
+++ b/sys/vm/vm_fault.c
@ -328,8 +328,6 @@ RetryFault:;
 		 */
 		fs.m = vm_page_lookup(fs.object, fs.pindex);
 		if (fs.m != NULL) {
-			int queue;
-
 			/* 
 			 * check for page-based copy on write.
 			 * We check fs.object == fs.first_object so
@ -398,20 +396,7 @@ RetryFault:;
 				vm_object_deallocate(fs.first_object);
 				goto RetryFault;
 			}
-			queue = fs.m->queue;
-
-			vm_pageq_remove_nowakeup(fs.m);
-
-			if (VM_PAGE_RESOLVEQUEUE(fs.m, queue) == PQ_CACHE) {
-				cnt.v_reactivated++;
-				if (vm_page_count_severe()) {
-					vm_page_activate(fs.m);
-					vm_page_unlock_queues();
-					unlock_and_deallocate(&fs);
-					VM_WAITPFAULT;
-					goto RetryFault;
-				}
-			}
+			vm_pageq_remove(fs.m);
 			vm_page_unlock_queues();

 			/*
@ -446,6 +431,8 @@ RetryFault:;
 			if (!vm_page_count_severe()) {
 				fs.m = vm_page_alloc(fs.object, fs.pindex,
 				    (fs.vp || fs.object->backing_object)? VM_ALLOC_NORMAL: VM_ALLOC_ZERO);
+				if ((fs.m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL)
+					break;
 			}
 			if (fs.m == NULL) {
 				unlock_and_deallocate(&fs);
@ -993,9 +980,7 @@ vm_fault_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry)
 		    (m->flags & PG_FICTITIOUS) == 0) {

 			vm_page_lock_queues();
-			if (!VM_PAGE_INQUEUE1(m, PQ_CACHE))
-				pmap_enter_quick(pmap, addr, m,
-				    entry->protection);
+			pmap_enter_quick(pmap, addr, m, entry->protection);
 			vm_page_unlock_queues();
 		}
 		VM_OBJECT_UNLOCK(lobject);
@ -1273,7 +1258,8 @@ vm_fault_additional_pages(m, rbehind, rahead, marray, reqpage)
 		for (i = 0, tpindex = pindex - 1; tpindex >= startpindex &&
 		    tpindex < pindex; i++, tpindex--) {

-			rtm = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL);
+			rtm = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL |
+			    VM_ALLOC_IFNOTCACHED);
 			if (rtm == NULL) {
 				/*
 				 * Shift the allocated pages to the
@ -1311,7 +1297,8 @@ vm_fault_additional_pages(m, rbehind, rahead, marray, reqpage)

 	for (; tpindex < endpindex; i++, tpindex++) {

-		rtm = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL);
+		rtm = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL |
+		    VM_ALLOC_IFNOTCACHED);
 		if (rtm == NULL) {
 			break;
 		}
--- a/sys/vm/vm_map.c
+++ b/sys/vm/vm_map.c
@ -1518,28 +1518,24 @@ vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
 				start = addr + ptoa(tmpidx);
 				p_start = p;
 			}
+		} else if (p_start != NULL) {
 			if (!are_queues_locked) {
 				are_queues_locked = TRUE;
 				vm_page_lock_queues();
 			}
-			if (VM_PAGE_INQUEUE1(p, PQ_CACHE)) {
-				if ((flags & MAP_PREFAULT_MADVISE) != 0)
-					vm_page_deactivate(p);
-				else if (p_start != NULL) {
-					pmap_enter_object(map->pmap, start, addr +
-					    ptoa(tmpidx), p_start, prot);
-					p_start = NULL;
-				}
-			}
-		} else if (p_start != NULL) {
 			pmap_enter_object(map->pmap, start, addr +
 			    ptoa(tmpidx), p_start, prot);
 			p_start = NULL;
 		}
 	}
-	if (p_start != NULL)
+	if (p_start != NULL) {
+		if (!are_queues_locked) {
+			are_queues_locked = TRUE;
+			vm_page_lock_queues();
+		}
 		pmap_enter_object(map->pmap, start, addr + ptoa(psize),
 		    p_start, prot);
+	}
 	if (are_queues_locked)
 		vm_page_unlock_queues();
 unlock_return:
--- a/sys/vm/vm_object.c
+++ b/sys/vm/vm_object.c
@ -170,6 +170,9 @@ vm_object_zdtor(void *mem, int size, void *arg)
 	KASSERT(TAILQ_EMPTY(&object->memq),
 	    ("object %p has resident pages",
 	    object));
+	KASSERT(object->cache == NULL,
+	    ("object %p has cached pages",
+	    object));
 	KASSERT(object->paging_in_progress == 0,
 	    ("object %p paging_in_progress = %d",
 	    object, object->paging_in_progress));
@ -217,6 +220,7 @@ _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object)
 	object->handle = NULL;
 	object->backing_object = NULL;
 	object->backing_object_offset = (vm_ooffset_t) 0;
+	object->cache = NULL;

 	mtx_lock(&vm_object_list_mtx);
 	TAILQ_INSERT_TAIL(&vm_object_list, object, object_list);
@ -648,6 +652,9 @@ vm_object_terminate(vm_object_t object)
 	}
 	vm_page_unlock_queues();

+	if (__predict_false(object->cache != NULL))
+		vm_page_cache_free(object);
+
 	/*
 	 * Let the pager know object is dead.
 	 */
@ -732,8 +739,7 @@ vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end, int
 		while (tscan < tend) {
 			curgeneration = object->generation;
 			p = vm_page_lookup(object, tscan);
-			if (p == NULL || p->valid == 0 ||
-			    VM_PAGE_INQUEUE1(p, PQ_CACHE)) {
+			if (p == NULL || p->valid == 0) {
 				if (--scanlimit == 0)
 					break;
 				++tscan;
@ -821,8 +827,7 @@ vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end, int
 		pi = p->pindex;
 		if ((p->oflags & VPO_CLEANCHK) == 0 ||
 			(pi < tstart) || (pi >= tend) ||
-			(p->valid == 0) ||
-		    VM_PAGE_INQUEUE1(p, PQ_CACHE)) {
+		    p->valid == 0) {
 			p->oflags &= ~VPO_CLEANCHK;
 			continue;
 		}
@ -900,10 +905,6 @@ vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int curgeneration,
 				 (tp->oflags & VPO_CLEANCHK) == 0) ||
 				(tp->busy != 0))
 				break;
-			if (VM_PAGE_INQUEUE1(tp, PQ_CACHE)) {
-				tp->oflags &= ~VPO_CLEANCHK;
-				break;
-			}
 			vm_page_test_dirty(tp);
 			if ((tp->dirty & tp->valid) == 0) {
 				tp->oflags &= ~VPO_CLEANCHK;
@ -928,10 +929,6 @@ vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int curgeneration,
 					 (tp->oflags & VPO_CLEANCHK) == 0) ||
 					(tp->busy != 0))
 					break;
-				if (VM_PAGE_INQUEUE1(tp, PQ_CACHE)) {
-					tp->oflags &= ~VPO_CLEANCHK;
-					break;
-				}
 				vm_page_test_dirty(tp);
 				if ((tp->dirty & tp->valid) == 0) {
 					tp->oflags &= ~VPO_CLEANCHK;
@ -1104,6 +1101,12 @@ vm_object_madvise(vm_object_t object, vm_pindex_t pindex, int count, int advise)
 			}
 		}
 		m = vm_page_lookup(tobject, tpindex);
+		if (m == NULL && advise == MADV_WILLNEED) {
+			/*
+			 * If the page is cached, reactivate it.
+			 */
+			m = vm_page_alloc(tobject, tpindex, VM_ALLOC_IFCACHED);
+		}
 		if (m == NULL) {
 			/*
 			 * There may be swap even if there is no backing page
@ -1356,6 +1359,13 @@ vm_object_split(vm_map_entry_t entry)
 		 * and new_object's locks are released and reacquired. 
 		 */
 		swap_pager_copy(orig_object, new_object, offidxstart, 0);
+
+		/*
+		 * Transfer any cached pages from orig_object to new_object.
+		 */
+		if (__predict_false(orig_object->cache != NULL))
+			vm_page_cache_transfer(orig_object, offidxstart,
+			    new_object);
 	}
 	VM_OBJECT_UNLOCK(orig_object);
 	TAILQ_FOREACH(m, &new_object->memq, listq)
@ -1390,8 +1400,8 @@ vm_object_backing_scan(vm_object_t object, int op)
 	 */
 	if (op & OBSC_TEST_ALL_SHADOWED) {
 		/*
-		 * We do not want to have to test for the existence of
-		 * swap pages in the backing object.  XXX but with the
+		 * We do not want to have to test for the existence of cache
+		 * or swap pages in the backing object.  XXX but with the
 		 * new swapper this would be pretty easy to do.
 		 *
 		 * XXX what about anonymous MAP_SHARED memory that hasn't
@ -1664,6 +1674,12 @@ vm_object_collapse(vm_object_t object)
 				    backing_object,
 				    object,
 				    OFF_TO_IDX(object->backing_object_offset), TRUE);
+
+				/*
+				 * Free any cached pages from backing_object.
+				 */
+				if (__predict_false(backing_object->cache != NULL))
+					vm_page_cache_free(backing_object);
 			}
 			/*
 			 * Object now shadows whatever backing_object did.
--- a/sys/vm/vm_object.h
+++ b/sys/vm/vm_object.h
@ -100,6 +100,7 @@ struct vm_object {
 	struct vm_object *backing_object; /* object that I'm a shadow of */
 	vm_ooffset_t backing_object_offset;/* Offset in backing object */
 	TAILQ_ENTRY(vm_object) pager_object_list; /* list of all objects of this pager type */
+	vm_page_t cache;		/* root of the cache page splay tree */
 	void *handle;
 	union {
 		/*
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@ -547,7 +547,7 @@ vm_page_sleep(vm_page_t m, const char *msg)
 void
 vm_page_dirty(vm_page_t m)
 {
-	KASSERT(VM_PAGE_GETKNOWNQUEUE1(m) != PQ_CACHE,
+	KASSERT((m->flags & PG_CACHED) == 0,
 	    ("vm_page_dirty: page in cache!"));
 	KASSERT(!VM_PAGE_IS_FREE(m),
 	    ("vm_page_dirty: page is free!"));
@ -790,50 +790,165 @@ vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)

 	vm_page_remove(m);
 	vm_page_insert(m, new_object, new_pindex);
-	if (VM_PAGE_INQUEUE1(m, PQ_CACHE))
-		vm_page_deactivate(m);
 	vm_page_dirty(m);
 }

 /*
- *	vm_page_select_cache:
- *
- *	Move a page of the given color from the cache queue to the free
- *	queue.  As pages might be found, but are not applicable, they are
- *	deactivated.
- *
- *	This routine may not block.
+ *	Convert all of the cached pages belonging to the given object
+ *	into free pages.  If the given object has cached pages and is
+ *	backed by a vnode, reduce the vnode's hold count.
 */
-vm_page_t
-vm_page_select_cache(void)
+void
+vm_page_cache_free(vm_object_t object)
 {
-	vm_object_t object;
-	vm_page_t m;
-	boolean_t was_trylocked;
+	vm_page_t m, root;
+	boolean_t empty;

-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
-	while ((m = TAILQ_FIRST(&vm_page_queues[PQ_CACHE].pl)) != NULL) {
-		KASSERT(m->dirty == 0, ("Found dirty cache page %p", m));
-		KASSERT(!pmap_page_is_mapped(m),
-		    ("Found mapped cache page %p", m));
-		KASSERT((m->flags & PG_UNMANAGED) == 0,
-		    ("Found unmanaged cache page %p", m));
-		KASSERT(m->wire_count == 0, ("Found wired cache page %p", m));
-		if (m->hold_count == 0 && (object = m->object,
-		    (was_trylocked = VM_OBJECT_TRYLOCK(object)) ||
-		    VM_OBJECT_LOCKED(object))) {
-			KASSERT((m->oflags & VPO_BUSY) == 0 && m->busy == 0,
-			    ("Found busy cache page %p", m));
-			vm_page_free(m);
-			if (was_trylocked)
-				VM_OBJECT_UNLOCK(object);
-			break;
+	mtx_lock(&vm_page_queue_free_mtx);
+	empty = object->cache == NULL;
+	while ((m = object->cache) != NULL) {
+		if (m->left == NULL)
+			root = m->right;
+		else if (m->right == NULL)
+			root = m->left;
+		else {
+			root = vm_page_splay(m->pindex, m->left);
+			root->right = m->right;
 		}
-		vm_page_deactivate(m);
+		m->object->cache = root;
+		m->object = NULL;
+		m->valid = 0;
+		/* Clear PG_CACHED and set PG_FREE. */
+		m->flags ^= PG_CACHED | PG_FREE;
+		KASSERT((m->flags & (PG_CACHED | PG_FREE)) == PG_FREE,
+		    ("vm_page_cache_free: page %p has inconsistent flags", m));
+		cnt.v_cache_count--;
+		cnt.v_free_count++;
+	}
+	mtx_unlock(&vm_page_queue_free_mtx);
+	if (object->type == OBJT_VNODE && !empty)
+		vdrop(object->handle);
+}
+
+/*
+ *	Returns the cached page that is associated with the given
+ *	object and offset.  If, however, none exists, returns NULL.
+ *
+ *	The free page queue must be locked.
+ */
+static inline vm_page_t
+vm_page_cache_lookup(vm_object_t object, vm_pindex_t pindex)
+{
+	vm_page_t m;
+
+	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+	if ((m = object->cache) != NULL && m->pindex != pindex) {
+		m = vm_page_splay(pindex, m);
+		if ((object->cache = m)->pindex != pindex)
+			m = NULL;
 	}
 	return (m);
 }

+/*
+ *	Remove the given cached page from its containing object's
+ *	collection of cached pages.
+ *
+ *	The free page queue must be locked.
+ */
+void
+vm_page_cache_remove(vm_page_t m)
+{
+	vm_object_t object;
+	vm_page_t root;
+
+	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+	KASSERT((m->flags & PG_CACHED) != 0,
+	    ("vm_page_cache_remove: page %p is not cached", m));
+	object = m->object;
+	if (m != object->cache) {
+		root = vm_page_splay(m->pindex, object->cache);
+		KASSERT(root == m,
+		    ("vm_page_cache_remove: page %p is not cached in object %p",
+		    m, object));
+	}
+	if (m->left == NULL)
+		root = m->right;
+	else if (m->right == NULL)
+		root = m->left;
+	else {
+		root = vm_page_splay(m->pindex, m->left);
+		root->right = m->right;
+	}
+	object->cache = root;
+	m->object = NULL;
+	cnt.v_cache_count--;
+}
+
+/*
+ *	Transfer all of the cached pages with offset greater than or
+ *	equal to 'offidxstart' from the original object's cache to the
+ *	new object's cache.  Initially, the new object's cache must be
+ *	empty.  Offset 'offidxstart' in the original object must
+ *	correspond to offset zero in the new object.
+ *
+ *	The new object must be locked.
+ */
+void
+vm_page_cache_transfer(vm_object_t orig_object, vm_pindex_t offidxstart,
+    vm_object_t new_object)
+{
+	vm_page_t m, m_next;
+
+	/*
+	 * Insertion into an object's collection of cached pages
+	 * requires the object to be locked.  In contrast, removal does
+	 * not.
+	 */
+	VM_OBJECT_LOCK_ASSERT(new_object, MA_OWNED);
+	KASSERT(new_object->cache == NULL,
+	    ("vm_page_cache_transfer: object %p has cached pages",
+	    new_object));
+	mtx_lock(&vm_page_queue_free_mtx);
+	if ((m = orig_object->cache) != NULL) {
+		/*
+		 * Transfer all of the pages with offset greater than or
+		 * equal to 'offidxstart' from the original object's
+		 * cache to the new object's cache.
+		 */
+		m = vm_page_splay(offidxstart, m);
+		if (m->pindex < offidxstart) {
+			orig_object->cache = m;
+			new_object->cache = m->right;
+			m->right = NULL;
+		} else {
+			orig_object->cache = m->left;
+			new_object->cache = m;
+			m->left = NULL;
+		}
+		KASSERT(new_object->cache == NULL ||
+		    new_object->type == OBJT_SWAP,
+		    ("vm_page_cache_transfer: object %p's type is incompatible"
+		    " with cached pages", new_object));
+
+		/*
+		 * Update the object and offset of each page that was
+		 * transferred to the new object's cache.
+		 */
+		while ((m = new_object->cache) != NULL) {
+			m_next = vm_page_splay(m->pindex, m->right);
+			m->object = new_object;
+			m->pindex -= offidxstart;
+			if (m_next == NULL)
+				break;
+			m->right = NULL;
+			m_next->left = m;
+			new_object->cache = m_next;
+		}
+	}
+	mtx_unlock(&vm_page_queue_free_mtx);
+}
+
 /*
 *	vm_page_alloc:
 *
@ -847,15 +962,13 @@ vm_page_select_cache(void)
 *	VM_ALLOC_ZERO		zero page
 *
 *	This routine may not block.
- *
- *	Additional special handling is required when called from an
- *	interrupt (VM_ALLOC_INTERRUPT).  We are not allowed to mess with
- *	the page cache in this case.
 */
 vm_page_t
 vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req)
 {
-	vm_page_t m = NULL;
+	struct vnode *vp = NULL;
+	vm_object_t m_object;
+	vm_page_t m;
 	int flags, page_req;

 	page_req = req & VM_ALLOC_CLASS_MASK;
@ -876,52 +989,32 @@ vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req)
 		page_req = VM_ALLOC_SYSTEM;
 	};

-loop:
 	mtx_lock(&vm_page_queue_free_mtx);
-	if (cnt.v_free_count > cnt.v_free_reserved ||
+	if (cnt.v_free_count + cnt.v_cache_count > cnt.v_free_reserved ||
 	    (page_req == VM_ALLOC_SYSTEM && 
-	     cnt.v_cache_count == 0 && 
-	     cnt.v_free_count > cnt.v_interrupt_free_min) ||
-	    (page_req == VM_ALLOC_INTERRUPT && cnt.v_free_count > 0)) {
+	    cnt.v_free_count + cnt.v_cache_count > cnt.v_interrupt_free_min) ||
+	    (page_req == VM_ALLOC_INTERRUPT &&
+	    cnt.v_free_count + cnt.v_cache_count > 0)) {
 		/*
 		 * Allocate from the free queue if the number of free pages
 		 * exceeds the minimum for the request class.
 		 */
-		m = vm_phys_alloc_pages(object != NULL ?
-		    VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0);
-	} else if (page_req != VM_ALLOC_INTERRUPT) {
-		mtx_unlock(&vm_page_queue_free_mtx);
-		/*
-		 * Allocatable from cache (non-interrupt only).  On success,
-		 * we must free the page and try again, thus ensuring that
-		 * cnt.v_*_free_min counters are replenished.
-		 */
-		vm_page_lock_queues();
-		if ((m = vm_page_select_cache()) == NULL) {
-			KASSERT(cnt.v_cache_count == 0,
-			    ("vm_page_alloc: cache queue is missing %d pages",
-			    cnt.v_cache_count));
-			vm_page_unlock_queues();
-			atomic_add_int(&vm_pageout_deficit, 1);
-			pagedaemon_wakeup();
-
-			if (page_req != VM_ALLOC_SYSTEM) 
-				return (NULL);
-
-			mtx_lock(&vm_page_queue_free_mtx);
-			if (cnt.v_free_count <= cnt.v_interrupt_free_min) {
+		if (object != NULL &&
+		    (m = vm_page_cache_lookup(object, pindex)) != NULL) {
+			if ((req & VM_ALLOC_IFNOTCACHED) != 0) {
 				mtx_unlock(&vm_page_queue_free_mtx);
 				return (NULL);
 			}
+			vm_phys_unfree_page(m);
+		} else if ((req & VM_ALLOC_IFCACHED) != 0) {
+			mtx_unlock(&vm_page_queue_free_mtx);
+			return (NULL);
+		} else
 			m = vm_phys_alloc_pages(object != NULL ?
 			    VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0);
-		} else {
-			vm_page_unlock_queues();
-			goto loop;
-		}
 	} else {
 		/*
-		 * Not allocatable from cache from interrupt, give up.
+		 * Not allocatable, give up.
 		 */
 		mtx_unlock(&vm_page_queue_free_mtx);
 		atomic_add_int(&vm_pageout_deficit, 1);
@ -937,8 +1030,24 @@ vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req)
 	    m != NULL,
 	    ("vm_page_alloc(): missing page on free queue")
 	);
-	KASSERT(VM_PAGE_IS_FREE(m),
-	    ("vm_page_alloc: page %p is not free", m));
+	if ((m->flags & PG_CACHED) != 0) {
+		KASSERT(m->valid != 0,
+		    ("vm_page_alloc: cached page %p is invalid", m));
+		if (m->object == object && m->pindex == pindex)
+	  		cnt.v_reactivated++;
+		else
+			m->valid = 0;
+		m_object = m->object;
+		vm_page_cache_remove(m);
+		if (m_object->type == OBJT_VNODE && m_object->cache == NULL)
+			vp = m_object->handle;
+	} else {
+		KASSERT(VM_PAGE_IS_FREE(m),
+		    ("vm_page_alloc: page %p is not free", m));
+		KASSERT(m->valid == 0,
+		    ("vm_page_alloc: free page %p is valid", m));
+		cnt.v_free_count--;
+	}

 	/*
 	 * Initialize structure.  Only the PG_ZERO flag is inherited.
@ -964,7 +1073,6 @@ vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req)
 	m->hold_count = 0;
 	m->act_count = 0;
 	m->busy = 0;
-	m->valid = 0;
 	KASSERT(m->dirty == 0, ("vm_page_alloc: free/cache page %p was dirty", m));
 	mtx_unlock(&vm_page_queue_free_mtx);

@ -973,6 +1081,15 @@ vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req)
 	else
 		m->pindex = pindex;

+	/*
+	 * The following call to vdrop() must come after the above call
+	 * to vm_page_insert() in case both affect the same object and
+	 * vnode.  Otherwise, the affected vnode's hold count could
+	 * temporarily become zero.
+	 */
+	if (vp != NULL)
+		vdrop(vp);
+
 	/*
 	 * Don't wakeup too often - wakeup the pageout daemon when
 	 * we would be nearly out of memory.
@ -1047,8 +1164,6 @@ vm_page_activate(vm_page_t m)

 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if (VM_PAGE_GETKNOWNQUEUE2(m) != PQ_ACTIVE) {
-		if (VM_PAGE_INQUEUE1(m, PQ_CACHE))
-			cnt.v_reactivated++;
 		vm_pageq_remove(m);
 		if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) {
 			if (m->act_count < ACT_INIT)
@ -1133,7 +1248,7 @@ vm_page_free_toq(vm_page_t m)
 	 * callback routine until after we've put the page on the
 	 * appropriate free queue.
 	 */
-	vm_pageq_remove_nowakeup(m);
+	vm_pageq_remove(m);
 	vm_page_remove(m);

 	/*
@ -1160,6 +1275,7 @@ vm_page_free_toq(vm_page_t m)
 	} else {
 		m->flags |= PG_FREE;
 		mtx_lock(&vm_page_queue_free_mtx);
+		cnt.v_free_count++;
 		if ((m->flags & PG_ZERO) != 0) {
 			vm_phys_free_pages(m, 0);
 			++vm_page_zero_count;
@ -1279,8 +1395,6 @@ _vm_page_deactivate(vm_page_t m, int athead)
 	if (VM_PAGE_INQUEUE2(m, PQ_INACTIVE))
 		return;
 	if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) {
-		if (VM_PAGE_INQUEUE1(m, PQ_CACHE))
-			cnt.v_reactivated++;
 		vm_page_flag_clear(m, PG_WINATCFLS);
 		vm_pageq_remove(m);
 		if (athead)
@ -1354,15 +1468,26 @@ vm_page_try_to_free(vm_page_t m)
 void
 vm_page_cache(vm_page_t m)
 {
+	vm_object_t object;
+	vm_page_t root;

 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
-	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
+	object = m->object;
+	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	if ((m->flags & PG_UNMANAGED) || (m->oflags & VPO_BUSY) || m->busy ||
 	    m->hold_count || m->wire_count) {
 		panic("vm_page_cache: attempting to cache busy page");
 	}
-	if (VM_PAGE_INQUEUE1(m, PQ_CACHE))
+	if (m->valid == 0 || object->type == OBJT_DEFAULT) {
+		/*
+		 * Hypothesis: A cache-elgible page belonging to a
+		 * default object must be zero filled.
+		 */
+		vm_page_free(m);
 		return;
+	}
+	KASSERT((m->flags & PG_CACHED) == 0,
+	    ("vm_page_cache: page %p is already cached", m));
 	cnt.v_tcached++;

 	/*
@ -1374,11 +1499,72 @@ vm_page_cache(vm_page_t m)
 		panic("vm_page_cache: caching a dirty page, pindex: %ld",
 			(long)m->pindex);
 	}
-	vm_pageq_remove_nowakeup(m);
-	vm_pageq_enqueue(PQ_CACHE, m);
+
+	/*
+	 * Remove the page from the paging queues.
+	 */
+	vm_pageq_remove(m);
+
+	/*
+	 * Remove the page from the object's collection of resident
+	 * pages. 
+	 */
+	if (m != object->root)
+		vm_page_splay(m->pindex, object->root);
+	if (m->left == NULL)
+		root = m->right;
+	else {
+		root = vm_page_splay(m->pindex, m->left);
+		root->right = m->right;
+	}
+	object->root = root;
+	TAILQ_REMOVE(&object->memq, m, listq);
+	object->resident_page_count--;
+	object->generation++;
+
+	/*
+	 * Insert the page into the object's collection of cached pages
+	 * and the physical memory allocator's cache/free page queues.
+	 */
+	vm_page_flag_set(m, PG_CACHED);
+	vm_page_flag_clear(m, PG_ZERO);
 	mtx_lock(&vm_page_queue_free_mtx);
+	vm_phys_set_pool(VM_FREEPOOL_CACHE, m, 0);
+	cnt.v_cache_count++;
+	root = object->cache;
+	if (root == NULL) {
+		m->left = NULL;
+		m->right = NULL;
+	} else {
+		root = vm_page_splay(m->pindex, root);
+		if (m->pindex < root->pindex) {
+			m->left = root->left;
+			m->right = root;
+			root->left = NULL;
+		} else if (__predict_false(m->pindex == root->pindex))
+			panic("vm_page_cache: offset already cached");
+		else {
+			m->right = root->right;
+			m->left = root;
+			root->right = NULL;
+		}
+	}
+	object->cache = m;
+	vm_phys_free_pages(m, 0);
 	vm_page_free_wakeup();
 	mtx_unlock(&vm_page_queue_free_mtx);
+
+	/*
+	 * Increment the vnode's hold count if this is the object's only
+	 * cached page.  Decrement the vnode's hold count if this was
+	 * the object's only resident page.
+	 */
+	if (object->type == OBJT_VNODE) {
+		if (root == NULL && object->resident_page_count != 0)
+			vhold(object->handle);
+		else if (root != NULL && object->resident_page_count == 0)
+			vdrop(object->handle);
+	}
 }

 /*
@ -1416,9 +1602,7 @@ vm_page_dontneed(vm_page_t m)
 	 * occassionally leave the page alone
 	 */
 	if ((dnw & 0x01F0) == 0 ||
-	    VM_PAGE_INQUEUE2(m, PQ_INACTIVE) || 
-	    VM_PAGE_INQUEUE1(m, PQ_CACHE)
-	) {
+	    VM_PAGE_INQUEUE2(m, PQ_INACTIVE)) {
 		if (m->act_count >= ACT_INIT)
 			--m->act_count;
 		return;
@ -1482,7 +1666,8 @@ vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags)
 		if ((allocflags & VM_ALLOC_RETRY) == 0)
 			return (NULL);
 		goto retrylookup;
-	}
+	} else if (m->valid != 0)
+		return (m);
 	if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0)
 		pmap_zero_page(m);
 	return (m);
@ -1813,7 +1998,7 @@ DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
 	db_printf("\n");
 		
 	db_printf("PQ_CACHE:");
-	db_printf(" %d", *vm_page_queues[PQ_CACHE].cnt);
+	db_printf(" %d", cnt.v_cache_count);
 	db_printf("\n");

 	db_printf("PQ_ACTIVE: %d, PQ_INACTIVE: %d\n",
--- a/sys/vm/vm_page.h
+++ b/sys/vm/vm_page.h
@ -160,27 +160,20 @@ CTASSERT(sizeof(u_long) >= 8);
 #define PQ_NONE		0
 #define	PQ_INACTIVE	1
 #define	PQ_ACTIVE	2
-#define	PQ_CACHE	3
-#define	PQ_HOLD		4
-#define	PQ_COUNT	5
-#define	PQ_MAXCOUNT	5
+#define	PQ_HOLD		3
+#define	PQ_COUNT	4
+#define	PQ_MAXCOUNT	4

 /* Returns the real queue a page is on. */
 #define VM_PAGE_GETQUEUE(m)	((m)->queue)

 /* Returns the well known queue a page is on. */
-#define VM_PAGE_GETKNOWNQUEUE1(m)	VM_PAGE_GETQUEUE(m)
 #define VM_PAGE_GETKNOWNQUEUE2(m)	VM_PAGE_GETQUEUE(m)

-/* Given the real queue number and a page color return the well know queue. */
-#define VM_PAGE_RESOLVEQUEUE(m, q)	(q)
-
 /* Returns true if the page is in the named well known queue. */
-#define VM_PAGE_INQUEUE1(m, q)	(VM_PAGE_GETKNOWNQUEUE1(m) == (q))
 #define VM_PAGE_INQUEUE2(m, q)	(VM_PAGE_GETKNOWNQUEUE2(m) == (q))

 /* Sets the queue a page is on. */
-#define VM_PAGE_SETQUEUE1(m, q)	(VM_PAGE_GETQUEUE(m) = (q))
 #define VM_PAGE_SETQUEUE2(m, q)	(VM_PAGE_GETQUEUE(m) = (q))

 struct vpgqueues {
@ -201,6 +194,7 @@ extern struct mtx vm_page_queue_free_mtx;
 *	 pte mappings, nor can they be removed from their objects via 
 *	 the object, and such pages are also not on any PQ queue.
 */
+#define	PG_CACHED	0x0001		/* page is cached */
 #define	PG_FREE		0x0002		/* page is free */
 #define PG_WINATCFLS	0x0004		/* flush dirty page on inactive q */
 #define	PG_FICTITIOUS	0x0008		/* physical page doesn't exist (O) */
@ -230,9 +224,8 @@ extern struct mtx vm_page_queue_free_mtx;
 *		Available for allocation now.
 *
 *	cache
- *		Almost available for allocation. Still in an
- *		object, but clean and immediately freeable at
- *		non-interrupt times.
+ *		Almost available for allocation. Still associated with
+ *		an object, but clean and immediately freeable.
 *
 *	hold
 *		Will become free after a pending I/O operation
@ -302,6 +295,8 @@ extern struct mtx vm_page_queue_mtx;
 #define	VM_ALLOC_RETRY		0x0080	/* vm_page_grab() only */
 #define	VM_ALLOC_NOOBJ		0x0100	/* No associated object */
 #define	VM_ALLOC_NOBUSY		0x0200	/* Do not busy the page */
+#define	VM_ALLOC_IFCACHED	0x0400	/* Fail if the page is not cached */
+#define	VM_ALLOC_IFNOTCACHED	0x0800	/* Fail if the page is cached */

 void vm_page_flag_set(vm_page_t m, unsigned short bits);
 void vm_page_flag_clear(vm_page_t m, unsigned short bits);
@ -318,7 +313,6 @@ void vm_page_wakeup(vm_page_t m);

 void vm_pageq_init(void);
 void vm_pageq_enqueue(int queue, vm_page_t m);
-void vm_pageq_remove_nowakeup(vm_page_t m);
 void vm_pageq_remove(vm_page_t m);
 void vm_pageq_requeue(vm_page_t m);

@ -326,6 +320,9 @@ void vm_page_activate (vm_page_t);
 vm_page_t vm_page_alloc (vm_object_t, vm_pindex_t, int);
 vm_page_t vm_page_grab (vm_object_t, vm_pindex_t, int);
 void vm_page_cache (register vm_page_t);
+void vm_page_cache_free(vm_object_t);
+void vm_page_cache_remove(vm_page_t);
+void vm_page_cache_transfer(vm_object_t, vm_pindex_t, vm_object_t);
 int vm_page_try_to_cache (vm_page_t);
 int vm_page_try_to_free (vm_page_t);
 void vm_page_dontneed (register vm_page_t);
@ -334,7 +331,6 @@ void vm_page_insert (vm_page_t, vm_object_t, vm_pindex_t);
 vm_page_t vm_page_lookup (vm_object_t, vm_pindex_t);
 void vm_page_remove (vm_page_t);
 void vm_page_rename (vm_page_t, vm_object_t, vm_pindex_t);
-vm_page_t vm_page_select_cache(void);
 void vm_page_sleep(vm_page_t m, const char *msg);
 vm_page_t vm_page_splay(vm_pindex_t, vm_page_t);
 vm_offset_t vm_page_startup(vm_offset_t vaddr);
--- a/sys/vm/vm_pageout.c
+++ b/sys/vm/vm_pageout.c
@ -342,8 +342,7 @@ vm_pageout_clean(m)
 			ib = 0;
 			break;
 		}
-		if (VM_PAGE_INQUEUE1(p, PQ_CACHE) ||
-		    (p->oflags & VPO_BUSY) || p->busy) {
+		if ((p->oflags & VPO_BUSY) || p->busy) {
 			ib = 0;
 			break;
 		}
@ -372,8 +371,7 @@ vm_pageout_clean(m)

 		if ((p = vm_page_lookup(object, pindex + is)) == NULL)
 			break;
-		if (VM_PAGE_INQUEUE1(p, PQ_CACHE) ||
-		    (p->oflags & VPO_BUSY) || p->busy) {
+		if ((p->oflags & VPO_BUSY) || p->busy) {
 			break;
 		}
 		vm_page_test_dirty(p);
@ -1139,37 +1137,6 @@ vm_pageout_scan(int pass)
 		VM_OBJECT_UNLOCK(object);
 		m = next;
 	}
-
-	/*
-	 * We try to maintain some *really* free pages, this allows interrupt
-	 * code to be guaranteed space.  Since both cache and free queues 
-	 * are considered basically 'free', moving pages from cache to free
-	 * does not effect other calculations.
-	 */
-	while (cnt.v_free_count < cnt.v_free_reserved) {
-		TAILQ_FOREACH(m, &vm_page_queues[PQ_CACHE].pl, pageq) {
-			KASSERT(m->dirty == 0,
-			    ("Found dirty cache page %p", m));
-			KASSERT(!pmap_page_is_mapped(m),
-			    ("Found mapped cache page %p", m));
-			KASSERT((m->flags & PG_UNMANAGED) == 0,
-			    ("Found unmanaged cache page %p", m));
-			KASSERT(m->wire_count == 0,
-			    ("Found wired cache page %p", m));
-			if (m->hold_count == 0 && VM_OBJECT_TRYLOCK(object =
-			    m->object)) {
-				KASSERT((m->oflags & VPO_BUSY) == 0 &&
-				    m->busy == 0, ("Found busy cache page %p",
-				    m));
-				vm_page_free(m);
-				VM_OBJECT_UNLOCK(object);
-				cnt.v_dfree++;
-				break;
-			}
-		}
-		if (m == NULL)
-			break;
-	}
 	vm_page_unlock_queues();
 #if !defined(NO_SWAPPING)
 	/*
--- a/sys/vm/vm_pageq.c
+++ b/sys/vm/vm_pageq.c
@ -56,7 +56,6 @@ vm_pageq_init(void)
 {
 	int i;

-	vm_page_queues[PQ_CACHE].cnt = &cnt.v_cache_count;
 	vm_page_queues[PQ_INACTIVE].cnt = &cnt.v_inactive_count;
 	vm_page_queues[PQ_ACTIVE].cnt = &cnt.v_active_count;
 	vm_page_queues[PQ_HOLD].cnt = &cnt.v_active_count;
@ -93,28 +92,6 @@ vm_pageq_enqueue(int queue, vm_page_t m)
 	++*vpq->cnt;
 }

-/*
- * vm_pageq_remove_nowakeup:
- *
- * 	vm_page_unqueue() without any wakeup
- *
- *	The queue containing the given page must be locked.
- *	This routine may not block.
- */
-void
-vm_pageq_remove_nowakeup(vm_page_t m)
-{
-	int queue = VM_PAGE_GETQUEUE(m);
-	struct vpgqueues *pq;
-
-	if (queue != PQ_NONE) {
-		pq = &vm_page_queues[queue];
-		VM_PAGE_SETQUEUE2(m, PQ_NONE);
-		TAILQ_REMOVE(&pq->pl, m, pageq);
-		(*pq->cnt)--;
-	}
-}
-
 /*
 * vm_pageq_remove:
 *
@ -134,9 +111,5 @@ vm_pageq_remove(vm_page_t m)
 		pq = &vm_page_queues[queue];
 		TAILQ_REMOVE(&pq->pl, m, pageq);
 		(*pq->cnt)--;
-		if (VM_PAGE_RESOLVEQUEUE(m, queue) == PQ_CACHE) {
-			if (vm_paging_needed())
-				pagedaemon_wakeup();
-		}
 	}
 }
--- a/sys/vm/vm_phys.c
+++ b/sys/vm/vm_phys.c
@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/vmmeter.h>
+#include <sys/vnode.h>

 #include <ddb/ddb.h>

@ -89,7 +90,6 @@ SYSCTL_OID(_vm, OID_AUTO, phys_segs, CTLTYPE_STRING | CTLFLAG_RD,

 static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind);
 static int vm_phys_paddr_to_segind(vm_paddr_t pa);
-static void vm_phys_set_pool(int pool, vm_page_t m, int order);
 static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl,
    int order);

@ -286,6 +286,7 @@ vm_phys_add_page(vm_paddr_t pa)
 	m->pool = VM_FREEPOOL_DEFAULT;
 	pmap_page_init(m);
 	mtx_lock(&vm_page_queue_free_mtx);
+	cnt.v_free_count++;
 	vm_phys_free_pages(m, 0);
 	mtx_unlock(&vm_page_queue_free_mtx);
 }
@ -318,7 +319,6 @@ vm_phys_alloc_pages(int pool, int order)
 				fl[oind].lcnt--;
 				m->order = VM_NFREEORDER;
 				vm_phys_split_pages(m, oind, fl, order);
-				cnt.v_free_count -= 1 << order;
 				return (m);
 			}
 		}
@ -339,7 +339,6 @@ vm_phys_alloc_pages(int pool, int order)
 					m->order = VM_NFREEORDER;
 					vm_phys_set_pool(pool, m, oind);
 					vm_phys_split_pages(m, oind, fl, order);
-					cnt.v_free_count -= 1 << order;
 					return (m);
 				}
 			}
@ -428,7 +427,6 @@ vm_phys_free_pages(vm_page_t m, int order)
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	pa = VM_PAGE_TO_PHYS(m);
 	seg = &vm_phys_segs[m->segind];
-	cnt.v_free_count += 1 << order;
 	while (order < VM_NFREEORDER - 1) {
 		pa_buddy = pa ^ (1 << (PAGE_SHIFT + order));
 		if (pa_buddy < seg->start ||
@ -456,7 +454,7 @@ vm_phys_free_pages(vm_page_t m, int order)
 /*
 * Set the pool for a contiguous, power of two-sized set of physical pages. 
 */
-static void
+void
 vm_phys_set_pool(int pool, vm_page_t m, int order)
 {
 	vm_page_t m_tmp;
@ -466,44 +464,113 @@ vm_phys_set_pool(int pool, vm_page_t m, int order)
 }

 /*
- * Try to zero one or more physical pages.  Used by an idle priority thread.
+ * Remove the given physical page "m" from the free lists.
+ *
+ * The free page queues must be locked.
+ */
+void
+vm_phys_unfree_page(vm_page_t m)
+{
+	struct vm_freelist *fl;
+	struct vm_phys_seg *seg;
+	vm_paddr_t pa, pa_half;
+	vm_page_t m_set, m_tmp;
+	int order;
+
+	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+
+	/*
+	 * First, find the contiguous, power of two-sized set of free
+	 * physical pages containing the given physical page "m" and
+	 * assign it to "m_set".
+	 */
+	seg = &vm_phys_segs[m->segind];
+	for (m_set = m, order = 0; m_set->order == VM_NFREEORDER &&
+	    order < VM_NFREEORDER; ) {
+		order++;
+		pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order));
+		KASSERT(pa >= seg->start && pa < seg->end,
+		    ("vm_phys_unfree_page: paddr %#jx is not within segment %p",
+		    (uintmax_t)pa, seg));
+		m_set = &seg->first_page[atop(pa - seg->start)];
+	}
+	KASSERT(m_set->order >= order, ("vm_phys_unfree_page: page %p's order"
+	    " (%d) is less than expected (%d)", m_set, m_set->order, order));
+	KASSERT(m_set->order < VM_NFREEORDER,
+	    ("vm_phys_unfree_page: page %p has unexpected order %d",
+	    m_set, m_set->order));
+	KASSERT(order < VM_NFREEORDER,
+	    ("vm_phys_unfree_page: order %d is out of range", order));
+
+	/*
+	 * Next, remove "m_set" from the free lists.  Finally, extract
+	 * "m" from "m_set" using an iterative algorithm: While "m_set"
+	 * is larger than a page, shrink "m_set" by returning the half
+	 * of "m_set" that does not contain "m" to the free lists.
+	 */
+	fl = (*seg->free_queues)[m_set->pool];
+	order = m_set->order;
+	TAILQ_REMOVE(&fl[order].pl, m_set, pageq);
+	fl[order].lcnt--;
+	m_set->order = VM_NFREEORDER;
+	while (order > 0) {
+		order--;
+		pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order));
+		if (m->phys_addr < pa_half)
+			m_tmp = &seg->first_page[atop(pa_half - seg->start)];
+		else {
+			m_tmp = m_set;
+			m_set = &seg->first_page[atop(pa_half - seg->start)];
+		}
+		m_tmp->order = order;
+		TAILQ_INSERT_HEAD(&fl[order].pl, m_tmp, pageq);
+		fl[order].lcnt++;
+	}
+	KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency"));
+}
+
+/*
+ * Try to zero one physical page.  Used by an idle priority thread.
 */
 boolean_t
 vm_phys_zero_pages_idle(void)
 {
-	struct vm_freelist *fl;
+	static struct vm_freelist *fl = vm_phys_free_queues[0][0];
+	static int flind, oind, pind;
 	vm_page_t m, m_tmp;
-	int flind, pind, q, zeroed;

 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
-	for (flind = 0; flind < vm_nfreelists; flind++) {
-		pind = VM_FREEPOOL_DEFAULT;
-		fl = vm_phys_free_queues[flind][pind];
-		for (q = 0; q < VM_NFREEORDER; q++) {
-			m = TAILQ_FIRST(&fl[q].pl);
-			if (m != NULL && (m->flags & PG_ZERO) == 0) {
-				TAILQ_REMOVE(&fl[q].pl, m, pageq);
-				fl[q].lcnt--;
-				m->order = VM_NFREEORDER;
-				cnt.v_free_count -= 1 << q;
-				mtx_unlock(&vm_page_queue_free_mtx);
-				zeroed = 0;
-				for (m_tmp = m; m_tmp < &m[1 << q]; m_tmp++) {
-					if ((m_tmp->flags & PG_ZERO) == 0) {
-						pmap_zero_page_idle(m_tmp);
-						m_tmp->flags |= PG_ZERO;
-						zeroed++;
-					}
+	for (;;) {
+		TAILQ_FOREACH_REVERSE(m, &fl[oind].pl, pglist, pageq) {
+			for (m_tmp = m; m_tmp < &m[1 << oind]; m_tmp++) {
+				if ((m_tmp->flags & (PG_CACHED | PG_ZERO)) == 0) {
+					vm_phys_unfree_page(m_tmp);
+					cnt.v_free_count--;
+					mtx_unlock(&vm_page_queue_free_mtx);
+					pmap_zero_page_idle(m_tmp);
+					m_tmp->flags |= PG_ZERO;
+					mtx_lock(&vm_page_queue_free_mtx);
+					cnt.v_free_count++;
+					vm_phys_free_pages(m_tmp, 0);
+					vm_page_zero_count++;
+					cnt_prezero++;
+					return (TRUE);
 				}
-				cnt_prezero += zeroed;
-				mtx_lock(&vm_page_queue_free_mtx);
-				vm_phys_free_pages(m, q);
-				vm_page_zero_count += zeroed;
-				return (TRUE);
 			}
 		}
+		oind++;
+		if (oind == VM_NFREEORDER) {
+			oind = 0;
+			pind++;
+			if (pind == VM_NFREEPOOL) {
+				pind = 0;
+				flind++;
+				if (flind == vm_nfreelists)
+					flind = 0;
+			}
+			fl = vm_phys_free_queues[flind][pind];
+		}
 	}
-	return (FALSE);
 }

 /*
@ -522,6 +589,7 @@ vm_phys_alloc_contig(unsigned long npages, vm_paddr_t low, vm_paddr_t high,
 {
 	struct vm_freelist *fl;
 	struct vm_phys_seg *seg;
+	vm_object_t m_object;
 	vm_paddr_t pa, pa_last, size;
 	vm_page_t m, m_ret;
 	int flind, i, oind, order, pind;
@ -606,12 +674,19 @@ vm_phys_alloc_contig(unsigned long npages, vm_paddr_t low, vm_paddr_t high,
 		vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m_ret, oind);
 	fl = (*seg->free_queues)[m_ret->pool];
 	vm_phys_split_pages(m_ret, oind, fl, order);
-	cnt.v_free_count -= roundup2(npages, 1 << imin(oind, order));
 	for (i = 0; i < npages; i++) {
 		m = &m_ret[i];
 		KASSERT(m->queue == PQ_NONE,
 		    ("vm_phys_alloc_contig: page %p has unexpected queue %d",
 		    m, m->queue));
+		m_object = m->object;
+		if ((m->flags & PG_CACHED) != 0)
+			vm_page_cache_remove(m);
+		else {
+			KASSERT(VM_PAGE_IS_FREE(m),
+			    ("vm_phys_alloc_contig: page %p is not free", m));
+			cnt.v_free_count--;
+		}
 		m->valid = VM_PAGE_BITS_ALL;
 		if (m->flags & PG_ZERO)
 			vm_page_zero_count--;
@ -622,6 +697,13 @@ vm_phys_alloc_contig(unsigned long npages, vm_paddr_t low, vm_paddr_t high,
 		    ("vm_phys_alloc_contig: page %p was dirty", m));
 		m->wire_count = 0;
 		m->busy = 0;
+		if (m_object != NULL &&
+		    m_object->type == OBJT_VNODE &&
+		    m_object->cache == NULL) {
+			mtx_unlock(&vm_page_queue_free_mtx);
+			vdrop(m_object->handle);
+			mtx_lock(&vm_page_queue_free_mtx);
+		}
 	}
 	for (; i < roundup2(npages, 1 << imin(oind, order)); i++) {
 		m = &m_ret[i];
--- a/sys/vm/vm_phys.h
+++ b/sys/vm/vm_phys.h
@ -1,5 +1,6 @@
 /*-
 * Copyright (c) 2002-2006 Rice University
+ * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu>
 * All rights reserved.
 *
 * This software was developed for the FreeBSD Project by Alan L. Cox,
@ -45,6 +46,8 @@ vm_page_t vm_phys_alloc_pages(int pool, int order);
 vm_paddr_t vm_phys_bootstrap_alloc(vm_size_t size, unsigned long alignment);
 void vm_phys_free_pages(vm_page_t m, int order);
 void vm_phys_init(void);
+void vm_phys_set_pool(int pool, vm_page_t m, int order);
+void vm_phys_unfree_page(vm_page_t m);
 boolean_t vm_phys_zero_pages_idle(void);

 #endif	/* !_VM_PHYS_H_ */