Implement a low-memory deadlock solution.

Removed most of the hacks that were trying to deal with low-memory situations prior to now. The new code is based on the concept that I/O must be able to function in a low memory situation. All major modules related to I/O (except networking) have been adjusted to allow allocation out of the system reserve memory pool. These modules now detect a low memory situation but rather then block they instead continue to operate, then return resources to the memory pool instead of cache them or leave them wired. Code has been added to stall in a low-memory situation prior to a vnode being locked. Thus situations where a process blocks in a low-memory condition while holding a locked vnode have been reduced to near nothing. Not only will I/O continue to operate, but many prior deadlock conditions simply no longer exist. Implement a number of VFS/BIO fixes (found by Ian): in biodone(), bogus-page replacement code, the loop was not properly incrementing loop variables prior to a continue statement. We do not believe this code can be hit anyway but we aren't taking any chances. We'll turn the whole section into a panic (as it already is in brelse()) after the release is rolled. In biodone(), the foff calculation was incorrectly clamped to the iosize, causing the wrong foff to be calculated for pages in the case of an I/O error or biodone() called without initiating I/O. The problem always caused a panic before. Now it doesn't. The problem is mainly an issue with NFS. Fixed casts for ~PAGE_MASK. This code worked properly before only because the calculations use signed arithmatic. Better to properly extend PAGE_MASK first before inverting it for the 64 bit masking op. In brelse(), the bogus_page fixup code was improperly throwing away the original contents of 'm' when it did the j-loop to fix the bogus pages. The result was that it would potentially invalidate parts of the *WRONG* page(!), leading to corruption. There may still be cases where a background bitmap write is being duplicated, causing potential corruption. We have identified a potentially serious bug related to this but the fix is still TBD. So instead this patch contains a KASSERT to detect the problem and panic the machine rather then continue to corrupt the filesystem. The problem does not occur very often.. it is very hard to reproduce, and it may or may not be the cause of the corruption people have reported. Review by: (VFS/BIO: mckusick, Ian Dowse <iedowse@maths.tcd.ie>) Testing by: (VM/Deadlock) Paul Saab <ps@yahoo-inc.com>
svn path=/head/; revision=68885
2000-11-18 23:06:26 +00:00 · 2000-11-18 23:06:26 +00:00 · 936524aa02 · 2020-12-20 02:59:44 +00:00
commit 936524aa02
parent ef0646f9d8
14 changed files with 315 additions and 185 deletions
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@ -597,8 +597,14 @@ bwrite(struct buf * bp)
 	 * If this buffer is marked for background writing and we
 	 * do not have to wait for it, make a copy and write the
 	 * copy so as to leave this buffer ready for further use.
+	 *
+	 * This optimization eats a lot of memory.  If we have a page
+	 * or buffer shortfall we can't do it.
 	 */
-	if ((bp->b_xflags & BX_BKGRDWRITE) && (bp->b_flags & B_ASYNC)) {
+	if ((bp->b_xflags & BX_BKGRDWRITE) && 
+	    (bp->b_flags & B_ASYNC) &&
+	    !vm_page_count_severe() &&
+	    !buf_dirty_count_severe()) {
 		if (bp->b_iodone != NULL) {
 			printf("bp->b_iodone = %p\n", bp->b_iodone);
 			panic("bwrite: need chained iodone");
@ -682,7 +688,10 @@ vfs_backgroundwritedone(bp)
 	/*
 	 * Clear the BX_BKGRDINPROG flag in the original buffer
 	 * and awaken it if it is waiting for the write to complete.
+	 * If BX_BKGRDINPROG is not set in the original buffer it must
+	 * have been released and re-instantiated - which is not legal.
 	 */
+	KASSERT((origbp->b_xflags & BX_BKGRDINPROG), ("backgroundwritedone: lost buffer2"));
 	origbp->b_xflags &= ~BX_BKGRDINPROG;
 	if (origbp->b_xflags & BX_BKGRDWAIT) {
 		origbp->b_xflags &= ~BX_BKGRDWAIT;
@ -902,6 +911,15 @@ bwillwrite(void)
 	}
 }

+/*
+ * Return true if we have too many dirty buffers.
+ */
+int
+buf_dirty_count_severe(void)
+{
+	return(numdirtybuffers >= hidirtybuffers);
+}
+
 /*
 *	brelse:
 *
@ -964,10 +982,14 @@ brelse(struct buf * bp)
 	 * 
 	 * We still allow the B_INVAL case to call vfs_vmio_release(), even
 	 * if B_DELWRI is set.
+	 *
+	 * If B_DELWRI is not set we may have to set B_RELBUF if we are low
+	 * on pages to return pages to the VM page queues.
 	 */
-
 	if (bp->b_flags & B_DELWRI)
 		bp->b_flags &= ~B_RELBUF;
+	else if (vm_page_count_severe() && !(bp->b_xflags & BX_BKGRDINPROG))
+		bp->b_flags |= B_RELBUF;

 	/*
 	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
@ -989,8 +1011,7 @@ brelse(struct buf * bp)
 	if ((bp->b_flags & B_VMIO)
 	    && !(bp->b_vp->v_tag == VT_NFS &&
 		 !vn_isdisk(bp->b_vp, NULL) &&
-		 (bp->b_flags & B_DELWRI) &&
-		 (bp->b_xflags & BX_BKGRDINPROG))
+		 (bp->b_flags & B_DELWRI))
 	    ) {

 		int i, j, resid;
@ -1017,32 +1038,40 @@ brelse(struct buf * bp)
 		 *
 		 * See man buf(9) for more information
 		 */
-
 		resid = bp->b_bufsize;
 		foff = bp->b_offset;

 		for (i = 0; i < bp->b_npages; i++) {
+			int had_bogus = 0;
+
 			m = bp->b_pages[i];
 			vm_page_flag_clear(m, PG_ZERO);
-			if (m == bogus_page) {

+			/*
+			 * If we hit a bogus page, fixup *all* the bogus pages
+			 * now.
+			 */
+			if (m == bogus_page) {
 				VOP_GETVOBJECT(vp, &obj);
 				poff = OFF_TO_IDX(bp->b_offset);
+				had_bogus = 1;

 				for (j = i; j < bp->b_npages; j++) {
-					m = bp->b_pages[j];
-					if (m == bogus_page) {
-						m = vm_page_lookup(obj, poff + j);
-						if (!m) {
+					vm_page_t mtmp;
+					mtmp = bp->b_pages[j];
+					if (mtmp == bogus_page) {
+						mtmp = vm_page_lookup(obj, poff + j);
+						if (!mtmp) {
 							panic("brelse: page missing\n");
 						}
-						bp->b_pages[j] = m;
+						bp->b_pages[j] = mtmp;
 					}
 				}

 				if ((bp->b_flags & B_INVAL) == 0) {
 					pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
 				}
+				m = bp->b_pages[i];
 			}
 			if ((bp->b_flags & B_NOCACHE) || (bp->b_ioflags & BIO_ERROR)) {
 				int poffset = foff & PAGE_MASK;
@ -1051,9 +1080,11 @@ brelse(struct buf * bp)

 				KASSERT(presid >= 0, ("brelse: extra page"));
 				vm_page_set_invalid(m, poffset, presid);
+				if (had_bogus)
+					printf("avoided corruption bug in bogus_page/brelse code\n");
 			}
 			resid -= PAGE_SIZE - (foff & PAGE_MASK);
-			foff = (foff + PAGE_SIZE) & ~PAGE_MASK;
+			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 		}

 		if (bp->b_flags & (B_INVAL | B_RELBUF))
@ -1171,7 +1202,7 @@ brelse(struct buf * bp)

 /*
 * Release a buffer back to the appropriate queue but do not try to free
- * it.
+ * it.  The buffer is expected to be used again soon.
 *
 * bqrelse() is used by bdwrite() to requeue a delayed write, and used by
 * biodone() to requeue an async I/O on completion.  It is also used when
@ -1203,6 +1234,15 @@ bqrelse(struct buf * bp)
 	} else if (bp->b_flags & B_DELWRI) {
 		bp->b_qindex = QUEUE_DIRTY;
 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist);
+	} else if (vm_page_count_severe()) {
+		/*
+		 * We are too low on memory, we have to try to free the
+		 * buffer (most importantly: the wired pages making up its
+		 * backing store) *now*.
+		 */
+		splx(s);
+		brelse(bp);
+		return;
 	} else {
 		bp->b_qindex = QUEUE_CLEAN;
 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
@ -1264,6 +1304,8 @@ vfs_vmio_release(bp)
 				vm_page_busy(m);
 				vm_page_protect(m, VM_PROT_NONE);
 				vm_page_free(m);
+			} else if (vm_page_count_severe()) {
+				vm_page_try_to_cache(m);
 			}
 		}
 	}
@ -1419,15 +1461,15 @@ getnewbuf(int slpflag, int slptimeo, int size, int maxsize)
 	struct buf *nbp;
 	int defrag = 0;
 	int nqindex;
-	int isspecial;
 	static int flushingbufs;

-	if (curproc != idleproc &&
-	    (curproc->p_flag & (P_COWINPROGRESS|P_BUFEXHAUST)) == 0)
-		isspecial = 0;
-	else
-		isspecial = 1;
-	
+	/*
+	 * We can't afford to block since we might be holding a vnode lock,
+	 * which may prevent system daemons from running.  We deal with
+	 * low-memory situations by proactively returning memory and running
+	 * async I/O rather then sync I/O.
+	 */
+
 	++getnewbufcalls;
 	--getnewbufrestarts;
 restart:
@ -1445,42 +1487,28 @@ getnewbuf(int slpflag, int slptimeo, int size, int maxsize)
 	 * However, there are a number of cases (defragging, reusing, ...)
 	 * where we cannot backup.
 	 */
+	nqindex = QUEUE_EMPTYKVA;
+	nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);

-	if (isspecial == 0 && numfreebuffers < lofreebuffers) {
+	if (nbp == NULL) {
 		/*
-		 * This will cause an immediate failure
+		 * If no EMPTYKVA buffers and we are either
+		 * defragging or reusing, locate a CLEAN buffer
+		 * to free or reuse.  If bufspace useage is low
+		 * skip this step so we can allocate a new buffer.
 		 */
-		nqindex = QUEUE_CLEAN;
-		nbp = NULL;
-	} else {
+		if (defrag || bufspace >= lobufspace) {
+			nqindex = QUEUE_CLEAN;
+			nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
+		}
+
 		/*
-		 * Locate a buffer which already has KVA assigned.  First
-		 * try EMPTYKVA buffers.
+		 * Nada.  If we are allowed to allocate an EMPTY 
+		 * buffer, go get one.
 		 */
-		nqindex = QUEUE_EMPTYKVA;
-		nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
-
-		if (nbp == NULL) {
-			/*
-			 * If no EMPTYKVA buffers and we are either
-			 * defragging or reusing, locate a CLEAN buffer
-			 * to free or reuse.  If bufspace useage is low
-			 * skip this step so we can allocate a new buffer.
-			 */
-			if (defrag || bufspace >= lobufspace) {
-				nqindex = QUEUE_CLEAN;
-				nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
-			}
-
-			/*
-			 * Nada.  If we are allowed to allocate an EMPTY 
-			 * buffer, go get one.
-			 */
-			if (nbp == NULL && defrag == 0 && 
-			    (isspecial || bufspace < hibufspace)) {
-				nqindex = QUEUE_EMPTY;
-				nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
-			}
+		if (nbp == NULL && defrag == 0 && bufspace < hibufspace) {
+			nqindex = QUEUE_EMPTY;
+			nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
 		}
 	}

@ -1610,26 +1638,16 @@ getnewbuf(int slpflag, int slptimeo, int size, int maxsize)
 			goto restart;
 		}

-		/*
-		 * If we are a normal process then deal with bufspace
-		 * hysteresis.  A normal process tries to keep bufspace
-		 * between lobufspace and hibufspace.  Note: if we encounter
-		 * a buffer with b_kvasize == 0 then it means we started
-		 * our scan on the EMPTY list and should allocate a new
-		 * buffer.
-		 */
-		if (isspecial == 0) {
-			if (bufspace > hibufspace)
-				flushingbufs = 1;
-			if (flushingbufs && bp->b_kvasize != 0) {
-				bp->b_flags |= B_INVAL;
-				bfreekva(bp);
-				brelse(bp);
-				goto restart;
-			}
-			if (bufspace < lobufspace)
-				flushingbufs = 0;
+		if (bufspace >= hibufspace)
+			flushingbufs = 1;
+		if (flushingbufs && bp->b_kvasize != 0) {
+			bp->b_flags |= B_INVAL;
+			bfreekva(bp);
+			brelse(bp);
+			goto restart;
 		}
+		if (bufspace < lobufspace)
+			flushingbufs = 0;
 		break;
 	}

@ -1705,6 +1723,7 @@ getnewbuf(int slpflag, int slptimeo, int size, int maxsize)
 	return(bp);
 }

+#if 0
 /*
 *	waitfreebuffers:
 *
@ -1723,6 +1742,8 @@ waitfreebuffers(int slpflag, int slptimeo)
 	}
 }

+#endif
+
 /*
 *	buf_daemon:
 *
@ -2073,8 +2094,12 @@ getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
         * If this check ever becomes a bottleneck it may be better to
         * move it into the else, when gbincore() fails.  At the moment
         * it isn't a problem.
+	 *
+	 * XXX remove if 0 sections (clean this up after its proven)
         */
+#if 0
 	if (curproc == idleproc || (curproc->p_flag & P_BUFEXHAUST)) {
+#endif
 		if (numfreebuffers == 0) {
 			if (curproc == idleproc)
 				return NULL;
@ -2082,9 +2107,11 @@ getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
 			tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, "newbuf",
 			    slptimeo);
 		}
+#if 0
 	} else if (numfreebuffers < lofreebuffers) {
 		waitfreebuffers(slpflag, slptimeo);
 	}
+#endif

 	if ((bp = gbincore(vp, blkno))) {
 		/*
@ -2468,7 +2495,13 @@ allocbuf(struct buf *bp, int size)

 				pi = OFF_TO_IDX(bp->b_offset) + bp->b_npages;
 				if ((m = vm_page_lookup(obj, pi)) == NULL) {
-					m = vm_page_alloc(obj, pi, VM_ALLOC_NORMAL);
+					/*
+					 * note: must allocate system pages
+					 * since blocking here could intefere
+					 * with paging I/O, no matter which
+					 * process we are.
+					 */
+					m = vm_page_alloc(obj, pi, VM_ALLOC_SYSTEM);
 					if (m == NULL) {
 						VM_WAIT;
 						vm_pageout_deficit += desiredpages - bp->b_npages;
@ -2671,7 +2704,7 @@ bufdone(struct buf *bp)
 		buf_complete(bp);

 	if (bp->b_flags & B_VMIO) {
-		int i, resid;
+		int i;
 		vm_ooffset_t foff;
 		vm_page_t m;
 		vm_object_t obj;
@ -2722,16 +2755,29 @@ bufdone(struct buf *bp)

 		for (i = 0; i < bp->b_npages; i++) {
 			int bogusflag = 0;
+			int resid;
+
+			resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff;
+			if (resid > iosize)
+				resid = iosize;
+
+			/*
+			 * cleanup bogus pages, restoring the originals
+			 */
 			m = bp->b_pages[i];
 			if (m == bogus_page) {
 				bogusflag = 1;
 				m = vm_page_lookup(obj, OFF_TO_IDX(foff));
 				if (!m) {
+					panic("biodone: page disappeared!");
 #if defined(VFS_BIO_DEBUG)
 					printf("biodone: page disappeared\n");
 #endif
 					vm_object_pip_subtract(obj, 1);
 					bp->b_flags &= ~B_CACHE;
+					foff = (foff + PAGE_SIZE) &
+					    ~(off_t)PAGE_MASK;
+					iosize -= resid;
 					continue;
 				}
 				bp->b_pages[i] = m;
@ -2744,9 +2790,6 @@ bufdone(struct buf *bp)
 				    (unsigned long)foff, m->pindex);
 			}
 #endif
-			resid = IDX_TO_OFF(m->pindex + 1) - foff;
-			if (resid > iosize)
-				resid = iosize;

 			/*
 			 * In the write case, the valid and clean bits are
@ -2784,7 +2827,7 @@ bufdone(struct buf *bp)
 			}
 			vm_page_io_finish(m);
 			vm_object_pip_subtract(obj, 1);
-			foff += resid;
+			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 			iosize -= resid;
 		}
 		if (obj)
@ -2862,7 +2905,7 @@ vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
 	 * of the buffer.
 	 */
 	soff = off;
-	eoff = (off + PAGE_SIZE) & ~PAGE_MASK;
+	eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 	if (eoff > bp->b_offset + bp->b_bcount)
 		eoff = bp->b_offset + bp->b_bcount;

@ -2948,7 +2991,7 @@ vfs_busy_pages(struct buf * bp, int clear_modify)
 				bp->b_pages[i] = bogus_page;
 				bogus++;
 			}
-			foff = (foff + PAGE_SIZE) & ~PAGE_MASK;
+			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 		}
 		if (bogus)
 			pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
@ -2976,7 +3019,7 @@ vfs_clean_pages(struct buf * bp)
 		    ("vfs_clean_pages: no buffer offset"));
 		for (i = 0; i < bp->b_npages; i++) {
 			vm_page_t m = bp->b_pages[i];
-			vm_ooffset_t noff = (foff + PAGE_SIZE) & ~PAGE_MASK;
+			vm_ooffset_t noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 			vm_ooffset_t eoff = noff;

 			if (eoff > bp->b_offset + bp->b_bufsize)
@ -3104,9 +3147,14 @@ vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)

 tryagain:

+		/*
+		 * note: must allocate system pages since blocking here
+		 * could intefere with paging I/O, no matter which
+		 * process we are.
+		 */
 		p = vm_page_alloc(kernel_object,
 			((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
-		    VM_ALLOC_NORMAL);
+		    VM_ALLOC_SYSTEM);
 		if (!p) {
 			vm_pageout_deficit += (to - from) >> PAGE_SHIFT;
 			VM_WAIT;
--- a/sys/kern/vfs_cluster.c
+++ b/sys/kern/vfs_cluster.c
@ -48,6 +48,7 @@
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/resourcevar.h>
+#include <sys/vmmeter.h>
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
@ -665,6 +666,11 @@ cluster_write(bp, filesize, seqcount)
 			cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, vp->v_clen + 1);
 		vp->v_clen = 0;
 		vp->v_cstart = lbn + 1;
+	} else if (vm_page_count_severe()) {
+		/*
+		 * We are low on memory, get it going NOW
+		 */
+		bawrite(bp);
 	} else {
 		/*
 		 * In the middle of a cluster, so just delay the I/O for now.
--- a/sys/kern/vfs_export.c
+++ b/sys/kern/vfs_export.c
@ -1438,10 +1438,14 @@ vget(vp, flags, p)
 	if ((flags & LK_INTERLOCK) == 0)
 		mtx_enter(&vp->v_interlock, MTX_DEF);
 	if (vp->v_flag & VXLOCK) {
-		vp->v_flag |= VXWANT;
-		mtx_exit(&vp->v_interlock, MTX_DEF);
-		tsleep((caddr_t)vp, PINOD, "vget", 0);
-		return (ENOENT);
+		if (vp->v_vxproc == curproc) {
+			printf("VXLOCK interlock avoided\n");
+		} else {
+			vp->v_flag |= VXWANT;
+			mtx_exit(&vp->v_interlock, MTX_DEF);
+			tsleep((caddr_t)vp, PINOD, "vget", 0);
+			return (ENOENT);
+		}
 	}

 	vp->v_usecount++;
@ -1731,6 +1735,7 @@ vclean(vp, flags, p)
 	if (vp->v_flag & VXLOCK)
 		panic("vclean: deadlock");
 	vp->v_flag |= VXLOCK;
+	vp->v_vxproc = curproc;
 	/*
 	 * Even if the count is zero, the VOP_INACTIVE routine may still
 	 * have the object locked while it cleans it out. The VOP_LOCK
@ -1807,6 +1812,7 @@ vclean(vp, flags, p)
 	vn_pollgone(vp);
 	vp->v_tag = VT_NON;
 	vp->v_flag &= ~VXLOCK;
+	vp->v_vxproc = NULL;
 	if (vp->v_flag & VXWANT) {
 		vp->v_flag &= ~VXWANT;
 		wakeup((caddr_t) vp);
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@ -1438,10 +1438,14 @@ vget(vp, flags, p)
 	if ((flags & LK_INTERLOCK) == 0)
 		mtx_enter(&vp->v_interlock, MTX_DEF);
 	if (vp->v_flag & VXLOCK) {
-		vp->v_flag |= VXWANT;
-		mtx_exit(&vp->v_interlock, MTX_DEF);
-		tsleep((caddr_t)vp, PINOD, "vget", 0);
-		return (ENOENT);
+		if (vp->v_vxproc == curproc) {
+			printf("VXLOCK interlock avoided\n");
+		} else {
+			vp->v_flag |= VXWANT;
+			mtx_exit(&vp->v_interlock, MTX_DEF);
+			tsleep((caddr_t)vp, PINOD, "vget", 0);
+			return (ENOENT);
+		}
 	}

 	vp->v_usecount++;
@ -1731,6 +1735,7 @@ vclean(vp, flags, p)
 	if (vp->v_flag & VXLOCK)
 		panic("vclean: deadlock");
 	vp->v_flag |= VXLOCK;
+	vp->v_vxproc = curproc;
 	/*
 	 * Even if the count is zero, the VOP_INACTIVE routine may still
 	 * have the object locked while it cleans it out. The VOP_LOCK
@ -1807,6 +1812,7 @@ vclean(vp, flags, p)
 	vn_pollgone(vp);
 	vp->v_tag = VT_NON;
 	vp->v_flag &= ~VXLOCK;
+	vp->v_vxproc = NULL;
 	if (vp->v_flag & VXWANT) {
 		vp->v_flag &= ~VXWANT;
 		wakeup((caddr_t) vp);
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@ -642,12 +642,14 @@ debug_vn_lock(vp, flags, p, filename, line)
 	do {
 		if ((flags & LK_INTERLOCK) == 0)
 			mtx_enter(&vp->v_interlock, MTX_DEF);
-		if (vp->v_flag & VXLOCK) {
+		if ((vp->v_flag & VXLOCK) && vp->v_vxproc != curproc) {
 			vp->v_flag |= VXWANT;
 			mtx_exit(&vp->v_interlock, MTX_DEF);
 			tsleep((caddr_t)vp, PINOD, "vn_lock", 0);
 			error = ENOENT;
 		} else {
+			if (vp->v_vxproc != NULL)
+				printf("VXLOCK interlock avoided in vn_lock\n");
 #ifdef	DEBUG_LOCKS
 			vp->filename = filename;
 			vp->line = line;
--- a/sys/sys/buf.h
+++ b/sys/sys/buf.h
@ -494,6 +494,7 @@ struct uio;
 caddr_t bufhashinit __P((caddr_t));
 void	bufinit __P((void));
 void	bwillwrite __P((void));
+int	buf_dirty_count_severe __P((void));
 void	bremfree __P((struct buf *));
 int	bread __P((struct vnode *, daddr_t, int,
 	    struct ucred *, struct buf **));
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@ -129,6 +129,7 @@ struct vnode {
 		short	vpi_events;		/* what they are looking for */
 		short	vpi_revents;		/* what has happened */
 	} v_pollinfo;
+	struct proc *v_vxproc;			/* proc owning VXLOCK */
 #ifdef	DEBUG_LOCKS
 	const char *filename;			/* Source file doing locking */
 	int line;				/* Line number doing locking */
--- a/sys/ufs/ffs/ffs_inode.c
+++ b/sys/ufs/ffs/ffs_inode.c
@ -45,6 +45,7 @@
 #include <sys/vnode.h>
 #include <sys/malloc.h>
 #include <sys/resourcevar.h>
+#include <sys/vmmeter.h>
 #include <sys/stat.h>

 #include <vm/vm.h>
@ -111,6 +112,8 @@ ffs_update(vp, waitfor)
 	    ino_to_fsbo(fs, ip->i_number)) = ip->i_din;
 	if (waitfor && !DOINGASYNC(vp)) {
 		return (bwrite(bp));
+	} else if (vm_page_count_severe() || buf_dirty_count_severe()) {
+		return (bwrite(bp));
 	} else {
 		if (bp->b_bufsize == fs->fs_bsize)
 			bp->b_flags |= B_CLUSTEROK;
--- a/sys/ufs/ffs/ffs_softdep.c
+++ b/sys/ufs/ffs/ffs_softdep.c
@ -91,6 +91,8 @@ MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry");
 MALLOC_DEFINE(M_MKDIR, "mkdir","New directory");
 MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted");

+#define M_SOFTDEP_FLAGS	(M_WAITOK | M_USE_RESERVE)
+
 #define	D_PAGEDEP	0
 #define	D_INODEDEP	1
 #define	D_NEWBLK	2
@ -802,7 +804,7 @@ pagedep_lookup(ip, lbn, flags, pagedeppp)
 		goto top;
 	}
 	MALLOC(pagedep, struct pagedep *, sizeof(struct pagedep), M_PAGEDEP,
-		M_WAITOK);
+		M_SOFTDEP_FLAGS);
 	bzero(pagedep, sizeof(struct pagedep));
 	pagedep->pd_list.wk_type = D_PAGEDEP;
 	pagedep->pd_mnt = mp;
@ -879,7 +881,7 @@ inodedep_lookup(fs, inum, flags, inodedeppp)
 	}
 	num_inodedep += 1;
 	MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep),
-		M_INODEDEP, M_WAITOK);
+		M_INODEDEP, M_SOFTDEP_FLAGS);
 	inodedep->id_list.wk_type = D_INODEDEP;
 	inodedep->id_fs = fs;
 	inodedep->id_ino = inum;
@ -941,7 +943,7 @@ newblk_lookup(fs, newblkno, flags, newblkpp)
 	if (sema_get(&newblk_in_progress, 0) == 0)
 		goto top;
 	MALLOC(newblk, struct newblk *, sizeof(struct newblk),
-		M_NEWBLK, M_WAITOK);
+		M_NEWBLK, M_SOFTDEP_FLAGS);
 	newblk->nb_state = 0;
 	newblk->nb_fs = fs;
 	newblk->nb_newblkno = newblkno;
@ -1127,7 +1129,7 @@ bmsafemap_lookup(bp)
 			return (WK_BMSAFEMAP(wk));
 	FREE_LOCK(&lk);
 	MALLOC(bmsafemap, struct bmsafemap *, sizeof(struct bmsafemap),
-		M_BMSAFEMAP, M_WAITOK);
+		M_BMSAFEMAP, M_SOFTDEP_FLAGS);
 	bmsafemap->sm_list.wk_type = D_BMSAFEMAP;
 	bmsafemap->sm_list.wk_state = 0;
 	bmsafemap->sm_buf = bp;
@ -1187,7 +1189,7 @@ softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
 	struct newblk *newblk;

 	MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect),
-		M_ALLOCDIRECT, M_WAITOK);
+		M_ALLOCDIRECT, M_SOFTDEP_FLAGS);
 	bzero(adp, sizeof(struct allocdirect));
 	adp->ad_list.wk_type = D_ALLOCDIRECT;
 	adp->ad_lbn = lbn;
@ -1339,7 +1341,7 @@ newfreefrag(ip, blkno, size)
 	if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
 		panic("newfreefrag: frag size");
 	MALLOC(freefrag, struct freefrag *, sizeof(struct freefrag),
-		M_FREEFRAG, M_WAITOK);
+		M_FREEFRAG, M_SOFTDEP_FLAGS);
 	freefrag->ff_list.wk_type = D_FREEFRAG;
 	freefrag->ff_state = ip->i_uid & ~ONWORKLIST;	/* XXX - used below */
 	freefrag->ff_inum = ip->i_number;
@ -1408,7 +1410,7 @@ newallocindir(ip, ptrno, newblkno, oldblkno)
 	struct allocindir *aip;

 	MALLOC(aip, struct allocindir *, sizeof(struct allocindir),
-		M_ALLOCINDIR, M_WAITOK);
+		M_ALLOCINDIR, M_SOFTDEP_FLAGS);
 	bzero(aip, sizeof(struct allocindir));
 	aip->ai_list.wk_type = D_ALLOCINDIR;
 	aip->ai_state = ATTACHED;
@ -1561,7 +1563,7 @@ setup_allocindir_phase2(bp, ip, aip)
 		if (indirdep)
 			break;
 		MALLOC(newindirdep, struct indirdep *, sizeof(struct indirdep),
-			M_INDIRDEP, M_WAITOK);
+			M_INDIRDEP, M_SOFTDEP_FLAGS);
 		newindirdep->ir_list.wk_type = D_INDIRDEP;
 		newindirdep->ir_state = ATTACHED;
 		LIST_INIT(&newindirdep->ir_deplisthd);
@ -1623,7 +1625,7 @@ softdep_setup_freeblocks(ip, length)
 	if (length != 0)
 		panic("softde_setup_freeblocks: non-zero length");
 	MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks),
-		M_FREEBLKS, M_WAITOK);
+		M_FREEBLKS, M_SOFTDEP_FLAGS);
 	bzero(freeblks, sizeof(struct freeblks));
 	freeblks->fb_list.wk_type = D_FREEBLKS;
 	freeblks->fb_uid = ip->i_uid;
@ -1870,7 +1872,7 @@ softdep_freefile(pvp, ino, mode)
 	 * This sets up the inode de-allocation dependency.
 	 */
 	MALLOC(freefile, struct freefile *, sizeof(struct freefile),
-		M_FREEFILE, M_WAITOK);
+		M_FREEFILE, M_SOFTDEP_FLAGS);
 	freefile->fx_list.wk_type = D_FREEFILE;
 	freefile->fx_list.wk_state = 0;
 	freefile->fx_mode = mode;
@ -2186,7 +2188,7 @@ softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp)
 	fs = dp->i_fs;
 	lbn = lblkno(fs, diroffset);
 	offset = blkoff(fs, diroffset);
-	MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD, M_WAITOK);
+	MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD, M_SOFTDEP_FLAGS);
 	bzero(dap, sizeof(struct diradd));
 	dap->da_list.wk_type = D_DIRADD;
 	dap->da_offset = offset;
@ -2198,12 +2200,12 @@ softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp)
 	} else {
 		dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
 		MALLOC(mkdir1, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
-		    M_WAITOK);
+		    M_SOFTDEP_FLAGS);
 		mkdir1->md_list.wk_type = D_MKDIR;
 		mkdir1->md_state = MKDIR_BODY;
 		mkdir1->md_diradd = dap;
 		MALLOC(mkdir2, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
-		    M_WAITOK);
+		    M_SOFTDEP_FLAGS);
 		mkdir2->md_list.wk_type = D_MKDIR;
 		mkdir2->md_state = MKDIR_PARENT;
 		mkdir2->md_diradd = dap;
@ -2438,7 +2440,7 @@ newdirrem(bp, dp, ip, isrmdir, prevdirremp)
 		(void) request_cleanup(FLUSH_REMOVE, 0);
 	num_dirrem += 1;
 	MALLOC(dirrem, struct dirrem *, sizeof(struct dirrem),
-		M_DIRREM, M_WAITOK);
+		M_DIRREM, M_SOFTDEP_FLAGS);
 	bzero(dirrem, sizeof(struct dirrem));
 	dirrem->dm_list.wk_type = D_DIRREM;
 	dirrem->dm_state = isrmdir ? RMDIR : 0;
@ -2535,7 +2537,7 @@ softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
 	 */
 	if (newinum != WINO) {
 		MALLOC(dap, struct diradd *, sizeof(struct diradd),
-		    M_DIRADD, M_WAITOK);
+		    M_DIRADD, M_SOFTDEP_FLAGS);
 		bzero(dap, sizeof(struct diradd));
 		dap->da_list.wk_type = D_DIRADD;
 		dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
@ -2841,7 +2843,7 @@ softdep_disk_io_initiation(bp)
 			 * Replace up-to-date version with safe version.
 			 */
 			MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount,
-			    M_INDIRDEP, M_WAITOK);
+			    M_INDIRDEP, M_SOFTDEP_FLAGS);
 			ACQUIRE_LOCK(&lk);
 			indirdep->ir_state &= ~ATTACHED;
 			indirdep->ir_state |= UNDONE;
@ -2942,7 +2944,7 @@ initiate_write_inodeblock(inodedep, bp)
 		if (inodedep->id_savedino != NULL)
 			panic("initiate_write_inodeblock: already doing I/O");
 		MALLOC(inodedep->id_savedino, struct dinode *,
-		    sizeof(struct dinode), M_INODEDEP, M_WAITOK);
+		    sizeof(struct dinode), M_INODEDEP, M_SOFTDEP_FLAGS);
 		*inodedep->id_savedino = *dp;
 		bzero((caddr_t)dp, sizeof(struct dinode));
 		return;
--- a/sys/ufs/ufs/ufs_readwrite.c
+++ b/sys/ufs/ufs/ufs_readwrite.c
@ -48,6 +48,7 @@
 #include <vm/vm_map.h>
 #include <vm/vnode_pager.h>
 #include <sys/event.h>
+#include <sys/vmmeter.h>

 #define VN_KNOTE(vp, b) \
 	KNOTE((struct klist *)&vp->v_pollinfo.vpi_selinfo.si_note, (b))
@ -501,6 +502,9 @@ WRITE(ap)
 			} else {
 				bawrite(bp);
 			}
+		} else if (vm_page_count_severe() || buf_dirty_count_severe()) {
+			bp->b_flags |= B_CLUSTEROK;
+			bawrite(bp);
 		} else {
 			bp->b_flags |= B_CLUSTEROK;
 			bdwrite(bp);
--- a/sys/vm/swap_pager.c
+++ b/sys/vm/swap_pager.c
@ -80,6 +80,7 @@
 #include <sys/sysctl.h>
 #include <sys/blist.h>
 #include <sys/lock.h>
+#include <sys/vmmeter.h>

 #ifndef MAX_PAGEOUT_CLUSTER
 #define MAX_PAGEOUT_CLUSTER 16
@ -1619,10 +1620,11 @@ swp_pager_async_iodone(bp)
 			 * status, then finish the I/O ( which decrements the 
 			 * busy count and possibly wakes waiter's up ).
 			 */
-			vm_page_protect(m, VM_PROT_READ);
 			pmap_clear_modify(m);
 			vm_page_undirty(m);
 			vm_page_io_finish(m);
+			if (!vm_page_count_severe() || !vm_page_try_to_cache(m))
+				vm_page_protect(m, VM_PROT_READ);
 		}
 	}

--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@ -860,7 +860,7 @@ vm_page_alloc(object, pindex, page_req)
 	 * Don't wakeup too often - wakeup the pageout daemon when
 	 * we would be nearly out of memory.
 	 */
-	if (vm_paging_needed() || cnt.v_free_count < cnt.v_pageout_free_min)
+	if (vm_paging_needed())
 		pagedaemon_wakeup();

 	splx(s);
@ -882,10 +882,10 @@ vm_wait()
 	s = splvm();
 	if (curproc == pageproc) {
 		vm_pageout_pages_needed = 1;
-		tsleep(&vm_pageout_pages_needed, PSWP, "vmwait", 0);
+		tsleep(&vm_pageout_pages_needed, PSWP, "VMWait", 0);
 	} else {
 		if (!vm_pages_needed) {
-			vm_pages_needed++;
+			vm_pages_needed = 1;
 			wakeup(&vm_pages_needed);
 		}
 		tsleep(&cnt.v_free_count, PVM, "vmwait", 0);
@ -1030,7 +1030,8 @@ vm_page_free_wakeup()
 	 * if pageout daemon needs pages, then tell it that there are
 	 * some free.
 	 */
-	if (vm_pageout_pages_needed) {
+	if (vm_pageout_pages_needed &&
+	    cnt.v_cache_count + cnt.v_free_count >= cnt.v_pageout_free_min) {
 		wakeup(&vm_pageout_pages_needed);
 		vm_pageout_pages_needed = 0;
 	}
@ -1039,9 +1040,9 @@ vm_page_free_wakeup()
 	 * high water mark. And wakeup scheduler process if we have
 	 * lots of memory. this process will swapin processes.
 	 */
-	if (vm_pages_needed && vm_page_count_min()) {
-		wakeup(&cnt.v_free_count);
+	if (vm_pages_needed && !vm_page_count_min()) {
 		vm_pages_needed = 0;
+		wakeup(&cnt.v_free_count);
 	}
 }

@ -1240,6 +1241,9 @@ vm_page_wire(m)
 *	processes.  This optimization causes one-time-use metadata to be
 *	reused more quickly.
 *
+ *	BUT, if we are in a low-memory situation we have no choice but to
+ *	put clean pages on the cache queue.
+ *
 *	A number of routines use vm_page_unwire() to guarantee that the page
 *	will go into either the inactive or active queues, and will NEVER
 *	be placed in the cache - for example, just after dirtying a page.
@ -1325,6 +1329,25 @@ vm_page_deactivate(vm_page_t m)
    _vm_page_deactivate(m, 0);
 }

+/*
+ * vm_page_try_to_cache:
+ *
+ * Returns 0 on failure, 1 on success
+ */
+int
+vm_page_try_to_cache(vm_page_t m)
+{
+	if (m->dirty || m->hold_count || m->busy || m->wire_count ||
+	    (m->flags & (PG_BUSY|PG_UNMANAGED))) {
+		return(0);
+	}
+	vm_page_test_dirty(m);
+	if (m->dirty)
+		return(0);
+	vm_page_cache(m);
+	return(1);
+}
+
 /*
 * vm_page_cache
 *
--- a/sys/vm/vm_page.h
+++ b/sys/vm/vm_page.h
@ -251,6 +251,7 @@ extern struct vpgqueues vm_page_queues[PQ_COUNT];
 #define PG_SWAPINPROG	0x0200		/* swap I/O in progress on page	     */
 #define PG_NOSYNC	0x0400		/* do not collect for syncer */
 #define PG_UNMANAGED	0x0800		/* No PV management for page */
+#define PG_MARKER	0x1000		/* special queue marker page */

 /*
 * Misc constants.
@ -403,6 +404,7 @@ void vm_page_activate __P((vm_page_t));
 vm_page_t vm_page_alloc __P((vm_object_t, vm_pindex_t, int));
 vm_page_t vm_page_grab __P((vm_object_t, vm_pindex_t, int));
 void vm_page_cache __P((register vm_page_t));
+int vm_page_try_to_cache __P((vm_page_t));
 void vm_page_dontneed __P((register vm_page_t));
 static __inline void vm_page_copy __P((vm_page_t, vm_page_t));
 static __inline void vm_page_free __P((vm_page_t));
--- a/sys/vm/vm_pageout.c
+++ b/sys/vm/vm_pageout.c
@ -146,6 +146,7 @@ static int defer_swap_pageouts=0;
 static int disable_swap_pageouts=0;

 static int max_page_launder=100;
+static int vm_pageout_actcmp=0;
 #if defined(NO_SWAPPING)
 static int vm_swap_enabled=0;
 static int vm_swap_idle_enabled=0;
@ -189,6 +190,8 @@ SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,

 SYSCTL_INT(_vm, OID_AUTO, max_page_launder,
 	CTLFLAG_RW, &max_page_launder, 0, "Maximum number of pages to clean per pass");
+SYSCTL_INT(_vm, OID_AUTO, vm_pageout_actcmp,
+	CTLFLAG_RD, &vm_pageout_actcmp, 0, "pagedaemon agressiveness");


 #define VM_PAGEOUT_PAGE_COUNT 16
@ -372,6 +375,7 @@ vm_pageout_flush(mc, count, flags)
 	 */

 	for (i = 0; i < count; i++) {
+		KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL && mc[i]->dirty == VM_PAGE_BITS_ALL, ("vm_pageout_flush page %p index %d/%d: partially dirty page", mc[i], i, count));
 		vm_page_io_start(mc[i]);
 		vm_page_protect(mc[i], VM_PROT_READ);
 	}
@ -424,6 +428,8 @@ vm_pageout_flush(mc, count, flags)
 		if (pageout_status[i] != VM_PAGER_PEND) {
 			vm_object_pip_wakeup(object);
 			vm_page_io_finish(mt);
+			if (!vm_page_count_severe() || !vm_page_try_to_cache(mt))
+				vm_page_protect(mt, VM_PROT_READ);
 		}
 	}
 	return numpagedout;
@ -621,10 +627,10 @@ static int
 vm_pageout_scan()
 {
 	vm_page_t m, next;
+	struct vm_page marker;
 	int page_shortage, maxscan, pcount;
 	int addl_page_shortage, addl_page_shortage_init;
 	int maxlaunder;
-	int launder_loop = 0;
 	struct proc *p, *bigproc;
 	vm_offset_t size, bigsize;
 	vm_object_t object;
@ -646,33 +652,37 @@ vm_pageout_scan()

 	/*
 	 * Calculate the number of pages we want to either free or move
-	 * to the cache.
+	 * to the cache.  Be more agressive if we aren't making our target.
 	 */

-	page_shortage = vm_paging_target() + addl_page_shortage_init;
+	page_shortage = vm_paging_target() +
+		addl_page_shortage_init + vm_pageout_actcmp;

 	/*
-	 * Figure out what to do with dirty pages when they are encountered.
-	 * Assume that 1/3 of the pages on the inactive list are clean.  If
-	 * we think we can reach our target, disable laundering (do not
-	 * clean any dirty pages).  If we miss the target we will loop back
-	 * up and do a laundering run.
+	 * Figure out how agressively we should flush dirty pages.
 	 */
+	{
+		int factor = vm_pageout_actcmp;

-	if (cnt.v_inactive_count / 3 > page_shortage) {
-		maxlaunder = 0;
-		launder_loop = 0;
-	} else {
-		maxlaunder = 
-		    (cnt.v_inactive_target > max_page_launder) ?
-		    max_page_launder : cnt.v_inactive_target;
-		launder_loop = 1;
+		maxlaunder = cnt.v_inactive_target / 3 + factor;
+		if (maxlaunder > max_page_launder + factor)
+			maxlaunder = max_page_launder + factor;
 	}

+	/*
+	 * Initialize our marker
+	 */
+	bzero(&marker, sizeof(marker));
+	marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
+	marker.queue = PQ_INACTIVE;
+	marker.wire_count = 1;
+
 	/*
 	 * Start scanning the inactive queue for pages we can move to the
 	 * cache or free.  The scan will stop when the target is reached or
-	 * we have scanned the entire inactive queue.
+	 * we have scanned the entire inactive queue.  Note that m->act_count
+	 * is not used to form decisions for the inactive queue, only for the
+	 * active queue.
 	 */

 rescan0:
@ -690,6 +700,12 @@ vm_pageout_scan()

 		next = TAILQ_NEXT(m, pageq);

+		/*
+		 * skip marker pages
+		 */
+		if (m->flags & PG_MARKER)
+			continue;
+
 		if (m->hold_count) {
 			s = splvm();
 			TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
@ -766,7 +782,8 @@ vm_pageout_scan()
 			--page_shortage;

 		/*
-		 * Clean pages can be placed onto the cache queue.
+		 * Clean pages can be placed onto the cache queue.  This
+		 * effectively frees them.
 		 */
 		} else if (m->dirty == 0) {
 			vm_page_cache(m);
@ -777,7 +794,6 @@ vm_pageout_scan()
 		 * only a limited number of pages per pagedaemon pass.
 		 */
 		} else if (maxlaunder > 0) {
-			int written;
 			int swap_pageouts_ok;
 			struct vnode *vp = NULL;
 			struct mount *mp;
@ -805,29 +821,6 @@ vm_pageout_scan()
 				continue;
 			}

-			/*
-			 * For now we protect against potential memory
-			 * deadlocks by requiring significant memory to be 
-			 * free if the object is not OBJT_DEFAULT or OBJT_SWAP.
-			 * We do not 'trust' any other object type to operate
-			 * with low memory, not even OBJT_DEVICE.  The VM
-			 * allocator will special case allocations done by
-			 * the pageout daemon so the check below actually 
-			 * does have some hysteresis in it.  It isn't the best
-			 * solution, though.
-			 */
-
-			if (object->type != OBJT_DEFAULT &&
-			    object->type != OBJT_SWAP &&
-			    cnt.v_free_count < cnt.v_free_reserved) {
-				s = splvm();
-				TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
-				TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m,
-				    pageq);
-				splx(s);
-				continue;
-			}
-
 			/*
 			 * Presumably we have sufficient free memory to do
 			 * the more sophisticated checks and locking required
@ -879,10 +872,15 @@ vm_pageout_scan()
 				}

 				/*
-				 * The page might have been moved to another queue
-				 * during potential blocking in vget() above.
+				 * The page might have been moved to another
+				 * queue during potential blocking in vget()
+				 * above.  The page might have been freed and
+				 * reused for another vnode.  The object might
+				 * have been reused for another vnode.
 				 */
-				if (m->queue != PQ_INACTIVE) {
+				if (m->queue != PQ_INACTIVE ||
+				    m->object != object ||
+				    object->handle != vp) {
 					if (object->flags & OBJ_MIGHTBEDIRTY)
 						vnodes_skipped++;
 					vput(vp);
@ -891,9 +889,10 @@ vm_pageout_scan()
 				}
 	
 				/*
-				 * The page may have been busied during the blocking in
-				 * vput();  We don't move the page back onto the end of
-				 * the queue so that statistics are more correct if we don't.
+				 * The page may have been busied during the
+				 * blocking in vput();  We don't move the
+				 * page back onto the end of the queue so that
+				 * statistics are more correct if we don't.
 				 */
 				if (m->busy || (m->flags & PG_BUSY)) {
 					vput(vp);
@ -921,42 +920,57 @@ vm_pageout_scan()
 			 * If a page is dirty, then it is either being washed
 			 * (but not yet cleaned) or it is still in the
 			 * laundry.  If it is still in the laundry, then we
-			 * start the cleaning operation.
+			 * start the cleaning operation.  maxlaunder nominally
+			 * counts I/O cost (seeks) rather then bytes.
+			 *
+			 * This operation may cluster, invalidating the 'next'
+			 * pointer.  To prevent an inordinate number of
+			 * restarts we use our marker to remember our place.
 			 */
-			written = vm_pageout_clean(m);
+			s = splvm();
+			TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE].pl, m, &marker, pageq);
+			splx(s);
+			if (vm_pageout_clean(m) != 0)
+				--maxlaunder;
+			s = splvm();
+			next = TAILQ_NEXT(&marker, pageq);
+			TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, &marker, pageq);
+			splx(s);
 			if (vp) {
 				vput(vp);
 				vn_finished_write(mp);
 			}
-
-			maxlaunder -= written;
 		}
 	}

 	/*
-	 * If we still have a page shortage and we didn't launder anything,
-	 * run the inactive scan again and launder something this time.
+	 * If we were not able to meet our target, increase actcmp
 	 */

-	if (launder_loop == 0 && page_shortage > 0) {
-		launder_loop = 1;
-		maxlaunder = 
-		    (cnt.v_inactive_target > max_page_launder) ?
-		    max_page_launder : cnt.v_inactive_target;
-		goto rescan0;
+	if (vm_page_count_min()) {
+		if (vm_pageout_actcmp < ACT_MAX / 2)
+			vm_pageout_actcmp += ACT_ADVANCE;
+	} else {
+		if (vm_pageout_actcmp < ACT_DECLINE)
+			vm_pageout_actcmp = 0;
+		else
+			vm_pageout_actcmp -= ACT_DECLINE;
 	}

 	/*
-	 * Compute the page shortage from the point of view of having to
-	 * move pages from the active queue to the inactive queue.
+	 * Compute the number of pages we want to try to move from the
+	 * active queue to the inactive queue.
 	 */

-	page_shortage = (cnt.v_inactive_target + cnt.v_cache_min) -
-	    (cnt.v_free_count + cnt.v_inactive_count + cnt.v_cache_count);
+	page_shortage = vm_paging_target() +
+		cnt.v_inactive_target - cnt.v_inactive_count;
 	page_shortage += addl_page_shortage;
+	page_shortage += vm_pageout_actcmp;

 	/*
-	 * Scan the active queue for things we can deactivate
+	 * Scan the active queue for things we can deactivate. We nominally
+	 * track the per-page activity counter and use it to locate
+	 * deactivation candidates.
 	 */

 	pcount = cnt.v_active_count;
@ -1026,7 +1040,8 @@ vm_pageout_scan()
 		} else {
 			m->act_count -= min(m->act_count, ACT_DECLINE);
 			if (vm_pageout_algorithm_lru ||
-				(m->object->ref_count == 0) || (m->act_count == 0)) {
+			    (m->object->ref_count == 0) || 
+			    (m->act_count <= vm_pageout_actcmp)) {
 				page_shortage--;
 				if (m->object->ref_count == 0) {
 					vm_page_protect(m, VM_PROT_NONE);
@ -1111,7 +1126,7 @@ vm_pageout_scan()
 	 * make sure that we have swap space -- if we are low on memory and
 	 * swap -- then kill the biggest process.
 	 */
-	if ((vm_swap_size == 0 || swap_pager_full) && vm_page_count_min()) {
+	if ((vm_swap_size < 64 || swap_pager_full) && vm_page_count_min()) {
 		bigproc = NULL;
 		bigsize = 0;
 		for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
@ -1349,20 +1364,31 @@ vm_pageout()
 		int error;
 		int s = splvm();

-		if (vm_pages_needed && vm_page_count_min()) {
+		/*
+		 * If we have enough free memory, wakeup waiters.  Do
+		 * not clear vm_pages_needed until we reach our target,
+		 * otherwise we may be woken up over and over again and
+		 * waste a lot of cpu.
+		 */
+		if (vm_pages_needed && !vm_page_count_min()) {
+			if (vm_paging_needed() <= 0)
+				vm_pages_needed = 0;
+			wakeup(&cnt.v_free_count);
+		}
+		if (vm_pages_needed) {
 			/*
 			 * Still not done, sleep a bit and go again
 			 */
-			vm_pages_needed = 0;
 			tsleep(&vm_pages_needed, PVM, "psleep", hz/2);
 		} else {
 			/*
 			 * Good enough, sleep & handle stats
 			 */
-			vm_pages_needed = 0;
 			error = tsleep(&vm_pages_needed,
 				PVM, "psleep", vm_pageout_stats_interval * hz);
 			if (error && !vm_pages_needed) {
+				if (vm_pageout_actcmp > 0)
+					--vm_pageout_actcmp;
 				splx(s);
 				vm_pageout_page_stats();
 				continue;
@ -1371,11 +1397,9 @@ vm_pageout()

 		if (vm_pages_needed)
 			cnt.v_pdwakeups++;
-		vm_pages_needed = 0;
 		splx(s);
 		vm_pageout_scan();
 		vm_pageout_deficit = 0;
-		wakeup(&cnt.v_free_count);
 	}
 }