From 245df27cee60dfce76e5f6f5e133e16e70279af8 Mon Sep 17 00:00:00 2001
From: Matthew Dillon <dillon@FreeBSD.org>
Date: Fri, 26 Oct 2001 00:08:05 +0000
Subject: [PATCH] Implement kern.maxvnodes.  adjusting kern.maxvnodes now
 actually has a real effect.

Optimize vfs_msync().  Avoid having to continually drop and re-obtain
mutexes when scanning the vnode list.  Improves looping case by 500%.

Optimize ffs_sync().  Avoid having to continually drop and re-obtain
mutexes when scanning the vnode list.  This makes a couple of assumptions,
which I believe are ok, in regards to vnode stability when the mount list
mutex is held.  Improves looping case by 500%.

(more optimization work is needed on top of these fixes)

MFC after:	1 week
---
 sys/kern/vfs_subr.c      | 112 ++++++++++++++++++++++++++-------------
 sys/sys/vnode.h          |   5 ++
 sys/ufs/ffs/ffs_vfsops.c |  38 +++++++------
 sys/vm/vm_fault.c        |   3 +-
 sys/vm/vm_object.c       |  41 +++++++++++++-
 sys/vm/vm_object.h       |   1 +
 sys/vm/vm_page.c         |   2 +-
 7 files changed, 145 insertions(+), 57 deletions(-)
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index fde8ce2a2699..d690ab356fee 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -518,6 +518,49 @@ vattr_null(vap)
 	vap->va_vaflags = 0;
 }
 
+/*
+ * This routine is called when we have too many vnodes.  It attempts
+ * to free <count> vnodes and will potentially free vnodes that still
+ * have VM backing store (VM backing store is typically the cause
+ * of a vnode blowout so we want to do this).  Therefore, this operation
+ * is not considered cheap.
+ *
+ * A number of conditions may prevent a vnode from being reclaimed.
+ * the buffer cache may have references on the vnode, a directory
+ * vnode may still have references due to the namei cache representing
+ * underlying files, or the vnode may be in active use.   It is not
+ * desireable to reuse such vnodes.  These conditions may cause the
+ * number of vnodes to reach some minimum value regardless of what
+ * you set kern.maxvnodes to.  Do not set kernl.maxvnodes too low.
+ */
+static void
+vlrureclaim(struct mount *mp, int count)
+{
+	struct vnode *vp;
+
+	mtx_lock(&mntvnode_mtx);
+	while (count && (vp = TAILQ_FIRST(&mp->mnt_nvnodelist)) != NULL) {
+		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
+		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
+
+		if (vp->v_type != VNON &&
+		    vp->v_type != VBAD &&
+		    VMIGHTFREE(vp) &&           /* critical path opt */
+		    mtx_trylock(&vp->v_interlock)
+		) {
+			mtx_unlock(&mntvnode_mtx);
+			if (VMIGHTFREE(vp)) {
+				vgonel(vp, curthread);
+			} else {
+				mtx_unlock(&vp->v_interlock);
+			}
+			mtx_lock(&mntvnode_mtx);
+		}
+		--count;
+	}
+	mtx_unlock(&mntvnode_mtx);
+}
+
 /*
  * Routines having to do with the management of the vnode table.
  */
@@ -532,25 +575,33 @@ getnewvnode(tag, mp, vops, vpp)
 	vop_t **vops;
 	struct vnode **vpp;
 {
-	int s, count;
+	int s;
 	struct thread *td = curthread;	/* XXX */
 	struct vnode *vp = NULL;
 	struct mount *vnmp;
 	vm_object_t object;
 
+	s = splbio();
 	/*
-	 * We take the least recently used vnode from the freelist
-	 * if we can get it and it has no cached pages, and no
-	 * namecache entries are relative to it.
-	 * Otherwise we allocate a new vnode
+	 * Try to reuse vnodes if we hit the max.  This situation only
+	 * occurs in certain large-memory (2G+) situations.  For the
+	 * algorithm to be stable we have to try to reuse at least 2.
+	 * No hysteresis should be necessary.
+	 */
+	if (numvnodes - freevnodes > desiredvnodes)
+		vlrureclaim(mp, 2);
+
+	/*
+	 * Attempt to reuse a vnode already on the free list, allocating
+	 * a new vnode if we can't find one or if we have not reached a
+	 * good minimum for good LRU performance.
 	 */
 
-	s = splbio();
 	mtx_lock(&vnode_free_list_mtx);
 
-	if (freevnodes < wantfreevnodes) {
-		vp = NULL;
-	} else if (numvnodes >= minvnodes) {
+	if (freevnodes >= wantfreevnodes && numvnodes >= minvnodes) {
+		int count;
+
 		for (count = 0; count < freevnodes; count++) {
 			vp = TAILQ_FIRST(&vnode_free_list);
 			if (vp == NULL || vp->v_usecount)
@@ -2408,22 +2459,20 @@ vfs_msync(struct mount *mp, int flags)
 {
 	struct vnode *vp, *nvp;
 	struct vm_object *obj;
-	int anyio, tries;
+	int tries;
 
 	GIANT_REQUIRED;
 
 	tries = 5;
-loop:
-	anyio = 0;
 	mtx_lock(&mntvnode_mtx);
+loop:
 	for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp != NULL; vp = nvp) {
-
-		nvp = TAILQ_NEXT(vp, v_nmntvnodes);
-
 		if (vp->v_mount != mp) {
-			mtx_unlock(&mntvnode_mtx);
-			goto loop;
+			if (--tries > 0)
+				goto loop;
+			break;
 		}
+		nvp = TAILQ_NEXT(vp, v_nmntvnodes);
 
 		if (vp->v_flag & VXLOCK)	/* XXX: what if MNT_WAIT? */
 			continue;
@@ -2431,36 +2480,27 @@ vfs_msync(struct mount *mp, int flags)
 		if (vp->v_flag & VNOSYNC)	/* unlinked, skip it */
 			continue;
 
-		if (flags != MNT_WAIT) {
-			if (VOP_GETVOBJECT(vp, &obj) != 0 ||
-			    (obj->flags & OBJ_MIGHTBEDIRTY) == 0)
-				continue;
-			if (VOP_ISLOCKED(vp, NULL))
-				continue;
-		}
-
-		mtx_unlock(&mntvnode_mtx);
-		mtx_lock(&vp->v_interlock);
-		if (VOP_GETVOBJECT(vp, &obj) == 0 &&
-		    (obj->flags & OBJ_MIGHTBEDIRTY)) {
+		if ((vp->v_flag & VOBJDIRTY) &&
+		    (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) {
+			mtx_unlock(&mntvnode_mtx);
 			if (!vget(vp,
-				LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curthread)) {
+			    LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curthread)) {
 				if (VOP_GETVOBJECT(vp, &obj) == 0) {
 					vm_object_page_clean(obj, 0, 0,
 					    flags == MNT_WAIT ?
 					    OBJPC_SYNC : OBJPC_NOSYNC);
-					anyio = 1;
 				}
 				vput(vp);
 			}
-		} else {
-			mtx_unlock(&vp->v_interlock);
+			mtx_lock(&mntvnode_mtx);
+			if (TAILQ_NEXT(vp, v_nmntvnodes) != nvp) {
+				if (--tries > 0)
+					goto loop;
+				break;
+			}
 		}
-		mtx_lock(&mntvnode_mtx);
 	}
 	mtx_unlock(&mntvnode_mtx);
-	if (anyio && (--tries > 0))
-		goto loop;
 }
 
 /*
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
index 5986579a7875..0d78fcbab6f8 100644
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -175,6 +175,7 @@ struct vnode {
 /* open for business	0x100000 */
 #define	VONWORKLST	0x200000 /* On syncer work-list */
 #define	VMOUNT		0x400000 /* Mount in progress */
+#define	VOBJDIRTY	0x800000 /* object might be dirty */
 
 /*
  * Vnode attributes.  A field value of VNOVAL represents a field whose value
@@ -311,6 +312,10 @@ extern void	(*lease_updatetime) __P((int deltat));
 	 (!(vp)->v_object || \
 	  !((vp)->v_object->ref_count || (vp)->v_object->resident_page_count)))
 
+#define VMIGHTFREE(vp) \
+	(!((vp)->v_flag & (VFREE|VDOOMED)) &&	\
+	 !(vp)->v_holdcnt && !(vp)->v_usecount)
+
 #define	VSHOULDBUSY(vp)	\
 	(((vp)->v_flag & VFREE) && \
 	 ((vp)->v_holdcnt || (vp)->v_usecount))
diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c
index ad4c24d2f9cd..d080fbb99a63 100644
--- a/sys/ufs/ffs/ffs_vfsops.c
+++ b/sys/ufs/ffs/ffs_vfsops.c
@@ -1001,10 +1001,10 @@ ffs_sync(mp, waitfor, cred, td)
 	 * Write back each (modified) inode.
 	 */
 	wait = 0;
-	lockreq = LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK;
+	lockreq = LK_EXCLUSIVE | LK_NOWAIT;
 	if (waitfor == MNT_WAIT) {
 		wait = 1;
-		lockreq = LK_EXCLUSIVE | LK_INTERLOCK;
+		lockreq = LK_EXCLUSIVE;
 	}
 	mtx_lock(&mntvnode_mtx);
 loop:
@@ -1015,34 +1015,40 @@ ffs_sync(mp, waitfor, cred, td)
 		 */
 		if (vp->v_mount != mp)
 			goto loop;
-		nvp = TAILQ_NEXT(vp, v_nmntvnodes);
 
-		mtx_unlock(&mntvnode_mtx);
-		mtx_lock(&vp->v_interlock);
+		/*
+		 * Depend on the mntvnode_slock to keep things stable enough
+		 * for a quick test.  Since there might be hundreds of
+		 * thousands of vnodes, we cannot afford even a subroutine
+		 * call unless there's a good chance that we have work to do.
+		 */
+		nvp = TAILQ_NEXT(vp, v_nmntvnodes);
 		ip = VTOI(vp);
 		if (vp->v_type == VNON || ((ip->i_flag &
-		     (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 &&
-		     TAILQ_EMPTY(&vp->v_dirtyblkhd))) {
-			mtx_unlock(&vp->v_interlock);
-			mtx_lock(&mntvnode_mtx);
+		    (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 &&
+		    TAILQ_EMPTY(&vp->v_dirtyblkhd))) {
 			continue;
 		}
 		if (vp->v_type != VCHR) {
+			mtx_unlock(&mntvnode_mtx);
 			if ((error = vget(vp, lockreq, td)) != 0) {
 				mtx_lock(&mntvnode_mtx);
 				if (error == ENOENT)
 					goto loop;
-				continue;
+			} else {
+				if ((error = VOP_FSYNC(vp, cred, waitfor, td)) != 0)
+					allerror = error;
+				VOP_UNLOCK(vp, 0, td);
+				vrele(vp);
+				mtx_lock(&mntvnode_mtx);
 			}
-			if ((error = VOP_FSYNC(vp, cred, waitfor, td)) != 0)
-				allerror = error;
-			VOP_UNLOCK(vp, 0, td);
-			vrele(vp);
 		} else {
-			mtx_unlock(&vp->v_interlock);
+			mtx_unlock(&mntvnode_mtx);
 			UFS_UPDATE(vp, wait);
+			mtx_lock(&mntvnode_mtx);
 		}
-		mtx_lock(&mntvnode_mtx);
+		if (TAILQ_NEXT(vp, v_nmntvnodes) != nvp)
+			goto loop;
 	}
 	mtx_unlock(&mntvnode_mtx);
 	/*
diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c
index 350735387142..8814ae5ae2a9 100644
--- a/sys/vm/vm_fault.c
+++ b/sys/vm/vm_fault.c
@@ -806,8 +806,7 @@ RetryFault:;
 
 	if (prot & VM_PROT_WRITE) {
 		vm_page_flag_set(fs.m, PG_WRITEABLE);
-		vm_object_set_flag(fs.m->object,
-				   OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
+		vm_object_set_writeable_dirty(fs.m->object);
 
 		/*
 		 * If the fault is a write, we know that this page is being
diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c
index 1d5a9892fbc2..b7613ebadd88 100644
--- a/sys/vm/vm_object.c
+++ b/sys/vm/vm_object.c
@@ -321,8 +321,11 @@ vm_object_reference(vm_object_t object)
 	if (object == NULL)
 		return;
 
+#if 0
+	/* object can be re-referenced during final cleaning */
 	KASSERT(!(object->flags & OBJ_DEAD),
 	    ("vm_object_reference: attempting to reference dead obj"));
+#endif
 
 	object->ref_count++;
 	if (object->type == OBJT_VNODE) {
@@ -454,8 +457,13 @@ vm_object_deallocate(vm_object_t object)
 			temp->generation++;
 			object->backing_object = NULL;
 		}
-		vm_object_terminate(object);
-		/* unlocks and deallocates object */
+		/*
+		 * Don't double-terminate, we could be in a termination
+		 * recursion due to the terminate having to sync data
+		 * to disk.
+		 */
+		if ((object->flags & OBJ_DEAD) == 0)
+			vm_object_terminate(object);
 		object = temp;
 	}
 }
@@ -627,7 +635,17 @@ vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end, int
 	}
 
 	if (clearobjflags && (tstart == 0) && (tend == object->size)) {
+		struct vnode *vp;
+
 		vm_object_clear_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
+		if (object->type == OBJT_VNODE &&
+		    (vp = (struct vnode *)object->handle) != NULL) {
+			if (vp->v_flag & VOBJDIRTY) {
+				mtx_lock(&vp->v_interlock);
+				vp->v_flag &= ~VOBJDIRTY;
+				mtx_unlock(&vp->v_interlock);
+			}
+		}
 	}
 
 rescan:
@@ -1357,6 +1375,8 @@ vm_object_collapse(vm_object_t object)
 			 * and no object references within it, all that is
 			 * necessary is to dispose of it.
 			 */
+			KASSERT(backing_object->ref_count == 1, ("backing_object %p was somehow re-referenced during collapse!", backing_object));
+			KASSERT(TAILQ_FIRST(&backing_object->memq) == NULL, ("backing_object %p somehow has left over pages during collapse!", backing_object));
 
 			TAILQ_REMOVE(
 			    &vm_object_list, 
@@ -1684,6 +1704,23 @@ vm_object_in_map(vm_object_t object)
 	return 0;
 }
 
+void
+vm_object_set_writeable_dirty(vm_object_t object)
+{
+	struct vnode *vp;
+
+	vm_object_set_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
+	if (object->type == OBJT_VNODE &&
+	    (vp = (struct vnode *)object->handle) != NULL) {
+		if ((vp->v_flag & VOBJDIRTY) == 0) {
+			mtx_lock(&vp->v_interlock);
+			vp->v_flag |= VOBJDIRTY;
+			mtx_unlock(&vp->v_interlock);
+		}
+	}
+}
+
+
 DB_SHOW_COMMAND(vmochk, vm_object_check)
 {
 	vm_object_t object;
diff --git a/sys/vm/vm_object.h b/sys/vm/vm_object.h
index c126cb6ecd42..1256e850c063 100644
--- a/sys/vm/vm_object.h
+++ b/sys/vm/vm_object.h
@@ -184,6 +184,7 @@ void vm_object_collapse (vm_object_t);
 void vm_object_deallocate (vm_object_t);
 void vm_object_terminate (vm_object_t);
 void vm_object_vndeallocate (vm_object_t);
+void vm_object_set_writeable_dirty (vm_object_t);
 void vm_object_init (void);
 void vm_object_page_clean (vm_object_t, vm_pindex_t, vm_pindex_t, boolean_t);
 void vm_object_page_remove (vm_object_t, vm_pindex_t, vm_pindex_t, boolean_t);
diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c
index 61d821ff3d63..0eb06fc06cef 100644
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@@ -609,7 +609,7 @@ vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
 	 * update the object's OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY flags.
 	 */
 	if (m->flags & PG_WRITEABLE)
-	    vm_object_set_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
+		vm_object_set_writeable_dirty(object);
 }
 
 /*