From 50d99cdfecd92f5323a18aa791a5b1cb9d8b7191 Mon Sep 17 00:00:00 2001
From: semenu <semenu@FreeBSD.org>
Date: Thu, 13 Jun 2002 21:49:09 +0000
Subject: [PATCH] Fix a race during null node creation between relookuping the
 hash and adding vnode to hash. The fix is to use atomic
 hash-lookup-and-add-if- not-found operation. The odd thing is that this race
 can't happen actually because the lowervp vnode is locked exclusively now
 during the whole process of null node creation. This must be thought as a
 step toward shared lookups.

Also remove vp->v_mount checks when looking for a match in the hash,
as this is the vestige.

Also add comments and cosmetic changes.
---
 sys/fs/nullfs/null.h        |   2 +-
 sys/fs/nullfs/null_subr.c   | 180 +++++++++++++++++-------------------
 sys/fs/nullfs/null_vfsops.c |   6 +-
 sys/fs/nullfs/null_vnops.c  |  19 ++--
 4 files changed, 102 insertions(+), 105 deletions(-)

diff --git a/sys/fs/nullfs/null.h b/sys/fs/nullfs/null.h
index 718deaa05bbd..3cb3716e9adb 100644
--- a/sys/fs/nullfs/null.h
+++ b/sys/fs/nullfs/null.h
@@ -59,7 +59,7 @@ struct null_node {
 
 int nullfs_init(struct vfsconf *vfsp);
 int nullfs_uninit(struct vfsconf *vfsp);
-int null_node_create(struct mount *mp, struct vnode *target, struct vnode **vpp);
+int null_nodeget(struct mount *mp, struct vnode *target, struct vnode **vpp);
 void null_hashrem(struct null_node *xp);
 int null_bypass(struct vop_generic_args *ap);
 
diff --git a/sys/fs/nullfs/null_subr.c b/sys/fs/nullfs/null_subr.c
index db1bdf5784c7..0de6c1f4d0ca 100644
--- a/sys/fs/nullfs/null_subr.c
+++ b/sys/fs/nullfs/null_subr.c
@@ -71,10 +71,8 @@ struct mtx null_hashmtx;
 static MALLOC_DEFINE(M_NULLFSHASH, "NULLFS hash", "NULLFS hash table");
 MALLOC_DEFINE(M_NULLFSNODE, "NULLFS node", "NULLFS vnode private part");
 
-static int	null_node_alloc(struct mount *mp, struct vnode *lowervp,
-				     struct vnode **vpp);
-static struct vnode *
-		null_node_find(struct mount *mp, struct vnode *lowervp);
+static struct vnode * null_hashget(struct vnode *);
+static struct vnode * null_hashins(struct null_node *);
 
 /*
  * Initialise cache headers
@@ -105,8 +103,7 @@ nullfs_uninit(vfsp)
  * Lower vnode should be locked on entry and will be left locked on exit.
  */
 static struct vnode *
-null_node_find(mp, lowervp)
-	struct mount *mp;
+null_hashget(lowervp)
 	struct vnode *lowervp;
 {
 	struct thread *td = curthread;	/* XXX */
@@ -124,7 +121,7 @@ null_node_find(mp, lowervp)
 loop:
 	mtx_lock(&null_hashmtx);
 	LIST_FOREACH(a, hd, null_hash) {
-		if (a->null_lowervp == lowervp && NULLTOV(a)->v_mount == mp) {
+		if (a->null_lowervp == lowervp) {
 			vp = NULLTOV(a);
 			mtx_lock(&vp->v_interlock);
 			mtx_unlock(&null_hashmtx);
@@ -140,28 +137,75 @@ null_node_find(mp, lowervp)
 		}
 	}
 	mtx_unlock(&null_hashmtx);
-
-	return NULLVP;
+	return (NULLVP);
 }
 
+/*
+ * Act like null_hashget, but add passed null_node to hash if no existing
+ * node found.
+ */
+static struct vnode *
+null_hashins(xp)
+	struct null_node *xp;
+{
+	struct thread *td = curthread;	/* XXX */
+	struct null_node_hashhead *hd;
+	struct null_node *oxp;
+	struct vnode *ovp;
+
+	hd = NULL_NHASH(xp->null_lowervp);
+loop:
+	mtx_lock(&null_hashmtx);
+	LIST_FOREACH(oxp, hd, null_hash) {
+		if (oxp->null_lowervp == xp->null_lowervp) {
+			ovp = NULLTOV(oxp);
+			mtx_lock(&ovp->v_interlock);
+			mtx_unlock(&null_hashmtx);
+			if (vget(ovp, LK_EXCLUSIVE | LK_THISLAYER | LK_INTERLOCK, td))
+				goto loop;
+
+			return (ovp);
+		}
+	}
+	LIST_INSERT_HEAD(hd, xp, null_hash);
+	mtx_unlock(&null_hashmtx);
+	return (NULLVP);
+}
 
 /*
- * Make a new null_node node.
- * Vp is the alias vnode, lofsvp is the lower vnode.
- * Maintain a reference to (lowervp).
+ * Make a new or get existing nullfs node.
+ * Vp is the alias vnode, lowervp is the lower vnode.
+ * 
+ * The lowervp assumed to be locked and having "spare" reference. This routine
+ * vrele lowervp if nullfs node was taken from hash. Otherwise it "transfers"
+ * the caller's "spare" reference to created nullfs vnode.
  */
-static int
-null_node_alloc(mp, lowervp, vpp)
+int
+null_nodeget(mp, lowervp, vpp)
 	struct mount *mp;
 	struct vnode *lowervp;
 	struct vnode **vpp;
 {
 	struct thread *td = curthread;	/* XXX */
-	struct null_node_hashhead *hd;
 	struct null_node *xp;
-	struct vnode *othervp, *vp;
+	struct vnode *vp;
 	int error;
 
+	/* Lookup the hash firstly */
+	*vpp = null_hashget(lowervp);
+	if (*vpp != NULL) {
+		vrele(lowervp);
+		return (0);
+	}
+
+	/*
+	 * We do not serialize vnode creation, instead we will check for
+	 * duplicates later, when adding new vnode to hash.
+	 *
+	 * Note that duplicate can only appear in hash if the lowervp is
+	 * locked LK_SHARED.
+	 */
+
 	/*
 	 * Do the MALLOC before the getnewvnode since doing so afterward
 	 * might cause a bogus v_data pointer to get dereferenced
@@ -170,12 +214,11 @@ null_node_alloc(mp, lowervp, vpp)
 	MALLOC(xp, struct null_node *, sizeof(struct null_node),
 	    M_NULLFSNODE, M_WAITOK);
 
-	error = getnewvnode(VT_NULL, mp, null_vnodeop_p, vpp);
+	error = getnewvnode(VT_NULL, mp, null_vnodeop_p, &vp);
 	if (error) {
 		FREE(xp, M_NULLFSNODE);
 		return (error);
 	}
-	vp = *vpp;
 
 	xp->null_vnode = vp;
 	xp->null_lowervp = lowervp;
@@ -186,19 +229,6 @@ null_node_alloc(mp, lowervp, vpp)
 	/* Though v_lock is inited by getnewvnode(), we want our own wmesg */
 	lockinit(&vp->v_lock, PVFS, "nunode", VLKTIMEOUT, LK_NOPAUSE);
 
-	/*
-	 * Before we insert our new node onto the hash chains,
-	 * check to see if someone else has beaten us to it.
-	 * (We could have slept in MALLOC.)
-	 */
-	othervp = null_node_find(mp, lowervp);
-	if (othervp) {
-		xp->null_lowervp = NULL;
-		vrele(vp);
-		*vpp = othervp;
-		return 0;
-	};
-
 	/*
 	 * From NetBSD:
 	 * Now lock the new node. We rely on the fact that we were passed
@@ -211,78 +241,38 @@ null_node_alloc(mp, lowervp, vpp)
 	vp->v_vnlock = lowervp->v_vnlock;
 	error = VOP_LOCK(vp, LK_EXCLUSIVE | LK_THISLAYER, td);
 	if (error)
-		panic("null_node_alloc: can't lock new vnode\n");
+		panic("null_nodeget: can't lock new vnode\n");
 
-	VREF(lowervp);
-	hd = NULL_NHASH(lowervp);
-	mtx_lock(&null_hashmtx);
-	LIST_INSERT_HEAD(hd, xp, null_hash);
-	mtx_unlock(&null_hashmtx);
-	return 0;
-}
-
-
-/*
- * Try to find an existing null_node vnode refering to the given underlying
- * vnode (which should be locked). If no vnode found, create a new null_node
- * vnode which contains a reference to the lower vnode.
- */
-int
-null_node_create(mp, lowervp, newvpp)
-	struct mount *mp;
-	struct vnode *lowervp;
-	struct vnode **newvpp;
-{
-	struct vnode *aliasvp;
-
-	aliasvp = null_node_find(mp, lowervp);
-	if (aliasvp) {
-		/*
-		 * null_node_find has taken another reference
-		 * to the alias vnode.
-		 */
-		 vrele(lowervp);
-#ifdef NULLFS_DEBUG
-		vprint("null_node_create: exists", aliasvp);
-#endif
-	} else {
-		int error;
-
-		/*
-		 * Get new vnode.
-		 */
-		NULLFSDEBUG("null_node_create: create new alias vnode\n");
-
-		/*
-		 * Make new vnode reference the null_node.
-		 */
-		error = null_node_alloc(mp, lowervp, &aliasvp);
-		if (error)
-			return error;
-
-		/*
-		 * aliasvp is already VREF'd by getnewvnode()
-		 */
+	/*
+	 * Atomically insert our new node into the hash or vget existing 
+	 * if someone else has beaten us to it.
+	 */
+	*vpp = null_hashins(xp);
+	if (*vpp != NULL) {
+		vrele(lowervp);
+		VOP_UNLOCK(vp, LK_THISLAYER, td);
+		vp->v_vnlock = NULL;
+		xp->null_lowervp = NULL;
+		vrele(vp);
+		return (0);
 	}
 
-#ifdef DIAGNOSTIC
-	if (lowervp->v_usecount < 1) {
-		/* Should never happen... */
-		vprint ("null_node_create: alias ", aliasvp);
-		vprint ("null_node_create: lower ", lowervp);
-		panic ("null_node_create: lower has 0 usecount.");
-	};
-#endif
+	/*
+	 * XXX We take extra vref just to workaround UFS's XXX:
+	 * UFS can vrele() vnode in VOP_CLOSE() in some cases. Luckily, this
+	 * can only happen if v_usecount == 1. To workaround, we just don't
+	 * let v_usecount be 1, it will be 2 or more.
+	 */
+	VREF(lowervp);
 
-#ifdef NULLFS_DEBUG
-	vprint("null_node_create: alias", aliasvp);
-	vprint("null_node_create: lower", lowervp);
-#endif
+	*vpp = vp;
 
-	*newvpp = aliasvp;
 	return (0);
 }
 
+/*
+ * Remove node from hash.
+ */
 void
 null_hashrem(xp)
 	struct null_node *xp;
diff --git a/sys/fs/nullfs/null_vfsops.c b/sys/fs/nullfs/null_vfsops.c
index a9ec22e45bfa..a218bf08ba6a 100644
--- a/sys/fs/nullfs/null_vfsops.c
+++ b/sys/fs/nullfs/null_vfsops.c
@@ -169,7 +169,7 @@ nullfs_mount(mp, ndp, td)
 	 * Save reference.  Each mount also holds
 	 * a reference on the root vnode.
 	 */
-	error = null_node_create(mp, lowerrootvp, &vp);
+	error = null_nodeget(mp, lowerrootvp, &vp);
 	/*
 	 * Unlock the node (either the lower or the alias)
 	 */
@@ -354,7 +354,7 @@ nullfs_vget(mp, ino, flags, vpp)
 	if (error)
 		return (error);
 
-	return (null_node_create(mp, *vpp, vpp));
+	return (null_nodeget(mp, *vpp, vpp));
 }
 
 static int
@@ -368,7 +368,7 @@ nullfs_fhtovp(mp, fidp, vpp)
 	if (error)
 		return (error);
 
-	return (null_node_create(mp, *vpp, vpp));
+	return (null_nodeget(mp, *vpp, vpp));
 }
 
 static int
diff --git a/sys/fs/nullfs/null_vnops.c b/sys/fs/nullfs/null_vnops.c
index f3201e254021..2b141be83732 100644
--- a/sys/fs/nullfs/null_vnops.c
+++ b/sys/fs/nullfs/null_vnops.c
@@ -346,7 +346,7 @@ null_bypass(ap)
 		vppp = VOPARG_OFFSETTO(struct vnode***,
 				 descp->vdesc_vpp_offset,ap);
 		if (*vppp)
-			error = null_node_create(old_vps[0]->v_mount, **vppp, *vppp);
+			error = null_nodeget(old_vps[0]->v_mount, **vppp, *vppp);
 	}
 
  out:
@@ -400,9 +400,12 @@ null_lookup(ap)
 			VREF(dvp);
 			vrele(lvp);
 		} else {
-			error = null_node_create(dvp->v_mount, lvp, &vp);
-			if (error == 0)
-				*ap->a_vpp = vp;
+			error = null_nodeget(dvp->v_mount, lvp, &vp);
+			if (error) {
+				/* XXX Cleanup needed... */
+				panic("null_nodeget failed");
+			}
+			*ap->a_vpp = vp;
 		}
 	}
 	return (error);
@@ -706,6 +709,11 @@ null_islocked(ap)
  * There is no way to tell that someone issued remove/rmdir operation
  * on the underlying filesystem. For now we just have to release lowevrp
  * as soon as possible.
+ *
+ * Note, we can't release any resources nor remove vnode from hash before 
+ * appropriate VXLOCK stuff is is done because other process can find this
+ * vnode in hash during inactivation and may be sitting in vget() and waiting
+ * for null_inactive to unlock vnode. Thus we will do all those in VOP_RECLAIM.
  */
 static int
 null_inactive(ap)
@@ -729,8 +737,7 @@ null_inactive(ap)
 }
 
 /*
- * We can free memory in null_inactive, but we do this
- * here. (Possible to guard vp->v_data to point somewhere)
+ * Now, the VXLOCK is in force and we're free to destroy the null vnode.
  */
 static int
 null_reclaim(ap)