zfs: overhaul zfs-vfs glue for vnode life-cycle management

* There is no need for the delayed destruction of znodes via taskqueue, now that we do not need to fear recursion from getnewvnode into zfs_inactive and zfs_freebsd_reclaim, thus making znode/vnode state machine a bit simpler. * More complete porting of zfs_inactive from Solaris VFS model to FreeBSD vop_inactive and vop_reclaim model. All destructive actions are done in zfs_freebsd_reclaim. This allows to simplify zfs_zget logic. * Allow zfs_zget to return a doomed vnode if the current thread already has an exclusive lock on the vnode. * Clean up Solaris-isms like bailing out of reclaim/inactive on certain values of v_usecount (aka v_count) or directly messing with this counter. * Do not clear z_vnode while znode is still accessible. z_vnode should be cleared only after zfs_znode_dmu_fini. Otherwise zfs_zget may get an effectively half-deconstructed znode. This allows to simplify zfs_zget logic further. The above changes fix at least two known/reported problems: o An indefinite wait in the following code path: vgone -> VOP_RECLAIM -> zfs_freebsd_reclaim -> vnode_destroy_vobject -> put_pages -> zfs_write -> zil_commit -> zfs_zget This happened because vgone marks a vnode as VI_DOOMED before calling VOP_RECLAIM, but zfs_zget would not return a doomed vnode under any circumstances. The fix in this change is not complete as it won't fix a deadlock between two threads doing VOP_RECLAIM where one thread is in zil_commit trying to zfs_zget a znode/vnode being reclaimed by the other thread, which would be blocked trying to enter zil_commit. This type of deadlock has not been reported as of now. o An indefinite wait in the unmount path caused by a znode "falling through the cracks" in inactive+reclaim. This would happen if the znode is unlinked while its vnode is still active. To Do: pass locking flags parameter to zfs_zget, so that the zfs-vfs glue code doesn't have to re-lock a vnode but could ask for proper locking from the very start. This would also allow for the higher level code to obtain a doomed vnode when it is expected/requested. Or to avoid blocking when it is not allowed (see zil_commit example above). ffs_vgetf seems like a good source of inspiration. Tested by: Willem Jan Withagen <wjw@digiware.nl> MFC after: 6 weeks
2012-11-25 15:33:26 +00:00 · 2012-11-25 15:33:26 +00:00 · 4ff1c77d22
commit 4ff1c77d22
parent 7ca5310ea3
4 changed files with 59 additions and 138 deletions
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
@ -207,8 +207,6 @@ typedef struct znode {
 	list_node_t	z_link_node;	/* all znodes in fs link */
 	sa_handle_t	*z_sa_hdl;	/* handle to sa data */
 	boolean_t	z_is_sa;	/* are we native sa? */
-	/* FreeBSD-specific field. */
-	struct task	z_task;
 } znode_t;


--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
@ -1847,18 +1847,6 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
 		zfsvfs->z_unmounted = B_TRUE;
 		rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
-
-#ifdef __FreeBSD__
-		/*
-		 * Some znodes might not be fully reclaimed, wait for them.
-		 */
-		mutex_enter(&zfsvfs->z_znodes_lock);
-		while (list_head(&zfsvfs->z_all_znodes) != NULL) {
-			msleep(zfsvfs, &zfsvfs->z_znodes_lock, 0,
-			    "zteardown", 0);
-		}
-		mutex_exit(&zfsvfs->z_znodes_lock);
-#endif
 	}

 	/*
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
@ -4566,15 +4566,23 @@ zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
 		 * The fs has been unmounted, or we did a
 		 * suspend/resume and this file no longer exists.
 		 */
-		VI_LOCK(vp);
-		ASSERT(vp->v_count <= 1);
-		vp->v_count = 0;
-		VI_UNLOCK(vp);
-		vrecycle(vp);
 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
+		vrecycle(vp);
 		return;
 	}

+	mutex_enter(&zp->z_lock);
+	if (zp->z_unlinked) {
+		/*
+		 * Fast path to recycle a vnode of a removed file.
+		 */
+		mutex_exit(&zp->z_lock);
+		rw_exit(&zfsvfs->z_teardown_inactive_lock);
+		vrecycle(vp);
+		return;
+	}
+	mutex_exit(&zp->z_lock);
+
 	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
 		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);

@ -4592,8 +4600,6 @@ zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
 			dmu_tx_commit(tx);
 		}
 	}
-
-	zfs_zinactive(zp);
 	rw_exit(&zfsvfs->z_teardown_inactive_lock);
 }

@ -6196,28 +6202,6 @@ zfs_freebsd_inactive(ap)
 	return (0);
 }

-static void
-zfs_reclaim_complete(void *arg, int pending)
-{
-	znode_t	*zp = arg;
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-
-	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
-	if (zp->z_sa_hdl != NULL) {
-		ZFS_OBJ_HOLD_ENTER(zfsvfs, zp->z_id);
-		zfs_znode_dmu_fini(zp);
-		ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
-	}
-	zfs_znode_free(zp);
-	rw_exit(&zfsvfs->z_teardown_inactive_lock);
-	/*
-	 * If the file system is being unmounted, there is a process waiting
-	 * for us, wake it up.
-	 */
-	if (zfsvfs->z_unmounted)
-		wakeup_one(zfsvfs);
-}
-
 static int
 zfs_freebsd_reclaim(ap)
 	struct vop_reclaim_args /* {
@ -6228,53 +6212,26 @@ zfs_freebsd_reclaim(ap)
 	vnode_t	*vp = ap->a_vp;
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	boolean_t rlocked;
-
-	rlocked = rw_tryenter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
+	int refcnt;

 	ASSERT(zp != NULL);

-	/*
-	 * Destroy the vm object and flush associated pages.
-	 */
+	/* Destroy the vm object and flush associated pages. */
 	vnode_destroy_vobject(vp);

-	mutex_enter(&zp->z_lock);
-	zp->z_vnode = NULL;
-	mutex_exit(&zp->z_lock);
-
-	if (zp->z_unlinked) {
-		;	/* Do nothing. */
-	} else if (!rlocked) {
-		TASK_INIT(&zp->z_task, 0, zfs_reclaim_complete, zp);
-		taskqueue_enqueue(taskqueue_thread, &zp->z_task);
-	} else if (zp->z_sa_hdl == NULL) {
+	/*
+	 * z_teardown_inactive_lock protects from a race with
+	 * zfs_znode_dmu_fini in zfsvfs_teardown during
+	 * force unmount.
+	 */
+	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
+	if (zp->z_sa_hdl == NULL)
 		zfs_znode_free(zp);
-	} else /* if (!zp->z_unlinked && zp->z_dbuf != NULL) */ {
-		int locked;
+	else
+		zfs_zinactive(zp);
+	rw_exit(&zfsvfs->z_teardown_inactive_lock);

-		locked = MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)) ? 2 :
-		    ZFS_OBJ_HOLD_TRYENTER(zfsvfs, zp->z_id);
-		if (locked == 0) {
-			/*
-			 * Lock can't be obtained due to deadlock possibility,
-			 * so defer znode destruction.
-			 */
-			TASK_INIT(&zp->z_task, 0, zfs_reclaim_complete, zp);
-			taskqueue_enqueue(taskqueue_thread, &zp->z_task);
-		} else {
-			zfs_znode_dmu_fini(zp);
-			if (locked == 1)
-				ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
-			zfs_znode_free(zp);
-		}
-	}
-	VI_LOCK(vp);
 	vp->v_data = NULL;
-	ASSERT(vp->v_holdcnt >= 1);
-	VI_UNLOCK(vp);
-	if (rlocked)
-		rw_exit(&zfsvfs->z_teardown_inactive_lock);
 	return (0);
 }

--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
@ -1147,14 +1147,16 @@ zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
 	dmu_object_info_t doi;
 	dmu_buf_t	*db;
 	znode_t		*zp;
-	int err;
+	vnode_t		*vp;
 	sa_handle_t	*hdl;
-	int first = 1;
-
-	*zpp = NULL;
+	struct thread	*td;
+	int locked;
+	int err;

+	td = curthread;
 	getnewvnode_reserve(1);
 again:
+	*zpp = NULL;
 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);

 	err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
@ -1193,48 +1195,38 @@ again:
 		if (zp->z_unlinked) {
 			err = ENOENT;
 		} else {
-			vnode_t *vp;
-			int dying = 0;
-
 			vp = ZTOV(zp);
-			if (vp == NULL)
-				dying = 1;
-			else {
-				VN_HOLD(vp);
-				if ((vp->v_iflag & VI_DOOMED) != 0) {
-					dying = 1;
-					/*
-					 * Don't VN_RELE() vnode here, because
-					 * it can call vn_lock() which creates
-					 * LOR between vnode lock and znode
-					 * lock. We will VN_RELE() the vnode
-					 * after droping znode lock.
-					 */
-				}
-			}
-			if (dying) {
-				if (first) {
-					ZFS_LOG(1, "dying znode detected (zp=%p)", zp);
-					first = 0;
-				}
-				/*
-				 * znode is dying so we can't reuse it, we must
-				 * wait until destruction is completed.
-				 */
-				sa_buf_rele(db, NULL);
-				mutex_exit(&zp->z_lock);
-				ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
-				if (vp != NULL)
-					VN_RELE(vp);
-				tsleep(zp, 0, "zcollide", 1);
-				goto again;
-			}
 			*zpp = zp;
 			err = 0;
 		}
 		sa_buf_rele(db, NULL);
+
+		/* Don't let the vnode disappear after ZFS_OBJ_HOLD_EXIT. */
+		if (err == 0)
+			VN_HOLD(vp);
+
 		mutex_exit(&zp->z_lock);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+
+		if (err == 0) {
+			locked = VOP_ISLOCKED(vp);
+			VI_LOCK(vp);
+			if ((vp->v_iflag & VI_DOOMED) != 0 &&
+			    locked != LK_EXCLUSIVE) {
+				/*
+				 * The vnode is doomed and this thread doesn't
+				 * hold the exclusive lock on it, so the vnode
+				 * must be being reclaimed by another thread.
+				 * Otherwise the doomed vnode is being reclaimed
+				 * by this thread and zfs_zget is called from
+				 * ZIL internals.
+				 */
+				VI_UNLOCK(vp);
+				VN_RELE(vp);
+				goto again;
+			}
+			VI_UNLOCK(vp);
+		}
 		getnewvnode_drop_reserve();
 		return (err);
 	}
@ -1400,7 +1392,6 @@ zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
 void
 zfs_zinactive(znode_t *zp)
 {
-	vnode_t	*vp = ZTOV(zp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	uint64_t z_id = zp->z_id;

@ -1412,19 +1403,6 @@ zfs_zinactive(znode_t *zp)
 	ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);

 	mutex_enter(&zp->z_lock);
-	VI_LOCK(vp);
-	if (vp->v_count > 0) {
-		/*
-		 * If the hold count is greater than zero, somebody has
-		 * obtained a new reference on this znode while we were
-		 * processing it here, so we are done.
-		 */
-		VI_UNLOCK(vp);
-		mutex_exit(&zp->z_lock);
-		ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
-		return;
-	}
-	VI_UNLOCK(vp);

 	/*
 	 * If this was the last reference to a file with no links,
@ -1433,14 +1411,14 @@ zfs_zinactive(znode_t *zp)
 	if (zp->z_unlinked) {
 		mutex_exit(&zp->z_lock);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
-		ASSERT(vp->v_count == 0);
-		vrecycle(vp);
 		zfs_rmnode(zp);
 		return;
 	}

 	mutex_exit(&zp->z_lock);
+	zfs_znode_dmu_fini(zp);
 	ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
+	zfs_znode_free(zp);
 }

 void
@ -1448,8 +1426,8 @@ zfs_znode_free(znode_t *zp)
 {
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;

-	ASSERT(ZTOV(zp) == NULL);
 	ASSERT(zp->z_sa_hdl == NULL);
+	zp->z_vnode = NULL;
 	mutex_enter(&zfsvfs->z_znodes_lock);
 	POINTER_INVALIDATE(&zp->z_zfsvfs);
 	list_remove(&zfsvfs->z_all_znodes, zp);