Break recursion involving getnewvnode and zfs_rmnode.

When we're at our vnode limit, getnewvnode will call into the vnode LRU cache to free up vnodes. If the vnode we try to recycle is a ZFS vnode we end up, eventually, in zfs_rmnode. If the ZFS vnode we're recycling represents something with extended attributes, zfs_rmnode will call zfs_zget which will attempt to allocate another vnode. If the next vnode we try to recycle is also a ZFS vnode representing something with extended attributes we can recurse further. This ends up being unbounded and can end up overflowing the stack. In order to avoid this, restructure zfs_rmnode to simply add the extended attribute directory's object ID to the unlinked set, thus not requiring the allocation of a vnode. We then schedule a task that calls zfs_unlinked_drain which will do the work of properly marking the vnodes for unlinking. zfs_unlinked_drain is also called on mount so these will be cleaned up there. Reviewed by: avg, mav Sponsored by: iXsystems, Inc. Differential Revision: https://reviews.freebsd.org/D15342
svn path=/head/; revision=334810
2018-06-07 18:59:32 +00:00 · 2018-06-07 18:59:32 +00:00 · b3b11d6400 · 2020-12-20 02:59:44 +00:00
commit b3b11d6400
parent 52cf896593
3 changed files with 66 additions and 13 deletions
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h
@ -85,6 +85,9 @@ struct zfsvfs {
 	sa_attr_type_t	*z_attr_table;	/* SA attr mapping->id */
 #define	ZFS_OBJ_MTX_SZ	64
 	kmutex_t	z_hold_mtx[ZFS_OBJ_MTX_SZ];	/* znode hold locks */
+#if defined(__FreeBSD__)
+	struct task	z_unlinked_drain_task;
+#endif
 };

 /*
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
@ -281,6 +281,7 @@ zfs_unlinked_drain(zfsvfs_t *zfsvfs)
 	zap_attribute_t zap;
 	dmu_object_info_t doi;
 	znode_t		*zp;
+	dmu_tx_t	*tx;
 	int		error;

 	/*
@ -318,6 +319,19 @@ zfs_unlinked_drain(zfsvfs_t *zfsvfs)

 		vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY);
 		zp->z_unlinked = B_TRUE;
+#if defined(__FreeBSD__)
+		/*
+		 * Due to changes in zfs_rmnode we need to make sure the
+		 * link count is set to zero here.
+		 */
+		zp->z_links = 0;
+		tx = dmu_tx_create(zfsvfs->z_os);
+		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+		VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
+		VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
+		    &zp->z_links, sizeof (zp->z_links), tx));
+		dmu_tx_commit(tx);
+#endif
 		vput(ZTOV(zp));
 	}
 	zap_cursor_fini(&zc);
@ -393,7 +407,6 @@ zfs_rmnode(znode_t *zp)
 {
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	objset_t	*os = zfsvfs->z_os;
-	znode_t		*xzp = NULL;
 	dmu_tx_t	*tx;
 	uint64_t	acl_obj;
 	uint64_t	xattr_obj;
@ -443,11 +456,8 @@ zfs_rmnode(znode_t *zp)
 	 */
 	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
 	    &xattr_obj, sizeof (xattr_obj));
-	if (error == 0 && xattr_obj) {
-		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
-		ASSERT3S(error, ==, 0);
-		vn_lock(ZTOV(xzp), LK_EXCLUSIVE | LK_RETRY);
-	}
+	if (error)
+		xattr_obj = 0;

 	acl_obj = zfs_external_acl(zp);

@ -457,10 +467,8 @@ zfs_rmnode(znode_t *zp)
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
-	if (xzp) {
+	if (xattr_obj)
 		dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL);
-		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
-	}
 	if (acl_obj)
 		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);

@ -475,9 +483,25 @@ zfs_rmnode(znode_t *zp)
 		dmu_tx_abort(tx);
 		zfs_znode_dmu_fini(zp);
 		zfs_znode_free(zp);
-		goto out;
+		return;
 	}

+#if defined(__FreeBSD__)
+	/*
+	 * FreeBSD's implemention of zfs_zget requires a vnode to back it.
+	 * This means that we could end up calling into getnewvnode while
+	 * calling zfs_rmnode as a result of a prior call to getnewvnode
+	 * trying to clear vnodes out of the cache. If this repeats we can
+	 * recurse enough that we overflow our stack. To avoid this, we
+	 * avoid calling zfs_zget on the xattr znode and instead simply add
+	 * it to the unlinked set and schedule a call to zfs_unlinked_drain.
+	 */
+	if (xattr_obj) {
+		/* Add extended attribute directory to the unlinked set. */
+		VERIFY3U(0, ==,
+		    zap_add_int(os, zfsvfs->z_unlinkedobj, xattr_obj, tx));
+	}
+#else
 	if (xzp) {
 		ASSERT(error == 0);
 		xzp->z_unlinked = B_TRUE;	/* mark xzp for deletion */
@ -486,6 +510,7 @@ zfs_rmnode(znode_t *zp)
 		    &xzp->z_links, sizeof (xzp->z_links), tx));
 		zfs_unlinked_add(xzp, tx);
 	}
+#endif

 	/* Remove this znode from the unlinked set */
 	VERIFY3U(0, ==,
@ -494,9 +519,16 @@ zfs_rmnode(znode_t *zp)
 	zfs_znode_delete(zp, tx);

 	dmu_tx_commit(tx);
-out:
-	if (xzp)
-		vput(ZTOV(xzp));
+
+	if (xattr_obj) {
+		/*
+		 * We're using the FreeBSD taskqueue API here instead of
+		 * the Solaris taskq API since the FreeBSD API allows for a
+		 * task to be enqueued multiple times but executed once.
+		 */
+		taskqueue_enqueue(system_taskq->tq_queue,
+		    &zfsvfs->z_unlinked_drain_task);
+	}
 }

 static uint64_t
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
@ -972,6 +972,15 @@ zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
 	return (0);
 }

+#if defined(__FreeBSD__)
+static void
+zfsvfs_task_unlinked_drain(void *context, int pending __unused)
+{
+
+	zfs_unlinked_drain((zfsvfs_t *)context);
+}
+#endif
+
 int
 zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
 {
@ -1023,6 +1032,10 @@ zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os)
 	mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
 	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
 	    offsetof(znode_t, z_link_node));
+#if defined(__FreeBSD__)
+	TASK_INIT(&zfsvfs->z_unlinked_drain_task, 0,
+	    zfsvfs_task_unlinked_drain, zfsvfs);
+#endif
 #ifdef DIAGNOSTIC
 	rrm_init(&zfsvfs->z_teardown_lock, B_TRUE);
 #else
@ -2015,6 +2028,11 @@ zfs_umount(vfs_t *vfsp, int fflag)
 	}
 #endif

+	while (taskqueue_cancel(system_taskq->tq_queue,
+	    &zfsvfs->z_unlinked_drain_task, NULL) != 0)
+		taskqueue_drain(system_taskq->tq_queue,
+		    &zfsvfs->z_unlinked_drain_task);
+
 	VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
 	os = zfsvfs->z_os;