Handle LoR in flush_pagedep_deps().

When operating in SU or SU+J mode, ffs_syncvnode() might need to instantiate other vnode by inode number while owning syncing vnode lock. Typically this other vnode is the parent of our vnode, but due to renames occuring right before fsync (or during fsync when we drop the syncing vnode lock, see below) it might be no longer parent. More, the called function flush_pagedep_deps() needs to lock other vnode while owning the lock for vnode which owns the buffer, for which the dependencies are flushed. This creates another instance of the same LoR as was fixed in softdep_sync(). Put the generic code for safe relocking into new SU helper get_parent_vp() and use it in flush_pagedep_deps(). The case for safe relocking of two vnodes with undefined lock order was extracted into vn helper vn_lock_pair(). Due to call sequence ffs_syncvnode()->softdep_sync_buf()->flush_pagedep_deps(), ffs_syncvnode() indicates with ERELOOKUP that passed vnode was unlocked in process, and can return ENOENT if the passed vnode reclaimed. All callers of the function were inspected. Because UFS namei lookups store auxiliary information about directory entry in in-memory directory inode, and this information is then used by UFS code that creates/removed directory entry in the actual mutating VOPs, it is critical that directory vnode lock is not dropped between lookup and VOP. For softdep_prelink(), which ensures that later link/unlink operation can proceed without overflowing the journal, calls were moved to the place where it is safe to drop processing VOP because mutations are not yet applied. Then, ERELOOKUP causes restart of the whole VFS operation (typically VFS syscall) at top level, including the re-lookup of the involved pathes. [Note that we already do the same restart for failing calls to vn_start_write(), so formally this patch does not introduce new behavior.] Similarly, unsafe calls to fsync in snapshot creation code were plugged. A possible view on these failures is that it does not make sense to continue creating snapshot if the snapshot vnode was reclaimed due to forced unmount. It is possible that relock/ERELOOKUP situation occurs in ffs_truncate() called from ufs_inactive(). In this case, dropping the vnode lock is not safe. Detect the situation with VI_DOINGINACT and reschedule inactivation by setting VI_OWEINACT. ufs_inactive() rechecks VI_OWEINACT and avoids reclaiming vnode is truncation failed this way. In ffs_truncate(), allocation of the EOF block for partial truncation is re-done after vnode is synced, since we cannot leave the buffer locked through ffs_syncvnode(). In collaboration with: pho Reviewed by: mckusick (previous version), markj Tested by: markj (syzkaller), pho Sponsored by: The FreeBSD Foundation Differential revision: https://reviews.freebsd.org/D26136
svn path=/head/; revision=367672
2020-11-14 05:30:10 +00:00 · 2020-11-14 05:30:10 +00:00 · 8a1509e442 · 2020-12-20 02:59:44 +00:00
commit 8a1509e442
parent 738ea0010b
9 changed files with 494 additions and 106 deletions
--- a/sys/ufs/ffs/ffs_extern.h
+++ b/sys/ufs/ffs/ffs_extern.h
@ -173,6 +173,9 @@ void	softdep_load_inodeblock(struct inode *);
 void	softdep_freefile(struct vnode *, ino_t, int);
 int	softdep_request_cleanup(struct fs *, struct vnode *,
 	    struct ucred *, int);
+int	softdep_prerename(struct vnode *, struct vnode *, struct vnode *,
+	    struct vnode *);
+int	softdep_prelink(struct vnode *, struct vnode *, int);
 void	softdep_setup_freeblocks(struct inode *, off_t, int);
 void	softdep_setup_inomapdep(struct buf *, struct inode *, ino_t, int);
 void	softdep_setup_blkmapdep(struct buf *, struct mount *, ufs2_daddr_t,
--- a/sys/ufs/ffs/ffs_inode.c
+++ b/sys/ufs/ffs/ffs_inode.c
@ -462,6 +462,8 @@ ffs_truncate(vp, length, flags, cred)
 		error = UFS_BALLOC(vp, length - 1, 1, cred, flags, &bp);
 		if (error)
 			return (error);
+		ffs_inode_bwrite(vp, bp, flags);
+
 		/*
 		 * When we are doing soft updates and the UFS_BALLOC
 		 * above fills in a direct block hole with a full sized
@ -474,6 +476,10 @@ ffs_truncate(vp, length, flags, cred)
 		    fragroundup(fs, blkoff(fs, length)) < fs->fs_bsize &&
 		    (error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0)
 			return (error);
+
+		error = UFS_BALLOC(vp, length - 1, 1, cred, flags, &bp);
+		if (error)
+			return (error);
 		ip->i_size = length;
 		DIP_SET(ip, i_size, length);
 		size = blksize(fs, ip, lbn);
--- a/sys/ufs/ffs/ffs_snapshot.c
+++ b/sys/ufs/ffs/ffs_snapshot.c
@ -301,6 +301,8 @@ ffs_snapshot(mp, snapfile)
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vn_finished_write(wrtmp);
 		vrele(nd.ni_dvp);
+		if (error == ERELOOKUP)
+			goto restart;
 		return (error);
 	}
 	vp = nd.ni_vp;
@ -368,8 +370,12 @@ ffs_snapshot(mp, snapfile)
 		if (error)
 			goto out;
 		bawrite(nbp);
-		if (cg % 10 == 0)
-			ffs_syncvnode(vp, MNT_WAIT, 0);
+		if (cg % 10 == 0) {
+			error = ffs_syncvnode(vp, MNT_WAIT, 0);
+			/* vp possibly reclaimed if unlocked */
+			if (error != 0)
+				goto out;
+		}
 	}
 	/*
 	 * Copy all the cylinder group maps. Although the
@ -391,8 +397,8 @@ ffs_snapshot(mp, snapfile)
 			goto out;
 		error = cgaccount(cg, vp, nbp, 1);
 		bawrite(nbp);
-		if (cg % 10 == 0)
-			ffs_syncvnode(vp, MNT_WAIT, 0);
+		if (cg % 10 == 0 && error == 0)
+			error = ffs_syncvnode(vp, MNT_WAIT, 0);
 		if (error)
 			goto out;
 	}
--- a/sys/ufs/ffs/ffs_softdep.c
+++ b/sys/ufs/ffs/ffs_softdep.c
@ -609,6 +609,27 @@ softdep_freework(wkhd)
 	panic("softdep_freework called");
 }

+int
+softdep_prerename(fdvp, fvp, tdvp, tvp)
+	struct vnode *fdvp;
+	struct vnode *fvp;
+	struct vnode *tdvp;
+	struct vnode *tvp;
+{
+
+	panic("softdep_prerename called");
+}
+
+int
+softdep_prelink(dvp, vp, will_direnter)
+	struct vnode *dvp;
+	struct vnode *vp;
+	int will_direnter;
+{
+
+	panic("softdep_prelink called");
+}
+
 #else

 FEATURE(softupdates, "FFS soft-updates support");
@ -748,7 +769,7 @@ static	void unlinked_inodedep(struct mount *, struct inodedep *);
 static	void clear_unlinked_inodedep(struct inodedep *);
 static	struct inodedep *first_unlinked_inodedep(struct ufsmount *);
 static	int flush_pagedep_deps(struct vnode *, struct mount *,
-	    struct diraddhd *);
+	    struct diraddhd *, struct buf *);
 static	int free_pagedep(struct pagedep *);
 static	int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t);
 static	int flush_inodedep_deps(struct vnode *, struct mount *, ino_t);
@ -925,7 +946,6 @@ static	void journal_unmount(struct ufsmount *);
 static	int journal_space(struct ufsmount *, int);
 static	void journal_suspend(struct ufsmount *);
 static	int journal_unsuspend(struct ufsmount *ump);
-static	void softdep_prelink(struct vnode *, struct vnode *);
 static	void add_to_journal(struct worklist *);
 static	void remove_from_journal(struct worklist *);
 static	bool softdep_excess_items(struct ufsmount *, int);
@ -1389,6 +1409,136 @@ SYSCTL_INT(_debug_softdep, OID_AUTO, print_threads, CTLFLAG_RW,
 /* List of all filesystems mounted with soft updates */
 static TAILQ_HEAD(, mount_softdeps) softdepmounts;

+/*
+ * This function fetches inode inum on mount point mp.  We already
+ * hold a locked vnode vp, and might have a locked buffer bp belonging
+ * to vp.
+
+ * We must not block on acquiring the new inode lock as we will get
+ * into a lock-order reversal with the buffer lock and possibly get a
+ * deadlock.  Thus if we cannot instantiate the requested vnode
+ * without sleeping on its lock, we must unlock the vnode and the
+ * buffer before doing a blocking on the vnode lock.  We return
+ * ERELOOKUP if we have had to unlock either the vnode or the buffer so
+ * that the caller can reassess its state.
+ *
+ * Top-level VFS code (for syscalls and other consumers, e.g. callers
+ * of VOP_FSYNC() in syncer) check for ERELOOKUP and restart at safe
+ * point.
+ *
+ * Since callers expect to operate on fully constructed vnode, we also
+ * recheck v_data after relock, and return ENOENT if NULL.
+ *
+ * If unlocking bp, we must unroll dequeueing its unfinished
+ * dependencies, and clear scan flag, before unlocking.  If unlocking
+ * vp while it is under deactivation, we re-queue deactivation.
+ */
+static int
+get_parent_vp(struct vnode *vp, struct mount *mp, ino_t inum, struct buf *bp,
+    struct diraddhd *diraddhdp, struct diraddhd *unfinishedp,
+    struct vnode **rvp)
+{
+	struct vnode *pvp;
+	struct diradd *dap;
+	int error;
+	bool bplocked;
+
+	ASSERT_VOP_ELOCKED(vp, "child vnode must be locked");
+	for (bplocked = true, pvp = NULL;;) {
+		error = ffs_vgetf(mp, inum, LK_EXCLUSIVE | LK_NOWAIT, &pvp,
+		    FFSV_FORCEINSMQ);
+		if (error == 0) {
+			/*
+			 * Since we could have unlocked vp, the inode
+			 * number could no longer indicate a
+			 * constructed node.  In this case, we must
+			 * restart the syscall.
+			 */
+			if (VTOI(pvp)->i_mode == 0 || !bplocked) {
+				if (VTOI(pvp)->i_mode == 0)
+					vgone(pvp);
+				vput(pvp);
+				error = ERELOOKUP;
+				goto out;
+			}
+
+			error = 0;
+			goto out1;
+		}
+		if (bp != NULL && bplocked) {
+			/*
+			 * Requeue unfinished dependencies before
+			 * unlocking buffer, which could make
+			 * diraddhdp invalid.
+			 */
+			ACQUIRE_LOCK(VFSTOUFS(mp));
+			while ((dap = LIST_FIRST(unfinishedp)) != NULL) {
+				LIST_REMOVE(dap, da_pdlist);
+				LIST_INSERT_HEAD(diraddhdp, dap, da_pdlist);
+			}
+			FREE_LOCK(VFSTOUFS(mp));
+			bp->b_vflags &= ~BV_SCANNED;
+			BUF_NOREC(bp);
+			BUF_UNLOCK(bp);
+			bplocked = false;
+		}
+
+		/*
+		 * Do not drop vnode lock while inactivating.  This
+		 * would result in leaks of the VI flags and
+		 * reclaiming of non-truncated vnode.  Instead,
+		 * re-schedule inactivation hoping that we would be
+		 * able to sync inode later.
+		 */
+		if ((vp->v_iflag & VI_DOINGINACT) != 0) {
+			VI_LOCK(vp);
+			vp->v_iflag |= VI_OWEINACT;
+			VI_UNLOCK(vp);
+			return (ERELOOKUP);
+		}
+
+		VOP_UNLOCK(vp);
+		error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &pvp,
+		    FFSV_FORCEINSMQ);
+		if (error != 0) {
+			MPASS(error != ERELOOKUP);
+			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+			break;
+		}
+		if (VTOI(pvp)->i_mode == 0) {
+			vgone(pvp);
+			vput(pvp);
+			pvp = NULL;
+			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+			error = ERELOOKUP;
+			break;
+		}
+		error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT);
+		if (error == 0)
+			break;
+		vput(pvp);
+		pvp = NULL;
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+		if (vp->v_data == NULL) {
+			error = ENOENT;
+			break;
+		}
+	}
+	if (bp != NULL) {
+		MPASS(!bplocked);
+		error = ERELOOKUP;
+	}
+	if (error != 0 && pvp != NULL) {
+		vput(pvp);
+		pvp = NULL;
+	}
+out1:
+	*rvp = pvp;
+out:
+	ASSERT_VOP_ELOCKED(vp, "child vnode must be locked on return");
+	return (error);
+}
+
 /*
 * This function cleans the worklist for a filesystem.
 * Each filesystem running with soft dependencies gets its own
@ -3096,40 +3246,84 @@ softdep_prealloc(vp, waitok)
 }

 /*
- * Before adjusting a link count on a vnode verify that we have sufficient
- * journal space.  If not, process operations that depend on the currently
- * locked pair of vnodes to try to flush space as the syncer, buf daemon,
- * and softdep flush threads can not acquire these locks to reclaim space.
+ * Try hard to sync all data and metadata for the vnode, and workitems
+ * flushing which might conflict with the vnode lock.  This is a
+ * helper for softdep_prerename().
 */
-static void
-softdep_prelink(dvp, vp)
-	struct vnode *dvp;
+static int
+softdep_prerename_vnode(ump, vp)
+	struct ufsmount *ump;
 	struct vnode *vp;
 {
-	struct ufsmount *ump;
+	int error;

-	ump = VFSTOUFS(dvp->v_mount);
-	LOCK_OWNED(ump);
-	/*
-	 * Nothing to do if we have sufficient journal space.
-	 * If we currently hold the snapshot lock, we must avoid
-	 * handling other resources that could cause deadlock.
-	 */
-	if (journal_space(ump, 0) || (vp && IS_SNAPSHOT(VTOI(vp))))
-		return;
-	stat_journal_low++;
-	FREE_LOCK(ump);
-	if (vp)
-		ffs_syncvnode(vp, MNT_NOWAIT, 0);
-	ffs_syncvnode(dvp, MNT_WAIT, 0);
+	ASSERT_VOP_ELOCKED(vp, "prehandle");
+	if (vp->v_data == NULL)
+		return (0);
+	error = VOP_FSYNC(vp, MNT_WAIT, curthread);
+	if (error != 0)
+		return (error);
 	ACQUIRE_LOCK(ump);
-	/* Process vp before dvp as it may create .. removes. */
-	if (vp) {
-		process_removes(vp);
-		process_truncates(vp);
+	process_removes(vp);
+	process_truncates(vp);
+	FREE_LOCK(ump);
+	return (0);
+}
+
+/*
+ * Must be called from VOP_RENAME() after all vnodes are locked.
+ * Ensures that there is enough journal space for rename.  It is
+ * sufficiently different from softdep_prelink() by having to handle
+ * four vnodes.
+ */
+int
+softdep_prerename(fdvp, fvp, tdvp, tvp)
+	struct vnode *fdvp;
+	struct vnode *fvp;
+	struct vnode *tdvp;
+	struct vnode *tvp;
+{
+	struct ufsmount *ump;
+	int error;
+
+	ump = VFSTOUFS(fdvp->v_mount);
+
+	if (journal_space(ump, 0))
+		return (0);
+
+	VOP_UNLOCK(tdvp);
+	VOP_UNLOCK(fvp);
+	if (tvp != NULL && tvp != tdvp)
+		VOP_UNLOCK(tvp);
+
+	error = softdep_prerename_vnode(ump, fdvp);
+	VOP_UNLOCK(fdvp);
+	if (error != 0)
+		return (error);
+
+	VOP_LOCK(fvp, LK_EXCLUSIVE | LK_RETRY);
+	error = softdep_prerename_vnode(ump, fvp);
+	VOP_UNLOCK(fvp);
+	if (error != 0)
+		return (error);
+
+	if (tdvp != fdvp) {
+		VOP_LOCK(tdvp, LK_EXCLUSIVE | LK_RETRY);
+		error = softdep_prerename_vnode(ump, tdvp);
+		VOP_UNLOCK(tdvp);
+		if (error != 0)
+			return (error);
 	}
-	process_removes(dvp);
-	process_truncates(dvp);
+
+	if (tvp != fvp && tvp != NULL) {
+		VOP_LOCK(tvp, LK_EXCLUSIVE | LK_RETRY);
+		error = softdep_prerename_vnode(ump, tvp);
+		VOP_UNLOCK(tvp);
+		if (error != 0)
+			return (error);
+	}
+
+	ACQUIRE_LOCK(ump);
 	softdep_speedup(ump);
 	process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT);
 	if (journal_space(ump, 0) == 0) {
@ -3137,6 +3331,121 @@ softdep_prelink(dvp, vp)
 		if (journal_space(ump, 1) == 0)
 			journal_suspend(ump);
 	}
+	FREE_LOCK(ump);
+	return (ERELOOKUP);
+}
+
+/*
+ * Before adjusting a link count on a vnode verify that we have sufficient
+ * journal space.  If not, process operations that depend on the currently
+ * locked pair of vnodes to try to flush space as the syncer, buf daemon,
+ * and softdep flush threads can not acquire these locks to reclaim space.
+ *
+ * Returns 0 if all owned locks are still valid and were not dropped
+ * in the process, in other case it returns either an error from sync,
+ * or ERELOOKUP if any of the locks were re-acquired.  In the later
+ * case, the state of the vnodes cannot be relied upon and our VFS
+ * syscall must be restarted at top level from the lookup.
+ */
+int
+softdep_prelink(dvp, vp, will_direnter)
+	struct vnode *dvp;
+	struct vnode *vp;
+	int will_direnter;
+{
+	struct ufsmount *ump;
+	int error, error1;
+
+	ASSERT_VOP_ELOCKED(dvp, "prelink dvp");
+	if (vp != NULL)
+		ASSERT_VOP_ELOCKED(vp, "prelink vp");
+	ump = VFSTOUFS(dvp->v_mount);
+
+	/*
+	 * Nothing to do if we have sufficient journal space.
+	 * If we currently hold the snapshot lock, we must avoid
+	 * handling other resources that could cause deadlock.
+	 *
+	 * will_direnter == 1: In case allocated a directory block in
+	 * an indirect block, we must prevent holes in the directory
+	 * created if directory entries are written out of order.  To
+	 * accomplish this we fsync when we extend a directory into
+	 * indirects.  During rename it's not safe to drop the tvp
+	 * lock so sync must be delayed until it is.
+	 *
+	 * This synchronous step could be removed if fsck and the
+	 * kernel were taught to fill in sparse directories rather
+	 * than panic.
+	 */
+	if (journal_space(ump, 0) || (vp != NULL && IS_SNAPSHOT(VTOI(vp)))) {
+		error = 0;
+		if (will_direnter && (vp == NULL || !IS_SNAPSHOT(VTOI(vp)))) {
+			if (vp != NULL)
+				VOP_UNLOCK(vp);
+			error = ffs_syncvnode(dvp, MNT_WAIT, 0);
+			if (vp != NULL) {
+				error1 = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT);
+				if (error1 != 0) {
+					vn_lock_pair(dvp, true, vp, false);
+					if (error == 0)
+						error = ERELOOKUP;
+				} else if (vp->v_data == NULL) {
+					error = ERELOOKUP;
+				}
+			}
+		}
+		return (error);
+	}
+
+	stat_journal_low++;
+	if (vp != NULL) {
+		VOP_UNLOCK(dvp);
+		ffs_syncvnode(vp, MNT_NOWAIT, 0);
+		vn_lock_pair(dvp, false, vp, true);
+		if (dvp->v_data == NULL)
+			return (ERELOOKUP);
+	}
+	if (vp != NULL)
+		VOP_UNLOCK(vp);
+	ffs_syncvnode(dvp, MNT_WAIT, 0);
+	VOP_UNLOCK(dvp);
+
+	/* Process vp before dvp as it may create .. removes. */
+	if (vp != NULL) {
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+		if (vp->v_data == NULL) {
+			vn_lock_pair(dvp, false, vp, true);
+			return (ERELOOKUP);
+		}
+		ACQUIRE_LOCK(ump);
+		process_removes(vp);
+		process_truncates(vp);
+		FREE_LOCK(ump);
+		VOP_UNLOCK(vp);
+	}
+
+	vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
+	if (dvp->v_data == NULL) {
+		vn_lock_pair(dvp, true, vp, false);
+		return (ERELOOKUP);
+	}
+
+	ACQUIRE_LOCK(ump);
+	process_removes(dvp);
+	process_truncates(dvp);
+	VOP_UNLOCK(dvp);
+	softdep_speedup(ump);
+
+	process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT);
+	if (journal_space(ump, 0) == 0) {
+		softdep_speedup(ump);
+		if (journal_space(ump, 1) == 0)
+			journal_suspend(ump);
+	}
+	FREE_LOCK(ump);
+
+	vn_lock_pair(dvp, false, vp, false);
+	return (ERELOOKUP);
 }

 static void
@ -4742,7 +5051,6 @@ softdep_setup_create(dp, ip)
 		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
 		    ("softdep_setup_create: No addref structure present."));
 	}
-	softdep_prelink(dvp, NULL);
 	FREE_LOCK(ITOUMP(dp));
 }

@ -4777,7 +5085,6 @@ softdep_setup_dotdot_link(dp, ip)
 	if (jaddref)
 		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
 		    if_deps);
-	softdep_prelink(dvp, ITOV(ip));
 	FREE_LOCK(ITOUMP(dp));
 }

@ -4808,7 +5115,6 @@ softdep_setup_link(dp, ip)
 	if (jaddref)
 		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
 		    if_deps);
-	softdep_prelink(dvp, ITOV(ip));
 	FREE_LOCK(ITOUMP(dp));
 }

@ -4858,7 +5164,6 @@ softdep_setup_mkdir(dp, ip)
 	if (DOINGSUJ(dvp))
 		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst,
 		    &dotdotaddref->ja_ref, if_deps);
-	softdep_prelink(ITOV(dp), NULL);
 	FREE_LOCK(ITOUMP(dp));
 }

@ -4879,7 +5184,6 @@ softdep_setup_rmdir(dp, ip)
 	ACQUIRE_LOCK(ITOUMP(dp));
 	(void) inodedep_lookup_ip(ip);
 	(void) inodedep_lookup_ip(dp);
-	softdep_prelink(dvp, ITOV(ip));
 	FREE_LOCK(ITOUMP(dp));
 }

@ -4900,7 +5204,6 @@ softdep_setup_unlink(dp, ip)
 	ACQUIRE_LOCK(ITOUMP(dp));
 	(void) inodedep_lookup_ip(ip);
 	(void) inodedep_lookup_ip(dp);
-	softdep_prelink(dvp, ITOV(ip));
 	FREE_LOCK(ITOUMP(dp));
 }

@ -12622,25 +12925,12 @@ softdep_fsync(vp)
 		 * for details on possible races.
 		 */
 		FREE_LOCK(ump);
-		if (ffs_vgetf(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp,
-		    FFSV_FORCEINSMQ)) {
-			/*
-			 * Unmount cannot proceed after unlock because
-			 * caller must have called vn_start_write().
-			 */
-			VOP_UNLOCK(vp);
-			error = ffs_vgetf(mp, parentino, LK_EXCLUSIVE,
-			    &pvp, FFSV_FORCEINSMQ);
-			MPASS(VTOI(pvp)->i_mode != 0);
-			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
-			if (VN_IS_DOOMED(vp)) {
-				if (error == 0)
-					vput(pvp);
-				error = ENOENT;
-			}
-			if (error != 0)
-				return (error);
-		}
+		error = get_parent_vp(vp, mp, parentino, NULL, NULL, NULL,
+		    &pvp);
+		if (error == ERELOOKUP)
+			error = 0;
+		if (error != 0)
+			return (error);
 		/*
 		 * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps
 		 * that are contained in direct blocks will be resolved by 
@ -12964,9 +13254,11 @@ softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor)
 			for (i = 0; i < DAHASHSZ; i++) {
 				if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
 					continue;
-				if ((error = flush_pagedep_deps(vp, wk->wk_mp,
-				    &pagedep->pd_diraddhd[i]))) {
-					BUF_NOREC(bp);
+				error = flush_pagedep_deps(vp, wk->wk_mp,
+				    &pagedep->pd_diraddhd[i], bp);
+				if (error != 0) {
+					if (error != ERELOOKUP)
+						BUF_NOREC(bp);
 					goto out_unlock;
 				}
 			}
@ -13200,10 +13492,11 @@ flush_newblk_dep(vp, mp, lbn)
 * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
 */
 static int
-flush_pagedep_deps(pvp, mp, diraddhdp)
+flush_pagedep_deps(pvp, mp, diraddhdp, locked_bp)
 	struct vnode *pvp;
 	struct mount *mp;
 	struct diraddhd *diraddhdp;
+	struct buf *locked_bp;
 {
 	struct inodedep *inodedep;
 	struct inoref *inoref;
@ -13270,10 +13563,10 @@ flush_pagedep_deps(pvp, mp, diraddhdp)
 		}
 		if (dap->da_state & MKDIR_BODY) {
 			FREE_LOCK(ump);
-			if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
-			    FFSV_FORCEINSMQ)))
+			error = get_parent_vp(pvp, mp, inum, locked_bp,
+			    diraddhdp, &unfinished, &vp);
+			if (error != 0)
 				break;
-			MPASS(VTOI(vp)->i_mode != 0);
 			error = flush_newblk_dep(vp, mp, 0);
 			/*
 			 * If we still have the dependency we might need to
@ -13335,10 +13628,10 @@ flush_pagedep_deps(pvp, mp, diraddhdp)
 		 */
 		if (dap == LIST_FIRST(diraddhdp)) {
 			FREE_LOCK(ump);
-			if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
-			    FFSV_FORCEINSMQ)))
+			error = get_parent_vp(pvp, mp, inum, locked_bp,
+			    diraddhdp, &unfinished, &vp);
+			if (error != 0)
 				break;
-			MPASS(VTOI(vp)->i_mode != 0);
 			error = ffs_update(vp, 1);
 			vput(vp);
 			if (error)
--- a/sys/ufs/ffs/ffs_vfsops.c
+++ b/sys/ufs/ffs/ffs_vfsops.c
@ -1861,8 +1861,14 @@ ffs_sync(mp, waitfor)
 #ifdef QUOTA
 		qsyncvp(vp);
 #endif
-		if ((error = ffs_syncvnode(vp, waitfor, 0)) != 0)
-			allerror = error;
+		for (;;) {
+			error = ffs_syncvnode(vp, waitfor, 0);
+			if (error == ERELOOKUP)
+				continue;
+			if (error != 0)
+				allerror = error;
+			break;
+		}
 		vput(vp);
 	}
 	/*
--- a/sys/ufs/ffs/ffs_vnops.c
+++ b/sys/ufs/ffs/ffs_vnops.c
@ -253,7 +253,7 @@ ffs_syncvnode(struct vnode *vp, int waitfor, int flags)
 	struct buf *bp, *nbp;
 	ufs_lbn_t lbn;
 	int error, passes;
-	bool still_dirty, wait;
+	bool still_dirty, unlocked, wait;

 	ip = VTOI(vp);
 	ip->i_flag &= ~IN_NEEDSYNC;
@ -277,6 +277,7 @@ ffs_syncvnode(struct vnode *vp, int waitfor, int flags)
 	error = 0;
 	passes = 0;
 	wait = false;	/* Always do an async pass first. */
+	unlocked = false;
 	lbn = lblkno(ITOFS(ip), (ip->i_size + ITOFS(ip)->fs_bsize - 1));
 	BO_LOCK(bo);
 loop:
@ -325,6 +326,26 @@ ffs_syncvnode(struct vnode *vp, int waitfor, int flags)
 		if (!LIST_EMPTY(&bp->b_dep) &&
 		    (error = softdep_sync_buf(vp, bp,
 		    wait ? MNT_WAIT : MNT_NOWAIT)) != 0) {
+			/*
+			 * Lock order conflict, buffer was already unlocked,
+			 * and vnode possibly unlocked.
+			 */
+			if (error == ERELOOKUP) {
+				if (vp->v_data == NULL)
+					return (EBADF);
+				unlocked = true;
+				if (DOINGSOFTDEP(vp) && waitfor == MNT_WAIT &&
+				    (error = softdep_sync_metadata(vp)) != 0) {
+					if (ffs_fsfail_cleanup(ump, error))
+						error = 0;
+					return (unlocked && error == 0 ?
+					    ERELOOKUP : error);
+				}
+				/* Re-evaluate inode size */
+				lbn = lblkno(ITOFS(ip), (ip->i_size +
+				    ITOFS(ip)->fs_bsize - 1));
+				goto next;
+			}
 			/* I/O error. */
 			if (error != EBUSY) {
 				BUF_UNLOCK(bp);
@ -361,9 +382,11 @@ ffs_syncvnode(struct vnode *vp, int waitfor, int flags)
 	if (waitfor != MNT_WAIT) {
 		BO_UNLOCK(bo);
 		if ((flags & NO_INO_UPDT) != 0)
-			return (0);
-		else
-			return (ffs_update(vp, 0));
+			return (unlocked ? ERELOOKUP : 0);
+		error = ffs_update(vp, 0);
+		if (error == 0 && unlocked)
+			error = ERELOOKUP;
+		return (error);
 	}
 	/* Drain IO to see if we're done. */
 	bufobj_wwait(bo, 0, 0);
@ -419,6 +442,8 @@ ffs_syncvnode(struct vnode *vp, int waitfor, int flags)
 	} else if ((ip->i_flags & (IN_SIZEMOD | IN_IBLKDATA)) != 0) {
 		error = ffs_update(vp, 1);
 	}
+	if (error == 0 && unlocked)
+		error = ERELOOKUP;
 	return (error);
 }

--- a/sys/ufs/ufs/ufs_inode.c
+++ b/sys/ufs/ufs/ufs_inode.c
@ -166,7 +166,8 @@ ufs_inactive(ap)
 		isize += ip->i_din2->di_extsize;
 	if (ip->i_effnlink <= 0 && isize && !UFS_RDONLY(ip))
 		error = UFS_TRUNCATE(vp, (off_t)0, IO_EXT | IO_NORMAL, NOCRED);
-	if (ip->i_nlink <= 0 && ip->i_mode && !UFS_RDONLY(ip)) {
+	if (ip->i_nlink <= 0 && ip->i_mode != 0 && !UFS_RDONLY(ip) &&
+	    (vp->v_iflag & VI_OWEINACT) == 0) {
 #ifdef QUOTA
 		if (!getinoquota(ip))
 			(void)chkiq(ip, -1, NOCRED, FORCE);
@ -207,10 +208,12 @@ ufs_inactive(ap)
 	 * If we are done with the inode, reclaim it
 	 * so that it can be reused immediately.
 	 */
-	if (ip->i_mode == 0)
+	if (ip->i_mode == 0 && (vp->v_iflag & VI_OWEINACT) == 0)
 		vrecycle(vp);
 	if (mp != NULL)
 		vn_finished_secondary_write(mp);
+	if (error == ERELOOKUP)
+		error = 0;
 	return (error);
 }

--- a/sys/ufs/ufs/ufs_lookup.c
+++ b/sys/ufs/ufs/ufs_lookup.c
@ -961,27 +961,7 @@ ufs_direnter(dvp, tvp, dirp, cnp, newdirbp, isrename)
 			bdwrite(bp);
 			if ((dp->i_flag & IN_NEEDSYNC) == 0)
 				return (UFS_UPDATE(dvp, 0));
-			/*
-			 * We have just allocated a directory block in an
-			 * indirect block.  We must prevent holes in the
-			 * directory created if directory entries are
-			 * written out of order.  To accomplish this we
-			 * fsync when we extend a directory into indirects.
-			 * During rename it's not safe to drop the tvp lock
-			 * so sync must be delayed until it is.
-			 *
-			 * This synchronous step could be removed if fsck and
-			 * the kernel were taught to fill in sparse
-			 * directories rather than panic.
-			 */
-			if (isrename)
-				return (0);
-			if (tvp != NULL)
-				VOP_UNLOCK(tvp);
-			(void) VOP_FSYNC(dvp, MNT_WAIT, td);
-			if (tvp != NULL)
-				vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY);
-			return (error);
+			return (0);
 		}
 		if (DOINGASYNC(dvp)) {
 			bdwrite(bp);
--- a/sys/ufs/ufs/ufs_vnops.c
+++ b/sys/ufs/ufs/ufs_vnops.c
@ -1006,10 +1006,16 @@ ufs_remove(ap)
 	td = curthread;
 	ip = VTOI(vp);
 	if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) ||
-	    (VTOI(dvp)->i_flags & APPEND)) {
-		error = EPERM;
-		goto out;
+	    (VTOI(dvp)->i_flags & APPEND))
+		return (EPERM);
+	if (DOINGSOFTDEP(dvp)) {
+		error = softdep_prelink(dvp, vp, true);
+		if (error != 0) {
+			MPASS(error == ERELOOKUP);
+			return (error);
+		}
 	}
+
 #ifdef UFS_GJOURNAL
 	ufs_gjournal_orphan(vp);
 #endif
@ -1030,7 +1036,6 @@ ufs_remove(ap)
 		(void) VOP_FSYNC(dvp, MNT_WAIT, td);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	}
-out:
 	return (error);
 }

@ -1067,6 +1072,15 @@ ufs_link(ap)
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("ufs_link: no name");
 #endif
+
+	if (DOINGSOFTDEP(tdvp)) {
+		error = softdep_prelink(tdvp, vp, true);
+		if (error != 0) {
+			MPASS(error == ERELOOKUP);
+			return (error);
+		}
+	}
+
 	if (VTOI(tdvp)->i_effnlink < 2) {
 		print_bad_link_count("ufs_link", tdvp);
 		error = EINVAL;
@ -1089,6 +1103,7 @@ ufs_link(ap)
 		error = EPERM;
 		goto out;
 	}
+
 	ip->i_effnlink++;
 	ip->i_nlink++;
 	DIP_SET(ip, i_nlink, ip->i_nlink);
@ -1129,6 +1144,15 @@ ufs_whiteout(ap)
 	struct direct newdir;
 	int error = 0;

+	if (DOINGSOFTDEP(dvp) && (ap->a_flags == CREATE ||
+	    ap->a_flags == DELETE)) {
+		error = softdep_prelink(dvp, NULL, true);
+		if (error != 0) {
+			MPASS(error == ERELOOKUP);
+			return (error);
+		}
+	}
+
 	switch (ap->a_flags) {
 	case LOOKUP:
 		/* 4.4 format directories support whiteout operations */
@ -1338,6 +1362,18 @@ ufs_rename(ap)
 			goto relock;
 		}
 	}
+
+	if (DOINGSOFTDEP(fdvp)) {
+		error = softdep_prerename(fdvp, fvp, tdvp, tvp);
+		if (error != 0) {
+			if (error == ERELOOKUP) {
+				atomic_add_int(&rename_restarts, 1);
+				goto relock;
+			}
+			goto releout;
+		}
+	}
+
 	fdp = VTOI(fdvp);
 	fip = VTOI(fvp);
 	tdp = VTOI(tdvp);
@ -1649,8 +1685,10 @@ ufs_rename(ap)
 	 * are no longer needed.
 	 */
 	if (error == 0 && endoff != 0) {
-		error = UFS_TRUNCATE(tdvp, endoff, IO_NORMAL |
-		    (DOINGASYNC(tdvp) ? 0 : IO_SYNC), tcnp->cn_cred);
+		do {
+			error = UFS_TRUNCATE(tdvp, endoff, IO_NORMAL |
+			    (DOINGASYNC(tdvp) ? 0 : IO_SYNC), tcnp->cn_cred);
+		} while (error == ERELOOKUP);
 		if (error != 0 && !ffs_fsfail_cleanup(VFSTOUFS(mp), error))
 			vn_printf(tdvp,
 			    "ufs_rename: failed to truncate, error %d\n",
@ -1668,8 +1706,11 @@ ufs_rename(ap)
 		 */
 		error = 0;
 	}
-	if (error == 0 && tdp->i_flag & IN_NEEDSYNC)
-		error = VOP_FSYNC(tdvp, MNT_WAIT, td);
+	if (error == 0 && tdp->i_flag & IN_NEEDSYNC) {
+		do {
+			error = VOP_FSYNC(tdvp, MNT_WAIT, td);
+		} while (error == ERELOOKUP);
+	}
 	vput(tdvp);
 	return (error);

@ -1918,6 +1959,7 @@ ufs_mkdir(ap)
 	}
 	dmode = vap->va_mode & 0777;
 	dmode |= IFDIR;
+
 	/*
 	 * Must simulate part of ufs_makeinode here to acquire the inode,
 	 * but not have it entered in the parent directory. The entry is
@ -1928,6 +1970,15 @@ ufs_mkdir(ap)
 		error = EINVAL;
 		goto out;
 	}
+
+	if (DOINGSOFTDEP(dvp)) {
+		error = softdep_prelink(dvp, NULL, true);
+		if (error != 0) {
+			MPASS(error == ERELOOKUP);
+			return (error);
+		}
+	}
+
 	error = UFS_VALLOC(dvp, dmode, cnp->cn_cred, &tvp);
 	if (error)
 		goto out;
@ -2184,6 +2235,14 @@ ufs_rmdir(ap)
 		error = EINVAL;
 		goto out;
 	}
+	if (DOINGSOFTDEP(dvp)) {
+		error = softdep_prelink(dvp, vp, false);
+		if (error != 0) {
+			MPASS(error == ERELOOKUP);
+			return (error);
+		}
+	}
+
 #ifdef UFS_GJOURNAL
 	ufs_gjournal_orphan(vp);
 #endif
@ -2703,6 +2762,13 @@ ufs_makeinode(mode, dvp, vpp, cnp, callfunc)
 		print_bad_link_count(callfunc, dvp);
 		return (EINVAL);
 	}
+	if (DOINGSOFTDEP(dvp)) {
+		error = softdep_prelink(dvp, NULL, true);
+		if (error != 0) {
+			MPASS(error == ERELOOKUP);
+			return (error);
+		}
+	}
 	error = UFS_VALLOC(dvp, mode, cnp->cn_cred, &tvp);
 	if (error)
 		return (error);