This patch corrects the first round of panics and hangs reported

with the new snapshot code. Update addaliasu to correctly implement the semantics of the old checkalias function. When a device vnode first comes into existence, check to see if an anonymous vnode for the same device was created at boot time by bdevvp(). If so, adopt the bdevvp vnode rather than creating a new vnode for the device. This corrects a problem which caused the kernel to panic when taking a snapshot of the root filesystem. Change the calling convention of vn_write_suspend_wait() to be the same as vn_start_write(). Split out softdep_flushworklist() from softdep_flushfiles() so that it can be used to clear the work queue when suspending filesystem operations. Access to buffers becomes recursive so that snapshots can recursively traverse their indirect blocks using ffs_copyonwrite() when checking for the need for copy on write when flushing one of their own indirect blocks. This eliminates a deadlock between the syncer daemon and a process taking a snapshot. Ensure that softdep_process_worklist() can never block because of a snapshot being taken. This eliminates a problem with buffer starvation. Cleanup change in ffs_sync() which did not synchronously wait when MNT_WAIT was specified. The result was an unclean filesystem panic when doing forcible unmount with heavy filesystem I/O in progress. Return a zero'ed block when reading a block that was not in use at the time that a snapshot was taken. Normally, these blocks should never be read. However, the readahead code will occationally read them which can cause unexpected behavior. Clean up the debugging code that ensures that no blocks be written on a filesystem while it is suspended. Snapshots must explicitly label the blocks that they are writing during the suspension so that they do not cause a `write on suspended filesystem' panic. Reorganize ffs_copyonwrite() to eliminate a deadlock and also to prevent a race condition that would permit the same block to be copied twice. This change eliminates an unexpected soft updates inconsistency in fsck caused by the double allocation. Use bqrelse rather than brelse for buffers that will be needed soon again by the snapshot code. This improves snapshot performance.
svn path=/head/; revision=63788
2000-07-24 05:28:33 +00:00 · 2000-07-24 05:28:33 +00:00 · 9b97113391 · 2020-12-20 02:59:44 +00:00
commit 9b97113391
parent 3adc8b3d1d
24 changed files with 298 additions and 103 deletions
--- a/sys/fs/cd9660/cd9660_vfsops.c
+++ b/sys/fs/cd9660/cd9660_vfsops.c
@ -855,7 +855,8 @@ cd9660_vget_internal(mp, ino, vpp, relocated, isodir)
 	case VCHR:
 	case VBLK:
 		vp->v_op = cd9660_specop_p;
-		addaliasu(vp, ip->inode.iso_rdev);
+		vp = addaliasu(vp, ip->inode.iso_rdev);
+		ip->i_vnode = vp;
 		break;
 	default:
 		break;
--- a/sys/fs/specfs/spec_vnops.c
+++ b/sys/fs/specfs/spec_vnops.c
@ -421,9 +421,11 @@ spec_strategy(ap)
 	bp = ap->a_bp;
 	vp = ap->a_vp;
 	if ((bp->b_iocmd == BIO_WRITE)) {
-		if (vp->v_mount != NULL &&
-		    (vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) != 0)
+		if ((bp->b_flags & B_VALIDSUSPWRT) == 0 &&
+		    bp->b_vp != NULL && bp->b_vp->v_mount != NULL &&
+		    (bp->b_vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) != 0)
 			panic("spec_strategy: bad I/O");
+		bp->b_flags &= ~B_VALIDSUSPWRT;
 		if (LIST_FIRST(&bp->b_dep) != NULL)
 			buf_start(bp);
 		if ((vp->v_flag & VCOPYONWRITE) &&
--- a/sys/gnu/ext2fs/ext2_bmap.c
+++ b/sys/gnu/ext2fs/ext2_bmap.c
@ -147,7 +147,18 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb)
 	num = *nump;
 	if (num == 0) {
 		*bnp = blkptrtodb(ump, ip->i_db[bn]);
-		if (*bnp == 0) {
+		/*
+		 * Since this is FFS independent code, we are out of
+		 * scope for the definitions of BLK_NOCOPY and
+		 * BLK_SNAP, but we do know that they will fall in
+		 * the range 1..um_seqinc, so we use that test and
+		 * return a request for a zeroed out buffer if attempts
+		 * are made to read a BLK_NOCOPY or BLK_SNAP block.
+		 */
+		if ((ip->i_flags & SF_SNAPSHOT) &&
+		    ip->i_db[bn] > 0 && ip->i_db[bn] < ump->um_seqinc) {
+			*bnp = -1;
+		} else if (*bnp == 0) {
 			if (ip->i_flags & SF_SNAPSHOT)
 				*bnp = blkptrtodb(ump, bn * ump->um_seqinc);
 			else
@ -230,6 +241,17 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb)
 	if (bp)
 		bqrelse(bp);

+	/*
+	 * Since this is FFS independent code, we are out of scope for the
+	 * definitions of BLK_NOCOPY and BLK_SNAP, but we do know that they
+	 * will fall in the range 1..um_seqinc, so we use that test and
+	 * return a request for a zeroed out buffer if attempts are made
+	 * to read a BLK_NOCOPY or BLK_SNAP block.
+	 */
+	if ((ip->i_flags & SF_SNAPSHOT) && daddr > 0 && daddr < ump->um_seqinc){
+		*bnp = -1;
+		return (0);
+	}
 	*bnp = blkptrtodb(ump, daddr);
 	if (*bnp == 0) {
 		if (ip->i_flags & SF_SNAPSHOT)
--- a/sys/gnu/fs/ext2fs/ext2_bmap.c
+++ b/sys/gnu/fs/ext2fs/ext2_bmap.c
@ -147,7 +147,18 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb)
 	num = *nump;
 	if (num == 0) {
 		*bnp = blkptrtodb(ump, ip->i_db[bn]);
-		if (*bnp == 0) {
+		/*
+		 * Since this is FFS independent code, we are out of
+		 * scope for the definitions of BLK_NOCOPY and
+		 * BLK_SNAP, but we do know that they will fall in
+		 * the range 1..um_seqinc, so we use that test and
+		 * return a request for a zeroed out buffer if attempts
+		 * are made to read a BLK_NOCOPY or BLK_SNAP block.
+		 */
+		if ((ip->i_flags & SF_SNAPSHOT) &&
+		    ip->i_db[bn] > 0 && ip->i_db[bn] < ump->um_seqinc) {
+			*bnp = -1;
+		} else if (*bnp == 0) {
 			if (ip->i_flags & SF_SNAPSHOT)
 				*bnp = blkptrtodb(ump, bn * ump->um_seqinc);
 			else
@ -230,6 +241,17 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb)
 	if (bp)
 		bqrelse(bp);

+	/*
+	 * Since this is FFS independent code, we are out of scope for the
+	 * definitions of BLK_NOCOPY and BLK_SNAP, but we do know that they
+	 * will fall in the range 1..um_seqinc, so we use that test and
+	 * return a request for a zeroed out buffer if attempts are made
+	 * to read a BLK_NOCOPY or BLK_SNAP block.
+	 */
+	if ((ip->i_flags & SF_SNAPSHOT) && daddr > 0 && daddr < ump->um_seqinc){
+		*bnp = -1;
+		return (0);
+	}
 	*bnp = blkptrtodb(ump, daddr);
 	if (*bnp == 0) {
 		if (ip->i_flags & SF_SNAPSHOT)
--- a/sys/isofs/cd9660/cd9660_vfsops.c
+++ b/sys/isofs/cd9660/cd9660_vfsops.c
@ -855,7 +855,8 @@ cd9660_vget_internal(mp, ino, vpp, relocated, isodir)
 	case VCHR:
 	case VBLK:
 		vp->v_op = cd9660_specop_p;
-		addaliasu(vp, ip->inode.iso_rdev);
+		vp = addaliasu(vp, ip->inode.iso_rdev);
+		ip->i_vnode = vp;
 		break;
 	default:
 		break;
--- a/sys/kern/vfs_export.c
+++ b/sys/kern/vfs_export.c
@ -1296,15 +1296,45 @@ bdevvp(dev, vpp)
 * how many users there are is inadequate; the v_usecount for
 * the vnodes need to be accumulated.  vcount() does that.
 */
-void
+struct vnode *
 addaliasu(nvp, nvp_rdev)
 	struct vnode *nvp;
 	udev_t nvp_rdev;
 {
+	struct vnode *ovp;
+	vop_t **ops;
+	dev_t dev;

 	if (nvp->v_type != VBLK && nvp->v_type != VCHR)
 		panic("addaliasu on non-special vnode");
-	addalias(nvp, udev2dev(nvp_rdev, nvp->v_type == VBLK ? 1 : 0));
+	dev = udev2dev(nvp_rdev, nvp->v_type == VBLK ? 1 : 0);
+	/*
+	 * Check to see if we have a bdevvp vnode with no associated
+	 * filesystem. If so, we want to associate the filesystem of
+	 * the new newly instigated vnode with the bdevvp vnode and
+	 * discard the newly created vnode rather than leaving the
+	 * bdevvp vnode lying around with no associated filesystem.
+	 */
+	if (vfinddev(dev, nvp->v_type, &ovp) == 0 || ovp->v_data != NULL) {
+		addalias(nvp, dev);
+		return (nvp);
+	}
+	/*
+	 * Discard unneeded vnode, but save its node specific data.
+	 * Note that if there is a lock, it is carried over in the
+	 * node specific data to the replacement vnode.
+	 */
+	vref(ovp);
+	ovp->v_data = nvp->v_data;
+	ovp->v_tag = nvp->v_tag;
+	nvp->v_data = NULL;
+	ops = nvp->v_op;
+	nvp->v_op = ovp->v_op;
+	ovp->v_op = ops;
+	insmntque(ovp, nvp->v_mount);
+	vrele(nvp);
+	vgone(nvp);
+	return (ovp);
 }

 void
@ -1648,7 +1678,7 @@ vclean(vp, flags, p)
 	 */
 	if (flags & DOCLOSE) {
 		if (TAILQ_FIRST(&vp->v_dirtyblkhd) != NULL)
-			(void) vn_write_suspend_wait(vp, V_WAIT);
+			(void) vn_write_suspend_wait(vp, NULL, V_WAIT);
 		if (vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0) != 0)
 			vinvalbuf(vp, 0, NOCRED, p, 0, 0);
 	}
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@ -1296,15 +1296,45 @@ bdevvp(dev, vpp)
 * how many users there are is inadequate; the v_usecount for
 * the vnodes need to be accumulated.  vcount() does that.
 */
-void
+struct vnode *
 addaliasu(nvp, nvp_rdev)
 	struct vnode *nvp;
 	udev_t nvp_rdev;
 {
+	struct vnode *ovp;
+	vop_t **ops;
+	dev_t dev;

 	if (nvp->v_type != VBLK && nvp->v_type != VCHR)
 		panic("addaliasu on non-special vnode");
-	addalias(nvp, udev2dev(nvp_rdev, nvp->v_type == VBLK ? 1 : 0));
+	dev = udev2dev(nvp_rdev, nvp->v_type == VBLK ? 1 : 0);
+	/*
+	 * Check to see if we have a bdevvp vnode with no associated
+	 * filesystem. If so, we want to associate the filesystem of
+	 * the new newly instigated vnode with the bdevvp vnode and
+	 * discard the newly created vnode rather than leaving the
+	 * bdevvp vnode lying around with no associated filesystem.
+	 */
+	if (vfinddev(dev, nvp->v_type, &ovp) == 0 || ovp->v_data != NULL) {
+		addalias(nvp, dev);
+		return (nvp);
+	}
+	/*
+	 * Discard unneeded vnode, but save its node specific data.
+	 * Note that if there is a lock, it is carried over in the
+	 * node specific data to the replacement vnode.
+	 */
+	vref(ovp);
+	ovp->v_data = nvp->v_data;
+	ovp->v_tag = nvp->v_tag;
+	nvp->v_data = NULL;
+	ops = nvp->v_op;
+	nvp->v_op = ovp->v_op;
+	ovp->v_op = ops;
+	insmntque(ovp, nvp->v_mount);
+	vrele(nvp);
+	vgone(nvp);
+	return (ovp);
 }

 void
@ -1648,7 +1678,7 @@ vclean(vp, flags, p)
 	 */
 	if (flags & DOCLOSE) {
 		if (TAILQ_FIRST(&vp->v_dirtyblkhd) != NULL)
-			(void) vn_write_suspend_wait(vp, V_WAIT);
+			(void) vn_write_suspend_wait(vp, NULL, V_WAIT);
 		if (vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0) != 0)
 			vinvalbuf(vp, 0, NOCRED, p, 0, 0);
 	}
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@ -729,17 +729,19 @@ vn_start_write(vp, mpp, flags)
 * time, these operations are halted until the suspension is over.
 */
 int
-vn_write_suspend_wait(vp, flags)
+vn_write_suspend_wait(vp, mp, flags)
 	struct vnode *vp;
+	struct mount *mp;
 	int flags;
 {
-	struct mount *mp;
 	int error;

-	if ((error = VOP_GETWRITEMOUNT(vp, &mp)) != 0) {
-		if (error != EOPNOTSUPP)
-			return (error);
-		return (0);
+	if (vp != NULL) {
+		if ((error = VOP_GETWRITEMOUNT(vp, &mp)) != 0) {
+			if (error != EOPNOTSUPP)
+				return (error);
+			return (0);
+		}
 	}
 	/*
 	 * If we are not suspended or have not yet reached suspended
--- a/sys/miscfs/specfs/spec_vnops.c
+++ b/sys/miscfs/specfs/spec_vnops.c
@ -421,9 +421,11 @@ spec_strategy(ap)
 	bp = ap->a_bp;
 	vp = ap->a_vp;
 	if ((bp->b_iocmd == BIO_WRITE)) {
-		if (vp->v_mount != NULL &&
-		    (vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) != 0)
+		if ((bp->b_flags & B_VALIDSUSPWRT) == 0 &&
+		    bp->b_vp != NULL && bp->b_vp->v_mount != NULL &&
+		    (bp->b_vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) != 0)
 			panic("spec_strategy: bad I/O");
+		bp->b_flags &= ~B_VALIDSUSPWRT;
 		if (LIST_FIRST(&bp->b_dep) != NULL)
 			buf_start(bp);
 		if ((vp->v_flag & VCOPYONWRITE) &&
--- a/sys/nfs/nfs_common.c
+++ b/sys/nfs/nfs_common.c
@ -1282,7 +1282,8 @@ nfs_loadattrcache(vpp, mdp, dposp, vaper)
 		}
 		if (vp->v_type == VCHR || vp->v_type == VBLK) {
 			vp->v_op = spec_nfsv2nodeop_p;
-			addaliasu(vp, rdev);
+			vp = addaliasu(vp, rdev);
+			np->n_vnode = vp;
 		}
 		np->n_mtime = mtime.tv_sec;
 	}
--- a/sys/nfs/nfs_subs.c
+++ b/sys/nfs/nfs_subs.c
@ -1282,7 +1282,8 @@ nfs_loadattrcache(vpp, mdp, dposp, vaper)
 		}
 		if (vp->v_type == VCHR || vp->v_type == VBLK) {
 			vp->v_op = spec_nfsv2nodeop_p;
-			addaliasu(vp, rdev);
+			vp = addaliasu(vp, rdev);
+			np->n_vnode = vp;
 		}
 		np->n_mtime = mtime.tv_sec;
 	}
--- a/sys/nfsclient/nfs_subs.c
+++ b/sys/nfsclient/nfs_subs.c
@ -1282,7 +1282,8 @@ nfs_loadattrcache(vpp, mdp, dposp, vaper)
 		}
 		if (vp->v_type == VCHR || vp->v_type == VBLK) {
 			vp->v_op = spec_nfsv2nodeop_p;
-			addaliasu(vp, rdev);
+			vp = addaliasu(vp, rdev);
+			np->n_vnode = vp;
 		}
 		np->n_mtime = mtime.tv_sec;
 	}
--- a/sys/nfsserver/nfs_srvsubs.c
+++ b/sys/nfsserver/nfs_srvsubs.c
@ -1282,7 +1282,8 @@ nfs_loadattrcache(vpp, mdp, dposp, vaper)
 		}
 		if (vp->v_type == VCHR || vp->v_type == VBLK) {
 			vp->v_op = spec_nfsv2nodeop_p;
-			addaliasu(vp, rdev);
+			vp = addaliasu(vp, rdev);
+			np->n_vnode = vp;
 		}
 		np->n_mtime = mtime.tv_sec;
 	}
--- a/sys/sys/buf.h
+++ b/sys/sys/buf.h
@ -183,7 +183,7 @@ struct buf {
 #define	B_UNUSED0	0x00000008	/* Old B_BAD */
 #define	B_DEFERRED	0x00000010	/* Skipped over for cleaning */
 #define	B_CACHE		0x00000020	/* Bread found us in the cache. */
-#define	B_UNUSED40	0x00000040	/* Old B_CALL */
+#define	B_VALIDSUSPWRT	0x00000040	/* Valid write during suspension. */
 #define	B_DELWRI	0x00000080	/* Delay I/O until buffer reused. */
 #define	B_DONE		0x00000200	/* I/O completed. */
 #define	B_EINTR		0x00000400	/* I/O was interrupted */
@ -237,7 +237,7 @@ extern char *buf_wmesg;			/* Default buffer lock message */
 * Initialize a lock.
 */
 #define BUF_LOCKINIT(bp) \
-	lockinit(&(bp)->b_lock, PRIBIO + 4, buf_wmesg, 0, 0)
+	lockinit(&(bp)->b_lock, PRIBIO + 4, buf_wmesg, 0, LK_CANRECURSE)
 /*
 *
 * Get a lock sleeping non-interruptably until it becomes available.
@ -467,6 +467,7 @@ buf_countdeps(struct buf *bp, int i)
 #define B_CLRBUF	0x01	/* Request allocated buffer be cleared. */
 #define B_SYNC		0x02	/* Do all allocations synchronously. */
 #define	B_METAONLY	0x04	/* Return indirect block buffer. */
+#define B_NOWAIT	0x08	/* do not sleep to await lock */

 #ifdef _KERNEL
 extern int	nbuf;			/* The number of buffer headers */
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@ -537,7 +537,7 @@ struct vop_bwrite_args;
 extern int	(*lease_check_hook) __P((struct vop_lease_args *));

 void	addalias __P((struct vnode *vp, dev_t nvp_rdev));
-void	addaliasu __P((struct vnode *vp, udev_t nvp_rdev));
+struct	vnode *addaliasu __P((struct vnode *vp, udev_t nvp_rdev));
 int 	bdevvp __P((dev_t dev, struct vnode **vpp));
 /* cache_* may belong in namei.h. */
 void	cache_enter __P((struct vnode *dvp, struct vnode *vp,
@ -593,7 +593,8 @@ int 	vn_rdwr __P((enum uio_rw rw, struct vnode *vp, caddr_t base,
 int	vn_stat __P((struct vnode *vp, struct stat *sb, struct proc *p));
 int	vn_start_write __P((struct vnode *vp, struct mount **mpp, int flags));
 dev_t	vn_todev __P((struct vnode *vp));
-int	vn_write_suspend_wait __P((struct vnode *vp, int flags));
+int	vn_write_suspend_wait __P((struct vnode *vp, struct mount *mp,
+		int flags));
 int 	vn_writechk __P((struct vnode *vp));
 int	vfs_cache_lookup __P((struct vop_lookup_args *ap));
 int	vfs_object_create __P((struct vnode *vp, struct proc *p,
--- a/sys/ufs/ffs/ffs_extern.h
+++ b/sys/ufs/ffs/ffs_extern.h
@ -116,6 +116,7 @@ extern vop_t **ffs_fifoop_p;
 void	softdep_initialize __P((void));
 int	softdep_mount __P((struct vnode *, struct mount *, struct fs *,
 	    struct ucred *));
+int	softdep_flushworklist __P((struct mount *, int *, struct proc *));
 int	softdep_flushfiles __P((struct mount *, int, struct proc *));
 void	softdep_update_inodeblock __P((struct inode *, struct buf *, int));
 void	softdep_load_inodeblock __P((struct inode *));
--- a/sys/ufs/ffs/ffs_snapshot.c
+++ b/sys/ufs/ffs/ffs_snapshot.c
@ -30,7 +30,7 @@
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
- *	@(#)ffs_snapshot.c	8.10 (McKusick) 7/11/00
+ *	@(#)ffs_snapshot.c	8.11 (McKusick) 7/23/00
 * $FreeBSD$
 */

@ -290,6 +290,7 @@ ffs_snapshot(mp, snapfile)
 		if (fs->fs_cgsize < fs->fs_bsize)
 			bzero(&nbp->b_data[fs->fs_cgsize],
 			    fs->fs_bsize - fs->fs_cgsize);
+		nbp->b_flags |= B_VALIDSUSPWRT;
 		bawrite(nbp);
 		base = cg * fs->fs_fpg / fs->fs_frag;
 		if (base + len > numblks)
@ -311,6 +312,7 @@ ffs_snapshot(mp, snapfile)
 		indiroff = (base + loc - NDADDR) % NINDIR(fs);
 		for ( ; loc < len; loc++, indiroff++) {
 			if (indiroff >= NINDIR(fs)) {
+				ibp->b_flags |= B_VALIDSUSPWRT;
 				bawrite(ibp);
 				error = VOP_BALLOC(vp,
 				    lblktosize(fs, (off_t)(base + loc)),
@ -325,7 +327,8 @@ ffs_snapshot(mp, snapfile)
 				continue;
 			((ufs_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY;
 		}
-		brelse(bp);
+		bqrelse(bp);
+		ibp->b_flags |= B_VALIDSUSPWRT;
 		bdwrite(ibp);
 	}
 	/*
@ -340,6 +343,7 @@ ffs_snapshot(mp, snapfile)
 	if (fs->fs_sbsize < fs->fs_bsize)
 		bzero(&nbp->b_data[fs->fs_sbsize],
 		    fs->fs_bsize - fs->fs_sbsize);
+	nbp->b_flags |= B_VALIDSUSPWRT;
 	bawrite(nbp);
 	blkno = fragstoblks(fs, fs->fs_csaddr);
 	len = howmany(fs->fs_cssize, fs->fs_bsize) - 1;
@ -354,6 +358,7 @@ ffs_snapshot(mp, snapfile)
 			size = fs->fs_cssize % fs->fs_bsize;
 		}
 		bcopy(fs->fs_csp[loc], nbp->b_data, size);
+		nbp->b_flags |= B_VALIDSUSPWRT;
 		bawrite(nbp);
 	}
 	/*
@ -366,6 +371,7 @@ ffs_snapshot(mp, snapfile)
 		if (error)
 			goto out1;
 		readblock(nbp, inoblks[loc]);
+		nbp->b_flags |= B_VALIDSUSPWRT;
 		bdwrite(nbp);
 	}
 	/*
@ -410,6 +416,7 @@ ffs_snapshot(mp, snapfile)
 		dip->di_blocks = 0;
 		dip->di_flags &= ~(SF_IMMUTABLE | SF_SNAPSHOT);
 		bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs_daddr_t));
+		nbp->b_flags |= B_VALIDSUSPWRT;
 		bdwrite(nbp);
 	}
 	/*
@ -422,7 +429,7 @@ ffs_snapshot(mp, snapfile)
 		if (error)
 			goto out1;
 		copyblkno = fragstoblks(fs, dbtofsb(fs, ibp->b_blkno));
-		brelse(ibp);
+		bqrelse(ibp);
 		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)copyblkno),
 		    fs->fs_bsize, p->p_ucred, 0, &nbp);
 		if (error)
@ -434,7 +441,8 @@ ffs_snapshot(mp, snapfile)
 			goto out1;
 		}
 		bcopy(ibp->b_data, nbp->b_data, fs->fs_bsize);
-		brelse(ibp);
+		bqrelse(ibp);
+		nbp->b_flags |= B_VALIDSUSPWRT;
 		bawrite(nbp);
 	}
 	/*
@ -518,7 +526,7 @@ indiracct(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, blksperindir)
 	} else {
 		MALLOC(bap, ufs_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK);
 		bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
-		brelse(bp);
+		bqrelse(bp);
 	}
 	error = snapacct(snapvp, &bap[0], &bap[last]);
 	if (error || level == 0)
@ -539,7 +547,7 @@ indiracct(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, blksperindir)
 	}
 out:
 	if (snapvp != cancelvp)
-		brelse(bp);
+		bqrelse(bp);
 	else
 		FREE(bap, M_DEVBUF);
 	return (error);
@ -578,8 +586,10 @@ snapacct(vp, oldblkp, lastblkp)
 		if (*blkp != 0)
 			panic("snapacct: bad block");
 		*blkp = BLK_SNAP;
-		if (lbn >= NDADDR)
+		if (lbn >= NDADDR) {
+			ibp->b_flags |= B_VALIDSUSPWRT;
 			bdwrite(ibp);
+		}
 	}
 	return (0);
 }
@ -732,7 +742,7 @@ ffs_snapblkfree(freeip, bno, size)
 		default:
 		case BLK_NOCOPY:
 			if (lbn >= NDADDR)
-				brelse(ibp);
+				bqrelse(ibp);
 			continue;
 		/*
 		 * No previous snapshot claimed the block, so it will be
@ -787,7 +797,7 @@ ffs_snapblkfree(freeip, bno, size)
 			return (1);
 		}
 		if (lbn >= NDADDR)
-			brelse(ibp);
+			bqrelse(ibp);
 		/*
 		 * Allocate the block into which to do the copy. Note that this
 		 * allocation will never require any additional allocations for
@ -933,40 +943,57 @@ ffs_copyonwrite(ap)
 		if (bp->b_vp == vp)
 			continue;
 		/*
-		 * Check to see if block needs to be copied.
+		 * Check to see if block needs to be copied. We have to
+		 * be able to do the VOP_BALLOC without blocking, otherwise
+		 * we may get in a deadlock with another process also
+		 * trying to allocate. If we find outselves unable to
+		 * get the buffer lock, we unlock the snapshot vnode,
+		 * sleep briefly, and try again.
 		 */
+retry:
+		vn_lock(vp, LK_SHARED | LK_RETRY, p);
 		if (lbn < NDADDR) {
 			blkno = ip->i_db[lbn];
 		} else {
-			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
 			p->p_flag |= P_COWINPROGRESS;
 			error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn),
-			    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
+			   fs->fs_bsize, KERNCRED, B_METAONLY | B_NOWAIT, &ibp);
 			p->p_flag &= ~P_COWINPROGRESS;
-			VOP_UNLOCK(vp, 0, p);
-			if (error)
-				break;
+			if (error) {
+				VOP_UNLOCK(vp, 0, p);
+				if (error != EWOULDBLOCK)
+					break;
+				tsleep(vp, p->p_usrpri, "nap", 1);
+				goto retry;
+			}
 			indiroff = (lbn - NDADDR) % NINDIR(fs);
 			blkno = ((ufs_daddr_t *)(ibp->b_data))[indiroff];
-			brelse(ibp);
+			bqrelse(ibp);
 		}
 #ifdef DIAGNOSTIC
 		if (blkno == BLK_SNAP && bp->b_lblkno >= 0)
 			panic("ffs_copyonwrite: bad copy block");
 #endif
-		if (blkno != 0)
+		if (blkno != 0) {
+			VOP_UNLOCK(vp, 0, p);
 			continue;
+		}
 		/*
 		 * Allocate the block into which to do the copy. Note that this
 		 * allocation will never require any additional allocations for
 		 * the snapshot inode.
 		 */
-		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
 		p->p_flag |= P_COWINPROGRESS;
 		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn),
-		    fs->fs_bsize, KERNCRED, 0, &cbp);
+		    fs->fs_bsize, KERNCRED, B_NOWAIT, &cbp);
 		p->p_flag &= ~P_COWINPROGRESS;
 		VOP_UNLOCK(vp, 0, p);
+		if (error) {
+			if (error != EWOULDBLOCK)
+				break;
+			tsleep(vp, p->p_usrpri, "nap", 1);
+			goto retry;
+		}
 #ifdef DEBUG
 		if (snapdebug) {
 			printf("Copyonwrite: snapino %d lbn %d for ",
@ -979,8 +1006,6 @@ ffs_copyonwrite(ap)
 			    cbp->b_blkno);
 		}
 #endif
-		if (error)
-			break;
 		/*
 		 * If we have already read the old block contents, then
 		 * simply copy them to the new block.
--- a/sys/ufs/ffs/ffs_softdep.c
+++ b/sys/ufs/ffs/ffs_softdep.c
@ -548,41 +548,45 @@ softdep_process_worklist(matchmnt)
 		case D_DIRREM:
 			/* removal of a directory entry */
 			mp = WK_DIRREM(wk)->dm_mnt;
+			if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
+				panic("%s: dirrem on suspended filesystem",
+					"softdep_process_worklist");
 			if (mp == matchmnt)
 				matchcnt += 1;
-			vn_start_write(NULL, &mp, V_WAIT);
 			handle_workitem_remove(WK_DIRREM(wk));
-			vn_finished_write(mp);
 			break;

 		case D_FREEBLKS:
 			/* releasing blocks and/or fragments from a file */
 			mp = WK_FREEBLKS(wk)->fb_mnt;
+			if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
+				panic("%s: freeblks on suspended filesystem",
+					"softdep_process_worklist");
 			if (mp == matchmnt)
 				matchcnt += 1;
-			vn_start_write(NULL, &mp, V_WAIT);
 			handle_workitem_freeblocks(WK_FREEBLKS(wk));
-			vn_finished_write(mp);
 			break;

 		case D_FREEFRAG:
 			/* releasing a fragment when replaced as a file grows */
 			mp = WK_FREEFRAG(wk)->ff_mnt;
+			if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
+				panic("%s: freefrag on suspended filesystem",
+					"softdep_process_worklist");
 			if (mp == matchmnt)
 				matchcnt += 1;
-			vn_start_write(NULL, &mp, V_WAIT);
 			handle_workitem_freefrag(WK_FREEFRAG(wk));
-			vn_finished_write(mp);
 			break;

 		case D_FREEFILE:
 			/* releasing an inode when its link count drops to 0 */
 			mp = WK_FREEFILE(wk)->fx_mnt;
+			if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
+				panic("%s: freefile on suspended filesystem",
+					"softdep_process_worklist");
 			if (mp == matchmnt)
 				matchcnt += 1;
-			vn_start_write(NULL, &mp, V_WAIT);
 			handle_workitem_freefile(WK_FREEFILE(wk));
-			vn_finished_write(mp);
 			break;

 		default:
@ -646,13 +650,13 @@ softdep_move_dependencies(oldbp, newbp)
 * Purge the work list of all items associated with a particular mount point.
 */
 int
-softdep_flushfiles(oldmnt, flags, p)
+softdep_flushworklist(oldmnt, countp, p)
 	struct mount *oldmnt;
-	int flags;
+	int *countp;
 	struct proc *p;
 {
 	struct vnode *devvp;
-	int error, loopcnt;
+	int count, error = 0;

 	/*
 	 * Await our turn to clear out the queue.
@ -660,32 +664,16 @@ softdep_flushfiles(oldmnt, flags, p)
 	while (softdep_worklist_busy)
 		tsleep(&lbolt, PRIBIO, "softflush", 0);
 	softdep_worklist_busy = 1;
-	if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0) {
-		softdep_worklist_busy = 0;
-		return (error);
-	}
 	/*
 	 * Alternately flush the block device associated with the mount
 	 * point and process any dependencies that the flushing
-	 * creates. In theory, this loop can happen at most twice,
-	 * but we give it a few extra just to be sure.
+	 * creates. We continue until no more worklist dependencies
+	 * are found.
 	 */
+	*countp = 0;
 	devvp = VFSTOUFS(oldmnt)->um_devvp;
-	for (loopcnt = 10; loopcnt > 0; ) {
-		if (softdep_process_worklist(oldmnt) == 0) {
-			loopcnt--;
-			/*
-			 * Do another flush in case any vnodes were brought in
-			 * as part of the cleanup operations.
-			 */
-			if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0)
-				break;
-			/*
-			 * If we still found nothing to do, we are really done.
-			 */
-			if (softdep_process_worklist(oldmnt) == 0)
-				break;
-		}
+	while ((count = softdep_process_worklist(oldmnt)) > 0) {
+		*countp += count;
 		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p);
 		error = VOP_FSYNC(devvp, p->p_ucred, MNT_WAIT, p);
 		VOP_UNLOCK(devvp, 0, p);
@ -693,6 +681,37 @@ softdep_flushfiles(oldmnt, flags, p)
 			break;
 	}
 	softdep_worklist_busy = 0;
+	return (error);
+}
+
+/*
+ * Flush all vnodes and worklist items associated with a specified mount point.
+ */
+int
+softdep_flushfiles(oldmnt, flags, p)
+	struct mount *oldmnt;
+	int flags;
+	struct proc *p;
+{
+	int error, count, loopcnt;
+
+	/*
+	 * Alternately flush the vnodes associated with the mount
+	 * point and process any dependencies that the flushing
+	 * creates. In theory, this loop can happen at most twice,
+	 * but we give it a few extra just to be sure.
+	 */
+	for (loopcnt = 10; loopcnt > 0; loopcnt--) {
+		/*
+		 * Do another flush in case any vnodes were brought in
+		 * as part of the cleanup operations.
+		 */
+		if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0)
+			break;
+		if ((error = softdep_flushworklist(oldmnt, &count, p)) != 0 ||
+		    count == 0)
+			break;
+	}
 	/*
 	 * If we are unmounting then it is an error to fail. If we
 	 * are simply trying to downgrade to read-only, then filesystem
@ -4432,8 +4451,8 @@ clear_remove(p)
 			mp = pagedep->pd_mnt;
 			ino = pagedep->pd_ino;
 			FREE_LOCK(&lk);
-			if (vn_start_write(NULL, &mp, V_WAIT | PCATCH) != 0)
-				return;
+			if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
+				continue;
 			if ((error = VFS_VGET(mp, ino, &vp)) != 0) {
 				softdep_error("clear_remove: vget", error);
 				vn_finished_write(mp);
@ -4503,8 +4522,8 @@ clear_inodedeps(p)
 		if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
 			continue;
 		FREE_LOCK(&lk);
-		if (vn_start_write(NULL, &mp, V_WAIT | PCATCH) != 0)
-			return;
+		if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
+			continue;
 		if ((error = VFS_VGET(mp, ino, &vp)) != 0) {
 			softdep_error("clear_inodedeps: vget", error);
 			vn_finished_write(mp);
--- a/sys/ufs/ffs/ffs_vfsops.c
+++ b/sys/ufs/ffs/ffs_vfsops.c
@ -908,7 +908,7 @@ ffs_sync(mp, waitfor, cred, p)
 	struct inode *ip;
 	struct ufsmount *ump = VFSTOUFS(mp);
 	struct fs *fs;
-	int error, allerror = 0;
+	int error, count, wait, lockreq, allerror = 0;

 	fs = ump->um_fs;
 	if (fs->fs_fmod != 0 && fs->fs_ronly != 0) {		/* XXX */
@ -918,6 +918,12 @@ ffs_sync(mp, waitfor, cred, p)
 	/*
 	 * Write back each (modified) inode.
 	 */
+	wait = 0;
+	lockreq = LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK;
+	if (waitfor == MNT_WAIT) {
+		wait = 1;
+		lockreq = LK_EXCLUSIVE | LK_INTERLOCK;
+	}
 	simple_lock(&mntvnode_slock);
 loop:
 	for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) {
@ -938,9 +944,7 @@ ffs_sync(mp, waitfor, cred, p)
 		}
 		if (vp->v_type != VCHR) {
 			simple_unlock(&mntvnode_slock);
-			error =
-			  vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, p);
-			if (error) {
+			if ((error = vget(vp, lockreq, p)) != 0) {
 				simple_lock(&mntvnode_slock);
 				if (error == ENOENT)
 					goto loop;
@ -948,14 +952,12 @@ ffs_sync(mp, waitfor, cred, p)
 			}
 			if ((error = VOP_FSYNC(vp, cred, waitfor, p)) != 0)
 				allerror = error;
-			VOP_UNLOCK(vp, 0, p);
-			vrele(vp);
+			vput(vp);
 			simple_lock(&mntvnode_slock);
 		} else {
 			simple_unlock(&mntvnode_slock);
 			simple_unlock(&vp->v_interlock);
-			/* UFS_UPDATE(vp, waitfor == MNT_WAIT); */
-			UFS_UPDATE(vp, 0);
+			UFS_UPDATE(vp, wait);
 			simple_lock(&mntvnode_slock);
 		}
 	}
@ -963,9 +965,16 @@ ffs_sync(mp, waitfor, cred, p)
 	/*
 	 * Force stale file system control information to be flushed.
 	 */
-	if (waitfor != MNT_LAZY) {
-		if (ump->um_mountp->mnt_flag & MNT_SOFTDEP)
-			waitfor = MNT_NOWAIT;
+	if (waitfor == MNT_WAIT) {
+		if ((error = softdep_flushworklist(ump->um_mountp, &count, p)))
+			allerror = error;
+		/* Flushed work items may create new vnodes to clean */
+		if (count) {
+			simple_lock(&mntvnode_slock);
+			goto loop;
+		}
+	}
+	if (waitfor == MNT_NOWAIT) {
 		vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY, p);
 		if ((error = VOP_FSYNC(ump->um_devvp, cred, waitfor, p)) != 0)
 			allerror = error;
--- a/sys/ufs/mfs/mfs_vfsops.c
+++ b/sys/ufs/mfs/mfs_vfsops.c
@ -248,7 +248,7 @@ mfs_mount(mp, path, data, ndp, p)
 	/* It is not clear that these will get initialized otherwise */
 	dev->si_bsize_phys = DEV_BSIZE;
 	dev->si_iosize_max = DFLTPHYS;
-	addaliasu(devvp, makeudev(253, mfs_minor++));
+	devvp = addaliasu(devvp, makeudev(253, mfs_minor++));
 	devvp->v_data = mfsp;
 	mfsp->mfs_baseoff = args.base;
 	mfsp->mfs_size = args.size;
--- a/sys/ufs/ufs/ufs_bmap.c
+++ b/sys/ufs/ufs/ufs_bmap.c
@ -147,7 +147,18 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb)
 	num = *nump;
 	if (num == 0) {
 		*bnp = blkptrtodb(ump, ip->i_db[bn]);
-		if (*bnp == 0) {
+		/*
+		 * Since this is FFS independent code, we are out of
+		 * scope for the definitions of BLK_NOCOPY and
+		 * BLK_SNAP, but we do know that they will fall in
+		 * the range 1..um_seqinc, so we use that test and
+		 * return a request for a zeroed out buffer if attempts
+		 * are made to read a BLK_NOCOPY or BLK_SNAP block.
+		 */
+		if ((ip->i_flags & SF_SNAPSHOT) &&
+		    ip->i_db[bn] > 0 && ip->i_db[bn] < ump->um_seqinc) {
+			*bnp = -1;
+		} else if (*bnp == 0) {
 			if (ip->i_flags & SF_SNAPSHOT)
 				*bnp = blkptrtodb(ump, bn * ump->um_seqinc);
 			else
@ -230,6 +241,17 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb)
 	if (bp)
 		bqrelse(bp);

+	/*
+	 * Since this is FFS independent code, we are out of scope for the
+	 * definitions of BLK_NOCOPY and BLK_SNAP, but we do know that they
+	 * will fall in the range 1..um_seqinc, so we use that test and
+	 * return a request for a zeroed out buffer if attempts are made
+	 * to read a BLK_NOCOPY or BLK_SNAP block.
+	 */
+	if ((ip->i_flags & SF_SNAPSHOT) && daddr > 0 && daddr < ump->um_seqinc){
+		*bnp = -1;
+		return (0);
+	}
 	*bnp = blkptrtodb(ump, daddr);
 	if (*bnp == 0) {
 		if (ip->i_flags & SF_SNAPSHOT)
--- a/sys/ufs/ufs/ufs_inode.c
+++ b/sys/ufs/ufs/ufs_inode.c
@ -77,7 +77,7 @@ ufs_inactive(ap)
 	if (ip->i_mode == 0)
 		goto out;
 	if (ip->i_nlink <= 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
-		(void) vn_write_suspend_wait(vp, V_WAIT);
+		(void) vn_write_suspend_wait(vp, NULL, V_WAIT);
 #ifdef QUOTA
 		if (!getinoquota(ip))
 			(void)chkiq(ip, -1, NOCRED, 0);
@ -94,10 +94,10 @@ ufs_inactive(ap)
 	}
 	if (ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) {
 		if ((ip->i_flag & (IN_CHANGE | IN_UPDATE | IN_MODIFIED)) == 0 &&
-		    vn_write_suspend_wait(vp, V_NOWAIT)) {
+		    vn_write_suspend_wait(vp, NULL, V_NOWAIT)) {
 			ip->i_flag &= ~IN_ACCESS;
 		} else {
-			(void) vn_write_suspend_wait(vp, V_WAIT);
+			(void) vn_write_suspend_wait(vp, NULL, V_WAIT);
 			UFS_UPDATE(vp, 0);
 		}
 	}
--- a/sys/ufs/ufs/ufs_quota.c
+++ b/sys/ufs/ufs/ufs_quota.c
@ -898,7 +898,7 @@ dqsync(vp, dq)
 		return (0);
 	if ((dqvp = dq->dq_ump->um_quotas[dq->dq_type]) == NULLVP)
 		panic("dqsync: file");
-	(void) vn_write_suspend_wait(dqvp, V_WAIT);
+	(void) vn_write_suspend_wait(dqvp, NULL, V_WAIT);
 	if (vp != dqvp)
 		vn_lock(dqvp, LK_EXCLUSIVE | LK_RETRY, p);
 	while (dq->dq_flags & DQ_LOCK) {
--- a/sys/ufs/ufs/ufs_vnops.c
+++ b/sys/ufs/ufs/ufs_vnops.c
@ -2036,7 +2036,8 @@ ufs_vinit(mntp, specops, fifoops, vpp)
 	case VCHR:
 	case VBLK:
 		vp->v_op = specops;
-		addaliasu(vp, ip->i_rdev);
+		vp = addaliasu(vp, ip->i_rdev);
+		ip->i_vnode = vp;
 		break;
 	case VFIFO:
 		vp->v_op = fifoops;