When closing the last reference to an unlinked file, it is freed

by the inactive routine. Because the freeing causes the filesystem to be modified, the close must be held up during periods when the filesystem is suspended. For snapshots to be consistent across crashes, they must write blocks that they copy and claim those written blocks in their on-disk block pointers before the old blocks that they referenced can be allowed to be written. Close a loophole that allowed unwritten blocks to be skipped when doing ffs_sync with a request to wait for all I/O activity to be completed.
2001-04-25 08:11:18 +00:00 · 2001-04-25 08:11:18 +00:00 · 112f737245
commit 112f737245
parent e69b2bc11c
4 changed files with 113 additions and 26 deletions
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@ -235,6 +235,15 @@ vn_close(vp, flags, cred, p)
 	if (flags & FWRITE)
 		vp->v_writecount--;
 	error = VOP_CLOSE(vp, flags, cred, p);
+	/*
+	 * XXX - In certain instances VOP_CLOSE has to do the vrele
+	 * itself. If the vrele has been done, it will return EAGAIN
+	 * to indicate that the vrele should not be done again. When
+	 * this happens, we just return success. The correct thing to
+	 * do would be to have all VOP_CLOSE instances do the vrele.
+	 */
+	if (error == EAGAIN)
+		return (0);
 	vrele(vp);
 	return (error);
 }
--- a/sys/ufs/ffs/ffs_snapshot.c
+++ b/sys/ufs/ffs/ffs_snapshot.c
@ -198,10 +198,14 @@ restart:
 	}
 	/*
 	 * Allocate shadow blocks to copy all of the other snapshot inodes
-	 * so that we will be able to expunge them from this snapshot.
+	 * so that we will be able to expunge them from this snapshot. Also
+	 * include a copy of ourselves so that we do not deadlock trying
+	 * to copyonwrite ourselves when VOP_FSYNC'ing below.
 	 */
-	for (loc = 0, inoblkcnt = 0; loc < snaploc; loc++) {
+	fs->fs_snapinum[snaploc] = ip->i_number;
+	for (loc = snaploc, inoblkcnt = 0; loc >= 0; loc--) {
 		blkno = fragstoblks(fs, ino_to_fsba(fs, fs->fs_snapinum[loc]));
+		fs->fs_snapinum[snaploc] = 0;
 		for (i = 0; i < inoblkcnt; i++)
 			if (inoblks[i] == blkno)
 				break;
@ -652,14 +656,14 @@ ffs_snapremove(vp)
 	ip = VTOI(vp);
 	fs = ip->i_fs;
 	/*
-	 * Delete from incore list.
+	 * If active, delete from incore list (this snapshot may
+	 * already have been in the process of being deleted, so
+	 * would not have been active).
+	 *
 	 * Clear copy-on-write flag if last snapshot.
 	 */
-	devvp = ip->i_devvp;
-	if (ip->i_nextsnap.tqe_prev == 0) {
-		printf("ffs_snapremove: lost snapshot vnode %d\n",
-		    ip->i_number);
-	} else {
+	if (ip->i_nextsnap.tqe_prev != 0) {
+		devvp = ip->i_devvp;
 		TAILQ_REMOVE(&devvp->v_rdev->si_snapshots, ip, i_nextsnap);
 		ip->i_nextsnap.tqe_prev = 0;
 		if (TAILQ_FIRST(&devvp->v_rdev->si_snapshots) == 0) {
@ -832,9 +836,10 @@ ffs_snapblkfree(freeip, bno, size)
 		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 		    fs->fs_bsize, KERNCRED, 0, &cbp);
 		p->p_flag &= ~P_COWINPROGRESS;
-		VOP_UNLOCK(vp, 0, p);
-		if (error)
+		if (error) {
+			VOP_UNLOCK(vp, 0, p);
 			break;
+		}
 #ifdef DEBUG
 		if (snapdebug)
 			printf("%s%d lbn %d for inum %d size %ld to blkno %d\n",
@ -843,22 +848,44 @@ ffs_snapblkfree(freeip, bno, size)
 #endif
 		/*
 		 * If we have already read the old block contents, then
-		 * simply copy them to the new block.
+		 * simply copy them to the new block. Note that we need
+		 * to synchronously write snapshots that have not been
+		 * unlinked, and hence will be visible after a crash,
+		 * to ensure their integrity.
 		 */
 		if (savedcbp != 0) {
 			bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
 			bawrite(cbp);
+			if (ip->i_effnlink > 0)
+				(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p);
+			VOP_UNLOCK(vp, 0, p);
 			continue;
 		}
 		/*
 		 * Otherwise, read the old block contents into the buffer.
 		 */
-		if ((error = readblock(cbp, lbn)) != 0)
+		if ((error = readblock(cbp, lbn)) != 0) {
+			bzero(cbp->b_data, fs->fs_bsize);
+			bawrite(cbp);
+			if (ip->i_effnlink > 0)
+				(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p);
+			VOP_UNLOCK(vp, 0, p);
 			break;
+		}
 		savedcbp = cbp;
 	}
-	if (savedcbp)
+	/*
+	 * Note that we need to synchronously write snapshots that
+	 * have not been unlinked, and hence will be visible after
+	 * a crash, to ensure their integrity.
+	 */
+	if (savedcbp) {
+		vp = savedcbp->b_vp;
 		bawrite(savedcbp);
+		if (VTOI(vp)->i_effnlink > 0)
+			(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p);
+		VOP_UNLOCK(vp, 0, p);
+	}
 	/*
 	 * If we have been unable to allocate a block in which to do
 	 * the copy, then return non-zero so that the fragment will
@ -1014,8 +1041,8 @@ retry:
 		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 		    fs->fs_bsize, KERNCRED, B_NOWAIT, &cbp);
 		p->p_flag &= ~P_COWINPROGRESS;
-		VOP_UNLOCK(vp, 0, p);
 		if (error) {
+			VOP_UNLOCK(vp, 0, p);
 			if (error != EWOULDBLOCK)
 				break;
 			tsleep(vp, p->p_pri.pri_user, "nap", 1);
@ -1035,22 +1062,44 @@ retry:
 #endif
 		/*
 		 * If we have already read the old block contents, then
-		 * simply copy them to the new block.
+		 * simply copy them to the new block. Note that we need
+		 * to synchronously write snapshots that have not been
+		 * unlinked, and hence will be visible after a crash,
+		 * to ensure their integrity.
 		 */
 		if (savedcbp != 0) {
 			bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
 			bawrite(cbp);
+			if (ip->i_effnlink > 0)
+				(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p);
+			VOP_UNLOCK(vp, 0, p);
 			continue;
 		}
 		/*
 		 * Otherwise, read the old block contents into the buffer.
 		 */
-		if ((error = readblock(cbp, lbn)) != 0)
+		if ((error = readblock(cbp, lbn)) != 0) {
+			bzero(cbp->b_data, fs->fs_bsize);
+			bawrite(cbp);
+			if (ip->i_effnlink > 0)
+				(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p);
+			VOP_UNLOCK(vp, 0, p);
 			break;
+		}
 		savedcbp = cbp;
 	}
-	if (savedcbp)
+	/*
+	 * Note that we need to synchronously write snapshots that
+	 * have not been unlinked, and hence will be visible after
+	 * a crash, to ensure their integrity.
+	 */
+	if (savedcbp) {
+		vp = savedcbp->b_vp;
 		bawrite(savedcbp);
+		if (VTOI(vp)->i_effnlink > 0)
+			(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p);
+		VOP_UNLOCK(vp, 0, p);
+	}
 	return (error);
 }

--- a/sys/ufs/ffs/ffs_vfsops.c
+++ b/sys/ufs/ffs/ffs_vfsops.c
@ -952,7 +952,7 @@ ffs_sync(mp, waitfor, cred, p)
 	struct ucred *cred;
 	struct proc *p;
 {
-	struct vnode *nvp, *vp;
+	struct vnode *nvp, *vp, *devvp;
 	struct inode *ip;
 	struct ufsmount *ump = VFSTOUFS(mp);
 	struct fs *fs;
@ -1026,12 +1026,21 @@ loop:
 #ifdef QUOTA
 	qsync(mp);
 #endif
-	if (waitfor != MNT_LAZY) {
-		vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY, p);
-		if ((error = VOP_FSYNC(ump->um_devvp, cred, waitfor, p)) != 0)
+	devvp = ump->um_devvp;
+	mtx_lock(&devvp->v_interlock);
+	if (waitfor != MNT_LAZY &&
+	    (devvp->v_numoutput > 0 || TAILQ_FIRST(&devvp->v_dirtyblkhd))) {
+		mtx_unlock(&devvp->v_interlock);
+		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p);
+		if ((error = VOP_FSYNC(devvp, cred, waitfor, p)) != 0)
 			allerror = error;
-		VOP_UNLOCK(ump->um_devvp, 0, p);
-	}
+		VOP_UNLOCK(devvp, 0, p);
+		if (waitfor == MNT_WAIT) {
+			mtx_lock(&mntvnode_mtx);
+			goto loop;
+		}
+	} else
+		mtx_unlock(&devvp->v_interlock);
 	/*
 	 * Write back modified superblock.
 	 */
--- a/sys/ufs/ufs/ufs_vnops.c
+++ b/sys/ufs/ufs/ufs_vnops.c
@ -292,12 +292,32 @@ ufs_close(ap)
 		struct proc *a_p;
 	} */ *ap;
 {
-	register struct vnode *vp = ap->a_vp;
+	struct vnode *vp = ap->a_vp;
+	struct mount *mp;

 	mtx_lock(&vp->v_interlock);
-	if (vp->v_usecount > 1)
+	if (vp->v_usecount > 1) {
 		ufs_itimes(vp);
-	mtx_unlock(&vp->v_interlock);
+		mtx_unlock(&vp->v_interlock);
+	} else {
+		mtx_unlock(&vp->v_interlock);
+		/*
+		 * If we are closing the last reference to an unlinked
+		 * file, then it will be freed by the inactive routine.
+		 * Because the freeing causes a the filesystem to be
+		 * modified, it must be held up during periods when the
+		 * filesystem is suspended.
+		 *
+		 * XXX - EAGAIN is returned to prevent vn_close from
+		 * repeating the vrele operation.
+		 */
+		if (vp->v_type == VREG && VTOI(vp)->i_effnlink == 0) {
+			(void) vn_start_write(vp, &mp, V_WAIT);
+			vrele(vp);
+			vn_finished_write(mp);
+			return (EAGAIN);
+		}
+	}
 	return (0);
 }