From 09f11da5a31d792654cf17c5f7a97bbee1fd9253 Mon Sep 17 00:00:00 2001
From: Jeff Roberson <jeff@FreeBSD.org>
Date: Thu, 13 Mar 2003 07:19:23 +0000
Subject: [PATCH]  - Remove a race between fsync like functions and
 flushbufqueues() by    requiring locked bufs in vfs_bio_awrite().  Previously
 the buf could    have been written out by fsync before we acquired the buf
 lock if it    weren't for giant.  The cluster_wbuild() handles this race
 properly but    the single write at the end of vfs_bio_awrite() would not.  -
 Modify flushbufqueues() so there is only one copy of the loop.  Pass a   
 parameter in that says whether or not we should sync bufs with deps.  - Call
 flushbufqueues() a second time and then break if we couldn't find    any bufs
 without deps.

---
 sys/kern/vfs_bio.c      | 76 +++++++++++++++++------------------------
 sys/kern/vfs_default.c  |  1 -
 sys/kern/vfs_subr.c     |  1 -
 sys/ufs/ffs/ffs_vnops.c |  6 ++--
 4 files changed, 34 insertions(+), 50 deletions(-)

diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index a2807e952b2b..7785e9de7262 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -81,7 +81,7 @@ static void vfs_vmio_release(struct buf *bp);
 static void vfs_backgroundwritedone(struct buf *bp);
 static int vfs_bio_clcheck(struct vnode *vp, int size,
 		daddr_t lblkno, daddr_t blkno);
-static int flushbufqueues(void);
+static int flushbufqueues(int flushdeps);
 static void buf_daemon(void);
 void bremfreel(struct buf * bp);
 
@@ -994,7 +994,6 @@ bdwrite(struct buf * bp)
 				panic("bdwrite: found ourselves");
 			VI_UNLOCK(vp);
 			if (nbp->b_flags & B_CLUSTEROK) {
-				BUF_UNLOCK(nbp);
 				vfs_bio_awrite(nbp);
 			} else {
 				bremfree(nbp);
@@ -1672,13 +1671,13 @@ vfs_bio_awrite(struct buf * bp)
 		 * this is a possible cluster write
 		 */
 		if (ncl != 1) {
+			BUF_UNLOCK(bp);
 			nwritten = cluster_wbuild(vp, size, lblkno - j, ncl);
 			splx(s);
 			return nwritten;
 		}
 	}
 
-	BUF_LOCK(bp, LK_EXCLUSIVE, NULL);
 	bremfree(bp);
 	bp->b_flags |= B_ASYNC;
 
@@ -2051,8 +2050,15 @@ buf_daemon()
 		 * normally would so they can run in parallel with our drain.
 		 */
 		while (numdirtybuffers > lodirtybuffers) {
-			if (flushbufqueues() == 0)
+			if (flushbufqueues(0) == 0) {
+				/*
+				 * Could not find any buffers without rollback
+				 * dependencies, so just write the first one
+				 * in the hopes of eventually making progress.
+				 */
+				flushbufqueues(1);
 				break;
+			}
 			waitrunningbufspace();
 			numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2);
 		}
@@ -2098,33 +2104,47 @@ int flushwithdeps = 0;
 SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps,
     0, "Number of buffers flushed with dependecies that require rollbacks");
 static int
-flushbufqueues(void)
+flushbufqueues(int flushdeps)
 {
 	struct thread *td = curthread;
 	struct vnode *vp;
 	struct buf *bp;
+	int hasdeps;
 
 	mtx_lock(&bqlock);
 	TAILQ_FOREACH(bp, &bufqueues[QUEUE_DIRTY], b_freelist) {
+		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
+			continue;
 		KASSERT((bp->b_flags & B_DELWRI),
 		    ("unexpected clean buffer %p", bp));
-		if ((bp->b_xflags & BX_BKGRDINPROG) != 0)
+		if ((bp->b_xflags & BX_BKGRDINPROG) != 0) {
+			BUF_UNLOCK(bp);
 			continue;
+		}
 		if (bp->b_flags & B_INVAL) {
-			if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
-				panic("flushbufqueues: locked buf");
 			bremfreel(bp);
 			mtx_unlock(&bqlock);
 			brelse(bp);
 			return (1);
 		}
-		if (LIST_FIRST(&bp->b_dep) != NULL && buf_countdeps(bp, 0))
-			continue;
+
+		if (LIST_FIRST(&bp->b_dep) != NULL && buf_countdeps(bp, 0)) {
+			if (flushdeps == 0) {
+				BUF_UNLOCK(bp);
+				continue;
+			}
+			hasdeps = 1;
+		} else
+			hasdeps = 0;
 		/*
 		 * We must hold the lock on a vnode before writing
 		 * one of its buffers. Otherwise we may confuse, or
 		 * in the case of a snapshot vnode, deadlock the
 		 * system.
+		 *
+		 * The lock order here is the reverse of the normal
+		 * of vnode followed by buf lock.  This is ok because
+		 * the NOWAIT will prevent deadlock.
 		 */
 		if ((vp = bp->b_vp) == NULL ||
 		    vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT, td) == 0) {
@@ -2132,42 +2152,10 @@ flushbufqueues(void)
 			vfs_bio_awrite(bp);
 			if (vp != NULL)
 				VOP_UNLOCK(vp, 0, td);
+			flushwithdeps += hasdeps;
 			return (1);
 		}
-	}
-	/*
-	 * Could not find any buffers without rollback dependencies,
-	 * so just write the first one in the hopes of eventually
-	 * making progress.
-	 */
-	TAILQ_FOREACH(bp, &bufqueues[QUEUE_DIRTY], b_freelist) {
-		KASSERT((bp->b_flags & B_DELWRI),
-		    ("unexpected clean buffer %p", bp));
-		if ((bp->b_xflags & BX_BKGRDINPROG) != 0)
-			continue;
-		if (bp->b_flags & B_INVAL) {
-			if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
-				panic("flushbufqueues: locked buf");
-			bremfreel(bp);
-			mtx_unlock(&bqlock);
-			brelse(bp);
-			return (1);
-		}
-		/*
-		 * We must hold the lock on a vnode before writing
-		 * one of its buffers. Otherwise we may confuse, or
-		 * in the case of a snapshot vnode, deadlock the
-		 * system.
-		 */
-		if ((vp = bp->b_vp) == NULL ||
-		    vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT, td) == 0) {
-			mtx_unlock(&bqlock);
-			vfs_bio_awrite(bp);
-			if (vp != NULL)
-				VOP_UNLOCK(vp, 0, td);
-			flushwithdeps += 1;
-			return (0);
-		}
+		BUF_UNLOCK(bp);
 	}
 	mtx_unlock(&bqlock);
 	return (0);
diff --git a/sys/kern/vfs_default.c b/sys/kern/vfs_default.c
index 1921abf5774c..58f1a8650fd8 100644
--- a/sys/kern/vfs_default.c
+++ b/sys/kern/vfs_default.c
@@ -749,7 +749,6 @@ vop_stdfsync(ap)
 		if ((bp->b_flags & B_DELWRI) == 0)
 			panic("fsync: not dirty");
 		if ((vp->v_vflag & VV_OBJBUF) && (bp->b_flags & B_CLUSTEROK)) {
-			BUF_UNLOCK(bp);
 			vfs_bio_awrite(bp);
 			splx(s);
 		} else {
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index 5f4b678632d4..3cedcb45b2a3 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -1240,7 +1240,6 @@ flushbuflist(blist, flags, vp, slpflag, slptimeo, errorp)
 
 			if (bp->b_vp == vp) {
 				if (bp->b_flags & B_CLUSTEROK) {
-					BUF_UNLOCK(bp);
 					vfs_bio_awrite(bp);
 				} else {
 					bremfree(bp);
diff --git a/sys/ufs/ffs/ffs_vnops.c b/sys/ufs/ffs/ffs_vnops.c
index cc6debc12f0d..8aa3db4cb21b 100644
--- a/sys/ufs/ffs/ffs_vnops.c
+++ b/sys/ufs/ffs/ffs_vnops.c
@@ -227,7 +227,6 @@ ffs_fsync(ap)
 			 */
 			if (passes > 0 || !wait) {
 				if ((bp->b_flags & B_CLUSTEROK) && !wait) {
-					BUF_UNLOCK(bp);
 					(void) vfs_bio_awrite(bp);
 				} else {
 					bremfree(bp);
@@ -252,10 +251,9 @@ ffs_fsync(ap)
 			splx(s);
 			brelse(bp);
 			s = splbio();
-		} else {
-			BUF_UNLOCK(bp);
+		} else
 			vfs_bio_awrite(bp);
-		}
+
 		/*
 		 * Since we may have slept during the I/O, we need 
 		 * to start from a known point.