- Remove a race between fsync like functions and flushbufqueues() by

requiring locked bufs in vfs_bio_awrite(). Previously the buf could have been written out by fsync before we acquired the buf lock if it weren't for giant. The cluster_wbuild() handles this race properly but the single write at the end of vfs_bio_awrite() would not. - Modify flushbufqueues() so there is only one copy of the loop. Pass a parameter in that says whether or not we should sync bufs with deps. - Call flushbufqueues() a second time and then break if we couldn't find any bufs without deps.
svn path=/head/; revision=112181
2003-03-13 07:19:23 +00:00 · 2003-03-13 07:19:23 +00:00 · 09f11da5a3 · 2020-12-20 02:59:44 +00:00
commit 09f11da5a3
parent f5f0dee483
4 changed files with 34 additions and 50 deletions
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@ -81,7 +81,7 @@ static void vfs_vmio_release(struct buf *bp);
 static void vfs_backgroundwritedone(struct buf *bp);
 static int vfs_bio_clcheck(struct vnode *vp, int size,
 		daddr_t lblkno, daddr_t blkno);
-static int flushbufqueues(void);
+static int flushbufqueues(int flushdeps);
 static void buf_daemon(void);
 void bremfreel(struct buf * bp);

@ -994,7 +994,6 @@ bdwrite(struct buf * bp)
 				panic("bdwrite: found ourselves");
 			VI_UNLOCK(vp);
 			if (nbp->b_flags & B_CLUSTEROK) {
-				BUF_UNLOCK(nbp);
 				vfs_bio_awrite(nbp);
 			} else {
 				bremfree(nbp);
@ -1672,13 +1671,13 @@ vfs_bio_awrite(struct buf * bp)
 		 * this is a possible cluster write
 		 */
 		if (ncl != 1) {
+			BUF_UNLOCK(bp);
 			nwritten = cluster_wbuild(vp, size, lblkno - j, ncl);
 			splx(s);
 			return nwritten;
 		}
 	}

-	BUF_LOCK(bp, LK_EXCLUSIVE, NULL);
 	bremfree(bp);
 	bp->b_flags |= B_ASYNC;

@ -2051,8 +2050,15 @@ buf_daemon()
 		 * normally would so they can run in parallel with our drain.
 		 */
 		while (numdirtybuffers > lodirtybuffers) {
-			if (flushbufqueues() == 0)
+			if (flushbufqueues(0) == 0) {
+				/*
+				 * Could not find any buffers without rollback
+				 * dependencies, so just write the first one
+				 * in the hopes of eventually making progress.
+				 */
+				flushbufqueues(1);
 				break;
+			}
 			waitrunningbufspace();
 			numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2);
 		}
@ -2098,33 +2104,47 @@ int flushwithdeps = 0;
 SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps,
    0, "Number of buffers flushed with dependecies that require rollbacks");
 static int
-flushbufqueues(void)
+flushbufqueues(int flushdeps)
 {
 	struct thread *td = curthread;
 	struct vnode *vp;
 	struct buf *bp;
+	int hasdeps;

 	mtx_lock(&bqlock);
 	TAILQ_FOREACH(bp, &bufqueues[QUEUE_DIRTY], b_freelist) {
+		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
+			continue;
 		KASSERT((bp->b_flags & B_DELWRI),
 		    ("unexpected clean buffer %p", bp));
-		if ((bp->b_xflags & BX_BKGRDINPROG) != 0)
+		if ((bp->b_xflags & BX_BKGRDINPROG) != 0) {
+			BUF_UNLOCK(bp);
 			continue;
+		}
 		if (bp->b_flags & B_INVAL) {
-			if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
-				panic("flushbufqueues: locked buf");
 			bremfreel(bp);
 			mtx_unlock(&bqlock);
 			brelse(bp);
 			return (1);
 		}
-		if (LIST_FIRST(&bp->b_dep) != NULL && buf_countdeps(bp, 0))
-			continue;
+
+		if (LIST_FIRST(&bp->b_dep) != NULL && buf_countdeps(bp, 0)) {
+			if (flushdeps == 0) {
+				BUF_UNLOCK(bp);
+				continue;
+			}
+			hasdeps = 1;
+		} else
+			hasdeps = 0;
 		/*
 		 * We must hold the lock on a vnode before writing
 		 * one of its buffers. Otherwise we may confuse, or
 		 * in the case of a snapshot vnode, deadlock the
 		 * system.
+		 *
+		 * The lock order here is the reverse of the normal
+		 * of vnode followed by buf lock.  This is ok because
+		 * the NOWAIT will prevent deadlock.
 		 */
 		if ((vp = bp->b_vp) == NULL ||
 		    vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT, td) == 0) {
@ -2132,42 +2152,10 @@ flushbufqueues(void)
 			vfs_bio_awrite(bp);
 			if (vp != NULL)
 				VOP_UNLOCK(vp, 0, td);
+			flushwithdeps += hasdeps;
 			return (1);
 		}
-	}
-	/*
-	 * Could not find any buffers without rollback dependencies,
-	 * so just write the first one in the hopes of eventually
-	 * making progress.
-	 */
-	TAILQ_FOREACH(bp, &bufqueues[QUEUE_DIRTY], b_freelist) {
-		KASSERT((bp->b_flags & B_DELWRI),
-		    ("unexpected clean buffer %p", bp));
-		if ((bp->b_xflags & BX_BKGRDINPROG) != 0)
-			continue;
-		if (bp->b_flags & B_INVAL) {
-			if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
-				panic("flushbufqueues: locked buf");
-			bremfreel(bp);
-			mtx_unlock(&bqlock);
-			brelse(bp);
-			return (1);
-		}
-		/*
-		 * We must hold the lock on a vnode before writing
-		 * one of its buffers. Otherwise we may confuse, or
-		 * in the case of a snapshot vnode, deadlock the
-		 * system.
-		 */
-		if ((vp = bp->b_vp) == NULL ||
-		    vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT, td) == 0) {
-			mtx_unlock(&bqlock);
-			vfs_bio_awrite(bp);
-			if (vp != NULL)
-				VOP_UNLOCK(vp, 0, td);
-			flushwithdeps += 1;
-			return (0);
-		}
+		BUF_UNLOCK(bp);
 	}
 	mtx_unlock(&bqlock);
 	return (0);
--- a/sys/kern/vfs_default.c
+++ b/sys/kern/vfs_default.c
@ -749,7 +749,6 @@ vop_stdfsync(ap)
 		if ((bp->b_flags & B_DELWRI) == 0)
 			panic("fsync: not dirty");
 		if ((vp->v_vflag & VV_OBJBUF) && (bp->b_flags & B_CLUSTEROK)) {
-			BUF_UNLOCK(bp);
 			vfs_bio_awrite(bp);
 			splx(s);
 		} else {
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@ -1240,7 +1240,6 @@ flushbuflist(blist, flags, vp, slpflag, slptimeo, errorp)

 			if (bp->b_vp == vp) {
 				if (bp->b_flags & B_CLUSTEROK) {
-					BUF_UNLOCK(bp);
 					vfs_bio_awrite(bp);
 				} else {
 					bremfree(bp);
--- a/sys/ufs/ffs/ffs_vnops.c
+++ b/sys/ufs/ffs/ffs_vnops.c
@ -227,7 +227,6 @@ ffs_fsync(ap)
 			 */
 			if (passes > 0 || !wait) {
 				if ((bp->b_flags & B_CLUSTEROK) && !wait) {
-					BUF_UNLOCK(bp);
 					(void) vfs_bio_awrite(bp);
 				} else {
 					bremfree(bp);
@ -252,10 +251,9 @@ ffs_fsync(ap)
 			splx(s);
 			brelse(bp);
 			s = splbio();
-		} else {
-			BUF_UNLOCK(bp);
+		} else
 			vfs_bio_awrite(bp);
-		}
+
 		/*
 		 * Since we may have slept during the I/O, we need 
 		 * to start from a known point.