diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index 663a504a331e..296c9a68b9d8 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -164,7 +164,7 @@ SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0, "Number of times getnewbuf has had to restart a buffer aquisition"); static int dobkgrdwrite = 1; SYSCTL_INT(_debug, OID_AUTO, dobkgrdwrite, CTLFLAG_RW, &dobkgrdwrite, 0, - "Do background writes (honoring the BX_BKGRDWRITE flag)?"); + "Do background writes (honoring the BV_BKGRDWRITE flag)?"); /* * Wakeup point for bufdaemon, as well as indicator of whether it is already @@ -223,14 +223,13 @@ static struct mtx bdonelock; /* * Definitions for the buffer free lists. */ -#define BUFFER_QUEUES 6 /* number of free buffer queues */ +#define BUFFER_QUEUES 5 /* number of free buffer queues */ #define QUEUE_NONE 0 /* on no queue */ -#define QUEUE_LOCKED 1 /* locked buffers */ -#define QUEUE_CLEAN 2 /* non-B_DELWRI buffers */ -#define QUEUE_DIRTY 3 /* B_DELWRI buffers */ -#define QUEUE_EMPTYKVA 4 /* empty buffer headers w/KVA assignment */ -#define QUEUE_EMPTY 5 /* empty buffer headers */ +#define QUEUE_CLEAN 1 /* non-B_DELWRI buffers */ +#define QUEUE_DIRTY 2 /* B_DELWRI buffers */ +#define QUEUE_EMPTYKVA 3 /* empty buffer headers w/KVA assignment */ +#define QUEUE_EMPTY 4 /* empty buffer headers */ /* Queues for free buffers with various properties */ static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } }; @@ -777,17 +776,19 @@ bwrite(struct buf * bp) * writing this block if it is asynchronous. Otherwise * wait for the background write to complete. */ - if (bp->b_xflags & BX_BKGRDINPROG) { + VI_LOCK(bp->b_vp); + if (bp->b_vflags & BV_BKGRDINPROG) { if (bp->b_flags & B_ASYNC) { splx(s); bdwrite(bp); return (0); } - bp->b_xflags |= BX_BKGRDWAIT; - tsleep(&bp->b_xflags, PRIBIO, "bwrbg", 0); - if (bp->b_xflags & BX_BKGRDINPROG) + bp->b_vflags |= BV_BKGRDWAIT; + msleep(&bp->b_xflags, VI_MTX(bp->b_vp), PRIBIO, "bwrbg", 0); + if (bp->b_vflags & BV_BKGRDINPROG) panic("bwrite: still writing"); } + VI_UNLOCK(bp->b_vp); /* Mark the buffer clean */ bundirty(bp); @@ -820,8 +821,8 @@ bwrite(struct buf * bp) memcpy(newbp->b_data, bp->b_data, bp->b_bufsize); newbp->b_lblkno = bp->b_lblkno; newbp->b_xflags |= BX_BKGRDMARKER; - /* XXX The BX_ flags need to be protected as well */ VI_LOCK(bp->b_vp); + bp->b_vflags |= BV_BKGRDINPROG; bgetvp(bp->b_vp, newbp); VI_UNLOCK(bp->b_vp); newbp->b_blkno = bp->b_blkno; @@ -842,8 +843,6 @@ bwrite(struct buf * bp) * If the reconstituted buffer were written, we could end up * with two background copies being written at the same time. */ - bp->b_xflags |= BX_BKGRDINPROG; - bp->b_flags |= B_LOCKED; bqrelse(bp); bp = newbp; } @@ -906,6 +905,20 @@ vfs_backgroundwritedone(bp) VI_LOCK(bp->b_vp); if ((origbp = gbincore(bp->b_vp, bp->b_lblkno)) == NULL) panic("backgroundwritedone: lost buffer"); + + /* + * Clear the BV_BKGRDINPROG flag in the original buffer + * and awaken it if it is waiting for the write to complete. + * If BV_BKGRDINPROG is not set in the original buffer it must + * have been released and re-instantiated - which is not legal. + */ + KASSERT((origbp->b_vflags & BV_BKGRDINPROG), + ("backgroundwritedone: lost buffer2")); + origbp->b_vflags &= ~BV_BKGRDINPROG; + if (origbp->b_vflags & BV_BKGRDWAIT) { + origbp->b_vflags &= ~BV_BKGRDWAIT; + wakeup(&origbp->b_xflags); + } VI_UNLOCK(bp->b_vp); /* * Process dependencies then return any unfinished ones. @@ -915,29 +928,6 @@ vfs_backgroundwritedone(bp) if (LIST_FIRST(&bp->b_dep) != NULL) buf_movedeps(bp, origbp); - /* XXX Find out if origbp can disappear or get inconsistent */ - /* - * Clear the BX_BKGRDINPROG flag in the original buffer - * and awaken it if it is waiting for the write to complete. - * If BX_BKGRDINPROG is not set in the original buffer it must - * have been released and re-instantiated - which is not legal. - */ - KASSERT((origbp->b_xflags & BX_BKGRDINPROG), - ("backgroundwritedone: lost buffer2")); - origbp->b_xflags &= ~BX_BKGRDINPROG; - if (origbp->b_xflags & BX_BKGRDWAIT) { - origbp->b_xflags &= ~BX_BKGRDWAIT; - wakeup(&origbp->b_xflags); - } - /* - * Clear the B_LOCKED flag and remove it from the locked - * queue if it currently resides there. - */ - origbp->b_flags &= ~B_LOCKED; - if (BUF_LOCK(origbp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0) { - bremfree(origbp); - bqrelse(origbp); - } /* * This buffer is marked B_NOCACHE, so when it is released * by biodone, it will be tossed. We mark it with BIO_READ @@ -997,7 +987,7 @@ bdwrite(struct buf * bp) * Try to find a buffer to flush. */ TAILQ_FOREACH(nbp, &vp->v_dirtyblkhd, b_vnbufs) { - if ((nbp->b_xflags & BX_BKGRDINPROG) || + if ((nbp->b_vflags & BV_BKGRDINPROG) || buf_countdeps(nbp, 0) || BUF_LOCK(nbp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) continue; @@ -1207,9 +1197,6 @@ brelse(struct buf * bp) s = splbio(); - if (bp->b_flags & B_LOCKED) - bp->b_ioflags &= ~BIO_ERROR; - if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) && !(bp->b_flags & B_INVAL)) { @@ -1259,8 +1246,17 @@ brelse(struct buf * bp) */ if (bp->b_flags & B_DELWRI) bp->b_flags &= ~B_RELBUF; - else if (vm_page_count_severe() && !(bp->b_xflags & BX_BKGRDINPROG)) - bp->b_flags |= B_RELBUF; + else if (vm_page_count_severe()) { + /* + * XXX This lock may not be necessary since BKGRDINPROG + * cannot be set while we hold the buf lock, it can only be + * cleared if it is already pending. + */ + VI_LOCK(bp->b_vp); + if (!(bp->b_vflags & BV_BKGRDINPROG)) + bp->b_flags |= B_RELBUF; + VI_UNLOCK(bp->b_vp); + } /* * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer @@ -1389,7 +1385,7 @@ brelse(struct buf * bp) if (bp->b_bufsize == 0) { bp->b_flags |= B_INVAL; bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA); - if (bp->b_xflags & BX_BKGRDINPROG) + if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 1"); if (bp->b_kvasize) { bp->b_qindex = QUEUE_EMPTYKVA; @@ -1403,17 +1399,11 @@ brelse(struct buf * bp) (bp->b_ioflags & BIO_ERROR)) { bp->b_flags |= B_INVAL; bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA); - if (bp->b_xflags & BX_BKGRDINPROG) + if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 2"); bp->b_qindex = QUEUE_CLEAN; TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist); bp->b_dev = NODEV; - - /* buffers that are locked */ - } else if (bp->b_flags & B_LOCKED) { - bp->b_qindex = QUEUE_LOCKED; - TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); - /* remaining buffers */ } else { if (bp->b_flags & B_DELWRI) @@ -1447,7 +1437,7 @@ brelse(struct buf * bp) * if B_INVAL is set ). */ - if ((bp->b_flags & B_LOCKED) == 0 && !(bp->b_flags & B_DELWRI)) + if (!(bp->b_flags & B_DELWRI)) bufcountwakeup(); /* @@ -1493,34 +1483,38 @@ bqrelse(struct buf * bp) return; } mtx_lock(&bqlock); - if (bp->b_flags & B_LOCKED) { - bp->b_ioflags &= ~BIO_ERROR; - bp->b_qindex = QUEUE_LOCKED; - TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); - /* buffers with stale but valid contents */ - } else if (bp->b_flags & B_DELWRI) { + /* buffers with stale but valid contents */ + if (bp->b_flags & B_DELWRI) { bp->b_qindex = QUEUE_DIRTY; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist); - } else if (vm_page_count_severe()) { - /* - * We are too low on memory, we have to try to free the - * buffer (most importantly: the wired pages making up its - * backing store) *now*. - */ - mtx_unlock(&bqlock); - splx(s); - brelse(bp); - return; } else { - bp->b_qindex = QUEUE_CLEAN; - TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist); + /* + * XXX This lock may not be necessary since BKGRDINPROG + * cannot be set while we hold the buf lock, it can only be + * cleared if it is already pending. + */ + VI_LOCK(bp->b_vp); + if (!vm_page_count_severe() || bp->b_vflags & BV_BKGRDINPROG) { + VI_UNLOCK(bp->b_vp); + bp->b_qindex = QUEUE_CLEAN; + TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, + b_freelist); + } else { + /* + * We are too low on memory, we have to try to free + * the buffer (most importantly: the wired pages + * making up its backing store) *now*. + */ + mtx_unlock(&bqlock); + splx(s); + brelse(bp); + return; + } } mtx_unlock(&bqlock); - if ((bp->b_flags & B_LOCKED) == 0 && - ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))) { + if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI)) bufcountwakeup(); - } /* * Something we can maybe free or reuse. @@ -1826,6 +1820,14 @@ getnewbuf(int slpflag, int slptimeo, int size, int maxsize) break; } } + if (bp->b_vp) { + VI_LOCK(bp->b_vp); + if (bp->b_vflags & BV_BKGRDINPROG) { + VI_UNLOCK(bp->b_vp); + continue; + } + VI_UNLOCK(bp->b_vp); + } /* * Sanity Checks @@ -1887,7 +1889,7 @@ getnewbuf(int slpflag, int slptimeo, int size, int maxsize) } if (LIST_FIRST(&bp->b_dep) != NULL) buf_deallocate(bp); - if (bp->b_xflags & BX_BKGRDINPROG) + if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 3"); if (bp->b_bufsize) @@ -2136,10 +2138,13 @@ flushbufqueues(int flushdeps) continue; KASSERT((bp->b_flags & B_DELWRI), ("unexpected clean buffer %p", bp)); - if ((bp->b_xflags & BX_BKGRDINPROG) != 0) { + VI_LOCK(bp->b_vp); + if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { + VI_UNLOCK(bp->b_vp); BUF_UNLOCK(bp); continue; } + VI_UNLOCK(bp->b_vp); if (bp->b_flags & B_INVAL) { bremfreel(bp); mtx_unlock(&bqlock); diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c index ad0b55dd2c0a..89de2ef3f191 100644 --- a/sys/kern/vfs_cluster.c +++ b/sys/kern/vfs_cluster.c @@ -399,11 +399,15 @@ cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp) * VMIO backed. The clustering code can only deal * with VMIO-backed buffers. */ - if ((tbp->b_flags & (B_CACHE|B_LOCKED)) || - (tbp->b_flags & B_VMIO) == 0) { + VI_LOCK(bp->b_vp); + if ((tbp->b_vflags & BV_BKGRDINPROG) || + (tbp->b_flags & B_CACHE) || + (tbp->b_flags & B_VMIO) == 0) { + VI_UNLOCK(bp->b_vp); bqrelse(tbp); break; } + VI_UNLOCK(bp->b_vp); /* * The buffer must be completely invalid in order to @@ -768,7 +772,8 @@ cluster_wbuild(vp, size, start_lbn, len) * partake in the clustered write. */ VI_LOCK(vp); - if ((tbp = gbincore(vp, start_lbn)) == NULL) { + if ((tbp = gbincore(vp, start_lbn)) == NULL || + (tbp->b_vflags & BV_BKGRDINPROG)) { VI_UNLOCK(vp); ++start_lbn; --len; @@ -782,8 +787,7 @@ cluster_wbuild(vp, size, start_lbn, len) splx(s); continue; } - if ((tbp->b_flags & (B_LOCKED | B_INVAL | B_DELWRI)) != - B_DELWRI) { + if ((tbp->b_flags & (B_INVAL | B_DELWRI)) != B_DELWRI) { BUF_UNLOCK(tbp); ++start_lbn; --len; @@ -857,7 +861,8 @@ cluster_wbuild(vp, size, start_lbn, len) * can't need to be written. */ VI_LOCK(vp); - if ((tbp = gbincore(vp, start_lbn)) == NULL) { + if ((tbp = gbincore(vp, start_lbn)) == NULL || + (tbp->b_vflags & BV_BKGRDINPROG)) { VI_UNLOCK(vp); splx(s); break; @@ -881,7 +886,6 @@ cluster_wbuild(vp, size, start_lbn, len) B_INVAL | B_DELWRI | B_NEEDCOMMIT)) != (B_DELWRI | B_CLUSTEROK | (bp->b_flags & (B_VMIO | B_NEEDCOMMIT))) || - (tbp->b_flags & B_LOCKED) || tbp->b_wcred != bp->b_wcred) { BUF_UNLOCK(tbp); splx(s); diff --git a/sys/sys/buf.h b/sys/sys/buf.h index 73baef19a4b6..3965764f8888 100644 --- a/sys/sys/buf.h +++ b/sys/sys/buf.h @@ -217,7 +217,7 @@ struct buf { #define B_00000800 0x00000800 /* Availabel flag. */ #define B_00001000 0x00001000 /* Available flag. */ #define B_INVAL 0x00002000 /* Does not contain valid info. */ -#define B_LOCKED 0x00004000 /* Locked in core (not reusable). */ +#define B_00004000 0x00004000 /* Available flag. */ #define B_NOCACHE 0x00008000 /* Do not cache block after use. */ #define B_MALLOC 0x00010000 /* malloced b_data */ #define B_CLUSTEROK 0x00020000 /* Pagein op, so swap() can count it. */ @@ -247,15 +247,18 @@ struct buf { */ #define BX_VNDIRTY 0x00000001 /* On vnode dirty list */ #define BX_VNCLEAN 0x00000002 /* On vnode clean list */ -#define BX_BKGRDWRITE 0x00000004 /* Do writes in background */ -#define BX_BKGRDINPROG 0x00000008 /* Background write in progress */ -#define BX_BKGRDWAIT 0x00000010 /* Background write waiting */ +#define BX_BKGRDWRITE 0x00000010 /* Do writes in background */ #define BX_BKGRDMARKER 0x00000020 /* Mark buffer for splay tree */ #define BX_ALTDATA 0x00000040 /* Holds extended data */ #define NOOFFSET (-1LL) /* No buffer offset calculated yet */ -#define BV_SCANNED 0x00001000 /* VOP_FSYNC funcs mark written bufs */ +/* + * These flags are kept in b_vflags. + */ +#define BV_SCANNED 0x00000001 /* VOP_FSYNC funcs mark written bufs */ +#define BV_BKGRDINPROG 0x00000002 /* Background write in progress */ +#define BV_BKGRDWAIT 0x00000004 /* Background write waiting */ #ifdef _KERNEL /* diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c index df149400d0a5..9ca56868743c 100644 --- a/sys/ufs/ffs/ffs_softdep.c +++ b/sys/ufs/ffs/ffs_softdep.c @@ -4892,11 +4892,9 @@ softdep_fsync_mountdev(vp) /* * If it is already scheduled, skip to the next buffer. */ - if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, - VI_MTX(vp))) { - VI_LOCK(vp); + if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) continue; - } + if ((bp->b_flags & B_DELWRI) == 0) { FREE_LOCK(&lk); panic("softdep_fsync_mountdev: not dirty"); @@ -4907,11 +4905,11 @@ softdep_fsync_mountdev(vp) */ if ((wk = LIST_FIRST(&bp->b_dep)) == NULL || wk->wk_type != D_BMSAFEMAP || - (bp->b_xflags & BX_BKGRDINPROG)) { + (bp->b_vflags & BV_BKGRDINPROG)) { BUF_UNLOCK(bp); - VI_LOCK(vp); continue; } + VI_UNLOCK(vp); bremfree(bp); FREE_LOCK(&lk); (void) bawrite(bp); @@ -5803,21 +5801,31 @@ getdirtybuf(bpp, waitfor) struct buf *bp; int error; + /* + * XXX This code and the code that calls it need to be reviewed to + * verify its use of the vnode interlock. + */ + for (;;) { if ((bp = *bpp) == NULL) return (0); - /* XXX Probably needs interlock */ + VI_LOCK(bp->b_vp); if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0) { - if ((bp->b_xflags & BX_BKGRDINPROG) == 0) + if ((bp->b_vflags & BV_BKGRDINPROG) == 0) { + VI_UNLOCK(bp->b_vp); break; + } BUF_UNLOCK(bp); - if (waitfor != MNT_WAIT) + if (waitfor != MNT_WAIT) { + VI_UNLOCK(bp->b_vp); return (0); - bp->b_xflags |= BX_BKGRDWAIT; - interlocked_sleep(&lk, SLEEP, &bp->b_xflags, NULL, - PRIBIO, "getbuf", 0); + } + bp->b_vflags |= BV_BKGRDWAIT; + interlocked_sleep(&lk, SLEEP, &bp->b_xflags, + VI_MTX(bp->b_vp), PRIBIO|PDROP, "getbuf", 0); continue; } + VI_UNLOCK(bp->b_vp); if (waitfor != MNT_WAIT) return (0); error = interlocked_sleep(&lk, LOCKBUF, bp, NULL,