diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index a01230bfd1cd..76f14b2e61ed 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -11,7 +11,7 @@ * 2. Absolutely no warranty of function or purpose is made by the author * John S. Dyson. * - * $Id: vfs_bio.c,v 1.200 1999/03/02 20:26:39 julian Exp $ + * $Id: vfs_bio.c,v 1.201 1999/03/02 21:23:38 julian Exp $ */ /* @@ -83,8 +83,7 @@ static void vfs_clean_pages(struct buf * bp); static void vfs_setdirty(struct buf *bp); static void vfs_vmio_release(struct buf *bp); static void flushdirtybuffers(int slpflag, int slptimeo); - -int needsbuffer; +static int flushbufqueues(void); /* * Internal update daemon, process 3 @@ -92,11 +91,6 @@ int needsbuffer; */ int vfs_update_wakeup; - -/* - * buffers base kva - */ - /* * bogus page -- for I/O to/from partially complete buffers * this is a temporary solution to the problem, but it is not @@ -105,12 +99,13 @@ int vfs_update_wakeup; * but the code is intricate enough already. */ vm_page_t bogus_page; +int runningbufspace; static vm_offset_t bogus_offset; static int bufspace, maxbufspace, vmiospace, maxvmiobufspace, - bufmallocspace, maxbufmallocspace; -int numdirtybuffers; -static int lodirtybuffers, hidirtybuffers; + bufmallocspace, maxbufmallocspace, hibufspace; +static int needsbuffer; +static int numdirtybuffers, lodirtybuffers, hidirtybuffers; static int numfreebuffers, lofreebuffers, hifreebuffers; static int kvafreespace; @@ -126,8 +121,12 @@ SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0, ""); +SYSCTL_INT(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, + &runningbufspace, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW, &maxbufspace, 0, ""); +SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, + &hibufspace, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, maxvmiobufspace, CTLFLAG_RW, @@ -146,11 +145,81 @@ struct bqueues bufqueues[BUFFER_QUEUES] = { { 0 } }; extern int vm_swap_size; -#define BUF_MAXUSE 24 +#define BUF_MAXUSE 24 -#define VFS_BIO_NEED_ANY 1 -#define VFS_BIO_NEED_LOWLIMIT 2 -#define VFS_BIO_NEED_FREE 4 +#define VFS_BIO_NEED_ANY 0x01 /* any freeable buffer */ +#define VFS_BIO_NEED_RESERVED02 0x02 /* unused */ +#define VFS_BIO_NEED_FREE 0x04 /* wait for free bufs, hi hysteresis */ +#define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */ +#define VFS_BIO_NEED_KVASPACE 0x10 /* wait for buffer_map space, emerg */ + +/* + * kvaspacewakeup: + * + * Called when kva space is potential available for recovery or when + * kva space is recovered in the buffer_map. This function wakes up + * anyone waiting for buffer_map kva space. Even though the buffer_map + * is larger then maxbufspace, this situation will typically occur + * when the buffer_map gets fragmented. + */ + +static __inline void +kvaspacewakeup(void) +{ + /* + * If someone is waiting for KVA space, wake them up. Even + * though we haven't freed the kva space yet, the waiting + * process will be able to now. + */ + if (needsbuffer & VFS_BIO_NEED_KVASPACE) { + needsbuffer &= ~VFS_BIO_NEED_KVASPACE; + wakeup(&needsbuffer); + } +} + +/* + * bufspacewakeup: + * + * Called when buffer space is potentially available for recovery or when + * buffer space is recovered. getnewbuf() will block on this flag when + * it is unable to free sufficient buffer space. Buffer space becomes + * recoverable when bp's get placed back in the queues. + */ + +static __inline void +bufspacewakeup(void) +{ + /* + * If someone is waiting for BUF space, wake them up. Even + * though we haven't freed the kva space yet, the waiting + * process will be able to now. + */ + if (needsbuffer & VFS_BIO_NEED_BUFSPACE) { + needsbuffer &= ~VFS_BIO_NEED_BUFSPACE; + wakeup(&needsbuffer); + } +} + +/* + * bufcountwakeup: + * + * Called when a buffer has been added to one of the free queues to + * account for the buffer and to wakeup anyone waiting for free buffers. + * This typically occurs when large amounts of metadata are being handled + * by the buffer cache ( else buffer space runs out first, usually ). + */ + +static __inline void +bufcountwakeup(void) +{ + ++numfreebuffers; + if (needsbuffer) { + needsbuffer &= ~VFS_BIO_NEED_ANY; + if (numfreebuffers >= hifreebuffers) + needsbuffer &= ~VFS_BIO_NEED_FREE; + wakeup(&needsbuffer); + } +} /* * Initialize buffer headers and related structures. @@ -186,17 +255,25 @@ bufinit() TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); LIST_INSERT_HEAD(&invalhash, bp, b_hash); } -/* - * maxbufspace is currently calculated to support all filesystem blocks - * to be 8K. If you happen to use a 16K filesystem, the size of the buffer - * cache is still the same as it would be for 8K filesystems. This - * keeps the size of the buffer cache "in check" for big block filesystems. - */ + + /* + * maxbufspace is currently calculated to support all filesystem + * blocks to be 8K. If you happen to use a 16K filesystem, the size + * of the buffer cache is still the same as it would be for 8K + * filesystems. This keeps the size of the buffer cache "in check" + * for big block filesystems. + * + * maxbufspace is calculated as around 50% of the KVA available in + * the buffer_map ( DFLTSIZE vs BKVASIZE ), I presume to reduce the + * effect of fragmentation. + */ maxbufspace = (nbuf + 8) * DFLTBSIZE; + if ((hibufspace = maxbufspace - MAXBSIZE * 5) <= MAXBSIZE) + hibufspace = 3 * maxbufspace / 4; /* * reserve 1/3 of the buffers for metadata (VDIR) which might not be VMIO'ed */ - maxvmiobufspace = 2 * maxbufspace / 3; + maxvmiobufspace = 2 * hibufspace / 3; /* * Limit the amount of malloc memory since it is wired permanently into * the kernel space. Even though this is accounted for in the buffer @@ -204,18 +281,24 @@ bufinit() * The malloc scheme improves memory utilization significantly on average * (small) directories. */ - maxbufmallocspace = maxbufspace / 20; + maxbufmallocspace = hibufspace / 20; /* - * Remove the probability of deadlock conditions by limiting the - * number of dirty buffers. + * Reduce the chance of a deadlock occuring by limiting the number + * of delayed-write dirty buffers we allow to stack up. */ - hidirtybuffers = nbuf / 8 + 20; lodirtybuffers = nbuf / 16 + 10; + hidirtybuffers = nbuf / 8 + 20; numdirtybuffers = 0; + +/* + * Try to keep the number of free buffers in the specified range, + * and give the syncer access to an emergency reserve. + */ lofreebuffers = nbuf / 18 + 5; hifreebuffers = 2 * lofreebuffers; numfreebuffers = nbuf; + kvafreespace = 0; bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE); @@ -233,24 +316,26 @@ bufinit() static void bfreekva(struct buf * bp) { - if (bp->b_kvasize == 0) - return; - - vm_map_delete(buffer_map, - (vm_offset_t) bp->b_kvabase, - (vm_offset_t) bp->b_kvabase + bp->b_kvasize); - - bp->b_kvasize = 0; - + if (bp->b_kvasize) { + vm_map_delete(buffer_map, + (vm_offset_t) bp->b_kvabase, + (vm_offset_t) bp->b_kvabase + bp->b_kvasize + ); + bp->b_kvasize = 0; + kvaspacewakeup(); + } } /* - * remove the buffer from the appropriate free list + * bremfree: + * + * Remove the buffer from the appropriate free list. */ void bremfree(struct buf * bp) { int s = splbio(); + int old_qindex = bp->b_qindex; if (bp->b_qindex != QUEUE_NONE) { if (bp->b_qindex == QUEUE_EMPTY) { @@ -258,14 +343,29 @@ bremfree(struct buf * bp) } TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); bp->b_qindex = QUEUE_NONE; + runningbufspace += bp->b_bufsize; } else { #if !defined(MAX_PERF) panic("bremfree: removing a buffer when not on a queue"); #endif } - if ((bp->b_flags & B_INVAL) || - (bp->b_flags & (B_DELWRI|B_LOCKED)) == 0) - --numfreebuffers; + + /* + * Fixup numfreebuffers count. If the buffer is invalid or not + * delayed-write, and it was on the EMPTY, LRU, or AGE queues, + * the buffer was free and we must decrement numfreebuffers. + */ + if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) { + switch(old_qindex) { + case QUEUE_EMPTY: + case QUEUE_LRU: + case QUEUE_AGE: + --numfreebuffers; + break; + default: + break; + } + } splx(s); } @@ -286,6 +386,7 @@ bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred, if ((bp->b_flags & B_CACHE) == 0) { if (curproc != NULL) curproc->p_stats->p_ru.ru_inblock++; + KASSERT(!(bp->b_flags & B_ASYNC), ("bread: illegal async bp %p", bp)); bp->b_flags |= B_READ; bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL); if (bp->b_rcred == NOCRED) { @@ -330,6 +431,7 @@ breadn(struct vnode * vp, daddr_t blkno, int size, VOP_STRATEGY(vp, bp); ++readwait; } + for (i = 0; i < cnt; i++, rablkno++, rabsize++) { if (inmem(vp, *rablkno)) continue; @@ -369,7 +471,6 @@ bwrite(struct buf * bp) struct vnode *vp; struct mount *mp; - if (bp->b_flags & B_INVAL) { brelse(bp); return (0); @@ -381,15 +482,11 @@ bwrite(struct buf * bp) if ((bp->b_flags & B_BUSY) == 0) panic("bwrite: buffer is not busy???"); #endif - - bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); - bp->b_flags |= B_WRITEINPROG; - s = splbio(); - if ((oldflags & B_DELWRI) == B_DELWRI) { - --numdirtybuffers; - reassignbuf(bp, bp->b_vp); - } + bundirty(bp); + + bp->b_flags &= ~(B_READ | B_DONE | B_ERROR); + bp->b_flags |= B_WRITEINPROG; bp->b_vp->v_numoutput++; vfs_busy_pages(bp, 1); @@ -420,23 +517,8 @@ bwrite(struct buf * bp) brelse(bp); return (rtval); } - return (0); -} -void -vfs_bio_need_satisfy(void) { - ++numfreebuffers; - if (!needsbuffer) - return; - if (numdirtybuffers < lodirtybuffers) { - needsbuffer &= ~(VFS_BIO_NEED_ANY | VFS_BIO_NEED_LOWLIMIT); - } else { - needsbuffer &= ~VFS_BIO_NEED_ANY; - } - if (numfreebuffers >= hifreebuffers) { - needsbuffer &= ~VFS_BIO_NEED_FREE; - } - wakeup(&needsbuffer); + return (0); } /* @@ -457,12 +539,7 @@ bdwrite(struct buf * bp) brelse(bp); return; } - bp->b_flags &= ~(B_READ|B_RELBUF); - if ((bp->b_flags & B_DELWRI) == 0) { - bp->b_flags |= B_DONE | B_DELWRI; - reassignbuf(bp, bp->b_vp); - ++numdirtybuffers; - } + bdirty(bp); /* * This bmap keeps the system from needing to do the bmap later, @@ -506,32 +583,68 @@ bdwrite(struct buf * bp) if (numdirtybuffers >= hidirtybuffers) flushdirtybuffers(0, 0); - - return; } - /* - * Same as first half of bdwrite, mark buffer dirty, but do not release it. - * Check how this compares with vfs_setdirty(); XXX [JRE] + * bdirty: + * + * Turn buffer into delayed write request. We must clear B_READ and + * B_RELBUF, and we must set B_DELWRI. We reassign the buffer to + * itself to properly update it in the dirty/clean lists. We mark it + * B_DONE to ensure that any asynchronization of the buffer properly + * clears B_DONE ( else a panic will occur later ). Note that B_INVALID + * buffers are not considered dirty even if B_DELWRI is set. + * + * Since the buffer is not on a queue, we do not update the numfreebuffers + * count. + * + * Must be called at splbio(). + * The buffer must be on QUEUE_NONE. */ void bdirty(bp) - struct buf *bp; + struct buf *bp; { - - bp->b_flags &= ~(B_READ|B_RELBUF); /* XXX ??? check this */ + KASSERT(bp->b_qindex == QUEUE_NONE, ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex)); + bp->b_flags &= ~(B_READ|B_RELBUF); + if ((bp->b_flags & B_DELWRI) == 0) { - bp->b_flags |= B_DONE | B_DELWRI; /* why done? XXX JRE */ + bp->b_flags |= B_DONE | B_DELWRI; reassignbuf(bp, bp->b_vp); ++numdirtybuffers; } } /* - * Asynchronous write. - * Start output on a buffer, but do not wait for it to complete. - * The buffer is released when the output completes. + * bundirty: + * + * Clear B_DELWRI for buffer. + * + * Since the buffer is not on a queue, we do not update the numfreebuffers + * count. + * + * Must be called at splbio(). + * The buffer must be on QUEUE_NONE. + */ + +void +bundirty(bp) + struct buf *bp; +{ + KASSERT(bp->b_qindex == QUEUE_NONE, ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex)); + + if (bp->b_flags & B_DELWRI) { + bp->b_flags &= ~B_DELWRI; + reassignbuf(bp, bp->b_vp); + --numdirtybuffers; + } +} + +/* + * bawrite: + * + * Asynchronous write. Start output on a buffer, but do not wait for + * it to complete. The buffer is released when the output completes. */ void bawrite(struct buf * bp) @@ -541,39 +654,42 @@ bawrite(struct buf * bp) } /* - * Ordered write. - * Start output on a buffer, and flag it so that the device will write - * it in the order it was queued. The buffer is released when the output - * completes. + * bowrite: + * + * Ordered write. Start output on a buffer, and flag it so that the + * device will write it in the order it was queued. The buffer is + * released when the output completes. */ int bowrite(struct buf * bp) { - bp->b_flags |= B_ORDERED|B_ASYNC; + bp->b_flags |= B_ORDERED | B_ASYNC; return (VOP_BWRITE(bp)); } /* - * Release a buffer. + * brelse: + * + * Release a busy buffer and, if requested, free its resources. The + * buffer will be stashed in the appropriate bufqueue[] allowing it + * to be accessed later as a cache entity or reused for other purposes. */ void brelse(struct buf * bp) { int s; + KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); + +#if 0 if (bp->b_flags & B_CLUSTER) { relpbuf(bp, NULL); return; } +#endif s = splbio(); - /* anyone need this block? */ - if (bp->b_flags & B_WANTED) { - bp->b_flags &= ~(B_WANTED | B_AGE); - wakeup(bp); - } - if (bp->b_flags & B_LOCKED) bp->b_flags &= ~B_ERROR; @@ -717,8 +833,8 @@ brelse(struct buf * bp) if (bp->b_qindex != QUEUE_NONE) panic("brelse: free buffer onto another queue???"); #endif - /* enqueue */ + /* buffers with no memory */ if (bp->b_bufsize == 0) { bp->b_flags |= B_INVAL; @@ -728,7 +844,8 @@ brelse(struct buf * bp) LIST_INSERT_HEAD(&invalhash, bp, b_hash); bp->b_dev = NODEV; kvafreespace += bp->b_kvasize; - + if (bp->b_kvasize) + kvaspacewakeup(); /* buffers with junk contents */ } else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) { bp->b_flags |= B_INVAL; @@ -754,15 +871,38 @@ brelse(struct buf * bp) TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); } - if ((bp->b_flags & B_INVAL) || - (bp->b_flags & (B_LOCKED|B_DELWRI)) == 0) { - if (bp->b_flags & B_DELWRI) { - --numdirtybuffers; - bp->b_flags &= ~B_DELWRI; - } - vfs_bio_need_satisfy(); + /* + * If B_INVAL, clear B_DELWRI. + */ + if ((bp->b_flags & (B_INVAL|B_DELWRI)) == (B_INVAL|B_DELWRI)) { + bp->b_flags &= ~B_DELWRI; + --numdirtybuffers; } + runningbufspace -= bp->b_bufsize; + + /* + * Fixup numfreebuffers count. The bp is on an appropriate queue + * unless locked. We then bump numfreebuffers if it is not B_DELWRI. + * We've already handled the B_INVAL case ( B_DELWRI will be clear + * if B_INVAL is set ). + */ + + if ((bp->b_flags & B_LOCKED) == 0 && !(bp->b_flags & B_DELWRI)) + bufcountwakeup(); + + /* + * Something we can maybe free. + */ + + if (bp->b_bufsize) + bufspacewakeup(); + + if (bp->b_flags & B_WANTED) { + bp->b_flags &= ~(B_WANTED | B_AGE); + wakeup(bp); + } + /* unlock */ bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); @@ -770,7 +910,8 @@ brelse(struct buf * bp) } /* - * Release a buffer. + * Release a buffer back to the appropriate queue but do not try to free + * it. */ void bqrelse(struct buf * bp) @@ -779,17 +920,12 @@ bqrelse(struct buf * bp) s = splbio(); - /* anyone need this block? */ - if (bp->b_flags & B_WANTED) { - bp->b_flags &= ~(B_WANTED | B_AGE); - wakeup(bp); - } + KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); #if !defined(MAX_PERF) if (bp->b_qindex != QUEUE_NONE) panic("bqrelse: free buffer onto another queue???"); #endif - if (bp->b_flags & B_LOCKED) { bp->b_flags &= ~B_ERROR; bp->b_qindex = QUEUE_LOCKED; @@ -800,10 +936,26 @@ bqrelse(struct buf * bp) TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); } - if ((bp->b_flags & (B_LOCKED|B_DELWRI)) == 0) { - vfs_bio_need_satisfy(); + runningbufspace -= bp->b_bufsize; + + if ((bp->b_flags & B_LOCKED) == 0 && + ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI)) + ) { + bufcountwakeup(); } + /* + * Something we can maybe wakeup + */ + if (bp->b_bufsize) + bufspacewakeup(); + + /* anyone need this block? */ + if (bp->b_flags & B_WANTED) { + bp->b_flags &= ~(B_WANTED | B_AGE); + wakeup(bp); + } + /* unlock */ bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); @@ -847,10 +999,13 @@ vfs_vmio_release(bp) } } } - splx(s); bufspace -= bp->b_bufsize; vmiospace -= bp->b_bufsize; + runningbufspace -= bp->b_bufsize; + splx(s); pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); + if (bp->b_bufsize) + bufspacewakeup(); bp->b_npages = 0; bp->b_bufsize = 0; bp->b_flags &= ~B_VMIO; @@ -902,7 +1057,8 @@ vfs_bio_awrite(struct buf * bp) s = splbio(); /* - * right now we support clustered writing only to regular files + * right now we support clustered writing only to regular files, and + * then only if our I/O system is not saturated. */ if ((vp->v_type == VREG) && (vp->v_mount != 0) && /* Only on nodes that have the size info */ @@ -943,279 +1099,358 @@ vfs_bio_awrite(struct buf * bp) */ nwritten = bp->b_bufsize; (void) VOP_BWRITE(bp); + return nwritten; } - /* - * Find a buffer header which is available for use. + * getnewbuf: + * + * Find and initialize a new buffer header, freeing up existing buffers + * in the bufqueues as necessary. + * + * We block if: + * We have insufficient buffer headers + * We have insufficient buffer space + * buffer_map is too fragmented ( space reservation fails ) + * + * We do *not* attempt to flush dirty buffers more then one level deep. + * I.e., if P_FLSINPROG is set we do not flush dirty buffers at all. + * + * If P_FLSINPROG is set, we are allowed to dip into our emergency + * reserve. */ static struct buf * getnewbuf(struct vnode *vp, daddr_t blkno, int slpflag, int slptimeo, int size, int maxsize) { - struct buf *bp, *bp1; - int nbyteswritten = 0; - vm_offset_t addr; - static int writerecursion = 0; + struct buf *bp; + struct buf *nbp; + int outofspace; + int nqindex; + int defrag = 0; + int countawrites = 0; + +restart: -start: - if (bufspace >= maxbufspace) - goto trytofreespace; + /* + * Setup for scan. If we do not have enough free buffers, + * we setup a degenerate case that falls through the while. + * + * If we are in the middle of a flush, we can dip into the + * emergency reserve. + */ - /* can we constitute a new buffer? */ - if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]))) { -#if !defined(MAX_PERF) - if (bp->b_qindex != QUEUE_EMPTY) - panic("getnewbuf: inconsistent EMPTY queue, qindex=%d", - bp->b_qindex); -#endif - bp->b_flags |= B_BUSY; - bremfree(bp); - goto fillbuf; - } -trytofreespace: - /* - * We keep the file I/O from hogging metadata I/O - * This is desirable because file data is cached in the - * VM/Buffer cache even if a buffer is freed. - */ - if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]))) { -#if !defined(MAX_PERF) - if (bp->b_qindex != QUEUE_AGE) - panic("getnewbuf: inconsistent AGE queue, qindex=%d", - bp->b_qindex); -#endif - } else if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]))) { -#if !defined(MAX_PERF) - if (bp->b_qindex != QUEUE_LRU) - panic("getnewbuf: inconsistent LRU queue, qindex=%d", - bp->b_qindex); -#endif - } - if (!bp) { - /* wait for a free buffer of any kind */ - needsbuffer |= VFS_BIO_NEED_ANY; - do - tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, "newbuf", - slptimeo); - while (needsbuffer & VFS_BIO_NEED_ANY); - return (0); - } - KASSERT(!(bp->b_flags & B_BUSY), - ("getnewbuf: busy buffer on free list\n")); - /* - * We are fairly aggressive about freeing VMIO buffers, but since - * the buffering is intact without buffer headers, there is not - * much loss. We gain by maintaining non-VMIOed metadata in buffers. - */ - if ((bp->b_qindex == QUEUE_LRU) && (bp->b_usecount > 0)) { - if ((bp->b_flags & B_VMIO) == 0 || - (vmiospace < maxvmiobufspace)) { - --bp->b_usecount; - TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist); - if (TAILQ_FIRST(&bufqueues[QUEUE_LRU]) != NULL) { - TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); - goto start; + if ((curproc->p_flag & P_FLSINPROG) == 0 && + numfreebuffers < lofreebuffers + ) { + nqindex = QUEUE_LRU; + nbp = NULL; + } else { + nqindex = QUEUE_EMPTY; + if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY])) == NULL) { + nqindex = QUEUE_AGE; + nbp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]); + if (nbp == NULL) { + nqindex = QUEUE_LRU; + nbp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]); } - TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); } } + /* + * Calculate whether we are out of buffer space. This state is + * recalculated on every restart. If we are out of space, we + * have to turn off defragmentation. The outofspace code will + * defragment too, but the looping conditionals will be messed up + * if both outofspace and defrag are on. + */ - /* if we are a delayed write, convert to an async write */ - if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) { - - /* - * If our delayed write is likely to be used soon, then - * recycle back onto the LRU queue. - */ - if (vp && (bp->b_vp == vp) && (bp->b_qindex == QUEUE_LRU) && - (bp->b_lblkno >= blkno) && (maxsize > 0)) { - - if (bp->b_usecount > 0) { - if (bp->b_lblkno < blkno + (MAXPHYS / maxsize)) { - - TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist); - - if (TAILQ_FIRST(&bufqueues[QUEUE_LRU]) != NULL) { - TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); - bp->b_usecount--; - goto start; - } - TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); - } - } - } - - /* - * Certain layered filesystems can recursively re-enter the vfs_bio - * code, due to delayed writes. This helps keep the system from - * deadlocking. - */ - if (writerecursion > 0) { - if (writerecursion > 5) { - bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]); - while (bp) { - if ((bp->b_flags & B_DELWRI) == 0) - break; - bp = TAILQ_NEXT(bp, b_freelist); - } - if (bp == NULL) { - bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]); - while (bp) { - if ((bp->b_flags & B_DELWRI) == 0) - break; - bp = TAILQ_NEXT(bp, b_freelist); - } - } - if (bp == NULL) - panic("getnewbuf: cannot get buffer, infinite recursion failure"); - } else { - bremfree(bp); - bp->b_flags |= B_BUSY | B_AGE | B_ASYNC; - nbyteswritten += bp->b_bufsize; - ++writerecursion; - VOP_BWRITE(bp); - --writerecursion; - if (!slpflag && !slptimeo) { - return (0); - } - goto start; - } - } else { - ++writerecursion; - nbyteswritten += vfs_bio_awrite(bp); - --writerecursion; - if (!slpflag && !slptimeo) { - return (0); - } - goto start; + outofspace = 0; + if (bufspace >= hibufspace) { + if ((curproc->p_flag & P_FLSINPROG) == 0 || + bufspace >= maxbufspace + ) { + outofspace = 1; + defrag = 0; } } - if (bp->b_flags & B_WANTED) { - bp->b_flags &= ~B_WANTED; - wakeup(bp); - } - bremfree(bp); - bp->b_flags |= B_BUSY; - - if (bp->b_flags & B_VMIO) { - bp->b_flags &= ~B_ASYNC; - vfs_vmio_release(bp); - } - - if (bp->b_vp) - brelvp(bp); - -fillbuf: - - /* we are not free, nor do we contain interesting data */ - if (bp->b_rcred != NOCRED) { - crfree(bp->b_rcred); - bp->b_rcred = NOCRED; - } - if (bp->b_wcred != NOCRED) { - crfree(bp->b_wcred); - bp->b_wcred = NOCRED; - } - if (LIST_FIRST(&bp->b_dep) != NULL && - bioops.io_deallocate) - (*bioops.io_deallocate)(bp); - - LIST_REMOVE(bp, b_hash); - LIST_INSERT_HEAD(&invalhash, bp, b_hash); - if (bp->b_bufsize) { - allocbuf(bp, 0); - } - bp->b_flags = B_BUSY; - bp->b_dev = NODEV; - bp->b_vp = NULL; - bp->b_blkno = bp->b_lblkno = 0; - bp->b_offset = NOOFFSET; - bp->b_iodone = 0; - bp->b_error = 0; - bp->b_resid = 0; - bp->b_bcount = 0; - bp->b_npages = 0; - bp->b_dirtyoff = bp->b_dirtyend = 0; - bp->b_validoff = bp->b_validend = 0; - bp->b_usecount = 5; - /* Here, not kern_physio.c, is where this should be done*/ - LIST_INIT(&bp->b_dep); - - maxsize = (maxsize + PAGE_MASK) & ~PAGE_MASK; - /* - * we assume that buffer_map is not at address 0 + * defrag state is semi-persistant. 1 means we are flagged for + * defragging. -1 means we actually defragged something. */ - addr = 0; - if (maxsize != bp->b_kvasize) { - bfreekva(bp); - -findkvaspace: + /* nop */ + + /* + * Run scan, possibly freeing data and/or kva mappings on the fly + * depending. + */ + + while ((bp = nbp) != NULL) { + int qindex = nqindex; /* - * See if we have buffer kva space + * Calculate next bp ( we can only use it if we do not block + * or do other fancy things ). */ - if (vm_map_findspace(buffer_map, - vm_map_min(buffer_map), maxsize, &addr)) { - if (kvafreespace > 0) { - int totfree = 0, freed; - do { - freed = 0; - for (bp1 = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); - bp1 != NULL; bp1 = TAILQ_NEXT(bp1, b_freelist)) { - if (bp1->b_kvasize != 0) { - totfree += bp1->b_kvasize; - freed = bp1->b_kvasize; - bremfree(bp1); - bfreekva(bp1); - brelse(bp1); - break; - } - } - } while (freed); + if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) { + switch(qindex) { + case QUEUE_EMPTY: + nqindex = QUEUE_AGE; + if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]))) + break; + /* fall through */ + case QUEUE_AGE: + nqindex = QUEUE_LRU; + if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]))) + break; + /* fall through */ + case QUEUE_LRU: /* - * if we found free space, then retry with the same buffer. + * nbp is NULL. */ - if (totfree) - goto findkvaspace; + break; } + } + + /* + * Sanity Checks + */ + KASSERT(!(bp->b_flags & B_BUSY), ("getnewbuf: busy buffer %p on free list", bp)); + KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistant queue %d bp %p", qindex, bp)); + + /* + * Here we try to move NON VMIO buffers to the end of the + * LRU queue in order to make VMIO buffers more readily + * freeable. We also try to move buffers with a positive + * usecount to the end. + * + * Note that by moving the bp to the end, we setup a following + * loop. Since we continue to decrement b_usecount this + * is ok and, in fact, desireable. + * + * If we are at the end of the list, we move ourself to the + * same place and need to fixup nbp and nqindex to handle + * the following case. + */ + + if ((qindex == QUEUE_LRU) && bp->b_usecount > 0) { + if ((bp->b_flags & B_VMIO) == 0 || + (vmiospace < maxvmiobufspace) + ) { + --bp->b_usecount; + TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist); + TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); + if (nbp == NULL) { + nqindex = qindex; + nbp = bp; + } + continue; + } + } + + /* + * If we come across a delayed write and numdirtybuffers should + * be flushed, try to write it out. Only if P_FLSINPROG is + * not set. We can't afford to recursively stack more then + * one deep due to the possibility of having deep VFS call + * stacks. + * + * Limit the number of dirty buffers we are willing to try + * to recover since it really isn't our job here. + */ + if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) { + if ((curproc->p_flag & P_FLSINPROG) || + numdirtybuffers < hidirtybuffers || + countawrites > 16 + ) { + continue; + } + curproc->p_flag |= P_FLSINPROG; + vfs_bio_awrite(bp); + curproc->p_flag &= ~P_FLSINPROG; + ++countawrites; + goto restart; + } + + if (defrag > 0 && bp->b_kvasize == 0) + continue; + if (outofspace > 0 && bp->b_bufsize == 0) + continue; + + /* + * Start freeing the bp. This is somewhat involved. nbp + * remains valid only for QUEUE_EMPTY bp's. + */ + + bremfree(bp); + bp->b_flags |= B_BUSY; + + if (qindex == QUEUE_LRU || qindex == QUEUE_AGE) { + if (bp->b_flags & B_VMIO) { + bp->b_flags &= ~B_ASYNC; + vfs_vmio_release(bp); + } + if (bp->b_vp) + brelvp(bp); + } + + if (bp->b_flags & B_WANTED) { + bp->b_flags &= ~B_WANTED; + wakeup(bp); + } + + /* + * NOTE: nbp is now entirely invalid. We can only restart + * the scan from this point on. + * + * Get the rest of the buffer freed up. b_kva* is still + * valid after this operation. + */ + + if (bp->b_rcred != NOCRED) { + crfree(bp->b_rcred); + bp->b_rcred = NOCRED; + } + if (bp->b_wcred != NOCRED) { + crfree(bp->b_wcred); + bp->b_wcred = NOCRED; + } + if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate) + (*bioops.io_deallocate)(bp); + + LIST_REMOVE(bp, b_hash); + LIST_INSERT_HEAD(&invalhash, bp, b_hash); + + if (bp->b_bufsize) + allocbuf(bp, 0); + + bp->b_flags = B_BUSY; + bp->b_dev = NODEV; + bp->b_vp = NULL; + bp->b_blkno = bp->b_lblkno = 0; + bp->b_offset = NOOFFSET; + bp->b_iodone = 0; + bp->b_error = 0; + bp->b_resid = 0; + bp->b_bcount = 0; + bp->b_npages = 0; + bp->b_dirtyoff = bp->b_dirtyend = 0; + bp->b_validoff = bp->b_validend = 0; + bp->b_usecount = 5; + + LIST_INIT(&bp->b_dep); + + /* + * Ok, now that we have a free buffer, if we are defragging + * we have to recover the kvaspace. + */ + + if (defrag > 0) { + defrag = -1; bp->b_flags |= B_INVAL; + bfreekva(bp); brelse(bp); - goto trytofreespace; + goto restart; + } + + if (outofspace > 0) { + outofspace = -1; + bp->b_flags |= B_INVAL; + bfreekva(bp); + brelse(bp); + goto restart; + } + + /* + * We are done + */ + break; + } + + /* + * If we exhausted our list, sleep as appropriate. + */ + + if (bp == NULL) { + int flags; + +dosleep: + if (defrag > 0) + flags = VFS_BIO_NEED_KVASPACE; + else if (outofspace > 0) + flags = VFS_BIO_NEED_BUFSPACE; + else + flags = VFS_BIO_NEED_ANY; + + if (rushjob < syncdelay / 2) + ++rushjob; + needsbuffer |= flags; + while (needsbuffer & flags) { + tsleep( + &needsbuffer, + (PRIBIO + 4) | slpflag, + "newbuf", + slptimeo + ); + } + } else { + /* + * We finally have a valid bp. We aren't quite out of the + * woods, we still have to reserve kva space. + */ + vm_offset_t addr = 0; + + maxsize = (maxsize + PAGE_MASK) & ~PAGE_MASK; + + if (maxsize != bp->b_kvasize) { + bfreekva(bp); + + if (vm_map_findspace(buffer_map, + vm_map_min(buffer_map), maxsize, &addr) + ) { + /* + * Uh oh. Buffer map is to fragmented. Try + * to defragment. + */ + if (defrag <= 0) { + defrag = 1; + bp->b_flags |= B_INVAL; + brelse(bp); + goto restart; + } + /* + * Uh oh. We couldn't seem to defragment + */ + bp = NULL; + goto dosleep; + } + } + if (addr) { + vm_map_insert(buffer_map, NULL, 0, + addr, addr + maxsize, + VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT); + + bp->b_kvabase = (caddr_t) addr; + bp->b_kvasize = maxsize; + bp->b_data = bp->b_kvabase; } } - - /* - * See if we are below are allocated minimum - */ - if (bufspace >= (maxbufspace + nbyteswritten)) { - bp->b_flags |= B_INVAL; - brelse(bp); - goto trytofreespace; - } - - /* - * create a map entry for the buffer -- in essence - * reserving the kva space. - */ - if (addr) { - vm_map_insert(buffer_map, NULL, 0, - addr, addr + maxsize, - VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT); - - bp->b_kvabase = (caddr_t) addr; - bp->b_kvasize = maxsize; - } - bp->b_data = bp->b_kvabase; return (bp); } +/* + * waitfreebuffers: + * + * Wait for sufficient free buffers. This routine is not called if + * curproc is the update process so we do not have to do anything + * fancy. + */ + static void -waitfreebuffers(int slpflag, int slptimeo) { +waitfreebuffers(int slpflag, int slptimeo) +{ while (numfreebuffers < hifreebuffers) { flushdirtybuffers(slpflag, slptimeo); if (numfreebuffers < hifreebuffers) @@ -1226,50 +1461,82 @@ waitfreebuffers(int slpflag, int slptimeo) { } } +/* + * flushdirtybuffers: + * + * This routine is called when we get too many dirty buffers. + * + * We have to protect ourselves from recursion, but we also do not want + * other process's flushdirtybuffers() to interfere with the syncer if + * it decides to flushdirtybuffers(). + * + * In order to maximize operations, we allow any process to flush + * dirty buffers and use P_FLSINPROG to prevent recursion. + */ + static void -flushdirtybuffers(int slpflag, int slptimeo) { +flushdirtybuffers(int slpflag, int slptimeo) +{ int s; - static pid_t flushing = 0; s = splbio(); - if (flushing) { - if (flushing == curproc->p_pid) { - splx(s); - return; - } - while (flushing) { - if (tsleep(&flushing, (PRIBIO + 4)|slpflag, "biofls", slptimeo)) { - splx(s); - return; - } - } + if (curproc->p_flag & P_FLSINPROG) { + splx(s); + return; } - flushing = curproc->p_pid; + curproc->p_flag |= P_FLSINPROG; while (numdirtybuffers > lodirtybuffers) { - struct buf *bp; - needsbuffer |= VFS_BIO_NEED_LOWLIMIT; - bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]); - if (bp == NULL) - bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]); - - while (bp && ((bp->b_flags & B_DELWRI) == 0)) { - bp = TAILQ_NEXT(bp, b_freelist); - } - - if (bp) { - vfs_bio_awrite(bp); - continue; - } - break; + if (flushbufqueues() == 0) + break; } - flushing = 0; - wakeup(&flushing); + curproc->p_flag &= ~P_FLSINPROG; + splx(s); } +static int +flushbufqueues(void) +{ + struct buf *bp; + int qindex; + int r = 0; + + qindex = QUEUE_AGE; + bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]); + + for (;;) { + if (bp == NULL) { + if (qindex == QUEUE_LRU) + break; + qindex = QUEUE_LRU; + if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU])) == NULL) + break; + } + + /* + * XXX NFS does weird things with B_INVAL bps if we bwrite + * them ( vfs_bio_awrite/bawrite/bdwrite/etc ) Why? + * + */ + if ((bp->b_flags & B_DELWRI) != 0) { + if (bp->b_flags & B_INVAL) { + bremfree(bp); + bp->b_flags |= B_BUSY; + brelse(bp); + } else { + vfs_bio_awrite(bp); + } + ++r; + break; + } + bp = TAILQ_NEXT(bp, b_freelist); + } + return(r); +} + /* * Check to see if a block is currently memory resident. */ @@ -1335,21 +1602,29 @@ inmem(struct vnode * vp, daddr_t blkno) * code, and used by the nfs read code. */ static void -vfs_setdirty(struct buf *bp) { +vfs_setdirty(struct buf *bp) +{ int i; vm_object_t object; vm_offset_t boffset; -#if 0 - vm_offset_t offset; -#endif /* * We qualify the scan for modified pages on whether the * object has been flushed yet. The OBJ_WRITEABLE flag * is not cleared simply by protecting pages off. */ - if ((bp->b_flags & B_VMIO) && - ((object = bp->b_pages[0]->object)->flags & (OBJ_WRITEABLE|OBJ_CLEANING))) { + + if ((bp->b_flags & B_VMIO) == 0) + return; + + object = bp->b_pages[0]->object; + + if ((object->flags & OBJ_WRITEABLE) && !(object->flags & OBJ_MIGHTBEDIRTY)) + printf("Warning: object %p writeable but not mightbedirty\n", object); + if (!(object->flags & OBJ_WRITEABLE) && (object->flags & OBJ_MIGHTBEDIRTY)) + printf("Warning: object %p mightbedirty but not writeable\n", object); + + if (object->flags & (OBJ_MIGHTBEDIRTY|OBJ_CLEANING)) { /* * test the pages to see if they have been modified directly * by users through the VM system. @@ -1410,7 +1685,15 @@ getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo) s = splbio(); loop: - if (numfreebuffers < lofreebuffers) { + /* + * Block if we are low on buffers. The syncer is allowed more + * buffers in order to avoid a deadlock. + */ + if (curproc == updateproc && numfreebuffers == 0) { + needsbuffer |= VFS_BIO_NEED_ANY; + tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, "newbuf", + slptimeo); + } else if (curproc != updateproc && numfreebuffers < lofreebuffers) { waitfreebuffers(slpflag, slptimeo); } @@ -1655,6 +1938,9 @@ allocbuf(struct buf *bp, int size) free(bp->b_data, M_BIOBUF); bufspace -= bp->b_bufsize; bufmallocspace -= bp->b_bufsize; + runningbufspace -= bp->b_bufsize; + if (bp->b_bufsize) + bufspacewakeup(); bp->b_data = bp->b_kvabase; bp->b_bufsize = 0; bp->b_bcount = 0; @@ -1683,6 +1969,7 @@ allocbuf(struct buf *bp, int size) bp->b_flags |= B_MALLOC; bufspace += mbsize; bufmallocspace += mbsize; + runningbufspace += bp->b_bufsize; return 1; } #endif @@ -1699,6 +1986,9 @@ allocbuf(struct buf *bp, int size) bp->b_data = bp->b_kvabase; bufspace -= bp->b_bufsize; bufmallocspace -= bp->b_bufsize; + runningbufspace -= bp->b_bufsize; + if (bp->b_bufsize) + bufspacewakeup(); bp->b_bufsize = 0; bp->b_flags &= ~B_MALLOC; newbsize = round_page(newbsize); @@ -1862,6 +2152,9 @@ allocbuf(struct buf *bp, int size) if (bp->b_flags & B_VMIO) vmiospace += (newbsize - bp->b_bufsize); bufspace += (newbsize - bp->b_bufsize); + runningbufspace += (newbsize - bp->b_bufsize); + if (newbsize < bp->b_bufsize) + bufspacewakeup(); bp->b_bufsize = newbsize; bp->b_bcount = size; return 1; @@ -1909,18 +2202,9 @@ biodone(register struct buf * bp) s = splbio(); -#if !defined(MAX_PERF) - if (!(bp->b_flags & B_BUSY)) - panic("biodone: buffer not busy"); -#endif + KASSERT((bp->b_flags & B_BUSY), ("biodone: bp %p not busy", bp)); + KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp)); - if (bp->b_flags & B_DONE) { - splx(s); -#if !defined(MAX_PERF) - printf("biodone: buffer already done\n"); -#endif - return; - } bp->b_flags |= B_DONE; if (bp->b_flags & B_FREEBUF) { diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c index 27e9167d83cf..f7bd95e2947e 100644 --- a/sys/kern/vfs_cluster.c +++ b/sys/kern/vfs_cluster.c @@ -33,7 +33,7 @@ * SUCH DAMAGE. * * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 - * $Id: vfs_cluster.c,v 1.78 1999/01/21 08:29:05 dillon Exp $ + * $Id: vfs_cluster.c,v 1.79 1999/01/27 21:49:58 dillon Exp $ */ #include "opt_debug_cluster.h" @@ -778,8 +778,8 @@ cluster_wbuild(vp, size, start_lbn, len) bp->b_bufsize += size; s = splbio(); - --numdirtybuffers; - tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); + bundirty(tbp); + tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR); tbp->b_flags |= B_ASYNC; reassignbuf(tbp, tbp->b_vp); /* put on clean list */ ++tbp->b_vp->v_numoutput; diff --git a/sys/kern/vfs_export.c b/sys/kern/vfs_export.c index d718a3f95d18..97edbdb430cf 100644 --- a/sys/kern/vfs_export.c +++ b/sys/kern/vfs_export.c @@ -36,7 +36,7 @@ * SUCH DAMAGE. * * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 - * $Id: vfs_subr.c,v 1.187 1999/02/19 17:36:58 dillon Exp $ + * $Id: vfs_subr.c,v 1.188 1999/02/25 05:22:29 dillon Exp $ */ /* @@ -901,7 +901,7 @@ vn_syncer_add_to_worklist(struct vnode *vp, int delay) splx(s); } -static struct proc *updateproc; +struct proc *updateproc; static void sched_sync __P((void)); static const struct kproc_desc up_kp = { "syncer", @@ -937,11 +937,19 @@ sched_sync(void) splx(s); while ((vp = LIST_FIRST(slp)) != NULL) { - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); - (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p); - VOP_UNLOCK(vp, 0, p); + if (VOP_ISLOCKED(vp) == 0) { + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p); + VOP_UNLOCK(vp, 0, p); + } s = splbio(); if (LIST_FIRST(slp) == vp) { + /* + * Note: v_tag VT_VFS vps can remain on the + * worklist too with no dirty blocks, but + * since sync_fsync() moves it to a different + * slot we are safe. + */ if (TAILQ_EMPTY(&vp->v_dirtyblkhd) && vp->v_type != VBLK) panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag); @@ -1063,7 +1071,6 @@ reassignbuf(bp, newvp) register struct vnode *newvp; { struct buflists *listheadp; - struct vnode *oldvp; int delay; int s; @@ -1086,14 +1093,16 @@ reassignbuf(bp, newvp) * Delete from old vnode list, if on one. */ if (bp->b_xflags & (B_VNDIRTY|B_VNCLEAN)) { - oldvp = bp->b_vp; if (bp->b_xflags & B_VNDIRTY) - listheadp = &oldvp->v_dirtyblkhd; + listheadp = &bp->b_vp->v_dirtyblkhd; else - listheadp = &oldvp->v_cleanblkhd; + listheadp = &bp->b_vp->v_cleanblkhd; TAILQ_REMOVE(listheadp, bp, b_vnbufs); bp->b_xflags &= ~(B_VNDIRTY|B_VNCLEAN); - vdrop(oldvp); + if (bp->b_vp != newvp) { + vdrop(bp->b_vp); + bp->b_vp = NULL; /* for clarification */ + } } /* * If dirty, put on list of dirty buffers; otherwise insert onto list @@ -1145,8 +1154,10 @@ reassignbuf(bp, newvp) LIST_REMOVE(newvp, v_synclist); } } - bp->b_vp = newvp; - vhold(bp->b_vp); + if (bp->b_vp != newvp) { + bp->b_vp = newvp; + vhold(bp->b_vp); + } splx(s); } diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index d718a3f95d18..97edbdb430cf 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -36,7 +36,7 @@ * SUCH DAMAGE. * * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 - * $Id: vfs_subr.c,v 1.187 1999/02/19 17:36:58 dillon Exp $ + * $Id: vfs_subr.c,v 1.188 1999/02/25 05:22:29 dillon Exp $ */ /* @@ -901,7 +901,7 @@ vn_syncer_add_to_worklist(struct vnode *vp, int delay) splx(s); } -static struct proc *updateproc; +struct proc *updateproc; static void sched_sync __P((void)); static const struct kproc_desc up_kp = { "syncer", @@ -937,11 +937,19 @@ sched_sync(void) splx(s); while ((vp = LIST_FIRST(slp)) != NULL) { - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); - (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p); - VOP_UNLOCK(vp, 0, p); + if (VOP_ISLOCKED(vp) == 0) { + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p); + VOP_UNLOCK(vp, 0, p); + } s = splbio(); if (LIST_FIRST(slp) == vp) { + /* + * Note: v_tag VT_VFS vps can remain on the + * worklist too with no dirty blocks, but + * since sync_fsync() moves it to a different + * slot we are safe. + */ if (TAILQ_EMPTY(&vp->v_dirtyblkhd) && vp->v_type != VBLK) panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag); @@ -1063,7 +1071,6 @@ reassignbuf(bp, newvp) register struct vnode *newvp; { struct buflists *listheadp; - struct vnode *oldvp; int delay; int s; @@ -1086,14 +1093,16 @@ reassignbuf(bp, newvp) * Delete from old vnode list, if on one. */ if (bp->b_xflags & (B_VNDIRTY|B_VNCLEAN)) { - oldvp = bp->b_vp; if (bp->b_xflags & B_VNDIRTY) - listheadp = &oldvp->v_dirtyblkhd; + listheadp = &bp->b_vp->v_dirtyblkhd; else - listheadp = &oldvp->v_cleanblkhd; + listheadp = &bp->b_vp->v_cleanblkhd; TAILQ_REMOVE(listheadp, bp, b_vnbufs); bp->b_xflags &= ~(B_VNDIRTY|B_VNCLEAN); - vdrop(oldvp); + if (bp->b_vp != newvp) { + vdrop(bp->b_vp); + bp->b_vp = NULL; /* for clarification */ + } } /* * If dirty, put on list of dirty buffers; otherwise insert onto list @@ -1145,8 +1154,10 @@ reassignbuf(bp, newvp) LIST_REMOVE(newvp, v_synclist); } } - bp->b_vp = newvp; - vhold(bp->b_vp); + if (bp->b_vp != newvp) { + bp->b_vp = newvp; + vhold(bp->b_vp); + } splx(s); } diff --git a/sys/nfs/nfs_bio.c b/sys/nfs/nfs_bio.c index fb437a530430..2fb535399d3a 100644 --- a/sys/nfs/nfs_bio.c +++ b/sys/nfs/nfs_bio.c @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95 - * $Id: nfs_bio.c,v 1.65 1998/12/14 17:51:30 dt Exp $ + * $Id: nfs_bio.c,v 1.66 1999/01/21 08:29:07 dillon Exp $ */ @@ -418,6 +418,7 @@ nfs_bioread(vp, uio, ioflag, cred, getpages) return (EINTR); if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { rabp->b_flags |= (B_READ | B_ASYNC); + rabp->b_flags &= ~B_DONE; vfs_busy_pages(rabp, 0); if (nfs_asyncio(rabp, cred)) { rabp->b_flags |= B_INVAL|B_ERROR; @@ -513,6 +514,7 @@ again: return (EINTR); if ((bp->b_flags & B_CACHE) == 0) { bp->b_flags |= B_READ; + bp->b_flags &= ~B_DONE; vfs_busy_pages(bp, 0); error = nfs_doio(bp, cred, p); if (error) { @@ -537,6 +539,7 @@ again: return (EINTR); if ((bp->b_flags & B_CACHE) == 0) { bp->b_flags |= B_READ; + bp->b_flags &= ~B_DONE; vfs_busy_pages(bp, 0); error = nfs_doio(bp, cred, p); if (error) { @@ -560,6 +563,7 @@ again: return (EINTR); if ((bp->b_flags & B_DONE) == 0) { bp->b_flags |= B_READ; + bp->b_flags &= ~B_DONE; vfs_busy_pages(bp, 0); error = nfs_doio(bp, cred, p); if (error == 0 && (bp->b_flags & B_INVAL)) @@ -591,6 +595,7 @@ again: if (rabp) { if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { rabp->b_flags |= (B_READ | B_ASYNC); + rabp->b_flags &= ~B_DONE; vfs_busy_pages(rabp, 0); if (nfs_asyncio(rabp, cred)) { rabp->b_flags |= B_INVAL|B_ERROR; @@ -840,6 +845,12 @@ again: bp->b_dirtyoff = on; bp->b_dirtyend = on + n; } + /* + * To avoid code complexity, we may have to throw away + * previously valid ranges when merging the new dirty range + * into the valid range. As long as we do not *ADD* an + * invalid valid range, we are ok. + */ if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff || bp->b_validoff > bp->b_dirtyend) { bp->b_validoff = bp->b_dirtyoff; @@ -1004,7 +1015,7 @@ nfs_asyncio(bp, cred) if (nfs_numasync == 0) return (EIO); - + nmp = VFSTONFS(bp->b_vp->v_mount); again: if (nmp->nm_flag & NFSMNT_INT) @@ -1109,12 +1120,12 @@ again: */ int nfs_doio(bp, cr, p) - register struct buf *bp; + struct buf *bp; struct ucred *cr; struct proc *p; { - register struct uio *uiop; - register struct vnode *vp; + struct uio *uiop; + struct vnode *vp; struct nfsnode *np; struct nfsmount *nmp; int error = 0, diff, len, iomode, must_commit = 0; @@ -1130,6 +1141,8 @@ nfs_doio(bp, cr, p) uiop->uio_segflg = UIO_SYSSPACE; uiop->uio_procp = p; + KASSERT(!(bp->b_flags & B_DONE), ("nfs_doio: bp %p already marked done", bp)); + /* * Historically, paging was done with physio, but no more. */ @@ -1236,10 +1249,12 @@ nfs_doio(bp, cr, p) io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; uiop->uio_rw = UIO_WRITE; nfsstats.write_bios++; + if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC) iomode = NFSV3WRITE_UNSTABLE; else iomode = NFSV3WRITE_FILESYNC; + bp->b_flags |= B_WRITEINPROG; error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit); if (!error && iomode == NFSV3WRITE_UNSTABLE) { @@ -1247,8 +1262,9 @@ nfs_doio(bp, cr, p) if (bp->b_dirtyoff == 0 && bp->b_dirtyend == bp->b_bufsize) bp->b_flags |= B_CLUSTEROK; - } else + } else { bp->b_flags &= ~B_NEEDCOMMIT; + } bp->b_flags &= ~B_WRITEINPROG; /* @@ -1265,31 +1281,30 @@ nfs_doio(bp, cr, p) * the B_DELWRI and B_NEEDCOMMIT flags. * * If the buffer is marked B_PAGING, it does not reside on - * the vp's paging queues so we do not ( and cannot ) reassign - * it. XXX numdirtybuffers should be integrated into - * reassignbuf() call. + * the vp's paging queues so we cannot call bdirty(). The + * bp in this case is not an NFS cache block so we should + * be safe. XXX */ if (error == EINTR || (!error && (bp->b_flags & B_NEEDCOMMIT))) { int s; + s = splbio(); bp->b_flags &= ~(B_INVAL|B_NOCACHE); if ((bp->b_flags & B_PAGING) == 0) { - ++numdirtybuffers; - bp->b_flags |= B_DELWRI; - s = splbio(); - reassignbuf(bp, vp); - splx(s); + bdirty(bp); + bp->b_flags &= ~B_DONE; } if ((bp->b_flags & B_ASYNC) == 0) bp->b_flags |= B_EINTR; + splx(s); } else { - if (error) { - bp->b_flags |= B_ERROR; - bp->b_error = np->n_error = error; - np->n_flag |= NWRITEERR; - } - bp->b_dirtyoff = bp->b_dirtyend = 0; + if (error) { + bp->b_flags |= B_ERROR; + bp->b_error = np->n_error = error; + np->n_flag |= NWRITEERR; + } + bp->b_dirtyoff = bp->b_dirtyend = 0; } } else { bp->b_resid = 0; @@ -1299,7 +1314,7 @@ nfs_doio(bp, cr, p) } bp->b_resid = uiop->uio_resid; if (must_commit) - nfs_clearcommit(vp->v_mount); + nfs_clearcommit(vp->v_mount); biodone(bp); return (error); } diff --git a/sys/nfs/nfs_vnops.c b/sys/nfs/nfs_vnops.c index 4afb4697c56b..a92bb2295811 100644 --- a/sys/nfs/nfs_vnops.c +++ b/sys/nfs/nfs_vnops.c @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * @(#)nfs_vnops.c 8.16 (Berkeley) 5/27/95 - * $Id: nfs_vnops.c,v 1.122 1999/02/13 09:47:30 dillon Exp $ + * $Id: nfs_vnops.c,v 1.123 1999/02/16 10:49:54 dfr Exp $ */ @@ -2648,6 +2648,9 @@ nfs_strategy(ap) struct proc *p; int error = 0; + KASSERT(!(bp->b_flags & B_DONE), ("nfs_strategy: buffer %p unexpectedly marked B_DONE", bp)); + KASSERT((bp->b_flags & B_BUSY), ("nfs_strategy: buffer %p not B_BUSY", bp)); + if (bp->b_flags & B_PHYS) panic("nfs physio"); @@ -2797,6 +2800,10 @@ again: /* * Work out if all buffers are using the same cred * so we can deal with them all with one commit. + * + * NOTE: we are not clearing B_DONE here, so we have + * to do it later on in this routine if we intend to + * initiate I/O on the bp. */ if (wcred == NULL) wcred = bp->b_wcred; @@ -2804,6 +2811,14 @@ again: wcred = NOCRED; bp->b_flags |= (B_BUSY | B_WRITEINPROG); vfs_busy_pages(bp, 1); + + /* + * bp is protected by being B_BUSY, but nbp is not + * and vfs_busy_pages() may sleep. We have to + * recalculate nbp. + */ + nbp = TAILQ_NEXT(bp, b_vnbufs); + /* * A list of these buffers is kept so that the * second loop knows which buffers have actually @@ -2849,6 +2864,7 @@ again: if (retv == NFSERR_STALEWRITEVERF) nfs_clearcommit(vp->v_mount); + /* * Now, either mark the blocks I/O done or mark the * blocks dirty, depending on whether the commit @@ -2858,23 +2874,27 @@ again: bp = bvec[i]; bp->b_flags &= ~(B_NEEDCOMMIT | B_WRITEINPROG); if (retv) { - vfs_unbusy_pages(bp); - brelse(bp); + /* + * Error, leave B_DELWRI intact + */ + vfs_unbusy_pages(bp); + brelse(bp); } else { - s = splbio(); /* XXX check this positionning */ - vp->v_numoutput++; - bp->b_flags |= B_ASYNC; - if (bp->b_flags & B_DELWRI) { - --numdirtybuffers; - if (needsbuffer) { - vfs_bio_need_satisfy(); - } - } - bp->b_flags &= ~(B_READ|B_DONE|B_ERROR|B_DELWRI); - bp->b_dirtyoff = bp->b_dirtyend = 0; - reassignbuf(bp, vp); - splx(s); - biodone(bp); + /* + * Success, remove B_DELWRI ( bundirty() ). + * + * b_dirtyoff/b_dirtyend seem to be NFS + * specific. We should probably move that + * into bundirty(). XXX + */ + s = splbio(); + vp->v_numoutput++; + bp->b_flags |= B_ASYNC; + bundirty(bp); + bp->b_flags &= ~(B_READ|B_DONE|B_ERROR); + bp->b_dirtyoff = bp->b_dirtyend = 0; + splx(s); + biodone(bp); } } } @@ -2999,6 +3019,8 @@ nfs_print(ap) /* * Just call nfs_writebp() with the force argument set to 1. + * + * NOTE: B_DONE may or may not be set in a_bp on call. */ static int nfs_bwrite(ap) @@ -3020,26 +3042,24 @@ nfs_writebp(bp, force) int force; { int s; - register int oldflags = bp->b_flags, retv = 1; + int oldflags = bp->b_flags; + int retv = 1; off_t off; if(!(bp->b_flags & B_BUSY)) panic("bwrite: buffer is not busy???"); if (bp->b_flags & B_INVAL) - bp->b_flags |= B_INVAL | B_NOCACHE; + bp->b_flags |= B_NOCACHE; - if (bp->b_flags & B_DELWRI) { - --numdirtybuffers; - if (needsbuffer) - vfs_bio_need_satisfy(); - } - s = splbio(); /* XXX check if needed */ - bp->b_flags &= ~(B_READ|B_DONE|B_ERROR|B_DELWRI); + /* + * XXX we bundirty() the bp here. Shouldn't we do it later after + * the I/O has completed?? + */ - if ((oldflags & (B_ASYNC|B_DELWRI)) == (B_ASYNC|B_DELWRI)) { - reassignbuf(bp, bp->b_vp); - } + s = splbio(); + bundirty(bp); + bp->b_flags &= ~(B_READ|B_DONE|B_ERROR); bp->b_vp->v_numoutput++; curproc->p_stats->p_ru.ru_oublock++; @@ -3061,8 +3081,9 @@ nfs_writebp(bp, force) bp->b_dirtyoff = bp->b_dirtyend = 0; bp->b_flags &= ~B_NEEDCOMMIT; biodone(bp); - } else if (retv == NFSERR_STALEWRITEVERF) + } else if (retv == NFSERR_STALEWRITEVERF) { nfs_clearcommit(bp->b_vp->v_mount); + } } if (retv) { if (force) diff --git a/sys/nfsclient/nfs_bio.c b/sys/nfsclient/nfs_bio.c index fb437a530430..2fb535399d3a 100644 --- a/sys/nfsclient/nfs_bio.c +++ b/sys/nfsclient/nfs_bio.c @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95 - * $Id: nfs_bio.c,v 1.65 1998/12/14 17:51:30 dt Exp $ + * $Id: nfs_bio.c,v 1.66 1999/01/21 08:29:07 dillon Exp $ */ @@ -418,6 +418,7 @@ nfs_bioread(vp, uio, ioflag, cred, getpages) return (EINTR); if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { rabp->b_flags |= (B_READ | B_ASYNC); + rabp->b_flags &= ~B_DONE; vfs_busy_pages(rabp, 0); if (nfs_asyncio(rabp, cred)) { rabp->b_flags |= B_INVAL|B_ERROR; @@ -513,6 +514,7 @@ again: return (EINTR); if ((bp->b_flags & B_CACHE) == 0) { bp->b_flags |= B_READ; + bp->b_flags &= ~B_DONE; vfs_busy_pages(bp, 0); error = nfs_doio(bp, cred, p); if (error) { @@ -537,6 +539,7 @@ again: return (EINTR); if ((bp->b_flags & B_CACHE) == 0) { bp->b_flags |= B_READ; + bp->b_flags &= ~B_DONE; vfs_busy_pages(bp, 0); error = nfs_doio(bp, cred, p); if (error) { @@ -560,6 +563,7 @@ again: return (EINTR); if ((bp->b_flags & B_DONE) == 0) { bp->b_flags |= B_READ; + bp->b_flags &= ~B_DONE; vfs_busy_pages(bp, 0); error = nfs_doio(bp, cred, p); if (error == 0 && (bp->b_flags & B_INVAL)) @@ -591,6 +595,7 @@ again: if (rabp) { if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { rabp->b_flags |= (B_READ | B_ASYNC); + rabp->b_flags &= ~B_DONE; vfs_busy_pages(rabp, 0); if (nfs_asyncio(rabp, cred)) { rabp->b_flags |= B_INVAL|B_ERROR; @@ -840,6 +845,12 @@ again: bp->b_dirtyoff = on; bp->b_dirtyend = on + n; } + /* + * To avoid code complexity, we may have to throw away + * previously valid ranges when merging the new dirty range + * into the valid range. As long as we do not *ADD* an + * invalid valid range, we are ok. + */ if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff || bp->b_validoff > bp->b_dirtyend) { bp->b_validoff = bp->b_dirtyoff; @@ -1004,7 +1015,7 @@ nfs_asyncio(bp, cred) if (nfs_numasync == 0) return (EIO); - + nmp = VFSTONFS(bp->b_vp->v_mount); again: if (nmp->nm_flag & NFSMNT_INT) @@ -1109,12 +1120,12 @@ again: */ int nfs_doio(bp, cr, p) - register struct buf *bp; + struct buf *bp; struct ucred *cr; struct proc *p; { - register struct uio *uiop; - register struct vnode *vp; + struct uio *uiop; + struct vnode *vp; struct nfsnode *np; struct nfsmount *nmp; int error = 0, diff, len, iomode, must_commit = 0; @@ -1130,6 +1141,8 @@ nfs_doio(bp, cr, p) uiop->uio_segflg = UIO_SYSSPACE; uiop->uio_procp = p; + KASSERT(!(bp->b_flags & B_DONE), ("nfs_doio: bp %p already marked done", bp)); + /* * Historically, paging was done with physio, but no more. */ @@ -1236,10 +1249,12 @@ nfs_doio(bp, cr, p) io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; uiop->uio_rw = UIO_WRITE; nfsstats.write_bios++; + if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC) iomode = NFSV3WRITE_UNSTABLE; else iomode = NFSV3WRITE_FILESYNC; + bp->b_flags |= B_WRITEINPROG; error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit); if (!error && iomode == NFSV3WRITE_UNSTABLE) { @@ -1247,8 +1262,9 @@ nfs_doio(bp, cr, p) if (bp->b_dirtyoff == 0 && bp->b_dirtyend == bp->b_bufsize) bp->b_flags |= B_CLUSTEROK; - } else + } else { bp->b_flags &= ~B_NEEDCOMMIT; + } bp->b_flags &= ~B_WRITEINPROG; /* @@ -1265,31 +1281,30 @@ nfs_doio(bp, cr, p) * the B_DELWRI and B_NEEDCOMMIT flags. * * If the buffer is marked B_PAGING, it does not reside on - * the vp's paging queues so we do not ( and cannot ) reassign - * it. XXX numdirtybuffers should be integrated into - * reassignbuf() call. + * the vp's paging queues so we cannot call bdirty(). The + * bp in this case is not an NFS cache block so we should + * be safe. XXX */ if (error == EINTR || (!error && (bp->b_flags & B_NEEDCOMMIT))) { int s; + s = splbio(); bp->b_flags &= ~(B_INVAL|B_NOCACHE); if ((bp->b_flags & B_PAGING) == 0) { - ++numdirtybuffers; - bp->b_flags |= B_DELWRI; - s = splbio(); - reassignbuf(bp, vp); - splx(s); + bdirty(bp); + bp->b_flags &= ~B_DONE; } if ((bp->b_flags & B_ASYNC) == 0) bp->b_flags |= B_EINTR; + splx(s); } else { - if (error) { - bp->b_flags |= B_ERROR; - bp->b_error = np->n_error = error; - np->n_flag |= NWRITEERR; - } - bp->b_dirtyoff = bp->b_dirtyend = 0; + if (error) { + bp->b_flags |= B_ERROR; + bp->b_error = np->n_error = error; + np->n_flag |= NWRITEERR; + } + bp->b_dirtyoff = bp->b_dirtyend = 0; } } else { bp->b_resid = 0; @@ -1299,7 +1314,7 @@ nfs_doio(bp, cr, p) } bp->b_resid = uiop->uio_resid; if (must_commit) - nfs_clearcommit(vp->v_mount); + nfs_clearcommit(vp->v_mount); biodone(bp); return (error); } diff --git a/sys/nfsclient/nfs_vnops.c b/sys/nfsclient/nfs_vnops.c index 4afb4697c56b..a92bb2295811 100644 --- a/sys/nfsclient/nfs_vnops.c +++ b/sys/nfsclient/nfs_vnops.c @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * @(#)nfs_vnops.c 8.16 (Berkeley) 5/27/95 - * $Id: nfs_vnops.c,v 1.122 1999/02/13 09:47:30 dillon Exp $ + * $Id: nfs_vnops.c,v 1.123 1999/02/16 10:49:54 dfr Exp $ */ @@ -2648,6 +2648,9 @@ nfs_strategy(ap) struct proc *p; int error = 0; + KASSERT(!(bp->b_flags & B_DONE), ("nfs_strategy: buffer %p unexpectedly marked B_DONE", bp)); + KASSERT((bp->b_flags & B_BUSY), ("nfs_strategy: buffer %p not B_BUSY", bp)); + if (bp->b_flags & B_PHYS) panic("nfs physio"); @@ -2797,6 +2800,10 @@ again: /* * Work out if all buffers are using the same cred * so we can deal with them all with one commit. + * + * NOTE: we are not clearing B_DONE here, so we have + * to do it later on in this routine if we intend to + * initiate I/O on the bp. */ if (wcred == NULL) wcred = bp->b_wcred; @@ -2804,6 +2811,14 @@ again: wcred = NOCRED; bp->b_flags |= (B_BUSY | B_WRITEINPROG); vfs_busy_pages(bp, 1); + + /* + * bp is protected by being B_BUSY, but nbp is not + * and vfs_busy_pages() may sleep. We have to + * recalculate nbp. + */ + nbp = TAILQ_NEXT(bp, b_vnbufs); + /* * A list of these buffers is kept so that the * second loop knows which buffers have actually @@ -2849,6 +2864,7 @@ again: if (retv == NFSERR_STALEWRITEVERF) nfs_clearcommit(vp->v_mount); + /* * Now, either mark the blocks I/O done or mark the * blocks dirty, depending on whether the commit @@ -2858,23 +2874,27 @@ again: bp = bvec[i]; bp->b_flags &= ~(B_NEEDCOMMIT | B_WRITEINPROG); if (retv) { - vfs_unbusy_pages(bp); - brelse(bp); + /* + * Error, leave B_DELWRI intact + */ + vfs_unbusy_pages(bp); + brelse(bp); } else { - s = splbio(); /* XXX check this positionning */ - vp->v_numoutput++; - bp->b_flags |= B_ASYNC; - if (bp->b_flags & B_DELWRI) { - --numdirtybuffers; - if (needsbuffer) { - vfs_bio_need_satisfy(); - } - } - bp->b_flags &= ~(B_READ|B_DONE|B_ERROR|B_DELWRI); - bp->b_dirtyoff = bp->b_dirtyend = 0; - reassignbuf(bp, vp); - splx(s); - biodone(bp); + /* + * Success, remove B_DELWRI ( bundirty() ). + * + * b_dirtyoff/b_dirtyend seem to be NFS + * specific. We should probably move that + * into bundirty(). XXX + */ + s = splbio(); + vp->v_numoutput++; + bp->b_flags |= B_ASYNC; + bundirty(bp); + bp->b_flags &= ~(B_READ|B_DONE|B_ERROR); + bp->b_dirtyoff = bp->b_dirtyend = 0; + splx(s); + biodone(bp); } } } @@ -2999,6 +3019,8 @@ nfs_print(ap) /* * Just call nfs_writebp() with the force argument set to 1. + * + * NOTE: B_DONE may or may not be set in a_bp on call. */ static int nfs_bwrite(ap) @@ -3020,26 +3042,24 @@ nfs_writebp(bp, force) int force; { int s; - register int oldflags = bp->b_flags, retv = 1; + int oldflags = bp->b_flags; + int retv = 1; off_t off; if(!(bp->b_flags & B_BUSY)) panic("bwrite: buffer is not busy???"); if (bp->b_flags & B_INVAL) - bp->b_flags |= B_INVAL | B_NOCACHE; + bp->b_flags |= B_NOCACHE; - if (bp->b_flags & B_DELWRI) { - --numdirtybuffers; - if (needsbuffer) - vfs_bio_need_satisfy(); - } - s = splbio(); /* XXX check if needed */ - bp->b_flags &= ~(B_READ|B_DONE|B_ERROR|B_DELWRI); + /* + * XXX we bundirty() the bp here. Shouldn't we do it later after + * the I/O has completed?? + */ - if ((oldflags & (B_ASYNC|B_DELWRI)) == (B_ASYNC|B_DELWRI)) { - reassignbuf(bp, bp->b_vp); - } + s = splbio(); + bundirty(bp); + bp->b_flags &= ~(B_READ|B_DONE|B_ERROR); bp->b_vp->v_numoutput++; curproc->p_stats->p_ru.ru_oublock++; @@ -3061,8 +3081,9 @@ nfs_writebp(bp, force) bp->b_dirtyoff = bp->b_dirtyend = 0; bp->b_flags &= ~B_NEEDCOMMIT; biodone(bp); - } else if (retv == NFSERR_STALEWRITEVERF) + } else if (retv == NFSERR_STALEWRITEVERF) { nfs_clearcommit(bp->b_vp->v_mount); + } } if (retv) { if (force) diff --git a/sys/sys/bio.h b/sys/sys/bio.h index 5ce4039ce904..d2ce212b4d12 100644 --- a/sys/sys/bio.h +++ b/sys/sys/bio.h @@ -36,7 +36,7 @@ * SUCH DAMAGE. * * @(#)buf.h 8.9 (Berkeley) 3/30/95 - * $Id: buf.h,v 1.63 1999/01/21 13:41:12 peter Exp $ + * $Id: buf.h,v 1.64 1999/03/02 04:04:28 mckusick Exp $ */ #ifndef _SYS_BUF_H_ @@ -127,6 +127,10 @@ struct buf { struct vm_page *b_pages[btoc(MAXPHYS)]; int b_npages; struct workhead b_dep; /* List of filesystem dependencies. */ + struct chain_info { /* buffer chaining */ + struct buf *parent; + int count; + } b_chain; }; #define b_spc b_pager.pg_spc @@ -184,12 +188,12 @@ struct buf { #define B_RAM 0x10000000 /* Read ahead mark (flag) */ #define B_VMIO 0x20000000 /* VMIO flag */ #define B_CLUSTER 0x40000000 /* pagein op, so swap() can count it */ -#define B_AVAIL1 0x80000000 /* Available flag */ +#define B_AUTOCHAINDONE 0x80000000 /* Available flag */ -#define PRINT_BUF_FLAGS "\20\40avail1\37cluster\36vmio\35ram\34ordered" \ +#define PRINT_BUF_FLAGS "\20\40autochain\37cluster\36vmio\35ram\34ordered" \ "\33paging\32xxx\31writeinprog\30wanted\27relbuf\26dirty" \ "\25read\24raw\23phys\22clusterok\21malloc\20nocache" \ - "\17locked\16inval\15avail2\14error\13eintr\12done\11freebuf" \ + "\17locked\16inval\15scanned\14error\13eintr\12done\11freebuf" \ "\10delwri\7call\6cache\5busy\4bad\3async\2needcommit\1age" /* @@ -315,7 +319,6 @@ extern char *buffers; /* The buffer contents. */ extern int bufpages; /* Number of memory pages in the buffer pool. */ extern struct buf *swbuf; /* Swap I/O buffer headers. */ extern int nswbuf; /* Number of swap I/O buffer headers. */ -extern int needsbuffer, numdirtybuffers; extern TAILQ_HEAD(swqueue, buf) bswlist; extern TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES]; @@ -331,6 +334,7 @@ int bwrite __P((struct buf *)); void bdwrite __P((struct buf *)); void bawrite __P((struct buf *)); void bdirty __P((struct buf *)); +void bundirty __P((struct buf *)); int bowrite __P((struct buf *)); void brelse __P((struct buf *)); void bqrelse __P((struct buf *)); @@ -367,7 +371,6 @@ int allocbuf __P((struct buf *bp, int size)); void reassignbuf __P((struct buf *, struct vnode *)); void pbreassignbuf __P((struct buf *, struct vnode *)); struct buf *trypbuf __P((int *)); -void vfs_bio_need_satisfy __P((void)); #endif /* KERNEL */ #endif /* !_SYS_BUF_H_ */ diff --git a/sys/sys/buf.h b/sys/sys/buf.h index 5ce4039ce904..d2ce212b4d12 100644 --- a/sys/sys/buf.h +++ b/sys/sys/buf.h @@ -36,7 +36,7 @@ * SUCH DAMAGE. * * @(#)buf.h 8.9 (Berkeley) 3/30/95 - * $Id: buf.h,v 1.63 1999/01/21 13:41:12 peter Exp $ + * $Id: buf.h,v 1.64 1999/03/02 04:04:28 mckusick Exp $ */ #ifndef _SYS_BUF_H_ @@ -127,6 +127,10 @@ struct buf { struct vm_page *b_pages[btoc(MAXPHYS)]; int b_npages; struct workhead b_dep; /* List of filesystem dependencies. */ + struct chain_info { /* buffer chaining */ + struct buf *parent; + int count; + } b_chain; }; #define b_spc b_pager.pg_spc @@ -184,12 +188,12 @@ struct buf { #define B_RAM 0x10000000 /* Read ahead mark (flag) */ #define B_VMIO 0x20000000 /* VMIO flag */ #define B_CLUSTER 0x40000000 /* pagein op, so swap() can count it */ -#define B_AVAIL1 0x80000000 /* Available flag */ +#define B_AUTOCHAINDONE 0x80000000 /* Available flag */ -#define PRINT_BUF_FLAGS "\20\40avail1\37cluster\36vmio\35ram\34ordered" \ +#define PRINT_BUF_FLAGS "\20\40autochain\37cluster\36vmio\35ram\34ordered" \ "\33paging\32xxx\31writeinprog\30wanted\27relbuf\26dirty" \ "\25read\24raw\23phys\22clusterok\21malloc\20nocache" \ - "\17locked\16inval\15avail2\14error\13eintr\12done\11freebuf" \ + "\17locked\16inval\15scanned\14error\13eintr\12done\11freebuf" \ "\10delwri\7call\6cache\5busy\4bad\3async\2needcommit\1age" /* @@ -315,7 +319,6 @@ extern char *buffers; /* The buffer contents. */ extern int bufpages; /* Number of memory pages in the buffer pool. */ extern struct buf *swbuf; /* Swap I/O buffer headers. */ extern int nswbuf; /* Number of swap I/O buffer headers. */ -extern int needsbuffer, numdirtybuffers; extern TAILQ_HEAD(swqueue, buf) bswlist; extern TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES]; @@ -331,6 +334,7 @@ int bwrite __P((struct buf *)); void bdwrite __P((struct buf *)); void bawrite __P((struct buf *)); void bdirty __P((struct buf *)); +void bundirty __P((struct buf *)); int bowrite __P((struct buf *)); void brelse __P((struct buf *)); void bqrelse __P((struct buf *)); @@ -367,7 +371,6 @@ int allocbuf __P((struct buf *bp, int size)); void reassignbuf __P((struct buf *, struct vnode *)); void pbreassignbuf __P((struct buf *, struct vnode *)); struct buf *trypbuf __P((int *)); -void vfs_bio_need_satisfy __P((void)); #endif /* KERNEL */ #endif /* !_SYS_BUF_H_ */ diff --git a/sys/sys/proc.h b/sys/sys/proc.h index 6f51c57c4922..910760bf199c 100644 --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -36,7 +36,7 @@ * SUCH DAMAGE. * * @(#)proc.h 8.15 (Berkeley) 5/19/95 - * $Id: proc.h,v 1.73 1999/03/03 18:15:29 julian Exp $ + * $Id: proc.h,v 1.74 1999/03/05 16:38:12 bde Exp $ */ #ifndef _SYS_PROC_H_ @@ -262,10 +262,11 @@ struct proc { #define P_SWAPINREQ 0x80000 /* Swapin request due to wakeup */ /* Marked a kernel thread */ +#define P_FLSINPROG 0x100000 /* dirty buffers flush is in progress */ #define P_KTHREADP 0x200000 /* Process is really a kernel thread */ #define P_NOCLDWAIT 0x400000 /* No zombies if child dies */ - +#define P_DEADLKTREAT 0x800000 /* lock aquisition - deadlock treatment */ /* * MOVE TO ucred.h? @@ -336,7 +337,7 @@ extern struct timeval switchtime; /* Uptime at last context switch */ LIST_HEAD(proclist, proc); extern struct proclist allproc; /* List of all processes. */ extern struct proclist zombproc; /* List of zombie processes. */ -extern struct proc *initproc, *pageproc; /* Process slots for init, pager. */ +extern struct proc *initproc, *pageproc, *updateproc; /* Process slots for init, pager. */ #define NQS 32 /* 32 run queues. */ extern struct prochd qs[];