diff --git a/sys/gnu/ext2fs/ext2_bmap.c b/sys/gnu/ext2fs/ext2_bmap.c index f40ff338d41a..3ea5965e90d6 100644 --- a/sys/gnu/ext2fs/ext2_bmap.c +++ b/sys/gnu/ext2fs/ext2_bmap.c @@ -36,7 +36,7 @@ * SUCH DAMAGE. * * @(#)ufs_bmap.c 8.7 (Berkeley) 3/21/95 - * $Id: ufs_bmap.c,v 1.24 1998/10/27 11:47:08 bde Exp $ + * $Id: ufs_bmap.c,v 1.25 1999/01/28 00:57:55 dillon Exp $ */ #include @@ -228,6 +228,7 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb) #endif bp->b_blkno = blkptrtodb(ump, daddr); bp->b_flags |= B_READ; + bp->b_flags &= ~(B_INVAL|B_ERROR); vfs_busy_pages(bp, 0); VOP_STRATEGY(bp->b_vp, bp); curproc->p_stats->p_ru.ru_inblock++; /* XXX */ diff --git a/sys/gnu/fs/ext2fs/ext2_bmap.c b/sys/gnu/fs/ext2fs/ext2_bmap.c index f40ff338d41a..3ea5965e90d6 100644 --- a/sys/gnu/fs/ext2fs/ext2_bmap.c +++ b/sys/gnu/fs/ext2fs/ext2_bmap.c @@ -36,7 +36,7 @@ * SUCH DAMAGE. * * @(#)ufs_bmap.c 8.7 (Berkeley) 3/21/95 - * $Id: ufs_bmap.c,v 1.24 1998/10/27 11:47:08 bde Exp $ + * $Id: ufs_bmap.c,v 1.25 1999/01/28 00:57:55 dillon Exp $ */ #include @@ -228,6 +228,7 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb) #endif bp->b_blkno = blkptrtodb(ump, daddr); bp->b_flags |= B_READ; + bp->b_flags &= ~(B_INVAL|B_ERROR); vfs_busy_pages(bp, 0); VOP_STRATEGY(bp->b_vp, bp); curproc->p_stats->p_ru.ru_inblock++; /* XXX */ diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index 803aab197bfa..cb183206c90d 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -11,7 +11,7 @@ * 2. Absolutely no warranty of function or purpose is made by the author * John S. Dyson. * - * $Id: vfs_bio.c,v 1.206 1999/04/14 18:51:52 dt Exp $ + * $Id: vfs_bio.c,v 1.207 1999/04/29 18:15:25 alc Exp $ */ /* @@ -74,9 +74,6 @@ static void vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to); static void vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to); -static void vfs_buf_set_valid(struct buf *bp, vm_ooffset_t foff, - vm_offset_t off, vm_offset_t size, - vm_page_t m); static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m); static void vfs_clean_pages(struct buf * bp); @@ -221,6 +218,27 @@ bufcountwakeup(void) } } +/* + * vfs_buf_test_cache: + * + * Called when a buffer is extended. This function clears the B_CACHE + * bit if the newly extended portion of the buffer does not contain + * valid data. + */ +static __inline__ +void +vfs_buf_test_cache(struct buf *bp, + vm_ooffset_t foff, vm_offset_t off, vm_offset_t size, + vm_page_t m) +{ + if (bp->b_flags & B_CACHE) { + int base = (foff + off) & PAGE_MASK; + if (vm_page_is_valid(m, base, size) == 0) + bp->b_flags &= ~B_CACHE; + } +} + + /* * Initialize buffer headers and related structures. */ @@ -371,7 +389,10 @@ bremfree(struct buf * bp) /* - * Get a buffer with the specified data. Look in the cache first. + * Get a buffer with the specified data. Look in the cache first. We + * must clear B_ERROR and B_INVAL prior to initiating I/O. If B_CACHE + * is set, the buffer is valid and we do not have to do anything ( see + * getblk() ). */ int bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred, @@ -388,7 +409,7 @@ bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred, curproc->p_stats->p_ru.ru_inblock++; KASSERT(!(bp->b_flags & B_ASYNC), ("bread: illegal async bp %p", bp)); bp->b_flags |= B_READ; - bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL); + bp->b_flags &= ~(B_ERROR | B_INVAL); if (bp->b_rcred == NOCRED) { if (cred != NOCRED) crhold(cred); @@ -403,7 +424,9 @@ bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred, /* * Operates like bread, but also starts asynchronous I/O on - * read-ahead blocks. + * read-ahead blocks. We must clear B_ERROR and B_INVAL prior + * to initiating I/O . If B_CACHE is set, the buffer is valid + * and we do not have to do anything. */ int breadn(struct vnode * vp, daddr_t blkno, int size, @@ -421,7 +444,7 @@ breadn(struct vnode * vp, daddr_t blkno, int size, if (curproc != NULL) curproc->p_stats->p_ru.ru_inblock++; bp->b_flags |= B_READ; - bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL); + bp->b_flags &= ~(B_ERROR | B_INVAL); if (bp->b_rcred == NOCRED) { if (cred != NOCRED) crhold(cred); @@ -441,7 +464,7 @@ breadn(struct vnode * vp, daddr_t blkno, int size, if (curproc != NULL) curproc->p_stats->p_ru.ru_inblock++; rabp->b_flags |= B_READ | B_ASYNC; - rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL); + rabp->b_flags &= ~(B_ERROR | B_INVAL); if (rabp->b_rcred == NOCRED) { if (cred != NOCRED) crhold(cred); @@ -462,7 +485,14 @@ breadn(struct vnode * vp, daddr_t blkno, int size, /* * Write, release buffer on completion. (Done by iodone - * if async.) + * if async). Do not bother writing anything if the buffer + * is invalid. + * + * Note that we set B_CACHE here, indicating that buffer is + * fully valid and thus cacheable. This is true even of NFS + * now so we set it generally. This could be set either here + * or in biodone() since the I/O is synchronous. We put it + * here. */ int bwrite(struct buf * bp) @@ -486,7 +516,7 @@ bwrite(struct buf * bp) bundirty(bp); bp->b_flags &= ~(B_READ | B_DONE | B_ERROR); - bp->b_flags |= B_WRITEINPROG; + bp->b_flags |= B_WRITEINPROG | B_CACHE; bp->b_vp->v_numoutput++; vfs_busy_pages(bp, 1); @@ -505,11 +535,12 @@ bwrite(struct buf * bp) mp = vp->v_specmountpoint; else mp = vp->v_mount; - if (mp != NULL) + if (mp != NULL) { if ((oldflags & B_ASYNC) == 0) mp->mnt_stat.f_syncwrites++; else mp->mnt_stat.f_asyncwrites++; + } } if ((oldflags & B_ASYNC) == 0) { @@ -522,7 +553,13 @@ bwrite(struct buf * bp) } /* - * Delayed write. (Buffer is marked dirty). + * Delayed write. (Buffer is marked dirty). Do not bother writing + * anything if the buffer is marked invalid. + * + * Note that since the buffer must be completely valid, we can safely + * set B_CACHE. In fact, we have to set B_CACHE here rather then in + * biodone() in order to prevent getblk from writing the buffer + * out synchronously. */ void bdwrite(struct buf * bp) @@ -541,6 +578,12 @@ bdwrite(struct buf * bp) } bdirty(bp); + /* + * Set B_CACHE, indicating that the buffer is fully valid. This is + * true even of NFS now. + */ + bp->b_flags |= B_CACHE; + /* * This bmap keeps the system from needing to do the bmap later, * perhaps when the system is attempting to do a sync. Since it @@ -592,8 +635,11 @@ bdwrite(struct buf * bp) * B_RELBUF, and we must set B_DELWRI. We reassign the buffer to * itself to properly update it in the dirty/clean lists. We mark it * B_DONE to ensure that any asynchronization of the buffer properly - * clears B_DONE ( else a panic will occur later ). Note that B_INVALID - * buffers are not considered dirty even if B_DELWRI is set. + * clears B_DONE ( else a panic will occur later ). + * + * bdirty() is kinda like bdwrite() - we have to clear B_INVAL which + * might have been set pre-getblk(). Unlike bwrite/bdwrite, bdirty() + * should only be called if the buffer is known-good. * * Since the buffer is not on a queue, we do not update the numfreebuffers * count. @@ -645,6 +691,9 @@ bundirty(bp) * * Asynchronous write. Start output on a buffer, but do not wait for * it to complete. The buffer is released when the output completes. + * + * bwrite() ( or the VOP routine anyway ) is responsible for handling + * B_INVAL buffers. Not us. */ void bawrite(struct buf * bp) @@ -658,7 +707,8 @@ bawrite(struct buf * bp) * * Ordered write. Start output on a buffer, and flag it so that the * device will write it in the order it was queued. The buffer is - * released when the output completes. + * released when the output completes. bwrite() ( or the VOP routine + * anyway ) is responsible for handling B_INVAL buffers. */ int bowrite(struct buf * bp) @@ -694,10 +744,19 @@ brelse(struct buf * bp) bp->b_flags &= ~B_ERROR; if ((bp->b_flags & (B_READ | B_ERROR)) == B_ERROR) { + /* + * Failed write, redirty. Must clear B_ERROR to prevent + * pages from being scrapped. Note: B_INVAL is ignored + * here but will presumably be dealt with later. + */ bp->b_flags &= ~B_ERROR; bdirty(bp); } else if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_FREEBUF)) || (bp->b_bufsize <= 0)) { + /* + * Either a failed I/O or we were asked to free or not + * cache the buffer. + */ bp->b_flags |= B_INVAL; if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate) (*bioops.io_deallocate)(bp); @@ -727,31 +786,22 @@ brelse(struct buf * bp) /* * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer - * constituted, so the B_INVAL flag is used to *invalidate* the buffer, - * but the VM object is kept around. The B_NOCACHE flag is used to - * invalidate the pages in the VM object. + * constituted, not even NFS buffers now. Two flags effect this. If + * B_INVAL, the struct buf is invalidated but the VM object is kept + * around ( i.e. so it is trivial to reconstitute the buffer later ). * - * The b_{validoff,validend,dirtyoff,dirtyend} values are relative - * to b_offset and currently have byte granularity, whereas the - * valid flags in the vm_pages have only DEV_BSIZE resolution. - * The byte resolution fields are used to avoid unnecessary re-reads - * of the buffer but the code really needs to be genericized so - * other filesystem modules can take advantage of these fields. + * If B_ERROR or B_NOCACHE is set, pages in the VM object will be + * invalidated. B_ERROR cannot be set for a failed write unless the + * buffer is also B_INVAL because it hits the re-dirtying code above. * - * XXX this seems to cause performance problems. + * Normally we can do this whether a buffer is B_DELWRI or not. If + * the buffer is an NFS buffer, it is tracking piecemeal writes or + * the commit state and we cannot afford to lose the buffer. */ if ((bp->b_flags & B_VMIO) && !(bp->b_vp->v_tag == VT_NFS && bp->b_vp->v_type != VBLK && - (bp->b_flags & B_DELWRI) != 0) -#ifdef notdef - && (bp->b_vp->v_tag != VT_NFS - || bp->b_vp->v_type == VBLK - || (bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) - || bp->b_validend == 0 - || (bp->b_validoff == 0 - && bp->b_validend == bp->b_bufsize)) -#endif + (bp->b_flags & B_DELWRI)) ) { int i, j, resid; @@ -912,6 +962,11 @@ brelse(struct buf * bp) /* * Release a buffer back to the appropriate queue but do not try to free * it. + * + * bqrelse() is used by bdwrite() to requeue a delayed write, and used by + * biodone() to requeue an async I/O on completion. It is also used when + * known good buffers need to be requeued but we think we may need the data + * again soon. */ void bqrelse(struct buf * bp) @@ -1096,6 +1151,8 @@ vfs_bio_awrite(struct buf * bp) splx(s); /* * default (old) behavior, writing out only one block + * + * XXX returns b_bufsize instead of b_bcount for nwritten? */ nwritten = bp->b_bufsize; (void) VOP_BWRITE(bp); @@ -1107,7 +1164,11 @@ vfs_bio_awrite(struct buf * bp) * getnewbuf: * * Find and initialize a new buffer header, freeing up existing buffers - * in the bufqueues as necessary. + * in the bufqueues as necessary. The new buffer is returned with + * flags set to B_BUSY. + * + * Important: B_INVAL is not set. If the caller wishes to throw the + * buffer away, the caller must set B_INVAL prior to calling brelse(). * * We block if: * We have insufficient buffer headers @@ -1368,7 +1429,6 @@ restart: bp->b_bcount = 0; bp->b_npages = 0; bp->b_dirtyoff = bp->b_dirtyend = 0; - bp->b_validoff = bp->b_validend = 0; bp->b_usecount = 5; LIST_INIT(&bp->b_dep); @@ -1465,7 +1525,10 @@ dosleep: } bp->b_data = bp->b_kvabase; } - + + /* + * The bp, if valid, is set to B_BUSY. + */ return (bp); } @@ -1546,9 +1609,10 @@ flushbufqueues(void) } /* - * XXX NFS does weird things with B_INVAL bps if we bwrite - * them ( vfs_bio_awrite/bawrite/bdwrite/etc ) Why? - * + * Try to free up B_INVAL delayed-write buffers rather then + * writing them out. Note also that NFS is somewhat sensitive + * to B_INVAL buffers so it is doubly important that we do + * this. */ if ((bp->b_flags & B_DELWRI) != 0) { if (bp->b_flags & B_INVAL) { @@ -1622,20 +1686,28 @@ inmem(struct vnode * vp, daddr_t blkno) } /* - * now we set the dirty range for the buffer -- - * for NFS -- if the file is mapped and pages have - * been written to, let it know. We want the - * entire range of the buffer to be marked dirty if - * any of the pages have been written to for consistancy - * with the b_validoff, b_validend set in the nfs write - * code, and used by the nfs read code. + * vfs_setdirty: + * + * Sets the dirty range for a buffer based on the status of the dirty + * bits in the pages comprising the buffer. + * + * The range is limited to the size of the buffer. + * + * This routine is primarily used by NFS, but is generalized for the + * B_VMIO case. */ static void vfs_setdirty(struct buf *bp) { int i; vm_object_t object; - vm_offset_t boffset; + + /* + * Degenerate case - empty buffer + */ + + if (bp->b_bufsize == 0) + return; /* * We qualify the scan for modified pages on whether the @@ -1654,6 +1726,9 @@ vfs_setdirty(struct buf *bp) printf("Warning: object %p mightbedirty but not writeable\n", object); if (object->flags & (OBJ_MIGHTBEDIRTY|OBJ_CLEANING)) { + vm_offset_t boffset; + vm_offset_t eoffset; + /* * test the pages to see if they have been modified directly * by users through the VM system. @@ -1664,47 +1739,85 @@ vfs_setdirty(struct buf *bp) } /* - * scan forwards for the first page modified + * Calculate the encompassing dirty range, boffset and eoffset, + * (eoffset - boffset) bytes. */ + for (i = 0; i < bp->b_npages; i++) { - if (bp->b_pages[i]->dirty) { + if (bp->b_pages[i]->dirty) break; - } } - boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); - if (boffset < bp->b_dirtyoff) { - bp->b_dirtyoff = max(boffset, 0); - } - /* - * scan backwards for the last page modified - */ for (i = bp->b_npages - 1; i >= 0; --i) { if (bp->b_pages[i]->dirty) { break; } } - boffset = (i + 1); -#if 0 - offset = boffset + bp->b_pages[0]->pindex; - if (offset >= object->size) - boffset = object->size - bp->b_pages[0]->pindex; -#endif - boffset = (boffset << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); - if (bp->b_dirtyend < boffset) - bp->b_dirtyend = min(boffset, bp->b_bufsize); + eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); + + /* + * Fit it to the buffer. + */ + + if (eoffset > bp->b_bcount) + eoffset = bp->b_bcount; + + /* + * If we have a good dirty range, merge with the existing + * dirty range. + */ + + if (boffset < eoffset) { + if (bp->b_dirtyoff > boffset) + bp->b_dirtyoff = boffset; + if (bp->b_dirtyend < eoffset) + bp->b_dirtyend = eoffset; + } } } /* - * Get a block given a specified block and offset into a file/device. + * getblk: + * + * Get a block given a specified block and offset into a file/device. + * The buffers B_DONE bit will be cleared on return, making it almost + * ready for an I/O initiation. B_INVAL may or may not be set on + * return. The caller should clear B_INVAL prior to initiating a + * READ. + * + * For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for + * an existing buffer. + * + * For a VMIO buffer, B_CACHE is modified according to the backing VM. + * If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set + * and then cleared based on the backing VM. If the previous buffer is + * non-0-sized but invalid, B_CACHE will be cleared. + * + * If getblk() must create a new buffer, the new buffer is returned with + * both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which + * case it is returned with B_INVAL clear and B_CACHE set based on the + * backing VM. + * + * getblk() also forces a VOP_BWRITE() for any B_DELWRI buffer whos + * B_CACHE bit is clear. + * + * What this means, basically, is that the caller should use B_CACHE to + * determine whether the buffer is fully valid or not and should clear + * B_INVAL prior to issuing a read. If the caller intends to validate + * the buffer by loading its data area with something, the caller needs + * to clear B_INVAL. If the caller does this without issuing an I/O, + * the caller should set B_CACHE ( as an optimization ), else the caller + * should issue the I/O and biodone() will set B_CACHE if the I/O was + * a write attempt or if it was a successfull read. If the caller + * intends to issue a READ, the caller must clear B_INVAL and B_ERROR + * prior to issuing the READ. biodone() will *not* clear B_INVAL. */ struct buf * getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo) { struct buf *bp; - int i, s; + int s; struct bufhashhdr *bh; #if !defined(MAX_PERF) @@ -1727,6 +1840,10 @@ loop: } if ((bp = gbincore(vp, blkno))) { + /* + * Buffer is in-core + */ + if (bp->b_flags & B_BUSY) { bp->b_flags |= B_WANTED; if (bp->b_usecount < BUF_MAXUSE) @@ -1740,7 +1857,18 @@ loop: splx(s); return (struct buf *) NULL; } - bp->b_flags |= B_BUSY | B_CACHE; + + /* + * Busy the buffer. B_CACHE is cleared if the buffer is + * invalid. Ohterwise, for a non-VMIO buffer, B_CACHE is set + * and for a VMIO buffer B_CACHE is adjusted according to the + * backing VM cache. + */ + bp->b_flags |= B_BUSY; + if (bp->b_flags & B_INVAL) + bp->b_flags &= ~B_CACHE; + else if ((bp->b_flags & (B_VMIO|B_INVAL)) == 0) + bp->b_flags |= B_CACHE; bremfree(bp); /* @@ -1770,7 +1898,9 @@ loop: /* * If the size is inconsistant in the VMIO case, we can resize - * the buffer. This might lead to B_CACHE getting cleared. + * the buffer. This might lead to B_CACHE getting set or + * cleared. If the size has not changed, B_CACHE remains + * unchanged from its previous state. */ if (bp->b_bcount != size) @@ -1780,45 +1910,19 @@ loop: ("getblk: no buffer offset")); /* - * Check that the constituted buffer really deserves for the - * B_CACHE bit to be set. B_VMIO type buffers might not - * contain fully valid pages. Normal (old-style) buffers - * should be fully valid. This might also lead to B_CACHE - * getting clear. + * A buffer with B_DELWRI set and B_CACHE clear must + * be committed before we can return the buffer in + * order to prevent the caller from issuing a read + * ( due to B_CACHE not being set ) and overwriting + * it. * - * If B_CACHE is already clear, don't bother checking to see - * if we have to clear it again. - * - * XXX this code should not be necessary unless the B_CACHE - * handling is broken elsewhere in the kernel. We need to - * check the cases and then turn the clearing part of this - * code into a panic. - */ - if ( - (bp->b_flags & (B_VMIO|B_CACHE)) == (B_VMIO|B_CACHE) && - (bp->b_vp->v_tag != VT_NFS || bp->b_validend <= 0) - ) { - int checksize = bp->b_bufsize; - int poffset = bp->b_offset & PAGE_MASK; - int resid; - for (i = 0; i < bp->b_npages; i++) { - resid = (checksize > (PAGE_SIZE - poffset)) ? - (PAGE_SIZE - poffset) : checksize; - if (!vm_page_is_valid(bp->b_pages[i], poffset, resid)) { - bp->b_flags &= ~(B_CACHE | B_DONE); - break; - } - checksize -= resid; - poffset = 0; - } - } - - /* - * If B_DELWRI is set and B_CACHE got cleared ( or was - * already clear ), we have to commit the write and - * retry. The NFS code absolutely depends on this, - * and so might the FFS code. In anycase, it formalizes - * the B_CACHE rules. See sys/buf.h. + * Most callers, including NFS and FFS, need this to + * operate properly either because they assume they + * can issue a read if B_CACHE is not set, or because + * ( for example ) an uncached B_DELWRI might loop due + * to softupdates re-dirtying the buffer. In the latter + * case, B_CACHE is set after the first write completes, + * preventing further loops. */ if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) { @@ -1829,8 +1933,14 @@ loop: if (bp->b_usecount < BUF_MAXUSE) ++bp->b_usecount; splx(s); - return (bp); + bp->b_flags &= ~B_DONE; } else { + /* + * Buffer is not in-core, create new buffer. The buffer + * returned by getnewbuf() is marked B_BUSY. Note that the + * returned buffer is also considered valid ( not marked + * B_INVAL ). + */ int bsize, maxsize, vmio; off_t offset; @@ -1849,7 +1959,7 @@ loop: maxsize = imax(maxsize, bsize); if ((bp = getnewbuf(vp, blkno, - slpflag, slptimeo, size, maxsize)) == 0) { + slpflag, slptimeo, size, maxsize)) == NULL) { if (slpflag || slptimeo) { splx(s); return NULL; @@ -1861,6 +1971,10 @@ loop: * This code is used to make sure that a buffer is not * created while the getnewbuf routine is blocked. * This can be a problem whether the vnode is locked or not. + * If the buffer is created out from under us, we have to + * throw away the one we just created. There is now window + * race because we are safely running at splbio() from the + * point of the duplicate buffer creation through to here. */ if (gbincore(vp, blkno)) { bp->b_flags |= B_INVAL; @@ -1880,8 +1994,15 @@ loop: bh = BUFHASH(vp, blkno); LIST_INSERT_HEAD(bh, bp, b_hash); + /* + * set B_VMIO bit. allocbuf() the buffer bigger. Since the + * buffer size starts out as 0, B_CACHE will be set by + * allocbuf() for the VMIO case prior to it testing the + * backing store for validity. + */ + if (vmio) { - bp->b_flags |= (B_VMIO | B_CACHE); + bp->b_flags |= B_VMIO; #if defined(VFS_BIO_DEBUG) if (vp->v_type != VREG && vp->v_type != VBLK) printf("getblk: vmioing file type %d???\n", vp->v_type); @@ -1893,12 +2014,14 @@ loop: allocbuf(bp, size); splx(s); - return (bp); + bp->b_flags &= ~B_DONE; } + return (bp); } /* - * Get an empty, disassociated buffer of given size. + * Get an empty, disassociated buffer of given size. The buffer is initially + * set to B_INVAL. */ struct buf * geteblk(int size) @@ -1910,7 +2033,7 @@ geteblk(int size) while ((bp = getnewbuf(0, (daddr_t) 0, 0, 0, size, MAXBSIZE)) == 0); splx(s); allocbuf(bp, size); - bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ + bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ return (bp); } @@ -1925,6 +2048,9 @@ geteblk(int size) * deadlock or inconsistant data situations. Tread lightly!!! * There are B_CACHE and B_DELWRI interactions that must be dealt with by * the caller. Calling this code willy nilly can result in the loss of data. + * + * allocbuf() only adjusts B_CACHE for VMIO buffers. getblk() deals with + * B_CACHE for the non-VMIO case. */ int @@ -1945,7 +2071,8 @@ allocbuf(struct buf *bp, int size) caddr_t origbuf; int origbufsize; /* - * Just get anonymous memory from the kernel + * Just get anonymous memory from the kernel. Don't + * mess with B_CACHE. */ mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); #if !defined(NO_B_MALLOC) @@ -2046,13 +2173,25 @@ allocbuf(struct buf *bp, int size) if (bp->b_flags & B_MALLOC) panic("allocbuf: VMIO buffer can't be malloced"); #endif + /* + * Set B_CACHE initially if buffer is 0 length or will become + * 0-length. + */ + if (size == 0 || bp->b_bufsize == 0) + bp->b_flags |= B_CACHE; if (newbsize < bp->b_bufsize) { + /* + * DEV_BSIZE aligned new buffer size is less then the + * DEV_BSIZE aligned existing buffer size. Figure out + * if we have to remove any pages. + */ if (desiredpages < bp->b_npages) { for (i = desiredpages; i < bp->b_npages; i++) { /* * the page is not freed here -- it - * is the responsibility of vnode_pager_setsize + * is the responsibility of + * vnode_pager_setsize */ m = bp->b_pages[i]; KASSERT(m != bogus_page, @@ -2067,115 +2206,131 @@ allocbuf(struct buf *bp, int size) (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages)); bp->b_npages = desiredpages; } - } else if (newbsize > bp->b_bufsize) { - vm_object_t obj; - vm_offset_t tinc, toff; - vm_ooffset_t off; - vm_pindex_t objoff; - int pageindex, curbpnpages; + } else if (size > bp->b_bcount) { + /* + * We are growing the buffer, possibly in a + * byte-granular fashion. + */ struct vnode *vp; - int bsize; - int orig_validoff = bp->b_validoff; - int orig_validend = bp->b_validend; + vm_object_t obj; + vm_offset_t toff; + vm_offset_t tinc; + + /* + * Step 1, bring in the VM pages from the object, + * allocating them if necessary. We must clear + * B_CACHE if these pages are not valid for the + * range covered by the buffer. + */ vp = bp->b_vp; + obj = vp->v_object; - if (vp->v_type == VBLK) - bsize = DEV_BSIZE; - else - bsize = vp->v_mount->mnt_stat.f_iosize; - - if (bp->b_npages < desiredpages) { - obj = vp->v_object; - tinc = PAGE_SIZE; - - off = bp->b_offset; - KASSERT(bp->b_offset != NOOFFSET, - ("allocbuf: no buffer offset")); - curbpnpages = bp->b_npages; - doretry: - bp->b_validoff = orig_validoff; - bp->b_validend = orig_validend; - bp->b_flags |= B_CACHE; - for (toff = 0; toff < newbsize; toff += tinc) { - objoff = OFF_TO_IDX(off + toff); - pageindex = objoff - OFF_TO_IDX(off); - tinc = PAGE_SIZE - ((off + toff) & PAGE_MASK); - if (pageindex < curbpnpages) { - - m = bp->b_pages[pageindex]; -#ifdef VFS_BIO_DIAG - if (m->pindex != objoff) - panic("allocbuf: page changed offset?!!!?"); -#endif - if (tinc > (newbsize - toff)) - tinc = newbsize - toff; - if (bp->b_flags & B_CACHE) - vfs_buf_set_valid(bp, off, toff, tinc, m); - continue; - } - m = vm_page_lookup(obj, objoff); - if (!m) { - m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL); - if (!m) { - VM_WAIT; - vm_pageout_deficit += (desiredpages - curbpnpages); - goto doretry; - } + while (bp->b_npages < desiredpages) { + vm_page_t m; + vm_pindex_t pi; + pi = OFF_TO_IDX(bp->b_offset) + bp->b_npages; + if ((m = vm_page_lookup(obj, pi)) == NULL) { + m = vm_page_alloc(obj, pi, VM_ALLOC_NORMAL); + if (m == NULL) { + VM_WAIT; + vm_pageout_deficit += desiredpages - bp->b_npages; + } else { vm_page_wire(m); vm_page_wakeup(m); bp->b_flags &= ~B_CACHE; + bp->b_pages[bp->b_npages] = m; + ++bp->b_npages; + } + continue; + } - } else if (vm_page_sleep_busy(m, FALSE, "pgtblk")) { - /* - * If we had to sleep, retry. - * - * Also note that we only test - * PG_BUSY here, not m->busy. - * - * We cannot sleep on m->busy - * here because a vm_fault -> - * getpages -> cluster-read -> - * ...-> allocbuf sequence - * will convert PG_BUSY to - * m->busy so we have to let - * m->busy through if we do - * not want to deadlock. - */ - goto doretry; - } else { - if ((curproc != pageproc) && - ((m->queue - m->pc) == PQ_CACHE) && - ((cnt.v_free_count + cnt.v_cache_count) < - (cnt.v_free_min + cnt.v_cache_min))) { - pagedaemon_wakeup(); - } - if (tinc > (newbsize - toff)) - tinc = newbsize - toff; - if (bp->b_flags & B_CACHE) - vfs_buf_set_valid(bp, off, toff, tinc, m); - vm_page_flag_clear(m, PG_ZERO); - vm_page_wire(m); - } - bp->b_pages[pageindex] = m; - curbpnpages = pageindex + 1; + /* + * We found a page. If we have to sleep on it, + * retry because it might have gotten freed out + * from under us. + * + * We can only test PG_BUSY here. Blocking on + * m->busy might lead to a deadlock: + * + * vm_fault->getpages->cluster_read->allocbuf + * + */ + + if (vm_page_sleep_busy(m, FALSE, "pgtblk")) + continue; + + /* + * We have a good page. Should we wakeup the + * page daemon? + */ + if ((curproc != pageproc) && + ((m->queue - m->pc) == PQ_CACHE) && + ((cnt.v_free_count + cnt.v_cache_count) < + (cnt.v_free_min + cnt.v_cache_min)) + ) { + pagedaemon_wakeup(); } - if (vp->v_tag == VT_NFS && - vp->v_type != VBLK) { - if (bp->b_dirtyend > 0) { - bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff); - bp->b_validend = max(bp->b_validend, bp->b_dirtyend); - } - if (bp->b_validend == 0) - bp->b_flags &= ~B_CACHE; - } - bp->b_data = (caddr_t) trunc_page((vm_offset_t)bp->b_data); - bp->b_npages = curbpnpages; - pmap_qenter((vm_offset_t) bp->b_data, - bp->b_pages, bp->b_npages); - ((vm_offset_t) bp->b_data) |= off & PAGE_MASK; + vm_page_flag_clear(m, PG_ZERO); + vm_page_wire(m); + bp->b_pages[bp->b_npages] = m; + ++bp->b_npages; } + + /* + * Step 2. We've loaded the pages into the buffer, + * we have to figure out if we can still have B_CACHE + * set. Note that B_CACHE is set according to the + * byte-granular range ( bcount and size ), new the + * aligned range ( newbsize ). + * + * The VM test is against m->valid, which is DEV_BSIZE + * aligned. Needless to say, the validity of the data + * needs to also be DEV_BSIZE aligned. Note that this + * fails with NFS if the server or some other client + * extends the file's EOF. If our buffer is resized, + * B_CACHE may remain set! XXX + */ + + toff = bp->b_bcount; + tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK); + + while ((bp->b_flags & B_CACHE) && toff < size) { + vm_pindex_t pi; + + if (tinc > (size - toff)) + tinc = size - toff; + + pi = ((bp->b_offset & PAGE_MASK) + toff) >> + PAGE_SHIFT; + + vfs_buf_test_cache( + bp, + bp->b_offset, + toff, + tinc, + bp->b_pages[pi] + ); + toff += tinc; + tinc = PAGE_SIZE; + } + + /* + * Step 3, fixup the KVM pmap. Remember that + * bp->b_data is relative to bp->b_offset, but + * bp->b_offset may be offset into the first page. + */ + + bp->b_data = (caddr_t) + trunc_page((vm_offset_t)bp->b_data); + pmap_qenter( + (vm_offset_t)bp->b_data, + bp->b_pages, + bp->b_npages + ); + bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | + (vm_offset_t)(bp->b_offset & PAGE_MASK)); } } if (bp->b_flags & B_VMIO) @@ -2184,13 +2339,17 @@ allocbuf(struct buf *bp, int size) runningbufspace += (newbsize - bp->b_bufsize); if (newbsize < bp->b_bufsize) bufspacewakeup(); - bp->b_bufsize = newbsize; - bp->b_bcount = size; + bp->b_bufsize = newbsize; /* actual buffer allocation */ + bp->b_bcount = size; /* requested buffer size */ return 1; } /* - * Wait for buffer I/O completion, returning error status. + * biowait: + * + * Wait for buffer I/O completion, returning error status. The buffer + * is left B_BUSY|B_DONE on return. B_EINTR is converted into a EINTR + * error and cleared. */ int biowait(register struct buf * bp) @@ -2220,9 +2379,23 @@ biowait(register struct buf * bp) } /* - * Finish I/O on a buffer, calling an optional function. - * This is usually called from interrupt level, so process blocking - * is not *a good idea*. + * biodone: + * + * Finish I/O on a buffer, optionally calling a completion function. + * This is usually called from an interrupt so process blocking is + * not allowed. + * + * biodone is also responsible for setting B_CACHE in a B_VMIO bp. + * In a non-VMIO bp, B_CACHE will be set on the next getblk() + * assuming B_INVAL is clear. + * + * For the VMIO case, we set B_CACHE if the op was a read and no + * read error occured, or if the op was a write. B_CACHE is never + * set if the buffer is invalid or otherwise uncacheable. + * + * biodone does not mess with B_INVAL, allowing the I/O routine or the + * initiator to leave B_INVAL set to brelse the buffer out of existance + * in the biodone routine. */ void biodone(register struct buf * bp) @@ -2295,7 +2468,17 @@ biodone(register struct buf * bp) obj->paging_in_progress, bp->b_npages); } #endif - iosize = bp->b_bufsize; + + /* + * Set B_CACHE if the op was a normal read and no error + * occured. B_CACHE is set for writes in the b*write() + * routines. + */ + iosize = bp->b_bcount; + if ((bp->b_flags & (B_READ|B_FREEBUF|B_INVAL|B_NOCACHE|B_ERROR)) == B_READ) { + bp->b_flags |= B_CACHE; + } + for (i = 0; i < bp->b_npages; i++) { int bogusflag = 0; m = bp->b_pages[i]; @@ -2307,6 +2490,7 @@ biodone(register struct buf * bp) printf("biodone: page disappeared\n"); #endif vm_object_pip_subtract(obj, 1); + bp->b_flags &= ~B_CACHE; continue; } bp->b_pages[i] = m; @@ -2325,8 +2509,8 @@ biodone(register struct buf * bp) /* * In the write case, the valid and clean bits are - * already changed correctly, so we only need to do this - * here in the read case. + * already changed correctly ( see bdwrite() ), so we + * only need to do this here in the read case. */ if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) { vfs_page_set_valid(bp, foff, i, m); @@ -2453,106 +2637,45 @@ vfs_unbusy_pages(struct buf * bp) } /* - * Set NFS' b_validoff and b_validend fields from the valid bits - * of a page. If the consumer is not NFS, and the page is not - * valid for the entire range, clear the B_CACHE flag to force - * the consumer to re-read the page. + * vfs_page_set_valid: * - * B_CACHE interaction is especially tricky. - */ -static void -vfs_buf_set_valid(struct buf *bp, - vm_ooffset_t foff, vm_offset_t off, vm_offset_t size, - vm_page_t m) -{ - if (bp->b_vp->v_tag == VT_NFS && bp->b_vp->v_type != VBLK) { - vm_offset_t svalid, evalid; - int validbits = m->valid >> (((foff+off)&PAGE_MASK)/DEV_BSIZE); - - /* - * This only bothers with the first valid range in the - * page. - */ - svalid = off; - while (validbits && !(validbits & 1)) { - svalid += DEV_BSIZE; - validbits >>= 1; - } - evalid = svalid; - while (validbits & 1) { - evalid += DEV_BSIZE; - validbits >>= 1; - } - evalid = min(evalid, off + size); - /* - * We can only set b_validoff/end if this range is contiguous - * with the range built up already. If we cannot set - * b_validoff/end, we must clear B_CACHE to force an update - * to clean the bp up. - */ - if (svalid == bp->b_validend) { - bp->b_validoff = min(bp->b_validoff, svalid); - bp->b_validend = max(bp->b_validend, evalid); - } else { - bp->b_flags &= ~B_CACHE; - } - } else if (!vm_page_is_valid(m, - (vm_offset_t) ((foff + off) & PAGE_MASK), - size)) { - bp->b_flags &= ~B_CACHE; - } -} - -/* - * Set the valid bits in a page, taking care of the b_validoff, - * b_validend fields which NFS uses to optimise small reads. Off is - * the offset within the file and pageno is the page index within the buf. + * Set the valid bits in a page based on the supplied offset. The + * range is restricted to the buffer's size. * - * XXX we have to set the valid & clean bits for all page fragments - * touched by b_validoff/validend, even if the page fragment goes somewhat - * beyond b_validoff/validend due to alignment. + * For NFS, the range is additionally restricted to b_validoff/end. + * validoff/end must be DEV_BSIZE chunky or the end must be at the + * file EOF. If a dirty range exists, set the page's dirty bits + * inclusively. + * + * This routine is typically called after a read completes. */ static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m) { - struct vnode *vp = bp->b_vp; vm_ooffset_t soff, eoff; /* * Start and end offsets in buffer. eoff - soff may not cross a - * page boundry or cross the end of the buffer. + * page boundry or cross the end of the buffer. The end of the + * buffer, in this case, is our file EOF, not the allocation size + * of the buffer. */ soff = off; eoff = (off + PAGE_SIZE) & ~PAGE_MASK; - if (eoff > bp->b_offset + bp->b_bufsize) - eoff = bp->b_offset + bp->b_bufsize; + if (eoff > bp->b_offset + bp->b_bcount) + eoff = bp->b_offset + bp->b_bcount; - if (vp->v_tag == VT_NFS && vp->v_type != VBLK) { - vm_ooffset_t sv, ev; - vm_page_set_invalid(m, - (vm_offset_t) (soff & PAGE_MASK), - (vm_offset_t) (eoff - soff)); - /* - * bp->b_validoff and bp->b_validend restrict the valid range - * that we can set. Note that these offsets are not DEV_BSIZE - * aligned. vm_page_set_validclean() must know what - * sub-DEV_BSIZE ranges to clear. - */ -#if 0 - sv = (bp->b_offset + bp->b_validoff + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); - ev = (bp->b_offset + bp->b_validend + (DEV_BSIZE - 1)) & - ~(DEV_BSIZE - 1); -#endif - sv = bp->b_offset + bp->b_validoff; - ev = bp->b_offset + bp->b_validend; - soff = qmax(sv, soff); - eoff = qmin(ev, eoff); + /* + * Set valid range. This is typically the entire buffer and thus the + * entire page. + */ + if (eoff > soff) { + vm_page_set_validclean( + m, + (vm_offset_t) (soff & PAGE_MASK), + (vm_offset_t) (eoff - soff) + ); } - - if (eoff > soff) - vm_page_set_validclean(m, - (vm_offset_t) (soff & PAGE_MASK), - (vm_offset_t) (eoff - soff)); } /* @@ -2562,6 +2685,10 @@ vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m) * almost as being PG_BUSY. Also the object paging_in_progress * flag is handled to make sure that the object doesn't become * inconsistant. + * + * Since I/O has not been initiated yet, certain buffer flags + * such as B_ERROR or B_INVAL may be in an inconsistant state + * and should be ignored. */ void vfs_busy_pages(struct buf * bp, int clear_modify) @@ -2595,6 +2722,22 @@ retry: vm_page_io_start(m); } + /* + * When readying a buffer for a read ( i.e + * clear_modify == 0 ), it is important to do + * bogus_page replacement for valid pages in + * partially instantiated buffers. Partially + * instantiated buffers can, in turn, occur when + * reconstituting a buffer from its VM backing store + * base. We only have to do this if B_CACHE is + * clear ( which causes the I/O to occur in the + * first place ). The replacement prevents the read + * I/O from overwriting potentially dirty VM-backed + * pages. XXX bogus page replacement is, uh, bogus. + * It may not work properly with small-block devices. + * We need to find a better way. + */ + vm_page_protect(m, VM_PROT_NONE); if (clear_modify) vfs_page_set_valid(bp, foff, i, m); @@ -2614,30 +2757,89 @@ retry: * Tell the VM system that the pages associated with this buffer * are clean. This is used for delayed writes where the data is * going to go to disk eventually without additional VM intevention. + * + * Note that while we only really need to clean through to b_bcount, we + * just go ahead and clean through to b_bufsize. */ -void +static void vfs_clean_pages(struct buf * bp) { int i; if (bp->b_flags & B_VMIO) { vm_ooffset_t foff; + foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_clean_pages: no buffer offset")); for (i = 0; i < bp->b_npages; i++) { vm_page_t m = bp->b_pages[i]; + vm_ooffset_t noff = (foff + PAGE_SIZE) & ~PAGE_MASK; + vm_ooffset_t eoff = noff; + + if (eoff > bp->b_offset + bp->b_bufsize) + eoff = bp->b_offset + bp->b_bufsize; vfs_page_set_valid(bp, foff, i, m); - foff = (foff + PAGE_SIZE) & ~PAGE_MASK; + /* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */ + foff = noff; } } } +/* + * vfs_bio_set_validclean: + * + * Set the range within the buffer to valid and clean. The range is + * relative to the beginning of the buffer, b_offset. Note that b_offset + * itself may be offset from the beginning of the first page. + */ + +void +vfs_bio_set_validclean(struct buf *bp, int base, int size) +{ + if (bp->b_flags & B_VMIO) { + int i; + int n; + + /* + * Fixup base to be relative to beginning of first page. + * Set initial n to be the maximum number of bytes in the + * first page that can be validated. + */ + + base += (bp->b_offset & PAGE_MASK); + n = PAGE_SIZE - (base & PAGE_MASK); + + for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { + vm_page_t m = bp->b_pages[i]; + + if (n > size) + n = size; + + vm_page_set_validclean(m, base & PAGE_MASK, n); + base += n; + size -= n; + n = PAGE_SIZE; + } + } +} + +/* + * vfs_bio_clrbuf: + * + * clear a buffer. This routine essentially fakes an I/O, so we need + * to clear B_ERROR and B_INVAL. + * + * Note that while we only theoretically need to clear through b_bcount, + * we go ahead and clear through b_bufsize. + */ + void vfs_bio_clrbuf(struct buf *bp) { int i, mask = 0; caddr_t sa, ea; if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) { + bp->b_flags &= ~(B_INVAL|B_ERROR); if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) && (bp->b_offset & PAGE_MASK) == 0) { mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1; diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c index f7bd95e2947e..5f7f870b717f 100644 --- a/sys/kern/vfs_cluster.c +++ b/sys/kern/vfs_cluster.c @@ -33,7 +33,7 @@ * SUCH DAMAGE. * * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 - * $Id: vfs_cluster.c,v 1.79 1999/01/27 21:49:58 dillon Exp $ + * $Id: vfs_cluster.c,v 1.80 1999/03/12 02:24:56 julian Exp $ */ #include "opt_debug_cluster.h" @@ -251,6 +251,7 @@ single_block_read: #endif if ((bp->b_flags & B_CLUSTER) == 0) vfs_busy_pages(bp, 0); + bp->b_flags &= ~(B_ERROR|B_INVAL); error = VOP_STRATEGY(vp, bp); curproc->p_stats->p_ru.ru_inblock++; } @@ -283,6 +284,7 @@ single_block_read: if ((rbp->b_flags & B_CLUSTER) == 0) vfs_busy_pages(rbp, 0); + rbp->b_flags &= ~(B_ERROR|B_INVAL); (void) VOP_STRATEGY(vp, rbp); curproc->p_stats->p_ru.ru_inblock++; } @@ -473,8 +475,10 @@ cluster_callback(bp) if (error) { tbp->b_flags |= B_ERROR; tbp->b_error = error; - } else - tbp->b_dirtyoff = tbp->b_dirtyend = 0; + } else { + tbp->b_dirtyoff = tbp->b_dirtyend = 0; + tbp->b_flags &= ~(B_ERROR|B_INVAL); + } biodone(tbp); } relpbuf(bp, &cluster_pbuf_freecnt); diff --git a/sys/kern/vfs_default.c b/sys/kern/vfs_default.c index c0565a44a2a0..de5d18d80e98 100644 --- a/sys/kern/vfs_default.c +++ b/sys/kern/vfs_default.c @@ -138,6 +138,18 @@ vop_panic(struct vop_generic_args *ap) panic("illegal vnode op called"); } +/* + * vop_nostrategy: + * + * Strategy routine for VFS devices that have none. + * + * B_ERROR and B_INVAL must be cleared prior to calling any strategy + * routine. Typically this is done for a B_READ strategy call. Typically + * B_INVAL is assumed to already be clear prior to a write and should not + * be cleared manually unless you just made the buffer invalid. B_ERROR + * should be cleared either way. + */ + static int vop_nostrategy (struct vop_strategy_args *ap) { diff --git a/sys/nfs/nfs.h b/sys/nfs/nfs.h index bc15a7c6dc28..78a54a2d6be0 100644 --- a/sys/nfs/nfs.h +++ b/sys/nfs/nfs.h @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * @(#)nfs.h 8.4 (Berkeley) 5/1/95 - * $Id: nfs.h,v 1.44 1998/09/07 05:42:15 bde Exp $ + * $Id: nfs.h,v 1.45 1999/02/25 00:03:50 peter Exp $ */ #ifndef _NFS_NFS_H_ @@ -651,8 +651,7 @@ void nfs_disconnect __P((struct nfsmount *)); void nfs_safedisconnect __P((struct nfsmount *)); int nfs_getattrcache __P((struct vnode *, struct vattr *)); int nfsm_strtmbuf __P((struct mbuf **, char **, const char *, long)); -int nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *, - int)); +int nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *)); int nfsm_uiotombuf __P((struct uio *, struct mbuf **, int, caddr_t *)); void nfsrv_init __P((int)); void nfs_clearcommit __P((struct mount *)); diff --git a/sys/nfs/nfs_bio.c b/sys/nfs/nfs_bio.c index cef982bb26fd..0d8a7828141c 100644 --- a/sys/nfs/nfs_bio.c +++ b/sys/nfs/nfs_bio.c @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95 - * $Id: nfs_bio.c,v 1.68 1999/04/05 19:38:28 julian Exp $ + * $Id: nfs_bio.c,v 1.69 1999/04/06 03:07:54 peter Exp $ */ @@ -65,7 +65,6 @@ static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size, struct proc *p)); -static void nfs_prot_buf __P((struct buf *bp, int off, int n)); extern int nfs_numasync; extern int nfs_pbuf_freecnt; @@ -84,7 +83,7 @@ nfs_getpages(ap) vm_ooffset_t a_offset; } */ *ap; { - int i, error, nextoff, size, toff, npages, count; + int i, error, nextoff, size, toff, count, npages; struct uio uio; struct iovec iov; vm_offset_t kva; @@ -110,13 +109,35 @@ nfs_getpages(ap) if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) (void)nfs_fsinfo(nmp, vp, cred, p); + + npages = btoc(count); + + /* + * If the requested page is partially valid, just return it and + * allow the pager to zero-out the blanks. Partially valid pages + * can only occur at the file EOF. + */ + + { + vm_page_t m = pages[ap->a_reqpage]; + + if (m->valid != 0) { + /* handled by vm_fault now */ + /* vm_page_zero_invalid(m, TRUE); */ + for (i = 0; i < npages; ++i) { + if (i != ap->a_reqpage) + vnode_pager_freepage(pages[i]); + } + return(0); + } + } + /* * We use only the kva address for the buffer, but this is extremely * convienient and fast. */ bp = getpbuf(&nfs_pbuf_freecnt); - npages = btoc(count); kva = (vm_offset_t) bp->b_data; pmap_qenter(kva, pages, npages); @@ -167,12 +188,12 @@ nfs_getpages(ap) m->dirty = 0; } else if (size > toff) { /* - * Read operation filled a partial page, set valid - * bits properly. validclean will zero out - * any cruft in the buffer when setting a valid bit, - * if the size is not DEV_BSIZE aligned. + * Read operation filled a partial page. */ + m->valid = 0; vm_page_set_validclean(m, 0, size - toff); + /* handled by vm_fault now */ + /* vm_page_zero_invalid(m, TRUE); */ } if (i != ap->a_reqpage) { @@ -197,13 +218,6 @@ nfs_getpages(ap) } else { vnode_pager_freepage(m); } - } else { - /* - * This page is being mapped, clear out any other - * cruft in the invalid areas of the page. - */ - if (m->valid && m->valid != VM_PAGE_BITS_ALL) - vm_page_zero_invalid(m, FALSE); } } return 0; @@ -228,14 +242,17 @@ nfs_putpages(ap) vm_offset_t kva; struct buf *bp; int iomode, must_commit, i, error, npages, count; + off_t offset; int *rtvals; struct vnode *vp; struct proc *p; struct ucred *cred; struct nfsmount *nmp; + struct nfsnode *np; vm_page_t *pages; vp = ap->a_vp; + np = VTONFS(vp); p = curproc; /* XXX */ cred = curproc->p_ucred; /* XXX */ nmp = VFSTONFS(vp->v_mount); @@ -243,6 +260,7 @@ nfs_putpages(ap) count = ap->a_count; rtvals = ap->a_rtvals; npages = btoc(count); + offset = IDX_TO_OFF(pages[0]->pindex); if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) @@ -252,6 +270,16 @@ nfs_putpages(ap) rtvals[i] = VM_PAGER_AGAIN; } + /* + * When putting pages, do not extend file past EOF. + */ + + if (offset + count > np->n_size) { + count = np->n_size - offset; + if (count < 0) + count = 0; + } + /* * We use only the kva address for the buffer, but this is extremely * convienient and fast. @@ -265,7 +293,7 @@ nfs_putpages(ap) iov.iov_len = count; uio.uio_iov = &iov; uio.uio_iovcnt = 1; - uio.uio_offset = IDX_TO_OFF(pages[0]->pindex); + uio.uio_offset = offset; uio.uio_resid = count; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_WRITE; @@ -297,23 +325,21 @@ nfs_putpages(ap) * Vnode op for read using bio */ int -nfs_bioread(vp, uio, ioflag, cred, getpages) +nfs_bioread(vp, uio, ioflag, cred) register struct vnode *vp; register struct uio *uio; int ioflag; struct ucred *cred; - int getpages; { register struct nfsnode *np = VTONFS(vp); register int biosize, i; - off_t diff; struct buf *bp = 0, *rabp; struct vattr vattr; struct proc *p; struct nfsmount *nmp = VFSTONFS(vp->v_mount); daddr_t lbn, rabn; - int bufsize; - int nra, error = 0, n = 0, on = 0, not_readin; + int bcount; + int nra, error = 0, n = 0, on = 0; #ifdef DIAGNOSTIC if (uio->uio_rw != UIO_READ) @@ -424,7 +450,6 @@ nfs_bioread(vp, uio, ioflag, cred, getpages) nfsstats.biocache_reads++; lbn = uio->uio_offset / biosize; on = uio->uio_offset & (biosize - 1); - not_readin = 1; /* * Start the read ahead(s), as required. @@ -439,7 +464,6 @@ nfs_bioread(vp, uio, ioflag, cred, getpages) return (EINTR); if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { rabp->b_flags |= (B_READ | B_ASYNC); - rabp->b_flags &= ~B_DONE; vfs_busy_pages(rabp, 0); if (nfs_asyncio(rabp, cred)) { rabp->b_flags |= B_INVAL|B_ERROR; @@ -453,47 +477,31 @@ nfs_bioread(vp, uio, ioflag, cred, getpages) } /* - * If the block is in the cache and has the required data - * in a valid region, just copy it out. - * Otherwise, get the block and write back/read in, - * as required. + * Obtain the buffer cache block. Figure out the buffer size + * when we are at EOF. nfs_getcacheblk() will also force + * uncached delayed-writes to be flushed to the server. + * + * Note that bcount is *not* DEV_BSIZE aligned. */ -again: - bufsize = biosize; - if ((off_t)(lbn + 1) * biosize > np->n_size && - (off_t)(lbn + 1) * biosize - np->n_size < biosize) { - bufsize = np->n_size - (off_t)lbn * biosize; - bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); + + bcount = biosize; + if ((off_t)lbn * biosize >= np->n_size) { + bcount = 0; + } else if ((off_t)(lbn + 1) * biosize > np->n_size) { + bcount = np->n_size - (off_t)lbn * biosize; } - bp = nfs_getcacheblk(vp, lbn, bufsize, p); + + bp = nfs_getcacheblk(vp, lbn, bcount, p); if (!bp) return (EINTR); /* - * If we are being called from nfs_getpages, we must - * make sure the buffer is a vmio buffer. The vp will - * already be setup for vmio but there may be some old - * non-vmio buffers attached to it. + * If B_CACHE is not set, we must issue the read. If this + * fails, we return an error. */ - if (getpages && !(bp->b_flags & B_VMIO)) { -#ifdef DIAGNOSTIC - printf("nfs_bioread: non vmio buf found, discarding\n"); -#endif - bp->b_flags |= B_NOCACHE; - bp->b_flags |= B_INVAFTERWRITE; - if (bp->b_dirtyend > 0) { - if ((bp->b_flags & B_DELWRI) == 0) - panic("nfsbioread"); - if (VOP_BWRITE(bp) == EINTR) - return (EINTR); - } else - brelse(bp); - goto again; - } + if ((bp->b_flags & B_CACHE) == 0) { bp->b_flags |= B_READ; - bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL); - not_readin = 0; vfs_busy_pages(bp, 0); error = nfs_doio(bp, cred, p); if (error) { @@ -501,32 +509,20 @@ again: return (error); } } - if (bufsize > on) { - n = min((unsigned)(bufsize - on), uio->uio_resid); - } else { - n = 0; - } - diff = np->n_size - uio->uio_offset; - if (diff < n) - n = diff; - if (not_readin && n > 0) { - if (on < bp->b_validoff || (on + n) > bp->b_validend) { - bp->b_flags |= B_NOCACHE; - bp->b_flags |= B_INVAFTERWRITE; - if (bp->b_dirtyend > 0) { - if ((bp->b_flags & B_DELWRI) == 0) - panic("nfsbioread"); - if (VOP_BWRITE(bp) == EINTR) - return (EINTR); - } else - brelse(bp); - goto again; - } - } + + /* + * on is the offset into the current bp. Figure out how many + * bytes we can copy out of the bp. Note that bcount is + * NOT DEV_BSIZE aligned. + * + * Then figure out how many bytes we can copy into the uio. + */ + + n = 0; + if (on < bcount) + n = min((unsigned)(bcount - on), uio->uio_resid); + vp->v_lastr = lbn; - diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on); - if (diff < n) - n = diff; break; case VLNK: nfsstats.biocache_readlinks++; @@ -535,7 +531,6 @@ again: return (EINTR); if ((bp->b_flags & B_CACHE) == 0) { bp->b_flags |= B_READ; - bp->b_flags &= ~B_DONE; vfs_busy_pages(bp, 0); error = nfs_doio(bp, cred, p); if (error) { @@ -560,13 +555,13 @@ again: return (EINTR); if ((bp->b_flags & B_CACHE) == 0) { bp->b_flags |= B_READ; - bp->b_flags &= ~B_DONE; vfs_busy_pages(bp, 0); error = nfs_doio(bp, cred, p); if (error) { brelse(bp); } while (error == NFSERR_BAD_COOKIE) { + printf("got bad cookie vp %p bp %p\n", vp, bp); nfs_invaldir(vp); error = nfs_vinvalbuf(vp, 0, cred, p, 1); /* @@ -574,6 +569,10 @@ again: * server. The only way to get the block is by * reading from the beginning to get all the * offset cookies. + * + * Leave the last bp intact unless there is an error. + * Loop back up to the while if the error is another + * NFSERR_BAD_COOKIE (double yuch!). */ for (i = 0; i <= lbn && !error; i++) { if (np->n_direofoffset @@ -582,21 +581,32 @@ again: bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p); if (!bp) return (EINTR); - if ((bp->b_flags & B_DONE) == 0) { - bp->b_flags |= B_READ; - bp->b_flags &= ~B_DONE; - vfs_busy_pages(bp, 0); - error = nfs_doio(bp, cred, p); - if (error == 0 && (bp->b_flags & B_INVAL)) - break; - if (error) { - brelse(bp); - } else if (i < lbn) { - brelse(bp); - } + if ((bp->b_flags & B_CACHE) == 0) { + bp->b_flags |= B_READ; + vfs_busy_pages(bp, 0); + error = nfs_doio(bp, cred, p); + /* + * no error + B_INVAL == directory EOF, + * use the block. + */ + if (error == 0 && (bp->b_flags & B_INVAL)) + break; } + /* + * An error will throw away the block and the + * for loop will break out. If no error and this + * is not the block we want, we throw away the + * block and go for the next one via the for loop. + */ + if (error || i < lbn) + brelse(bp); } } + /* + * The above while is repeated if we hit another cookie + * error. If we hit an error and it wasn't a cookie error, + * we give up. + */ if (error) return (error); } @@ -616,7 +626,6 @@ again: if (rabp) { if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { rabp->b_flags |= (B_READ | B_ASYNC); - rabp->b_flags &= ~B_DONE; vfs_busy_pages(rabp, 0); if (nfs_asyncio(rabp, cred)) { rabp->b_flags |= B_INVAL|B_ERROR; @@ -629,10 +638,20 @@ again: } } /* - * Make sure we use a signed variant of min() since - * the second term may be negative. + * Unlike VREG files, whos buffer size ( bp->b_bcount ) is + * chopped for the EOF condition, we cannot tell how large + * NFS directories are going to be until we hit EOF. So + * an NFS directory buffer is *not* chopped to its EOF. Now, + * it just so happens that b_resid will effectively chop it + * to EOF. *BUT* this information is lost if the buffer goes + * away and is reconstituted into a B_CACHE state ( due to + * being VMIO ) later. So we keep track of the directory eof + * in np->n_direofoffset and chop it off as an extra step + * right here. */ n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on); + if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset) + n = np->n_direofoffset - uio->uio_offset; break; default: printf(" nfs_bioread: type %x unexpected\n",vp->v_type); @@ -649,6 +668,10 @@ again: n = 0; break; case VDIR: + /* + * Invalidate buffer if caching is disabled, forcing a + * re-read from the remote later. + */ if (np->n_flag & NQNFSNONCACHE) bp->b_flags |= B_INVAL; break; @@ -660,24 +683,6 @@ again: return (error); } -static void -nfs_prot_buf(bp, off, n) - struct buf *bp; - int off; - int n; -{ - int pindex, boff, end; - - if ((bp->b_flags & B_VMIO) == 0) - return; - - end = round_page(off + n); - for (boff = trunc_page(off); boff < end; boff += PAGE_SIZE) { - pindex = boff >> PAGE_SHIFT; - vm_page_protect(bp->b_pages[pindex], VM_PROT_NONE); - } -} - /* * Vnode op for write using bio */ @@ -690,18 +695,18 @@ nfs_write(ap) struct ucred *a_cred; } */ *ap; { - register int biosize; - register struct uio *uio = ap->a_uio; + int biosize; + struct uio *uio = ap->a_uio; struct proc *p = uio->uio_procp; - register struct vnode *vp = ap->a_vp; + struct vnode *vp = ap->a_vp; struct nfsnode *np = VTONFS(vp); - register struct ucred *cred = ap->a_cred; + struct ucred *cred = ap->a_cred; int ioflag = ap->a_ioflag; struct buf *bp; struct vattr vattr; struct nfsmount *nmp = VFSTONFS(vp->v_mount); daddr_t lbn; - int bufsize; + int bcount; int n, on, error = 0, iomode, must_commit; #ifdef DIAGNOSTIC @@ -749,12 +754,9 @@ nfs_write(ap) psignal(p, SIGXFSZ); return (EFBIG); } - /* - * I use nm_rsize, not nm_wsize so that all buffer cache blocks - * will be the same size within a filesystem. nfs_writerpc will - * still use nm_wsize when sizing the rpc's. - */ + biosize = vp->v_mount->mnt_stat.f_iosize; + do { /* * Check for a valid write lease. @@ -786,17 +788,74 @@ nfs_write(ap) on = uio->uio_offset & (biosize-1); n = min((unsigned)(biosize - on), uio->uio_resid); again: - if (uio->uio_offset + n > np->n_size) { + /* + * Handle direct append and file extension cases, calculate + * unaligned buffer size. + */ + + if (uio->uio_offset == np->n_size && n) { + /* + * special append case. Obtain buffer prior to + * resizing it to maintain B_CACHE. + */ + long save; + + bcount = on; + bp = nfs_getcacheblk(vp, lbn, bcount, p); + save = bp->b_flags & B_CACHE; + np->n_size = uio->uio_offset + n; np->n_flag |= NMODIFIED; vnode_pager_setsize(vp, np->n_size); + + bcount += n; + allocbuf(bp, bcount); + bp->b_flags |= save; + } else { + if (uio->uio_offset + n > np->n_size) { + np->n_size = uio->uio_offset + n; + np->n_flag |= NMODIFIED; + vnode_pager_setsize(vp, np->n_size); + } + bcount = biosize; + if ((off_t)(lbn + 1) * biosize > np->n_size) + bcount = np->n_size - (off_t)lbn * biosize; + bp = nfs_getcacheblk(vp, lbn, bcount, p); } - bufsize = biosize; - if ((off_t)(lbn + 1) * biosize > np->n_size) { - bufsize = np->n_size - (off_t)lbn * biosize; - bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); + + /* + * Issue a READ if B_CACHE is not set. In special-append + * mode, B_CACHE is based on the buffer prior to the write + * op and is typically set, avoiding the read. If a read + * is required in special append mode, the server will + * probably send us a short-read since we extended the file + * on our end, resulting in b_resid == 0 and, thusly, + * B_CACHE getting set. + * + * We can also avoid issuing the read if the write covers + * the entire buffer. We have to make sure the buffer state + * is reasonable in this case since we will not be initiating + * I/O. See the comments in kern/vfs_bio.c's getblk() for + * more information. + * + * B_CACHE may also be set due to the buffer being cached + * normally. + */ + + if (on == 0 && n == bcount) { + bp->b_flags |= B_CACHE; + bp->b_flags &= ~(B_ERROR | B_INVAL); + } + + if ((bp->b_flags & B_CACHE) == 0) { + bp->b_flags |= B_READ; + vfs_busy_pages(bp, 0); + error = nfs_doio(bp, cred, p); + if (error) { + brelse(bp); + return (error); + } } - bp = nfs_getcacheblk(vp, lbn, bufsize, p); if (!bp) return (EINTR); if (bp->b_wcred == NOCRED) { @@ -820,6 +879,17 @@ again: * If the new write will leave a contiguous dirty * area, just update the b_dirtyoff and b_dirtyend, * otherwise force a write rpc of the old dirty area. + * + * While it is possible to merge discontiguous writes due to + * our having a B_CACHE buffer ( and thus valid read data + * for the hole), we don't because it could lead to + * significant cache coherency problems with multiple clients, + * especially if locking is implemented later on. + * + * as an optimization we could theoretically maintain + * a linked list of discontinuous areas, but we would still + * have to commit them separately so there isn't much + * advantage to it except perhaps a bit of asynchronization. */ if (bp->b_dirtyend > 0 && @@ -862,11 +932,6 @@ again: return (error); } - /* - * This will keep the buffer and mmaped regions more coherent. - */ - nfs_prot_buf(bp, on, n); - /* * Only update dirtyoff/dirtyend if not a degenerate * condition. @@ -879,21 +944,7 @@ again: bp->b_dirtyoff = on; bp->b_dirtyend = on + n; } - } - - /* - * To avoid code complexity, we may have to throw away - * previously valid ranges when merging the new dirty range - * into the valid range. As long as we do not *ADD* an - * invalid valid range, we are ok. - */ - if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff || - bp->b_validoff > bp->b_dirtyend) { - bp->b_validoff = bp->b_dirtyoff; - bp->b_validend = bp->b_dirtyend; - } else { - bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff); - bp->b_validend = max(bp->b_validend, bp->b_dirtyend); + vfs_bio_set_validclean(bp, on, n); } /* @@ -904,11 +955,14 @@ again: /* * If the lease is non-cachable or IO_SYNC do bwrite(). + * + * IO_INVAL appears to be unused. The idea appears to be + * to turn off caching in this case. Very odd. XXX */ if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) { bp->b_proc = p; if (ioflag & IO_INVAL) - bp->b_flags |= B_INVAL; + bp->b_flags |= B_NOCACHE; error = VOP_BWRITE(bp); if (error) return (error); @@ -922,8 +976,9 @@ again: bp->b_proc = (struct proc *)0; bp->b_flags |= B_ASYNC; (void)nfs_writebp(bp, 0); - } else + } else { bdwrite(bp); + } } while (uio->uio_resid > 0 && n > 0); return (0); } @@ -956,15 +1011,16 @@ nfs_getcacheblk(vp, bn, size, p) return ((struct buf *)0); bp = getblk(vp, bn, size, 0, 2 * hz); } - } else + } else { bp = getblk(vp, bn, size, 0, 0); + } if (vp->v_type == VREG) { int biosize; + biosize = mp->mnt_stat.f_iosize; bp->b_blkno = bn * (biosize / DEV_BSIZE); } - return (bp); } @@ -1036,6 +1092,9 @@ nfs_vinvalbuf(vp, flags, cred, p, intrflg) * Initiate asynchronous I/O. Return an error if no nfsiods are available. * This is mainly to avoid queueing async I/O requests when the nfsiods * are all hung on a dead server. + * + * Note: nfs_asyncio() does not clear (B_ERROR|B_INVAL) but when the bp + * is eventually dequeued by the async daemon, nfs_doio() *will*. */ int nfs_asyncio(bp, cred) @@ -1164,7 +1223,7 @@ nfs_doio(bp, cr, p) struct vnode *vp; struct nfsnode *np; struct nfsmount *nmp; - int error = 0, diff, len, iomode, must_commit = 0; + int error = 0, iomode, must_commit = 0; struct uio uio; struct iovec io; @@ -1177,6 +1236,13 @@ nfs_doio(bp, cr, p) uiop->uio_segflg = UIO_SYSSPACE; uiop->uio_procp = p; + /* + * clear B_ERROR and B_INVAL state prior to initiating the I/O. We + * do this here so we do not have to do it in all the code that + * calls us. + */ + bp->b_flags &= ~(B_ERROR | B_INVAL); + KASSERT(!(bp->b_flags & B_DONE), ("nfs_doio: bp %p already marked done", bp)); /* @@ -1216,25 +1282,22 @@ nfs_doio(bp, cr, p) nfsstats.read_bios++; error = nfs_readrpc(vp, uiop, cr); if (!error) { - bp->b_validoff = 0; if (uiop->uio_resid) { /* - * If len > 0, there is a hole in the file and - * no writes after the hole have been pushed to - * the server yet. - * Just zero fill the rest of the valid area. + * If we had a short read with no error, we must have + * hit a file hole. We should zero-fill the remainder. + * This can also occur if the server hits the file EOF. + * + * Holes used to be able to occur due to pending + * writes, but that is not possible any longer. */ - diff = bp->b_bcount - uiop->uio_resid; - len = np->n_size - (((u_quad_t)bp->b_blkno) * DEV_BSIZE - + diff); - if (len > 0) { - len = min(len, uiop->uio_resid); - bzero((char *)bp->b_data + diff, len); - bp->b_validend = diff + len; - } else - bp->b_validend = diff; - } else - bp->b_validend = bp->b_bcount; + int nread = bp->b_bcount - uiop->uio_resid; + int left = bp->b_bcount - nread; + + if (left > 0) + bzero((char *)bp->b_data + nread, left); + uiop->uio_resid = 0; + } } if (p && (vp->v_flag & VTEXT) && (((nmp->nm_flag & NFSMNT_NQNFS) && @@ -1262,6 +1325,10 @@ nfs_doio(bp, cr, p) } if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0) error = nfs_readdirrpc(vp, uiop, cr); + /* + * end-of-directory sets B_INVAL but does not generate an + * error. + */ if (error == 0 && uiop->uio_resid == bp->b_bcount) bp->b_flags |= B_INVAL; break; @@ -1296,7 +1363,7 @@ nfs_doio(bp, cr, p) if (!error && iomode == NFSV3WRITE_UNSTABLE) { bp->b_flags |= B_NEEDCOMMIT; if (bp->b_dirtyoff == 0 - && bp->b_dirtyend == bp->b_bufsize) + && bp->b_dirtyend == bp->b_bcount) bp->b_flags |= B_CLUSTEROK; } else { bp->b_flags &= ~B_NEEDCOMMIT; diff --git a/sys/nfs/nfs_nqlease.c b/sys/nfs/nfs_nqlease.c index 71f692aeadf3..e45c73f559c2 100644 --- a/sys/nfs/nfs_nqlease.c +++ b/sys/nfs/nfs_nqlease.c @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * @(#)nfs_nqlease.c 8.9 (Berkeley) 5/20/95 - * $Id: nfs_nqlease.c,v 1.39 1998/10/31 15:31:25 peter Exp $ + * $Id: nfs_nqlease.c,v 1.40 1999/02/25 00:03:51 peter Exp $ */ @@ -561,6 +561,10 @@ nqsrv_send_eviction(vp, lp, slp, nam, cred) *mtod(m, u_int32_t *) = htonl(0x80000000 | (m->m_pkthdr.len - NFSX_UNSIGNED)); } + /* + * nfs_sndlock if PR_CONNREQUIRED XXX + */ + if (((lph->lph_flag & (LC_UDP | LC_CLTP)) == 0 && (lph->lph_slp->ns_flag & SLP_VALID) == 0) || (nfs_slplock(lph->lph_slp, 0) == 0)) diff --git a/sys/nfs/nfs_socket.c b/sys/nfs/nfs_socket.c index 1490f724a68f..2267629116b3 100644 --- a/sys/nfs/nfs_socket.c +++ b/sys/nfs/nfs_socket.c @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * @(#)nfs_socket.c 8.5 (Berkeley) 3/30/95 - * $Id: nfs_socket.c,v 1.50 1999/02/25 00:03:51 peter Exp $ + * $Id: nfs_socket.c,v 1.51 1999/04/24 11:29:48 dt Exp $ */ /* @@ -54,6 +54,7 @@ #include #include #include +#include #include #include @@ -115,6 +116,15 @@ static int proct[NFS_NPROCS] = { 0, 0, 0, }; +static int nfs_realign_test; +static int nfs_realign_count; + +SYSCTL_DECL(_vfs_nfs); + +SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_test, CTLFLAG_RD, &nfs_realign_test, 0, ""); +SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_count, CTLFLAG_RD, &nfs_realign_count, 0, ""); + + /* * There is a congestion window for outstanding rpcs maintained per mount * point. The cwnd size is adjusted in roughly the way that: @@ -138,7 +148,7 @@ struct callout_handle nfs_timer_handle; static int nfs_msg __P((struct proc *,char *,char *)); static int nfs_rcvlock __P((struct nfsreq *)); static void nfs_rcvunlock __P((struct nfsreq *)); -static void nfs_realign __P((struct mbuf *m, int hsiz)); +static void nfs_realign __P((struct mbuf **pm, int hsiz)); static int nfs_receive __P((struct nfsreq *rep, struct sockaddr **aname, struct mbuf **mp)); static int nfs_reconnect __P((struct nfsreq *rep)); @@ -702,7 +712,7 @@ errout: * These could cause pointer alignment problems, so copy them to * well aligned mbufs. */ - nfs_realign(*mp, 5 * NFSX_UNSIGNED); + nfs_realign(mp, 5 * NFSX_UNSIGNED); return (error); } @@ -1589,92 +1599,56 @@ nfs_rcvunlock(rep) } /* - * Check for badly aligned mbuf data areas and - * realign data in an mbuf list by copying the data areas up, as required. + * nfs_realign: + * + * Check for badly aligned mbuf data and realign by copying the unaligned + * portion of the data into a new mbuf chain and freeing the portions + * of the old chain that were replaced. + * + * We cannot simply realign the data within the existing mbuf chain + * because the underlying buffers may contain other rpc commands and + * we cannot afford to overwrite them. + * + * We would prefer to avoid this situation entirely. The situation does + * not occur with NFS/UDP and is supposed to only occassionally occur + * with TCP. Use vfs.nfs.realign_count and realign_test to check this. */ static void -nfs_realign(m, hsiz) - register struct mbuf *m; +nfs_realign(pm, hsiz) + register struct mbuf **pm; int hsiz; { - register struct mbuf *m2; - register int siz, mlen, olen; - register caddr_t tcp, fcp; - struct mbuf *mnew; + struct mbuf *m; + struct mbuf *n = NULL; + int off = 0; - while (m) { - /* - * This never happens for UDP, rarely happens for TCP - * but frequently happens for iso transport. - */ - if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) { - olen = m->m_len; - fcp = mtod(m, caddr_t); - if ((intptr_t)fcp & 0x3) { - m->m_flags &= ~M_PKTHDR; - if (m->m_flags & M_EXT) - m->m_data = m->m_ext.ext_buf + - ((m->m_ext.ext_size - olen) & ~0x3); - else - m->m_data = m->m_dat; + ++nfs_realign_test; + + while ((m = *pm) != NULL) { + if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) { + MGET(n, M_WAIT, MT_DATA); + if (m->m_len >= MINCLSIZE) { + MCLGET(n, M_WAIT); + } + n->m_len = 0; + break; } - m->m_len = 0; - tcp = mtod(m, caddr_t); - mnew = m; - m2 = m->m_next; + pm = &m->m_next; + } - /* - * If possible, only put the first invariant part - * of the RPC header in the first mbuf. - */ - mlen = M_TRAILINGSPACE(m); - if (olen <= hsiz && mlen > hsiz) - mlen = hsiz; - - /* - * Loop through the mbuf list consolidating data. - */ + /* + * If n is non-NULL, loop on m copying data, then replace the + * portion of the chain that had to be realigned. + */ + if (n != NULL) { + ++nfs_realign_count; while (m) { - while (olen > 0) { - if (mlen == 0) { - m2->m_flags &= ~M_PKTHDR; - if (m2->m_flags & M_EXT) - m2->m_data = m2->m_ext.ext_buf; - else - m2->m_data = m2->m_dat; - m2->m_len = 0; - mlen = M_TRAILINGSPACE(m2); - tcp = mtod(m2, caddr_t); - mnew = m2; - m2 = m2->m_next; - } - siz = min(mlen, olen); - if (tcp != fcp) - bcopy(fcp, tcp, siz); - mnew->m_len += siz; - mlen -= siz; - olen -= siz; - tcp += siz; - fcp += siz; - } + m_copyback(n, off, m->m_len, mtod(m, caddr_t)); + off += m->m_len; m = m->m_next; - if (m) { - olen = m->m_len; - fcp = mtod(m, caddr_t); - } } - - /* - * Finally, set m_len == 0 for any trailing mbufs that have - * been copied out of. - */ - while (m2) { - m2->m_len = 0; - m2 = m2->m_next; - } - return; - } - m = m->m_next; + m_freem(*pm); + *pm = n; } } @@ -2040,7 +2014,7 @@ nfsrv_rcv(so, arg, waitflag) m_freem(mp); continue; } - nfs_realign(mp, 10 * NFSX_UNSIGNED); + nfs_realign(&mp, 10 * NFSX_UNSIGNED); rec->nr_address = nam; rec->nr_packet = mp; STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link); @@ -2182,7 +2156,7 @@ nfsrv_getstream(slp, waitflag) if (!rec) { m_freem(slp->ns_frag); } else { - nfs_realign(slp->ns_frag, 10 * NFSX_UNSIGNED); + nfs_realign(&slp->ns_frag, 10 * NFSX_UNSIGNED); rec->nr_address = (struct sockaddr *)0; rec->nr_packet = slp->ns_frag; STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link); diff --git a/sys/nfs/nfs_vnops.c b/sys/nfs/nfs_vnops.c index a92bb2295811..6114d56f340a 100644 --- a/sys/nfs/nfs_vnops.c +++ b/sys/nfs/nfs_vnops.c @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * @(#)nfs_vnops.c 8.16 (Berkeley) 5/27/95 - * $Id: nfs_vnops.c,v 1.123 1999/02/16 10:49:54 dfr Exp $ + * $Id: nfs_vnops.c,v 1.124 1999/03/12 02:24:58 julian Exp $ */ @@ -408,9 +408,9 @@ nfs_access(ap) error = nfs_readrpc(vp, &auio, ap->a_cred); else if (vp->v_type == VDIR) { char* bp; - bp = malloc(NFS_DIRBLKSIZ, M_TEMP, M_WAITOK); + bp = malloc(DIRBLKSIZ, M_TEMP, M_WAITOK); aiov.iov_base = bp; - aiov.iov_len = auio.uio_resid = NFS_DIRBLKSIZ; + aiov.iov_len = auio.uio_resid = DIRBLKSIZ; error = nfs_readdirrpc(vp, &auio, ap->a_cred); free(bp, M_TEMP); } else if (vp->v_type == VLNK) @@ -962,7 +962,7 @@ nfs_read(ap) if (vp->v_type != VREG) return (EPERM); - return (nfs_bioread(vp, ap->a_uio, ap->a_ioflag, ap->a_cred, 0)); + return (nfs_bioread(vp, ap->a_uio, ap->a_ioflag, ap->a_cred)); } /* @@ -980,7 +980,7 @@ nfs_readlink(ap) if (vp->v_type != VLNK) return (EINVAL); - return (nfs_bioread(vp, ap->a_uio, 0, ap->a_cred, 0)); + return (nfs_bioread(vp, ap->a_uio, 0, ap->a_cred)); } /* @@ -1985,7 +1985,7 @@ nfs_readdir(ap) * Call nfs_bioread() to do the real work. */ tresid = uio->uio_resid; - error = nfs_bioread(vp, uio, 0, ap->a_cred, 0); + error = nfs_bioread(vp, uio, 0, ap->a_cred); if (!error && uio->uio_resid == tresid) nfsstats.direofcache_misses++; @@ -2004,7 +2004,7 @@ nfs_readdirrpc(vp, uiop, cred) { register int len, left; - register struct dirent *dp; + register struct dirent *dp = NULL; register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; @@ -2019,12 +2019,9 @@ nfs_readdirrpc(vp, uiop, cred) int attrflag; int v3 = NFS_ISV3(vp); -#ifndef nolint - dp = (struct dirent *)0; -#endif #ifndef DIAGNOSTIC - if (uiop->uio_iovcnt != 1 || (uiop->uio_offset & (NFS_DIRBLKSIZ - 1)) || - (uiop->uio_resid & (NFS_DIRBLKSIZ - 1))) + if (uiop->uio_iovcnt != 1 || (uiop->uio_offset & (DIRBLKSIZ - 1)) || + (uiop->uio_resid & (DIRBLKSIZ - 1))) panic("nfs readdirrpc bad uio"); #endif @@ -2381,7 +2378,7 @@ nfs_readdirplusrpc(vp, uiop, cred) m_freem(mrep); } /* - * Fill last record, iff any, out to a multiple of NFS_DIRBLKSIZ + * Fill last record, iff any, out to a multiple of DIRBLKSIZ * by increasing d_reclen for the last record. */ if (blksiz > 0) { @@ -3028,13 +3025,13 @@ nfs_bwrite(ap) struct vnode *a_bp; } */ *ap; { - return (nfs_writebp(ap->a_bp, 1)); } /* * This is a clone of vn_bwrite(), except that B_WRITEINPROG isn't set unless - * the force flag is one and it also handles the B_NEEDCOMMIT flag. + * the force flag is one and it also handles the B_NEEDCOMMIT flag. We set + * B_CACHE if this is a VMIO buffer. */ int nfs_writebp(bp, force) @@ -3049,12 +3046,15 @@ nfs_writebp(bp, force) if(!(bp->b_flags & B_BUSY)) panic("bwrite: buffer is not busy???"); - if (bp->b_flags & B_INVAL) - bp->b_flags |= B_NOCACHE; + if (bp->b_flags & B_INVAL) { + brelse(bp); + return(0); + } + + bp->b_flags |= B_CACHE; /* - * XXX we bundirty() the bp here. Shouldn't we do it later after - * the I/O has completed?? + * Undirty the bp. We will redirty it later if the I/O fails. */ s = splbio(); diff --git a/sys/nfsclient/nfs.h b/sys/nfsclient/nfs.h index bc15a7c6dc28..78a54a2d6be0 100644 --- a/sys/nfsclient/nfs.h +++ b/sys/nfsclient/nfs.h @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * @(#)nfs.h 8.4 (Berkeley) 5/1/95 - * $Id: nfs.h,v 1.44 1998/09/07 05:42:15 bde Exp $ + * $Id: nfs.h,v 1.45 1999/02/25 00:03:50 peter Exp $ */ #ifndef _NFS_NFS_H_ @@ -651,8 +651,7 @@ void nfs_disconnect __P((struct nfsmount *)); void nfs_safedisconnect __P((struct nfsmount *)); int nfs_getattrcache __P((struct vnode *, struct vattr *)); int nfsm_strtmbuf __P((struct mbuf **, char **, const char *, long)); -int nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *, - int)); +int nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *)); int nfsm_uiotombuf __P((struct uio *, struct mbuf **, int, caddr_t *)); void nfsrv_init __P((int)); void nfs_clearcommit __P((struct mount *)); diff --git a/sys/nfsclient/nfs_bio.c b/sys/nfsclient/nfs_bio.c index cef982bb26fd..0d8a7828141c 100644 --- a/sys/nfsclient/nfs_bio.c +++ b/sys/nfsclient/nfs_bio.c @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95 - * $Id: nfs_bio.c,v 1.68 1999/04/05 19:38:28 julian Exp $ + * $Id: nfs_bio.c,v 1.69 1999/04/06 03:07:54 peter Exp $ */ @@ -65,7 +65,6 @@ static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size, struct proc *p)); -static void nfs_prot_buf __P((struct buf *bp, int off, int n)); extern int nfs_numasync; extern int nfs_pbuf_freecnt; @@ -84,7 +83,7 @@ nfs_getpages(ap) vm_ooffset_t a_offset; } */ *ap; { - int i, error, nextoff, size, toff, npages, count; + int i, error, nextoff, size, toff, count, npages; struct uio uio; struct iovec iov; vm_offset_t kva; @@ -110,13 +109,35 @@ nfs_getpages(ap) if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) (void)nfs_fsinfo(nmp, vp, cred, p); + + npages = btoc(count); + + /* + * If the requested page is partially valid, just return it and + * allow the pager to zero-out the blanks. Partially valid pages + * can only occur at the file EOF. + */ + + { + vm_page_t m = pages[ap->a_reqpage]; + + if (m->valid != 0) { + /* handled by vm_fault now */ + /* vm_page_zero_invalid(m, TRUE); */ + for (i = 0; i < npages; ++i) { + if (i != ap->a_reqpage) + vnode_pager_freepage(pages[i]); + } + return(0); + } + } + /* * We use only the kva address for the buffer, but this is extremely * convienient and fast. */ bp = getpbuf(&nfs_pbuf_freecnt); - npages = btoc(count); kva = (vm_offset_t) bp->b_data; pmap_qenter(kva, pages, npages); @@ -167,12 +188,12 @@ nfs_getpages(ap) m->dirty = 0; } else if (size > toff) { /* - * Read operation filled a partial page, set valid - * bits properly. validclean will zero out - * any cruft in the buffer when setting a valid bit, - * if the size is not DEV_BSIZE aligned. + * Read operation filled a partial page. */ + m->valid = 0; vm_page_set_validclean(m, 0, size - toff); + /* handled by vm_fault now */ + /* vm_page_zero_invalid(m, TRUE); */ } if (i != ap->a_reqpage) { @@ -197,13 +218,6 @@ nfs_getpages(ap) } else { vnode_pager_freepage(m); } - } else { - /* - * This page is being mapped, clear out any other - * cruft in the invalid areas of the page. - */ - if (m->valid && m->valid != VM_PAGE_BITS_ALL) - vm_page_zero_invalid(m, FALSE); } } return 0; @@ -228,14 +242,17 @@ nfs_putpages(ap) vm_offset_t kva; struct buf *bp; int iomode, must_commit, i, error, npages, count; + off_t offset; int *rtvals; struct vnode *vp; struct proc *p; struct ucred *cred; struct nfsmount *nmp; + struct nfsnode *np; vm_page_t *pages; vp = ap->a_vp; + np = VTONFS(vp); p = curproc; /* XXX */ cred = curproc->p_ucred; /* XXX */ nmp = VFSTONFS(vp->v_mount); @@ -243,6 +260,7 @@ nfs_putpages(ap) count = ap->a_count; rtvals = ap->a_rtvals; npages = btoc(count); + offset = IDX_TO_OFF(pages[0]->pindex); if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) @@ -252,6 +270,16 @@ nfs_putpages(ap) rtvals[i] = VM_PAGER_AGAIN; } + /* + * When putting pages, do not extend file past EOF. + */ + + if (offset + count > np->n_size) { + count = np->n_size - offset; + if (count < 0) + count = 0; + } + /* * We use only the kva address for the buffer, but this is extremely * convienient and fast. @@ -265,7 +293,7 @@ nfs_putpages(ap) iov.iov_len = count; uio.uio_iov = &iov; uio.uio_iovcnt = 1; - uio.uio_offset = IDX_TO_OFF(pages[0]->pindex); + uio.uio_offset = offset; uio.uio_resid = count; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_WRITE; @@ -297,23 +325,21 @@ nfs_putpages(ap) * Vnode op for read using bio */ int -nfs_bioread(vp, uio, ioflag, cred, getpages) +nfs_bioread(vp, uio, ioflag, cred) register struct vnode *vp; register struct uio *uio; int ioflag; struct ucred *cred; - int getpages; { register struct nfsnode *np = VTONFS(vp); register int biosize, i; - off_t diff; struct buf *bp = 0, *rabp; struct vattr vattr; struct proc *p; struct nfsmount *nmp = VFSTONFS(vp->v_mount); daddr_t lbn, rabn; - int bufsize; - int nra, error = 0, n = 0, on = 0, not_readin; + int bcount; + int nra, error = 0, n = 0, on = 0; #ifdef DIAGNOSTIC if (uio->uio_rw != UIO_READ) @@ -424,7 +450,6 @@ nfs_bioread(vp, uio, ioflag, cred, getpages) nfsstats.biocache_reads++; lbn = uio->uio_offset / biosize; on = uio->uio_offset & (biosize - 1); - not_readin = 1; /* * Start the read ahead(s), as required. @@ -439,7 +464,6 @@ nfs_bioread(vp, uio, ioflag, cred, getpages) return (EINTR); if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { rabp->b_flags |= (B_READ | B_ASYNC); - rabp->b_flags &= ~B_DONE; vfs_busy_pages(rabp, 0); if (nfs_asyncio(rabp, cred)) { rabp->b_flags |= B_INVAL|B_ERROR; @@ -453,47 +477,31 @@ nfs_bioread(vp, uio, ioflag, cred, getpages) } /* - * If the block is in the cache and has the required data - * in a valid region, just copy it out. - * Otherwise, get the block and write back/read in, - * as required. + * Obtain the buffer cache block. Figure out the buffer size + * when we are at EOF. nfs_getcacheblk() will also force + * uncached delayed-writes to be flushed to the server. + * + * Note that bcount is *not* DEV_BSIZE aligned. */ -again: - bufsize = biosize; - if ((off_t)(lbn + 1) * biosize > np->n_size && - (off_t)(lbn + 1) * biosize - np->n_size < biosize) { - bufsize = np->n_size - (off_t)lbn * biosize; - bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); + + bcount = biosize; + if ((off_t)lbn * biosize >= np->n_size) { + bcount = 0; + } else if ((off_t)(lbn + 1) * biosize > np->n_size) { + bcount = np->n_size - (off_t)lbn * biosize; } - bp = nfs_getcacheblk(vp, lbn, bufsize, p); + + bp = nfs_getcacheblk(vp, lbn, bcount, p); if (!bp) return (EINTR); /* - * If we are being called from nfs_getpages, we must - * make sure the buffer is a vmio buffer. The vp will - * already be setup for vmio but there may be some old - * non-vmio buffers attached to it. + * If B_CACHE is not set, we must issue the read. If this + * fails, we return an error. */ - if (getpages && !(bp->b_flags & B_VMIO)) { -#ifdef DIAGNOSTIC - printf("nfs_bioread: non vmio buf found, discarding\n"); -#endif - bp->b_flags |= B_NOCACHE; - bp->b_flags |= B_INVAFTERWRITE; - if (bp->b_dirtyend > 0) { - if ((bp->b_flags & B_DELWRI) == 0) - panic("nfsbioread"); - if (VOP_BWRITE(bp) == EINTR) - return (EINTR); - } else - brelse(bp); - goto again; - } + if ((bp->b_flags & B_CACHE) == 0) { bp->b_flags |= B_READ; - bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL); - not_readin = 0; vfs_busy_pages(bp, 0); error = nfs_doio(bp, cred, p); if (error) { @@ -501,32 +509,20 @@ again: return (error); } } - if (bufsize > on) { - n = min((unsigned)(bufsize - on), uio->uio_resid); - } else { - n = 0; - } - diff = np->n_size - uio->uio_offset; - if (diff < n) - n = diff; - if (not_readin && n > 0) { - if (on < bp->b_validoff || (on + n) > bp->b_validend) { - bp->b_flags |= B_NOCACHE; - bp->b_flags |= B_INVAFTERWRITE; - if (bp->b_dirtyend > 0) { - if ((bp->b_flags & B_DELWRI) == 0) - panic("nfsbioread"); - if (VOP_BWRITE(bp) == EINTR) - return (EINTR); - } else - brelse(bp); - goto again; - } - } + + /* + * on is the offset into the current bp. Figure out how many + * bytes we can copy out of the bp. Note that bcount is + * NOT DEV_BSIZE aligned. + * + * Then figure out how many bytes we can copy into the uio. + */ + + n = 0; + if (on < bcount) + n = min((unsigned)(bcount - on), uio->uio_resid); + vp->v_lastr = lbn; - diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on); - if (diff < n) - n = diff; break; case VLNK: nfsstats.biocache_readlinks++; @@ -535,7 +531,6 @@ again: return (EINTR); if ((bp->b_flags & B_CACHE) == 0) { bp->b_flags |= B_READ; - bp->b_flags &= ~B_DONE; vfs_busy_pages(bp, 0); error = nfs_doio(bp, cred, p); if (error) { @@ -560,13 +555,13 @@ again: return (EINTR); if ((bp->b_flags & B_CACHE) == 0) { bp->b_flags |= B_READ; - bp->b_flags &= ~B_DONE; vfs_busy_pages(bp, 0); error = nfs_doio(bp, cred, p); if (error) { brelse(bp); } while (error == NFSERR_BAD_COOKIE) { + printf("got bad cookie vp %p bp %p\n", vp, bp); nfs_invaldir(vp); error = nfs_vinvalbuf(vp, 0, cred, p, 1); /* @@ -574,6 +569,10 @@ again: * server. The only way to get the block is by * reading from the beginning to get all the * offset cookies. + * + * Leave the last bp intact unless there is an error. + * Loop back up to the while if the error is another + * NFSERR_BAD_COOKIE (double yuch!). */ for (i = 0; i <= lbn && !error; i++) { if (np->n_direofoffset @@ -582,21 +581,32 @@ again: bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p); if (!bp) return (EINTR); - if ((bp->b_flags & B_DONE) == 0) { - bp->b_flags |= B_READ; - bp->b_flags &= ~B_DONE; - vfs_busy_pages(bp, 0); - error = nfs_doio(bp, cred, p); - if (error == 0 && (bp->b_flags & B_INVAL)) - break; - if (error) { - brelse(bp); - } else if (i < lbn) { - brelse(bp); - } + if ((bp->b_flags & B_CACHE) == 0) { + bp->b_flags |= B_READ; + vfs_busy_pages(bp, 0); + error = nfs_doio(bp, cred, p); + /* + * no error + B_INVAL == directory EOF, + * use the block. + */ + if (error == 0 && (bp->b_flags & B_INVAL)) + break; } + /* + * An error will throw away the block and the + * for loop will break out. If no error and this + * is not the block we want, we throw away the + * block and go for the next one via the for loop. + */ + if (error || i < lbn) + brelse(bp); } } + /* + * The above while is repeated if we hit another cookie + * error. If we hit an error and it wasn't a cookie error, + * we give up. + */ if (error) return (error); } @@ -616,7 +626,6 @@ again: if (rabp) { if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { rabp->b_flags |= (B_READ | B_ASYNC); - rabp->b_flags &= ~B_DONE; vfs_busy_pages(rabp, 0); if (nfs_asyncio(rabp, cred)) { rabp->b_flags |= B_INVAL|B_ERROR; @@ -629,10 +638,20 @@ again: } } /* - * Make sure we use a signed variant of min() since - * the second term may be negative. + * Unlike VREG files, whos buffer size ( bp->b_bcount ) is + * chopped for the EOF condition, we cannot tell how large + * NFS directories are going to be until we hit EOF. So + * an NFS directory buffer is *not* chopped to its EOF. Now, + * it just so happens that b_resid will effectively chop it + * to EOF. *BUT* this information is lost if the buffer goes + * away and is reconstituted into a B_CACHE state ( due to + * being VMIO ) later. So we keep track of the directory eof + * in np->n_direofoffset and chop it off as an extra step + * right here. */ n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on); + if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset) + n = np->n_direofoffset - uio->uio_offset; break; default: printf(" nfs_bioread: type %x unexpected\n",vp->v_type); @@ -649,6 +668,10 @@ again: n = 0; break; case VDIR: + /* + * Invalidate buffer if caching is disabled, forcing a + * re-read from the remote later. + */ if (np->n_flag & NQNFSNONCACHE) bp->b_flags |= B_INVAL; break; @@ -660,24 +683,6 @@ again: return (error); } -static void -nfs_prot_buf(bp, off, n) - struct buf *bp; - int off; - int n; -{ - int pindex, boff, end; - - if ((bp->b_flags & B_VMIO) == 0) - return; - - end = round_page(off + n); - for (boff = trunc_page(off); boff < end; boff += PAGE_SIZE) { - pindex = boff >> PAGE_SHIFT; - vm_page_protect(bp->b_pages[pindex], VM_PROT_NONE); - } -} - /* * Vnode op for write using bio */ @@ -690,18 +695,18 @@ nfs_write(ap) struct ucred *a_cred; } */ *ap; { - register int biosize; - register struct uio *uio = ap->a_uio; + int biosize; + struct uio *uio = ap->a_uio; struct proc *p = uio->uio_procp; - register struct vnode *vp = ap->a_vp; + struct vnode *vp = ap->a_vp; struct nfsnode *np = VTONFS(vp); - register struct ucred *cred = ap->a_cred; + struct ucred *cred = ap->a_cred; int ioflag = ap->a_ioflag; struct buf *bp; struct vattr vattr; struct nfsmount *nmp = VFSTONFS(vp->v_mount); daddr_t lbn; - int bufsize; + int bcount; int n, on, error = 0, iomode, must_commit; #ifdef DIAGNOSTIC @@ -749,12 +754,9 @@ nfs_write(ap) psignal(p, SIGXFSZ); return (EFBIG); } - /* - * I use nm_rsize, not nm_wsize so that all buffer cache blocks - * will be the same size within a filesystem. nfs_writerpc will - * still use nm_wsize when sizing the rpc's. - */ + biosize = vp->v_mount->mnt_stat.f_iosize; + do { /* * Check for a valid write lease. @@ -786,17 +788,74 @@ nfs_write(ap) on = uio->uio_offset & (biosize-1); n = min((unsigned)(biosize - on), uio->uio_resid); again: - if (uio->uio_offset + n > np->n_size) { + /* + * Handle direct append and file extension cases, calculate + * unaligned buffer size. + */ + + if (uio->uio_offset == np->n_size && n) { + /* + * special append case. Obtain buffer prior to + * resizing it to maintain B_CACHE. + */ + long save; + + bcount = on; + bp = nfs_getcacheblk(vp, lbn, bcount, p); + save = bp->b_flags & B_CACHE; + np->n_size = uio->uio_offset + n; np->n_flag |= NMODIFIED; vnode_pager_setsize(vp, np->n_size); + + bcount += n; + allocbuf(bp, bcount); + bp->b_flags |= save; + } else { + if (uio->uio_offset + n > np->n_size) { + np->n_size = uio->uio_offset + n; + np->n_flag |= NMODIFIED; + vnode_pager_setsize(vp, np->n_size); + } + bcount = biosize; + if ((off_t)(lbn + 1) * biosize > np->n_size) + bcount = np->n_size - (off_t)lbn * biosize; + bp = nfs_getcacheblk(vp, lbn, bcount, p); } - bufsize = biosize; - if ((off_t)(lbn + 1) * biosize > np->n_size) { - bufsize = np->n_size - (off_t)lbn * biosize; - bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); + + /* + * Issue a READ if B_CACHE is not set. In special-append + * mode, B_CACHE is based on the buffer prior to the write + * op and is typically set, avoiding the read. If a read + * is required in special append mode, the server will + * probably send us a short-read since we extended the file + * on our end, resulting in b_resid == 0 and, thusly, + * B_CACHE getting set. + * + * We can also avoid issuing the read if the write covers + * the entire buffer. We have to make sure the buffer state + * is reasonable in this case since we will not be initiating + * I/O. See the comments in kern/vfs_bio.c's getblk() for + * more information. + * + * B_CACHE may also be set due to the buffer being cached + * normally. + */ + + if (on == 0 && n == bcount) { + bp->b_flags |= B_CACHE; + bp->b_flags &= ~(B_ERROR | B_INVAL); + } + + if ((bp->b_flags & B_CACHE) == 0) { + bp->b_flags |= B_READ; + vfs_busy_pages(bp, 0); + error = nfs_doio(bp, cred, p); + if (error) { + brelse(bp); + return (error); + } } - bp = nfs_getcacheblk(vp, lbn, bufsize, p); if (!bp) return (EINTR); if (bp->b_wcred == NOCRED) { @@ -820,6 +879,17 @@ again: * If the new write will leave a contiguous dirty * area, just update the b_dirtyoff and b_dirtyend, * otherwise force a write rpc of the old dirty area. + * + * While it is possible to merge discontiguous writes due to + * our having a B_CACHE buffer ( and thus valid read data + * for the hole), we don't because it could lead to + * significant cache coherency problems with multiple clients, + * especially if locking is implemented later on. + * + * as an optimization we could theoretically maintain + * a linked list of discontinuous areas, but we would still + * have to commit them separately so there isn't much + * advantage to it except perhaps a bit of asynchronization. */ if (bp->b_dirtyend > 0 && @@ -862,11 +932,6 @@ again: return (error); } - /* - * This will keep the buffer and mmaped regions more coherent. - */ - nfs_prot_buf(bp, on, n); - /* * Only update dirtyoff/dirtyend if not a degenerate * condition. @@ -879,21 +944,7 @@ again: bp->b_dirtyoff = on; bp->b_dirtyend = on + n; } - } - - /* - * To avoid code complexity, we may have to throw away - * previously valid ranges when merging the new dirty range - * into the valid range. As long as we do not *ADD* an - * invalid valid range, we are ok. - */ - if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff || - bp->b_validoff > bp->b_dirtyend) { - bp->b_validoff = bp->b_dirtyoff; - bp->b_validend = bp->b_dirtyend; - } else { - bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff); - bp->b_validend = max(bp->b_validend, bp->b_dirtyend); + vfs_bio_set_validclean(bp, on, n); } /* @@ -904,11 +955,14 @@ again: /* * If the lease is non-cachable or IO_SYNC do bwrite(). + * + * IO_INVAL appears to be unused. The idea appears to be + * to turn off caching in this case. Very odd. XXX */ if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) { bp->b_proc = p; if (ioflag & IO_INVAL) - bp->b_flags |= B_INVAL; + bp->b_flags |= B_NOCACHE; error = VOP_BWRITE(bp); if (error) return (error); @@ -922,8 +976,9 @@ again: bp->b_proc = (struct proc *)0; bp->b_flags |= B_ASYNC; (void)nfs_writebp(bp, 0); - } else + } else { bdwrite(bp); + } } while (uio->uio_resid > 0 && n > 0); return (0); } @@ -956,15 +1011,16 @@ nfs_getcacheblk(vp, bn, size, p) return ((struct buf *)0); bp = getblk(vp, bn, size, 0, 2 * hz); } - } else + } else { bp = getblk(vp, bn, size, 0, 0); + } if (vp->v_type == VREG) { int biosize; + biosize = mp->mnt_stat.f_iosize; bp->b_blkno = bn * (biosize / DEV_BSIZE); } - return (bp); } @@ -1036,6 +1092,9 @@ nfs_vinvalbuf(vp, flags, cred, p, intrflg) * Initiate asynchronous I/O. Return an error if no nfsiods are available. * This is mainly to avoid queueing async I/O requests when the nfsiods * are all hung on a dead server. + * + * Note: nfs_asyncio() does not clear (B_ERROR|B_INVAL) but when the bp + * is eventually dequeued by the async daemon, nfs_doio() *will*. */ int nfs_asyncio(bp, cred) @@ -1164,7 +1223,7 @@ nfs_doio(bp, cr, p) struct vnode *vp; struct nfsnode *np; struct nfsmount *nmp; - int error = 0, diff, len, iomode, must_commit = 0; + int error = 0, iomode, must_commit = 0; struct uio uio; struct iovec io; @@ -1177,6 +1236,13 @@ nfs_doio(bp, cr, p) uiop->uio_segflg = UIO_SYSSPACE; uiop->uio_procp = p; + /* + * clear B_ERROR and B_INVAL state prior to initiating the I/O. We + * do this here so we do not have to do it in all the code that + * calls us. + */ + bp->b_flags &= ~(B_ERROR | B_INVAL); + KASSERT(!(bp->b_flags & B_DONE), ("nfs_doio: bp %p already marked done", bp)); /* @@ -1216,25 +1282,22 @@ nfs_doio(bp, cr, p) nfsstats.read_bios++; error = nfs_readrpc(vp, uiop, cr); if (!error) { - bp->b_validoff = 0; if (uiop->uio_resid) { /* - * If len > 0, there is a hole in the file and - * no writes after the hole have been pushed to - * the server yet. - * Just zero fill the rest of the valid area. + * If we had a short read with no error, we must have + * hit a file hole. We should zero-fill the remainder. + * This can also occur if the server hits the file EOF. + * + * Holes used to be able to occur due to pending + * writes, but that is not possible any longer. */ - diff = bp->b_bcount - uiop->uio_resid; - len = np->n_size - (((u_quad_t)bp->b_blkno) * DEV_BSIZE - + diff); - if (len > 0) { - len = min(len, uiop->uio_resid); - bzero((char *)bp->b_data + diff, len); - bp->b_validend = diff + len; - } else - bp->b_validend = diff; - } else - bp->b_validend = bp->b_bcount; + int nread = bp->b_bcount - uiop->uio_resid; + int left = bp->b_bcount - nread; + + if (left > 0) + bzero((char *)bp->b_data + nread, left); + uiop->uio_resid = 0; + } } if (p && (vp->v_flag & VTEXT) && (((nmp->nm_flag & NFSMNT_NQNFS) && @@ -1262,6 +1325,10 @@ nfs_doio(bp, cr, p) } if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0) error = nfs_readdirrpc(vp, uiop, cr); + /* + * end-of-directory sets B_INVAL but does not generate an + * error. + */ if (error == 0 && uiop->uio_resid == bp->b_bcount) bp->b_flags |= B_INVAL; break; @@ -1296,7 +1363,7 @@ nfs_doio(bp, cr, p) if (!error && iomode == NFSV3WRITE_UNSTABLE) { bp->b_flags |= B_NEEDCOMMIT; if (bp->b_dirtyoff == 0 - && bp->b_dirtyend == bp->b_bufsize) + && bp->b_dirtyend == bp->b_bcount) bp->b_flags |= B_CLUSTEROK; } else { bp->b_flags &= ~B_NEEDCOMMIT; diff --git a/sys/nfsclient/nfs_socket.c b/sys/nfsclient/nfs_socket.c index 1490f724a68f..2267629116b3 100644 --- a/sys/nfsclient/nfs_socket.c +++ b/sys/nfsclient/nfs_socket.c @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * @(#)nfs_socket.c 8.5 (Berkeley) 3/30/95 - * $Id: nfs_socket.c,v 1.50 1999/02/25 00:03:51 peter Exp $ + * $Id: nfs_socket.c,v 1.51 1999/04/24 11:29:48 dt Exp $ */ /* @@ -54,6 +54,7 @@ #include #include #include +#include #include #include @@ -115,6 +116,15 @@ static int proct[NFS_NPROCS] = { 0, 0, 0, }; +static int nfs_realign_test; +static int nfs_realign_count; + +SYSCTL_DECL(_vfs_nfs); + +SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_test, CTLFLAG_RD, &nfs_realign_test, 0, ""); +SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_count, CTLFLAG_RD, &nfs_realign_count, 0, ""); + + /* * There is a congestion window for outstanding rpcs maintained per mount * point. The cwnd size is adjusted in roughly the way that: @@ -138,7 +148,7 @@ struct callout_handle nfs_timer_handle; static int nfs_msg __P((struct proc *,char *,char *)); static int nfs_rcvlock __P((struct nfsreq *)); static void nfs_rcvunlock __P((struct nfsreq *)); -static void nfs_realign __P((struct mbuf *m, int hsiz)); +static void nfs_realign __P((struct mbuf **pm, int hsiz)); static int nfs_receive __P((struct nfsreq *rep, struct sockaddr **aname, struct mbuf **mp)); static int nfs_reconnect __P((struct nfsreq *rep)); @@ -702,7 +712,7 @@ errout: * These could cause pointer alignment problems, so copy them to * well aligned mbufs. */ - nfs_realign(*mp, 5 * NFSX_UNSIGNED); + nfs_realign(mp, 5 * NFSX_UNSIGNED); return (error); } @@ -1589,92 +1599,56 @@ nfs_rcvunlock(rep) } /* - * Check for badly aligned mbuf data areas and - * realign data in an mbuf list by copying the data areas up, as required. + * nfs_realign: + * + * Check for badly aligned mbuf data and realign by copying the unaligned + * portion of the data into a new mbuf chain and freeing the portions + * of the old chain that were replaced. + * + * We cannot simply realign the data within the existing mbuf chain + * because the underlying buffers may contain other rpc commands and + * we cannot afford to overwrite them. + * + * We would prefer to avoid this situation entirely. The situation does + * not occur with NFS/UDP and is supposed to only occassionally occur + * with TCP. Use vfs.nfs.realign_count and realign_test to check this. */ static void -nfs_realign(m, hsiz) - register struct mbuf *m; +nfs_realign(pm, hsiz) + register struct mbuf **pm; int hsiz; { - register struct mbuf *m2; - register int siz, mlen, olen; - register caddr_t tcp, fcp; - struct mbuf *mnew; + struct mbuf *m; + struct mbuf *n = NULL; + int off = 0; - while (m) { - /* - * This never happens for UDP, rarely happens for TCP - * but frequently happens for iso transport. - */ - if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) { - olen = m->m_len; - fcp = mtod(m, caddr_t); - if ((intptr_t)fcp & 0x3) { - m->m_flags &= ~M_PKTHDR; - if (m->m_flags & M_EXT) - m->m_data = m->m_ext.ext_buf + - ((m->m_ext.ext_size - olen) & ~0x3); - else - m->m_data = m->m_dat; + ++nfs_realign_test; + + while ((m = *pm) != NULL) { + if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) { + MGET(n, M_WAIT, MT_DATA); + if (m->m_len >= MINCLSIZE) { + MCLGET(n, M_WAIT); + } + n->m_len = 0; + break; } - m->m_len = 0; - tcp = mtod(m, caddr_t); - mnew = m; - m2 = m->m_next; + pm = &m->m_next; + } - /* - * If possible, only put the first invariant part - * of the RPC header in the first mbuf. - */ - mlen = M_TRAILINGSPACE(m); - if (olen <= hsiz && mlen > hsiz) - mlen = hsiz; - - /* - * Loop through the mbuf list consolidating data. - */ + /* + * If n is non-NULL, loop on m copying data, then replace the + * portion of the chain that had to be realigned. + */ + if (n != NULL) { + ++nfs_realign_count; while (m) { - while (olen > 0) { - if (mlen == 0) { - m2->m_flags &= ~M_PKTHDR; - if (m2->m_flags & M_EXT) - m2->m_data = m2->m_ext.ext_buf; - else - m2->m_data = m2->m_dat; - m2->m_len = 0; - mlen = M_TRAILINGSPACE(m2); - tcp = mtod(m2, caddr_t); - mnew = m2; - m2 = m2->m_next; - } - siz = min(mlen, olen); - if (tcp != fcp) - bcopy(fcp, tcp, siz); - mnew->m_len += siz; - mlen -= siz; - olen -= siz; - tcp += siz; - fcp += siz; - } + m_copyback(n, off, m->m_len, mtod(m, caddr_t)); + off += m->m_len; m = m->m_next; - if (m) { - olen = m->m_len; - fcp = mtod(m, caddr_t); - } } - - /* - * Finally, set m_len == 0 for any trailing mbufs that have - * been copied out of. - */ - while (m2) { - m2->m_len = 0; - m2 = m2->m_next; - } - return; - } - m = m->m_next; + m_freem(*pm); + *pm = n; } } @@ -2040,7 +2014,7 @@ nfsrv_rcv(so, arg, waitflag) m_freem(mp); continue; } - nfs_realign(mp, 10 * NFSX_UNSIGNED); + nfs_realign(&mp, 10 * NFSX_UNSIGNED); rec->nr_address = nam; rec->nr_packet = mp; STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link); @@ -2182,7 +2156,7 @@ nfsrv_getstream(slp, waitflag) if (!rec) { m_freem(slp->ns_frag); } else { - nfs_realign(slp->ns_frag, 10 * NFSX_UNSIGNED); + nfs_realign(&slp->ns_frag, 10 * NFSX_UNSIGNED); rec->nr_address = (struct sockaddr *)0; rec->nr_packet = slp->ns_frag; STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link); diff --git a/sys/nfsclient/nfs_vnops.c b/sys/nfsclient/nfs_vnops.c index a92bb2295811..6114d56f340a 100644 --- a/sys/nfsclient/nfs_vnops.c +++ b/sys/nfsclient/nfs_vnops.c @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * @(#)nfs_vnops.c 8.16 (Berkeley) 5/27/95 - * $Id: nfs_vnops.c,v 1.123 1999/02/16 10:49:54 dfr Exp $ + * $Id: nfs_vnops.c,v 1.124 1999/03/12 02:24:58 julian Exp $ */ @@ -408,9 +408,9 @@ nfs_access(ap) error = nfs_readrpc(vp, &auio, ap->a_cred); else if (vp->v_type == VDIR) { char* bp; - bp = malloc(NFS_DIRBLKSIZ, M_TEMP, M_WAITOK); + bp = malloc(DIRBLKSIZ, M_TEMP, M_WAITOK); aiov.iov_base = bp; - aiov.iov_len = auio.uio_resid = NFS_DIRBLKSIZ; + aiov.iov_len = auio.uio_resid = DIRBLKSIZ; error = nfs_readdirrpc(vp, &auio, ap->a_cred); free(bp, M_TEMP); } else if (vp->v_type == VLNK) @@ -962,7 +962,7 @@ nfs_read(ap) if (vp->v_type != VREG) return (EPERM); - return (nfs_bioread(vp, ap->a_uio, ap->a_ioflag, ap->a_cred, 0)); + return (nfs_bioread(vp, ap->a_uio, ap->a_ioflag, ap->a_cred)); } /* @@ -980,7 +980,7 @@ nfs_readlink(ap) if (vp->v_type != VLNK) return (EINVAL); - return (nfs_bioread(vp, ap->a_uio, 0, ap->a_cred, 0)); + return (nfs_bioread(vp, ap->a_uio, 0, ap->a_cred)); } /* @@ -1985,7 +1985,7 @@ nfs_readdir(ap) * Call nfs_bioread() to do the real work. */ tresid = uio->uio_resid; - error = nfs_bioread(vp, uio, 0, ap->a_cred, 0); + error = nfs_bioread(vp, uio, 0, ap->a_cred); if (!error && uio->uio_resid == tresid) nfsstats.direofcache_misses++; @@ -2004,7 +2004,7 @@ nfs_readdirrpc(vp, uiop, cred) { register int len, left; - register struct dirent *dp; + register struct dirent *dp = NULL; register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; @@ -2019,12 +2019,9 @@ nfs_readdirrpc(vp, uiop, cred) int attrflag; int v3 = NFS_ISV3(vp); -#ifndef nolint - dp = (struct dirent *)0; -#endif #ifndef DIAGNOSTIC - if (uiop->uio_iovcnt != 1 || (uiop->uio_offset & (NFS_DIRBLKSIZ - 1)) || - (uiop->uio_resid & (NFS_DIRBLKSIZ - 1))) + if (uiop->uio_iovcnt != 1 || (uiop->uio_offset & (DIRBLKSIZ - 1)) || + (uiop->uio_resid & (DIRBLKSIZ - 1))) panic("nfs readdirrpc bad uio"); #endif @@ -2381,7 +2378,7 @@ nfs_readdirplusrpc(vp, uiop, cred) m_freem(mrep); } /* - * Fill last record, iff any, out to a multiple of NFS_DIRBLKSIZ + * Fill last record, iff any, out to a multiple of DIRBLKSIZ * by increasing d_reclen for the last record. */ if (blksiz > 0) { @@ -3028,13 +3025,13 @@ nfs_bwrite(ap) struct vnode *a_bp; } */ *ap; { - return (nfs_writebp(ap->a_bp, 1)); } /* * This is a clone of vn_bwrite(), except that B_WRITEINPROG isn't set unless - * the force flag is one and it also handles the B_NEEDCOMMIT flag. + * the force flag is one and it also handles the B_NEEDCOMMIT flag. We set + * B_CACHE if this is a VMIO buffer. */ int nfs_writebp(bp, force) @@ -3049,12 +3046,15 @@ nfs_writebp(bp, force) if(!(bp->b_flags & B_BUSY)) panic("bwrite: buffer is not busy???"); - if (bp->b_flags & B_INVAL) - bp->b_flags |= B_NOCACHE; + if (bp->b_flags & B_INVAL) { + brelse(bp); + return(0); + } + + bp->b_flags |= B_CACHE; /* - * XXX we bundirty() the bp here. Shouldn't we do it later after - * the I/O has completed?? + * Undirty the bp. We will redirty it later if the I/O fails. */ s = splbio(); diff --git a/sys/nfsclient/nfsargs.h b/sys/nfsclient/nfsargs.h index bc15a7c6dc28..78a54a2d6be0 100644 --- a/sys/nfsclient/nfsargs.h +++ b/sys/nfsclient/nfsargs.h @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * @(#)nfs.h 8.4 (Berkeley) 5/1/95 - * $Id: nfs.h,v 1.44 1998/09/07 05:42:15 bde Exp $ + * $Id: nfs.h,v 1.45 1999/02/25 00:03:50 peter Exp $ */ #ifndef _NFS_NFS_H_ @@ -651,8 +651,7 @@ void nfs_disconnect __P((struct nfsmount *)); void nfs_safedisconnect __P((struct nfsmount *)); int nfs_getattrcache __P((struct vnode *, struct vattr *)); int nfsm_strtmbuf __P((struct mbuf **, char **, const char *, long)); -int nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *, - int)); +int nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *)); int nfsm_uiotombuf __P((struct uio *, struct mbuf **, int, caddr_t *)); void nfsrv_init __P((int)); void nfs_clearcommit __P((struct mount *)); diff --git a/sys/nfsclient/nfsstats.h b/sys/nfsclient/nfsstats.h index bc15a7c6dc28..78a54a2d6be0 100644 --- a/sys/nfsclient/nfsstats.h +++ b/sys/nfsclient/nfsstats.h @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * @(#)nfs.h 8.4 (Berkeley) 5/1/95 - * $Id: nfs.h,v 1.44 1998/09/07 05:42:15 bde Exp $ + * $Id: nfs.h,v 1.45 1999/02/25 00:03:50 peter Exp $ */ #ifndef _NFS_NFS_H_ @@ -651,8 +651,7 @@ void nfs_disconnect __P((struct nfsmount *)); void nfs_safedisconnect __P((struct nfsmount *)); int nfs_getattrcache __P((struct vnode *, struct vattr *)); int nfsm_strtmbuf __P((struct mbuf **, char **, const char *, long)); -int nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *, - int)); +int nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *)); int nfsm_uiotombuf __P((struct uio *, struct mbuf **, int, caddr_t *)); void nfsrv_init __P((int)); void nfs_clearcommit __P((struct mount *)); diff --git a/sys/nfsserver/nfs.h b/sys/nfsserver/nfs.h index bc15a7c6dc28..78a54a2d6be0 100644 --- a/sys/nfsserver/nfs.h +++ b/sys/nfsserver/nfs.h @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * @(#)nfs.h 8.4 (Berkeley) 5/1/95 - * $Id: nfs.h,v 1.44 1998/09/07 05:42:15 bde Exp $ + * $Id: nfs.h,v 1.45 1999/02/25 00:03:50 peter Exp $ */ #ifndef _NFS_NFS_H_ @@ -651,8 +651,7 @@ void nfs_disconnect __P((struct nfsmount *)); void nfs_safedisconnect __P((struct nfsmount *)); int nfs_getattrcache __P((struct vnode *, struct vattr *)); int nfsm_strtmbuf __P((struct mbuf **, char **, const char *, long)); -int nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *, - int)); +int nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *)); int nfsm_uiotombuf __P((struct uio *, struct mbuf **, int, caddr_t *)); void nfsrv_init __P((int)); void nfs_clearcommit __P((struct mount *)); diff --git a/sys/nfsserver/nfs_srvsock.c b/sys/nfsserver/nfs_srvsock.c index 1490f724a68f..2267629116b3 100644 --- a/sys/nfsserver/nfs_srvsock.c +++ b/sys/nfsserver/nfs_srvsock.c @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * @(#)nfs_socket.c 8.5 (Berkeley) 3/30/95 - * $Id: nfs_socket.c,v 1.50 1999/02/25 00:03:51 peter Exp $ + * $Id: nfs_socket.c,v 1.51 1999/04/24 11:29:48 dt Exp $ */ /* @@ -54,6 +54,7 @@ #include #include #include +#include #include #include @@ -115,6 +116,15 @@ static int proct[NFS_NPROCS] = { 0, 0, 0, }; +static int nfs_realign_test; +static int nfs_realign_count; + +SYSCTL_DECL(_vfs_nfs); + +SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_test, CTLFLAG_RD, &nfs_realign_test, 0, ""); +SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_count, CTLFLAG_RD, &nfs_realign_count, 0, ""); + + /* * There is a congestion window for outstanding rpcs maintained per mount * point. The cwnd size is adjusted in roughly the way that: @@ -138,7 +148,7 @@ struct callout_handle nfs_timer_handle; static int nfs_msg __P((struct proc *,char *,char *)); static int nfs_rcvlock __P((struct nfsreq *)); static void nfs_rcvunlock __P((struct nfsreq *)); -static void nfs_realign __P((struct mbuf *m, int hsiz)); +static void nfs_realign __P((struct mbuf **pm, int hsiz)); static int nfs_receive __P((struct nfsreq *rep, struct sockaddr **aname, struct mbuf **mp)); static int nfs_reconnect __P((struct nfsreq *rep)); @@ -702,7 +712,7 @@ errout: * These could cause pointer alignment problems, so copy them to * well aligned mbufs. */ - nfs_realign(*mp, 5 * NFSX_UNSIGNED); + nfs_realign(mp, 5 * NFSX_UNSIGNED); return (error); } @@ -1589,92 +1599,56 @@ nfs_rcvunlock(rep) } /* - * Check for badly aligned mbuf data areas and - * realign data in an mbuf list by copying the data areas up, as required. + * nfs_realign: + * + * Check for badly aligned mbuf data and realign by copying the unaligned + * portion of the data into a new mbuf chain and freeing the portions + * of the old chain that were replaced. + * + * We cannot simply realign the data within the existing mbuf chain + * because the underlying buffers may contain other rpc commands and + * we cannot afford to overwrite them. + * + * We would prefer to avoid this situation entirely. The situation does + * not occur with NFS/UDP and is supposed to only occassionally occur + * with TCP. Use vfs.nfs.realign_count and realign_test to check this. */ static void -nfs_realign(m, hsiz) - register struct mbuf *m; +nfs_realign(pm, hsiz) + register struct mbuf **pm; int hsiz; { - register struct mbuf *m2; - register int siz, mlen, olen; - register caddr_t tcp, fcp; - struct mbuf *mnew; + struct mbuf *m; + struct mbuf *n = NULL; + int off = 0; - while (m) { - /* - * This never happens for UDP, rarely happens for TCP - * but frequently happens for iso transport. - */ - if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) { - olen = m->m_len; - fcp = mtod(m, caddr_t); - if ((intptr_t)fcp & 0x3) { - m->m_flags &= ~M_PKTHDR; - if (m->m_flags & M_EXT) - m->m_data = m->m_ext.ext_buf + - ((m->m_ext.ext_size - olen) & ~0x3); - else - m->m_data = m->m_dat; + ++nfs_realign_test; + + while ((m = *pm) != NULL) { + if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) { + MGET(n, M_WAIT, MT_DATA); + if (m->m_len >= MINCLSIZE) { + MCLGET(n, M_WAIT); + } + n->m_len = 0; + break; } - m->m_len = 0; - tcp = mtod(m, caddr_t); - mnew = m; - m2 = m->m_next; + pm = &m->m_next; + } - /* - * If possible, only put the first invariant part - * of the RPC header in the first mbuf. - */ - mlen = M_TRAILINGSPACE(m); - if (olen <= hsiz && mlen > hsiz) - mlen = hsiz; - - /* - * Loop through the mbuf list consolidating data. - */ + /* + * If n is non-NULL, loop on m copying data, then replace the + * portion of the chain that had to be realigned. + */ + if (n != NULL) { + ++nfs_realign_count; while (m) { - while (olen > 0) { - if (mlen == 0) { - m2->m_flags &= ~M_PKTHDR; - if (m2->m_flags & M_EXT) - m2->m_data = m2->m_ext.ext_buf; - else - m2->m_data = m2->m_dat; - m2->m_len = 0; - mlen = M_TRAILINGSPACE(m2); - tcp = mtod(m2, caddr_t); - mnew = m2; - m2 = m2->m_next; - } - siz = min(mlen, olen); - if (tcp != fcp) - bcopy(fcp, tcp, siz); - mnew->m_len += siz; - mlen -= siz; - olen -= siz; - tcp += siz; - fcp += siz; - } + m_copyback(n, off, m->m_len, mtod(m, caddr_t)); + off += m->m_len; m = m->m_next; - if (m) { - olen = m->m_len; - fcp = mtod(m, caddr_t); - } } - - /* - * Finally, set m_len == 0 for any trailing mbufs that have - * been copied out of. - */ - while (m2) { - m2->m_len = 0; - m2 = m2->m_next; - } - return; - } - m = m->m_next; + m_freem(*pm); + *pm = n; } } @@ -2040,7 +2014,7 @@ nfsrv_rcv(so, arg, waitflag) m_freem(mp); continue; } - nfs_realign(mp, 10 * NFSX_UNSIGNED); + nfs_realign(&mp, 10 * NFSX_UNSIGNED); rec->nr_address = nam; rec->nr_packet = mp; STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link); @@ -2182,7 +2156,7 @@ nfsrv_getstream(slp, waitflag) if (!rec) { m_freem(slp->ns_frag); } else { - nfs_realign(slp->ns_frag, 10 * NFSX_UNSIGNED); + nfs_realign(&slp->ns_frag, 10 * NFSX_UNSIGNED); rec->nr_address = (struct sockaddr *)0; rec->nr_packet = slp->ns_frag; STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link); diff --git a/sys/nfsserver/nfsrvstats.h b/sys/nfsserver/nfsrvstats.h index bc15a7c6dc28..78a54a2d6be0 100644 --- a/sys/nfsserver/nfsrvstats.h +++ b/sys/nfsserver/nfsrvstats.h @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * @(#)nfs.h 8.4 (Berkeley) 5/1/95 - * $Id: nfs.h,v 1.44 1998/09/07 05:42:15 bde Exp $ + * $Id: nfs.h,v 1.45 1999/02/25 00:03:50 peter Exp $ */ #ifndef _NFS_NFS_H_ @@ -651,8 +651,7 @@ void nfs_disconnect __P((struct nfsmount *)); void nfs_safedisconnect __P((struct nfsmount *)); int nfs_getattrcache __P((struct vnode *, struct vattr *)); int nfsm_strtmbuf __P((struct mbuf **, char **, const char *, long)); -int nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *, - int)); +int nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *)); int nfsm_uiotombuf __P((struct uio *, struct mbuf **, int, caddr_t *)); void nfsrv_init __P((int)); void nfs_clearcommit __P((struct mount *)); diff --git a/sys/sys/bio.h b/sys/sys/bio.h index d2ce212b4d12..2e88ca7fe5d2 100644 --- a/sys/sys/bio.h +++ b/sys/sys/bio.h @@ -36,7 +36,7 @@ * SUCH DAMAGE. * * @(#)buf.h 8.9 (Berkeley) 3/30/95 - * $Id: buf.h,v 1.64 1999/03/02 04:04:28 mckusick Exp $ + * $Id: buf.h,v 1.65 1999/03/12 02:24:55 julian Exp $ */ #ifndef _SYS_BUF_H_ @@ -78,6 +78,19 @@ struct iodone_chain { /* * The buffer header describes an I/O operation in the kernel. + * + * NOTES: + * b_bufsize, b_bcount. b_bufsize is the allocation size of the + * buffer, either DEV_BSIZE or PAGE_SIZE aligned. b_bcount is the + * originally requested buffer size and can serve as a bounds check + * against EOF. For most, but not all uses, b_bcount == b_bufsize. + * + * b_dirtyoff, b_dirtyend. Buffers support piecemeal, unaligned + * ranges of dirty data that need to be written to backing store. + * The range is typically clipped at b_bcount ( not b_bufsize ). + * + * b_resid. Number of bytes remaining in I/O. After an I/O operation + * completes, b_resid is usually 0 indicating 100% success. */ struct buf { LIST_ENTRY(buf) b_hash; /* Hash chain. */ @@ -109,8 +122,10 @@ struct buf { int b_dirtyend; /* Offset of end of dirty region. */ struct ucred *b_rcred; /* Read credentials reference. */ struct ucred *b_wcred; /* Write credentials reference. */ +#if 0 int b_validoff; /* Offset in buffer of valid region. */ int b_validend; /* Offset of end of valid region. */ +#endif daddr_t b_pblkno; /* physical block number */ void *b_saveaddr; /* Original b_addr for physio. */ caddr_t b_savekva; /* saved kva for transfer while bouncing */ @@ -151,9 +166,24 @@ struct buf { * Buffer vp reassignments are illegal in this case. * * B_CACHE This may only be set if the buffer is entirely valid. - * The situation where B_DELWRI is set and B_CACHE gets - * cleared MUST be committed to disk so B_DELWRI can - * also be cleared. + * The situation where B_DELWRI is set and B_CACHE is + * clear MUST be committed to disk by getblk() so + * B_DELWRI can also be cleared. See the comments for + * getblk() in kern/vfs_bio.c. If B_CACHE is clear, + * the caller is expected to clear B_ERROR|B_INVAL, + * set B_READ, and initiate an I/O. + * + * The 'entire buffer' is defined to be the range from + * 0 through b_bcount. + * + * B_MALLOC Request that the buffer be allocated from the malloc + * pool, DEV_BSIZE aligned instead of PAGE_SIZE aligned. + * + * B_VMIO Indicates that the buffer is tied into an VM object. + * The buffer's data is always PAGE_SIZE aligned even + * if b_bufsize and b_bcount are not. ( b_bufsize is + * always at least DEV_BSIZE aligned, though ). + * */ #define B_AGE 0x00000001 /* Move to age queue when I/O done. */ @@ -356,6 +386,7 @@ void cluster_write __P((struct buf *, u_quad_t)); int physio __P((void (*)(struct buf *), struct buf *, dev_t, int, u_int (*)(struct buf *), struct uio *)); u_int minphys __P((struct buf *)); +void vfs_bio_set_validclean __P((struct buf *, int base, int size)); void vfs_bio_clrbuf __P((struct buf *)); void vfs_busy_pages __P((struct buf *, int clear_modify)); void vfs_unbusy_pages __P((struct buf *)); @@ -371,6 +402,7 @@ int allocbuf __P((struct buf *bp, int size)); void reassignbuf __P((struct buf *, struct vnode *)); void pbreassignbuf __P((struct buf *, struct vnode *)); struct buf *trypbuf __P((int *)); + #endif /* KERNEL */ #endif /* !_SYS_BUF_H_ */ diff --git a/sys/sys/buf.h b/sys/sys/buf.h index d2ce212b4d12..2e88ca7fe5d2 100644 --- a/sys/sys/buf.h +++ b/sys/sys/buf.h @@ -36,7 +36,7 @@ * SUCH DAMAGE. * * @(#)buf.h 8.9 (Berkeley) 3/30/95 - * $Id: buf.h,v 1.64 1999/03/02 04:04:28 mckusick Exp $ + * $Id: buf.h,v 1.65 1999/03/12 02:24:55 julian Exp $ */ #ifndef _SYS_BUF_H_ @@ -78,6 +78,19 @@ struct iodone_chain { /* * The buffer header describes an I/O operation in the kernel. + * + * NOTES: + * b_bufsize, b_bcount. b_bufsize is the allocation size of the + * buffer, either DEV_BSIZE or PAGE_SIZE aligned. b_bcount is the + * originally requested buffer size and can serve as a bounds check + * against EOF. For most, but not all uses, b_bcount == b_bufsize. + * + * b_dirtyoff, b_dirtyend. Buffers support piecemeal, unaligned + * ranges of dirty data that need to be written to backing store. + * The range is typically clipped at b_bcount ( not b_bufsize ). + * + * b_resid. Number of bytes remaining in I/O. After an I/O operation + * completes, b_resid is usually 0 indicating 100% success. */ struct buf { LIST_ENTRY(buf) b_hash; /* Hash chain. */ @@ -109,8 +122,10 @@ struct buf { int b_dirtyend; /* Offset of end of dirty region. */ struct ucred *b_rcred; /* Read credentials reference. */ struct ucred *b_wcred; /* Write credentials reference. */ +#if 0 int b_validoff; /* Offset in buffer of valid region. */ int b_validend; /* Offset of end of valid region. */ +#endif daddr_t b_pblkno; /* physical block number */ void *b_saveaddr; /* Original b_addr for physio. */ caddr_t b_savekva; /* saved kva for transfer while bouncing */ @@ -151,9 +166,24 @@ struct buf { * Buffer vp reassignments are illegal in this case. * * B_CACHE This may only be set if the buffer is entirely valid. - * The situation where B_DELWRI is set and B_CACHE gets - * cleared MUST be committed to disk so B_DELWRI can - * also be cleared. + * The situation where B_DELWRI is set and B_CACHE is + * clear MUST be committed to disk by getblk() so + * B_DELWRI can also be cleared. See the comments for + * getblk() in kern/vfs_bio.c. If B_CACHE is clear, + * the caller is expected to clear B_ERROR|B_INVAL, + * set B_READ, and initiate an I/O. + * + * The 'entire buffer' is defined to be the range from + * 0 through b_bcount. + * + * B_MALLOC Request that the buffer be allocated from the malloc + * pool, DEV_BSIZE aligned instead of PAGE_SIZE aligned. + * + * B_VMIO Indicates that the buffer is tied into an VM object. + * The buffer's data is always PAGE_SIZE aligned even + * if b_bufsize and b_bcount are not. ( b_bufsize is + * always at least DEV_BSIZE aligned, though ). + * */ #define B_AGE 0x00000001 /* Move to age queue when I/O done. */ @@ -356,6 +386,7 @@ void cluster_write __P((struct buf *, u_quad_t)); int physio __P((void (*)(struct buf *), struct buf *, dev_t, int, u_int (*)(struct buf *), struct uio *)); u_int minphys __P((struct buf *)); +void vfs_bio_set_validclean __P((struct buf *, int base, int size)); void vfs_bio_clrbuf __P((struct buf *)); void vfs_busy_pages __P((struct buf *, int clear_modify)); void vfs_unbusy_pages __P((struct buf *)); @@ -371,6 +402,7 @@ int allocbuf __P((struct buf *bp, int size)); void reassignbuf __P((struct buf *, struct vnode *)); void pbreassignbuf __P((struct buf *, struct vnode *)); struct buf *trypbuf __P((int *)); + #endif /* KERNEL */ #endif /* !_SYS_BUF_H_ */ diff --git a/sys/ufs/ffs/ffs_inode.c b/sys/ufs/ffs/ffs_inode.c index 882144028690..c80d0a5e4d78 100644 --- a/sys/ufs/ffs/ffs_inode.c +++ b/sys/ufs/ffs/ffs_inode.c @@ -31,7 +31,7 @@ * SUCH DAMAGE. * * @(#)ffs_inode.c 8.13 (Berkeley) 4/21/95 - * $Id: ffs_inode.c,v 1.52 1999/01/07 16:14:16 bde Exp $ + * $Id: ffs_inode.c,v 1.53 1999/01/28 00:57:54 dillon Exp $ */ #include "opt_quota.h" @@ -452,6 +452,7 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp) if ((bp->b_flags & B_CACHE) == 0) { curproc->p_stats->p_ru.ru_inblock++; /* pay for read */ bp->b_flags |= B_READ; + bp->b_flags &= ~(B_ERROR|B_INVAL); if (bp->b_bcount > bp->b_bufsize) panic("ffs_indirtrunc: bad buffer size"); bp->b_blkno = dbn; diff --git a/sys/ufs/mfs/mfs_vnops.c b/sys/ufs/mfs/mfs_vnops.c index d4d82f09c1d4..c9ae4dd8483d 100644 --- a/sys/ufs/mfs/mfs_vnops.c +++ b/sys/ufs/mfs/mfs_vnops.c @@ -31,7 +31,7 @@ * SUCH DAMAGE. * * @(#)mfs_vnops.c 8.11 (Berkeley) 5/22/95 - * $Id: mfs_vnops.c,v 1.42 1999/01/28 00:57:55 dillon Exp $ + * $Id: mfs_vnops.c,v 1.43 1999/04/11 02:28:32 eivind Exp $ */ #include @@ -127,6 +127,9 @@ mfs_fsync(ap) * We implement the B_FREEBUF strategy. We can't just madvise() * here because we have to do it in the correct order vs other bio * requests, so we queue it. + * + * Note: geteblk() sets B_INVAL. We leave it set to guarentee buffer + * throw-away on brelse()? XXX */ static int diff --git a/sys/ufs/ufs/ufs_bmap.c b/sys/ufs/ufs/ufs_bmap.c index f40ff338d41a..3ea5965e90d6 100644 --- a/sys/ufs/ufs/ufs_bmap.c +++ b/sys/ufs/ufs/ufs_bmap.c @@ -36,7 +36,7 @@ * SUCH DAMAGE. * * @(#)ufs_bmap.c 8.7 (Berkeley) 3/21/95 - * $Id: ufs_bmap.c,v 1.24 1998/10/27 11:47:08 bde Exp $ + * $Id: ufs_bmap.c,v 1.25 1999/01/28 00:57:55 dillon Exp $ */ #include @@ -228,6 +228,7 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb) #endif bp->b_blkno = blkptrtodb(ump, daddr); bp->b_flags |= B_READ; + bp->b_flags &= ~(B_INVAL|B_ERROR); vfs_busy_pages(bp, 0); VOP_STRATEGY(bp->b_vp, bp); curproc->p_stats->p_ru.ru_inblock++; /* XXX */ diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index 047f10f6cbe6..882d52efdffd 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -66,7 +66,7 @@ * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $Id: vm_fault.c,v 1.100 1999/02/17 09:08:29 dillon Exp $ + * $Id: vm_fault.c,v 1.101 1999/02/25 06:00:52 alc Exp $ */ /* @@ -409,6 +409,12 @@ readrest: firstpindex = fs.first_pindex - 2*(VM_FAULT_READ_BEHIND + VM_FAULT_READ_AHEAD + 1); + /* + * note: partially valid pages cannot be + * included in the lookahead - NFS piecemeal + * writes will barf on it badly. + */ + for(tmppindex = fs.first_pindex - 1; tmppindex >= firstpindex; --tmppindex) { @@ -552,12 +558,16 @@ readrest: } fs.first_m = NULL; + /* + * Zero the page if necessary and mark it valid. + */ if ((fs.m->flags & PG_ZERO) == 0) { vm_page_zero_fill(fs.m); - } - else + } else { cnt.v_ozfod++; + } cnt.v_zfod++; + fs.m->valid = VM_PAGE_BITS_ALL; break; /* break to PAGE HAS BEEN FOUND */ } else { if (fs.object != fs.first_object) { @@ -788,14 +798,24 @@ readrest: #endif unlock_things(&fs); - fs.m->valid = VM_PAGE_BITS_ALL; - vm_page_flag_clear(fs.m, PG_ZERO); + + /* + * Sanity check: page must be completely valid or it is not fit to + * map into user space. vm_pager_get_pages() ensures this. + */ + + if (fs.m->valid != VM_PAGE_BITS_ALL) { + vm_page_zero_invalid(fs.m, TRUE); + printf("Warning: page %p partially invalid on fault\n", fs.m); + } pmap_enter(fs.map->pmap, vaddr, VM_PAGE_TO_PHYS(fs.m), prot, wired); + if (((fault_flags & VM_FAULT_WIRE_MASK) == 0) && (wired == 0)) { pmap_prefault(fs.map->pmap, vaddr, fs.entry); } + vm_page_flag_clear(fs.m, PG_ZERO); vm_page_flag_set(fs.m, PG_MAPPED|PG_REFERENCED); if (fault_flags & VM_FAULT_HOLD) vm_page_hold(fs.m); diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index e07ea633ed17..0d85a946757c 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91 - * $Id: vm_page.c,v 1.128 1999/03/19 05:21:03 alc Exp $ + * $Id: vm_page.c,v 1.129 1999/04/05 19:38:29 julian Exp $ */ /* @@ -1460,14 +1460,16 @@ vm_page_bits(int base, int size) } /* - * set a page valid and clean. May not block. + * vm_page_set_validclean: * - * In order to maintain consistancy due to the DEV_BSIZE granularity - * of the valid bits, we have to zero non-DEV_BSIZE aligned portions of - * the page at the beginning and end of the valid range when the - * associated valid bits are not already set. + * Sets portions of a page valid and clean. The arguments are expected + * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive + * of any partial chunks touched by the range. The invalid portion of + * such chunks will be zero'd. * - * (base + size) must be less then or equal to PAGE_SIZE. + * This routine may not block. + * + * (base + size) must be less then or equal to PAGE_SIZE. */ void vm_page_set_validclean(m, base, size) @@ -1529,8 +1531,35 @@ vm_page_set_validclean(m, base, size) pmap_clear_modify(VM_PAGE_TO_PHYS(m)); } +#if 0 + +void +vm_page_set_dirty(m, base, size) + vm_page_t m; + int base; + int size; +{ + m->dirty |= vm_page_bits(base, size); +} + +#endif + +void +vm_page_clear_dirty(m, base, size) + vm_page_t m; + int base; + int size; +{ + m->dirty &= ~vm_page_bits(base, size); +} + /* - * set a page (partially) invalid. May not block. + * vm_page_set_invalid: + * + * Invalidates DEV_BSIZE'd chunks within a page. Both the + * valid and dirty bits for the effected areas are cleared. + * + * May not block. */ void vm_page_set_invalid(m, base, size) @@ -1540,9 +1569,9 @@ vm_page_set_invalid(m, base, size) { int bits; - m->valid &= ~(bits = vm_page_bits(base, size)); - if (m->valid == 0) - m->dirty &= ~bits; + bits = vm_page_bits(base, size); + m->valid &= ~bits; + m->dirty &= ~bits; m->object->generation++; } diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h index 8072f666b4c3..abff79486aad 100644 --- a/sys/vm/vm_page.h +++ b/sys/vm/vm_page.h @@ -61,7 +61,7 @@ * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $Id: vm_page.h,v 1.58 1999/03/15 05:09:48 julian Exp $ + * $Id: vm_page.h,v 1.59 1999/04/05 19:38:29 julian Exp $ */ /* @@ -101,6 +101,10 @@ * Fields in this structure are locked either by the lock on the * object that the page belongs to (O) or by the lock on the page * queues (P). + * + * The 'valid' and 'dirty' fields are distinct. A page may have dirty + * bits set without having associated valid bits set. This is used by + * NFS to implement piecemeal writes. */ TAILQ_HEAD(pglist, vm_page); @@ -404,6 +408,8 @@ void vm_page_wire __P((vm_page_t)); void vm_page_unqueue __P((vm_page_t)); void vm_page_unqueue_nowakeup __P((vm_page_t)); void vm_page_set_validclean __P((vm_page_t, int, int)); +void vm_page_set_dirty __P((vm_page_t, int, int)); +void vm_page_clear_dirty __P((vm_page_t, int, int)); void vm_page_set_invalid __P((vm_page_t, int, int)); static __inline boolean_t vm_page_zero_fill __P((vm_page_t)); int vm_page_is_valid __P((vm_page_t, int, int)); diff --git a/sys/vm/vm_pager.c b/sys/vm/vm_pager.c index 36a905ec6dc9..dbacceb03050 100644 --- a/sys/vm/vm_pager.c +++ b/sys/vm/vm_pager.c @@ -61,7 +61,7 @@ * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $Id: vm_pager.c,v 1.44 1999/03/14 09:20:00 julian Exp $ + * $Id: vm_pager.c,v 1.45 1999/04/11 02:16:27 eivind Exp $ */ /* @@ -523,6 +523,9 @@ vm_pager_chain_iodone(struct buf *nbp) * Obtain a physical buffer and chain it to its parent buffer. When * I/O completes, the parent buffer will be B_SIGNAL'd. Errors are * automatically propogated to the parent + * + * Since these are brand new buffers, we do not have to clear B_INVAL + * and B_ERROR because they are already clear. */ struct buf * diff --git a/sys/vm/vm_pager.h b/sys/vm/vm_pager.h index 82b65740f845..aff14ab29aef 100644 --- a/sys/vm/vm_pager.h +++ b/sys/vm/vm_pager.h @@ -36,7 +36,7 @@ * SUCH DAMAGE. * * @(#)vm_pager.h 8.4 (Berkeley) 1/12/94 - * $Id: vm_pager.h,v 1.20 1999/01/24 02:32:15 dillon Exp $ + * $Id: vm_pager.h,v 1.21 1999/03/14 09:20:00 julian Exp $ */ /* @@ -110,6 +110,14 @@ void flushchainbuf(struct buf *nbp); void waitchainbuf(struct buf *bp, int count, int done); void autochaindone(struct buf *bp); +/* + * vm_page_get_pages: + * + * Retrieve pages from the VM system in order to map them into an object + * ( or into VM space somewhere ). If the pagein was successful, we + * must fully validate it. + */ + static __inline int vm_pager_get_pages( vm_object_t object, @@ -117,7 +125,13 @@ vm_pager_get_pages( int count, int reqpage ) { - return ((*pagertab[object->type]->pgo_getpages)(object, m, count, reqpage)); + int r; + + r = (*pagertab[object->type]->pgo_getpages)(object, m, count, reqpage); + if (r == VM_PAGER_OK && m[reqpage]->valid != VM_PAGE_BITS_ALL) { + vm_page_zero_invalid(m[reqpage], TRUE); + } + return(r); } static __inline void diff --git a/sys/vm/vnode_pager.c b/sys/vm/vnode_pager.c index 628bec7cf5ab..83f379a206a1 100644 --- a/sys/vm/vnode_pager.c +++ b/sys/vm/vnode_pager.c @@ -38,7 +38,7 @@ * SUCH DAMAGE. * * from: @(#)vnode_pager.c 7.5 (Berkeley) 4/20/91 - * $Id: vnode_pager.c,v 1.106 1999/04/05 19:38:29 julian Exp $ + * $Id: vnode_pager.c,v 1.107 1999/04/10 20:52:11 dt Exp $ */ /* @@ -789,7 +789,8 @@ vnode_pager_generic_getpages(vp, m, bytecount, reqpage) * read. */ vm_page_set_validclean(mt, 0, size - tfoff); - vm_page_zero_invalid(mt, FALSE); + /* handled by vm_fault now */ + /* vm_page_zero_invalid(mt, FALSE); */ } vm_page_flag_clear(mt, PG_ZERO);