The VFS/BIO subsystem contained a number of hacks in order to optimize

piecemeal, middle-of-file writes for NFS. These hacks have caused no end of trouble, especially when combined with mmap(). I've removed them. Instead, NFS will issue a read-before-write to fully instantiate the struct buf containing the write. NFS does, however, optimize piecemeal appends to files. For most common file operations, you will not notice the difference. The sole remaining fragment in the VFS/BIO system is b_dirtyoff/end, which NFS uses to avoid cache coherency issues with read-merge-write style operations. NFS also optimizes the write-covers-entire-buffer case by avoiding the read-before-write. There is quite a bit of room for further optimization in these areas. The VM system marks pages fully-valid (AKA vm_page_t->valid = VM_PAGE_BITS_ALL) in several places, most noteably in vm_fault. This is not correct operation. The vm_pager_get_pages() code is now responsible for marking VM pages all-valid. A number of VM helper routines have been added to aid in zeroing-out the invalid portions of a VM page prior to the page being marked all-valid. This operation is necessary to properly support mmap(). The zeroing occurs most often when dealing with file-EOF situations. Several bugs have been fixed in the NFS subsystem, including bits handling file and directory EOF situations and buf->b_flags consistancy issues relating to clearing B_ERROR & B_INVAL, and handling B_DONE. getblk() and allocbuf() have been rewritten. B_CACHE operation is now formally defined in comments and more straightforward in implementation. B_CACHE for VMIO buffers is based on the validity of the backing store. B_CACHE for non-VMIO buffers is based simply on whether the buffer is B_INVAL or not (B_CACHE set if B_INVAL clear, and vise-versa). biodone() is now responsible for setting B_CACHE when a successful read completes. B_CACHE is also set when a bdwrite() is initiated and when a bwrite() is initiated. VFS VOP_BWRITE routines (there are only two - nfs_bwrite() and bwrite()) are now expected to set B_CACHE. This means that bowrite() and bawrite() also set B_CACHE indirectly. There are a number of places in the code which were previously using buf->b_bufsize (which is DEV_BSIZE aligned) when they should have been using buf->b_bcount. These have been fixed. getblk() now clears B_DONE on return because the rest of the system is so bad about dealing with B_DONE. Major fixes to NFS/TCP have been made. A server-side bug could cause requests to be lost by the server due to nfs_realign() overwriting other rpc's in the same TCP mbuf chain. The server's kernel must be recompiled to get the benefit of the fixes. Submitted by: Matthew Dillon <dillon@apollo.backplane.com>
svn path=/head/; revision=46349
1999-05-02 23:57:16 +00:00 · 1999-05-02 23:57:16 +00:00 · 4221e284a3 · 2020-12-20 02:59:44 +00:00
commit 4221e284a3
parent 1b3859ce9a
30 changed files with 1412 additions and 996 deletions
--- a/sys/gnu/ext2fs/ext2_bmap.c
+++ b/sys/gnu/ext2fs/ext2_bmap.c
@ -36,7 +36,7 @@
 * SUCH DAMAGE.
 *
 *	@(#)ufs_bmap.c	8.7 (Berkeley) 3/21/95
- * $Id: ufs_bmap.c,v 1.24 1998/10/27 11:47:08 bde Exp $
+ * $Id: ufs_bmap.c,v 1.25 1999/01/28 00:57:55 dillon Exp $
 */

 #include <sys/param.h>
@ -228,6 +228,7 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb)
 #endif
 			bp->b_blkno = blkptrtodb(ump, daddr);
 			bp->b_flags |= B_READ;
+			bp->b_flags &= ~(B_INVAL|B_ERROR);
 			vfs_busy_pages(bp, 0);
 			VOP_STRATEGY(bp->b_vp, bp);
 			curproc->p_stats->p_ru.ru_inblock++;	/* XXX */
--- a/sys/gnu/fs/ext2fs/ext2_bmap.c
+++ b/sys/gnu/fs/ext2fs/ext2_bmap.c
@ -36,7 +36,7 @@
 * SUCH DAMAGE.
 *
 *	@(#)ufs_bmap.c	8.7 (Berkeley) 3/21/95
- * $Id: ufs_bmap.c,v 1.24 1998/10/27 11:47:08 bde Exp $
+ * $Id: ufs_bmap.c,v 1.25 1999/01/28 00:57:55 dillon Exp $
 */

 #include <sys/param.h>
@ -228,6 +228,7 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb)
 #endif
 			bp->b_blkno = blkptrtodb(ump, daddr);
 			bp->b_flags |= B_READ;
+			bp->b_flags &= ~(B_INVAL|B_ERROR);
 			vfs_busy_pages(bp, 0);
 			VOP_STRATEGY(bp->b_vp, bp);
 			curproc->p_stats->p_ru.ru_inblock++;	/* XXX */
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
--- a/sys/kern/vfs_cluster.c
+++ b/sys/kern/vfs_cluster.c
@ -33,7 +33,7 @@
 * SUCH DAMAGE.
 *
 *	@(#)vfs_cluster.c	8.7 (Berkeley) 2/13/94
- * $Id: vfs_cluster.c,v 1.79 1999/01/27 21:49:58 dillon Exp $
+ * $Id: vfs_cluster.c,v 1.80 1999/03/12 02:24:56 julian Exp $
 */

 #include "opt_debug_cluster.h"
@ -251,6 +251,7 @@ cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp)
 #endif
 		if ((bp->b_flags & B_CLUSTER) == 0)
 			vfs_busy_pages(bp, 0);
+		bp->b_flags &= ~(B_ERROR|B_INVAL);
 		error = VOP_STRATEGY(vp, bp);
 		curproc->p_stats->p_ru.ru_inblock++;
 	}
@ -283,6 +284,7 @@ cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp)

 			if ((rbp->b_flags & B_CLUSTER) == 0)
 				vfs_busy_pages(rbp, 0);
+			rbp->b_flags &= ~(B_ERROR|B_INVAL);
 			(void) VOP_STRATEGY(vp, rbp);
 			curproc->p_stats->p_ru.ru_inblock++;
 		}
@ -473,8 +475,10 @@ cluster_callback(bp)
 		if (error) {
 			tbp->b_flags |= B_ERROR;
 			tbp->b_error = error;
-		} else
-		    tbp->b_dirtyoff = tbp->b_dirtyend = 0;
+		} else {
+			tbp->b_dirtyoff = tbp->b_dirtyend = 0;
+			tbp->b_flags &= ~(B_ERROR|B_INVAL);
+		}
 		biodone(tbp);
 	}
 	relpbuf(bp, &cluster_pbuf_freecnt);
--- a/sys/kern/vfs_default.c
+++ b/sys/kern/vfs_default.c
@ -138,6 +138,18 @@ vop_panic(struct vop_generic_args *ap)
 	panic("illegal vnode op called");
 }

+/*
+ *	vop_nostrategy:
+ *
+ *	Strategy routine for VFS devices that have none.
+ *
+ *	B_ERROR and B_INVAL must be cleared prior to calling any strategy
+ *	routine.  Typically this is done for a B_READ strategy call.  Typically
+ *	B_INVAL is assumed to already be clear prior to a write and should not
+ *	be cleared manually unless you just made the buffer invalid.  B_ERROR
+ *	should be cleared either way.
+ */
+
 static int
 vop_nostrategy (struct vop_strategy_args *ap)
 {
--- a/sys/nfs/nfs.h
+++ b/sys/nfs/nfs.h
@ -34,7 +34,7 @@
 * SUCH DAMAGE.
 *
 *	@(#)nfs.h	8.4 (Berkeley) 5/1/95
- * $Id: nfs.h,v 1.44 1998/09/07 05:42:15 bde Exp $
+ * $Id: nfs.h,v 1.45 1999/02/25 00:03:50 peter Exp $
 */

 #ifndef _NFS_NFS_H_
@ -651,8 +651,7 @@ void	nfs_disconnect __P((struct nfsmount *));
 void	nfs_safedisconnect __P((struct nfsmount *));
 int	nfs_getattrcache __P((struct vnode *, struct vattr *));
 int	nfsm_strtmbuf __P((struct mbuf **, char **, const char *, long));
-int	nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *,
-			 int));
+int	nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *));
 int	nfsm_uiotombuf __P((struct uio *, struct mbuf **, int, caddr_t *));
 void	nfsrv_init __P((int));
 void	nfs_clearcommit __P((struct mount *));
--- a/sys/nfs/nfs_bio.c
+++ b/sys/nfs/nfs_bio.c
@ -34,7 +34,7 @@
 * SUCH DAMAGE.
 *
 *	@(#)nfs_bio.c	8.9 (Berkeley) 3/30/95
- * $Id: nfs_bio.c,v 1.68 1999/04/05 19:38:28 julian Exp $
+ * $Id: nfs_bio.c,v 1.69 1999/04/06 03:07:54 peter Exp $
 */


@ -65,7 +65,6 @@

 static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size,
 					struct proc *p));
-static void nfs_prot_buf __P((struct buf *bp, int off, int n));

 extern int nfs_numasync;
 extern int nfs_pbuf_freecnt;
@ -84,7 +83,7 @@ nfs_getpages(ap)
 		vm_ooffset_t a_offset;
 	} */ *ap;
 {
-	int i, error, nextoff, size, toff, npages, count;
+	int i, error, nextoff, size, toff, count, npages;
 	struct uio uio;
 	struct iovec iov;
 	vm_offset_t kva;
@ -110,13 +109,35 @@ nfs_getpages(ap)
 	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
 	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
 		(void)nfs_fsinfo(nmp, vp, cred, p);
+
+	npages = btoc(count);
+
+	/*
+	 * If the requested page is partially valid, just return it and
+	 * allow the pager to zero-out the blanks.  Partially valid pages
+	 * can only occur at the file EOF.
+	 */
+
+	{
+		vm_page_t m = pages[ap->a_reqpage];
+
+		if (m->valid != 0) {
+			/* handled by vm_fault now	  */
+			/* vm_page_zero_invalid(m, TRUE); */
+			for (i = 0; i < npages; ++i) {
+				if (i != ap->a_reqpage)
+					vnode_pager_freepage(pages[i]);
+			}
+			return(0);
+		}
+	}
+
 	/*
 	 * We use only the kva address for the buffer, but this is extremely
 	 * convienient and fast.
 	 */
 	bp = getpbuf(&nfs_pbuf_freecnt);

-	npages = btoc(count);
 	kva = (vm_offset_t) bp->b_data;
 	pmap_qenter(kva, pages, npages);

@ -167,12 +188,12 @@ nfs_getpages(ap)
 			m->dirty = 0;
 		} else if (size > toff) {
 			/*
-			 * Read operation filled a partial page, set valid
-			 * bits properly.  validclean will zero out
-			 * any cruft in the buffer when setting a valid bit,
-			 * if the size is not DEV_BSIZE aligned.
+			 * Read operation filled a partial page.
 			 */
+			m->valid = 0;
 			vm_page_set_validclean(m, 0, size - toff);
+			/* handled by vm_fault now	  */
+			/* vm_page_zero_invalid(m, TRUE); */
 		}
 		
 		if (i != ap->a_reqpage) {
@ -197,13 +218,6 @@ nfs_getpages(ap)
 			} else {
 				vnode_pager_freepage(m);
 			}
-		} else {
-			/*
-			 * This page is being mapped, clear out any other
-			 * cruft in the invalid areas of the page.
-			 */
-			if (m->valid && m->valid != VM_PAGE_BITS_ALL)
-				vm_page_zero_invalid(m, FALSE);
 		}
 	}
 	return 0;
@ -228,14 +242,17 @@ nfs_putpages(ap)
 	vm_offset_t kva;
 	struct buf *bp;
 	int iomode, must_commit, i, error, npages, count;
+	off_t offset;
 	int *rtvals;
 	struct vnode *vp;
 	struct proc *p;
 	struct ucred *cred;
 	struct nfsmount *nmp;
+	struct nfsnode *np;
 	vm_page_t *pages;

 	vp = ap->a_vp;
+	np = VTONFS(vp);
 	p = curproc;				/* XXX */
 	cred = curproc->p_ucred;		/* XXX */
 	nmp = VFSTONFS(vp->v_mount);
@ -243,6 +260,7 @@ nfs_putpages(ap)
 	count = ap->a_count;
 	rtvals = ap->a_rtvals;
 	npages = btoc(count);
+	offset = IDX_TO_OFF(pages[0]->pindex);

 	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
 	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
@ -252,6 +270,16 @@ nfs_putpages(ap)
 		rtvals[i] = VM_PAGER_AGAIN;
 	}

+	/*
+	 * When putting pages, do not extend file past EOF.
+	 */
+
+	if (offset + count > np->n_size) {
+		count = np->n_size - offset;
+		if (count < 0)
+			count = 0;
+	}
+
 	/*
 	 * We use only the kva address for the buffer, but this is extremely
 	 * convienient and fast.
@ -265,7 +293,7 @@ nfs_putpages(ap)
 	iov.iov_len = count;
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
-	uio.uio_offset = IDX_TO_OFF(pages[0]->pindex);
+	uio.uio_offset = offset;
 	uio.uio_resid = count;
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_rw = UIO_WRITE;
@ -297,23 +325,21 @@ nfs_putpages(ap)
 * Vnode op for read using bio
 */
 int
-nfs_bioread(vp, uio, ioflag, cred, getpages)
+nfs_bioread(vp, uio, ioflag, cred)
 	register struct vnode *vp;
 	register struct uio *uio;
 	int ioflag;
 	struct ucred *cred;
-	int getpages;
 {
 	register struct nfsnode *np = VTONFS(vp);
 	register int biosize, i;
-	off_t diff;
 	struct buf *bp = 0, *rabp;
 	struct vattr vattr;
 	struct proc *p;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	daddr_t lbn, rabn;
-	int bufsize;
-	int nra, error = 0, n = 0, on = 0, not_readin;
+	int bcount;
+	int nra, error = 0, n = 0, on = 0;

 #ifdef DIAGNOSTIC
 	if (uio->uio_rw != UIO_READ)
@ -424,7 +450,6 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
 		nfsstats.biocache_reads++;
 		lbn = uio->uio_offset / biosize;
 		on = uio->uio_offset & (biosize - 1);
-		not_readin = 1;

 		/*
 		 * Start the read ahead(s), as required.
@ -439,7 +464,6 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
 				return (EINTR);
 			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
 				rabp->b_flags |= (B_READ | B_ASYNC);
-				rabp->b_flags &= ~B_DONE;
 				vfs_busy_pages(rabp, 0);
 				if (nfs_asyncio(rabp, cred)) {
 				    rabp->b_flags |= B_INVAL|B_ERROR;
@ -453,47 +477,31 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
 		}

 		/*
-		 * If the block is in the cache and has the required data
-		 * in a valid region, just copy it out.
-		 * Otherwise, get the block and write back/read in,
-		 * as required.
+		 * Obtain the buffer cache block.  Figure out the buffer size
+		 * when we are at EOF.  nfs_getcacheblk() will also force
+		 * uncached delayed-writes to be flushed to the server.
+		 *
+		 * Note that bcount is *not* DEV_BSIZE aligned.
 		 */
-again:
-		bufsize = biosize;
-		if ((off_t)(lbn + 1) * biosize > np->n_size && 
-		    (off_t)(lbn + 1) * biosize - np->n_size < biosize) {
-			bufsize = np->n_size - (off_t)lbn * biosize;
-			bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
+
+		bcount = biosize;
+		if ((off_t)lbn * biosize >= np->n_size) {
+			bcount = 0;
+		} else if ((off_t)(lbn + 1) * biosize > np->n_size) {
+			bcount = np->n_size - (off_t)lbn * biosize;
 		}
-		bp = nfs_getcacheblk(vp, lbn, bufsize, p);
+
+		bp = nfs_getcacheblk(vp, lbn, bcount, p);
 		if (!bp)
 			return (EINTR);

 		/*
-		 * If we are being called from nfs_getpages, we must
-		 * make sure the buffer is a vmio buffer.  The vp will
-		 * already be setup for vmio but there may be some old
-		 * non-vmio buffers attached to it.
+		 * If B_CACHE is not set, we must issue the read.  If this
+		 * fails, we return an error.
 		 */
-		if (getpages && !(bp->b_flags & B_VMIO)) {
-#ifdef DIAGNOSTIC
-			printf("nfs_bioread: non vmio buf found, discarding\n");
-#endif
-			bp->b_flags |= B_NOCACHE;
-			bp->b_flags |= B_INVAFTERWRITE;
-			if (bp->b_dirtyend > 0) {
-				if ((bp->b_flags & B_DELWRI) == 0)
-					panic("nfsbioread");
-				if (VOP_BWRITE(bp) == EINTR)
-					return (EINTR);
-			} else
-				brelse(bp);
-			goto again;
-		}
+
 		if ((bp->b_flags & B_CACHE) == 0) {
 		    bp->b_flags |= B_READ;
-		    bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
-		    not_readin = 0;
 		    vfs_busy_pages(bp, 0);
 		    error = nfs_doio(bp, cred, p);
 		    if (error) {
@ -501,32 +509,20 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
 			return (error);
 		    }
 		}
-		if (bufsize > on) {
-			n = min((unsigned)(bufsize - on), uio->uio_resid);
-		} else {
-			n = 0;
-		}
-		diff = np->n_size - uio->uio_offset;
-		if (diff < n)
-			n = diff;
-		if (not_readin && n > 0) {
-			if (on < bp->b_validoff || (on + n) > bp->b_validend) {
-				bp->b_flags |= B_NOCACHE;
-				bp->b_flags |= B_INVAFTERWRITE;
-				if (bp->b_dirtyend > 0) {
-				    if ((bp->b_flags & B_DELWRI) == 0)
-					panic("nfsbioread");
-				    if (VOP_BWRITE(bp) == EINTR)
-					return (EINTR);
-				} else
-				    brelse(bp);
-				goto again;
-			}
-		}
+
+		/*
+		 * on is the offset into the current bp.  Figure out how many
+		 * bytes we can copy out of the bp.  Note that bcount is
+		 * NOT DEV_BSIZE aligned.
+		 *
+		 * Then figure out how many bytes we can copy into the uio.
+		 */
+
+		n = 0;
+		if (on < bcount)
+			n = min((unsigned)(bcount - on), uio->uio_resid);
+
 		vp->v_lastr = lbn;
-		diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on);
-		if (diff < n)
-			n = diff;
 		break;
 	    case VLNK:
 		nfsstats.biocache_readlinks++;
@ -535,7 +531,6 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
 			return (EINTR);
 		if ((bp->b_flags & B_CACHE) == 0) {
 		    bp->b_flags |= B_READ;
-		    bp->b_flags &= ~B_DONE;
 		    vfs_busy_pages(bp, 0);
 		    error = nfs_doio(bp, cred, p);
 		    if (error) {
@ -560,13 +555,13 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
 		    return (EINTR);
 		if ((bp->b_flags & B_CACHE) == 0) {
 		    bp->b_flags |= B_READ;
-		    bp->b_flags &= ~B_DONE;
 		    vfs_busy_pages(bp, 0);
 		    error = nfs_doio(bp, cred, p);
 		    if (error) {
 			    brelse(bp);
 		    }
 		    while (error == NFSERR_BAD_COOKIE) {
+			printf("got bad cookie vp %p bp %p\n", vp, bp);
 			nfs_invaldir(vp);
 			error = nfs_vinvalbuf(vp, 0, cred, p, 1);
 			/*
@ -574,6 +569,10 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
 			 * server. The only way to get the block is by
 			 * reading from the beginning to get all the
 			 * offset cookies.
+			 *
+			 * Leave the last bp intact unless there is an error.
+			 * Loop back up to the while if the error is another
+			 * NFSERR_BAD_COOKIE (double yuch!).
 			 */
 			for (i = 0; i <= lbn && !error; i++) {
 			    if (np->n_direofoffset
@ -582,21 +581,32 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
 			    bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p);
 			    if (!bp)
 				return (EINTR);
-			    if ((bp->b_flags & B_DONE) == 0) {
-				bp->b_flags |= B_READ;
-				bp->b_flags &= ~B_DONE;
-				vfs_busy_pages(bp, 0);
-				error = nfs_doio(bp, cred, p);
-				if (error == 0 && (bp->b_flags & B_INVAL))
-					break;
-				if (error) {
-				    brelse(bp);
-				} else if (i < lbn) {
-				    brelse(bp);
-				}
+			    if ((bp->b_flags & B_CACHE) == 0) {
+				    bp->b_flags |= B_READ;
+				    vfs_busy_pages(bp, 0);
+				    error = nfs_doio(bp, cred, p);
+				    /*
+				     * no error + B_INVAL == directory EOF,
+				     * use the block.
+				     */
+				    if (error == 0 && (bp->b_flags & B_INVAL))
+					    break;
 			    }
+			    /*
+			     * An error will throw away the block and the
+			     * for loop will break out.  If no error and this
+			     * is not the block we want, we throw away the
+			     * block and go for the next one via the for loop.
+			     */
+			    if (error || i < lbn)
+				    brelse(bp);
 			}
 		    }
+		    /*
+		     * The above while is repeated if we hit another cookie
+		     * error.  If we hit an error and it wasn't a cookie error,
+		     * we give up.
+		     */
 		    if (error)
 			    return (error);
 		}
@ -616,7 +626,6 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
 			if (rabp) {
 			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
 				rabp->b_flags |= (B_READ | B_ASYNC);
-				rabp->b_flags &= ~B_DONE;
 				vfs_busy_pages(rabp, 0);
 				if (nfs_asyncio(rabp, cred)) {
 				    rabp->b_flags |= B_INVAL|B_ERROR;
@ -629,10 +638,20 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
 			}
 		}
 		/*
-		 * Make sure we use a signed variant of min() since
-		 * the second term may be negative.
+		 * Unlike VREG files, whos buffer size ( bp->b_bcount ) is
+		 * chopped for the EOF condition, we cannot tell how large
+		 * NFS directories are going to be until we hit EOF.  So
+		 * an NFS directory buffer is *not* chopped to its EOF.  Now,
+		 * it just so happens that b_resid will effectively chop it
+		 * to EOF.  *BUT* this information is lost if the buffer goes
+		 * away and is reconstituted into a B_CACHE state ( due to
+		 * being VMIO ) later.  So we keep track of the directory eof
+		 * in np->n_direofoffset and chop it off as an extra step 
+		 * right here.
 		 */
 		n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
+		if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset)
+			n = np->n_direofoffset - uio->uio_offset;
 		break;
 	    default:
 		printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
@ -649,6 +668,10 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
 		n = 0;
 		break;
 	    case VDIR:
+		/*
+		 * Invalidate buffer if caching is disabled, forcing a
+		 * re-read from the remote later.
+		 */
 		if (np->n_flag & NQNFSNONCACHE)
 			bp->b_flags |= B_INVAL;
 		break;
@ -660,24 +683,6 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
 	return (error);
 }

-static void
-nfs_prot_buf(bp, off, n)
-	struct buf *bp;
-	int off;
-	int n;
-{
-	int pindex, boff, end;
-
-	if ((bp->b_flags & B_VMIO) == 0)
-		return;
-
-	end = round_page(off + n);
-	for (boff = trunc_page(off); boff < end; boff += PAGE_SIZE) {
-		pindex = boff >> PAGE_SHIFT;
-		vm_page_protect(bp->b_pages[pindex], VM_PROT_NONE);
-	}
-}
-
 /*
 * Vnode op for write using bio
 */
@ -690,18 +695,18 @@ nfs_write(ap)
 		struct ucred *a_cred;
 	} */ *ap;
 {
-	register int biosize;
-	register struct uio *uio = ap->a_uio;
+	int biosize;
+	struct uio *uio = ap->a_uio;
 	struct proc *p = uio->uio_procp;
-	register struct vnode *vp = ap->a_vp;
+	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
-	register struct ucred *cred = ap->a_cred;
+	struct ucred *cred = ap->a_cred;
 	int ioflag = ap->a_ioflag;
 	struct buf *bp;
 	struct vattr vattr;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	daddr_t lbn;
-	int bufsize;
+	int bcount;
 	int n, on, error = 0, iomode, must_commit;

 #ifdef DIAGNOSTIC
@ -749,12 +754,9 @@ nfs_write(ap)
 		psignal(p, SIGXFSZ);
 		return (EFBIG);
 	}
-	/*
-	 * I use nm_rsize, not nm_wsize so that all buffer cache blocks
-	 * will be the same size within a filesystem. nfs_writerpc will
-	 * still use nm_wsize when sizing the rpc's.
-	 */
+
 	biosize = vp->v_mount->mnt_stat.f_iosize;
+
 	do {
 		/*
 		 * Check for a valid write lease.
@ -786,17 +788,74 @@ nfs_write(ap)
 		on = uio->uio_offset & (biosize-1);
 		n = min((unsigned)(biosize - on), uio->uio_resid);
 again:
-		if (uio->uio_offset + n > np->n_size) {
+		/*
+		 * Handle direct append and file extension cases, calculate
+		 * unaligned buffer size.
+		 */
+
+		if (uio->uio_offset == np->n_size && n) {
+			/*
+			 * special append case.  Obtain buffer prior to
+			 * resizing it to maintain B_CACHE.
+			 */
+			long save;
+
+			bcount = on;
+			bp = nfs_getcacheblk(vp, lbn, bcount, p);
+			save = bp->b_flags & B_CACHE;
+
 			np->n_size = uio->uio_offset + n;
 			np->n_flag |= NMODIFIED;
 			vnode_pager_setsize(vp, np->n_size);
+
+			bcount += n;
+			allocbuf(bp, bcount);
+			bp->b_flags |= save;
+		} else {
+			if (uio->uio_offset + n > np->n_size) {
+				np->n_size = uio->uio_offset + n;
+				np->n_flag |= NMODIFIED;
+				vnode_pager_setsize(vp, np->n_size);
+			}
+			bcount = biosize;
+			if ((off_t)(lbn + 1) * biosize > np->n_size) 
+				bcount = np->n_size - (off_t)lbn * biosize;
+			bp = nfs_getcacheblk(vp, lbn, bcount, p);
 		}
-		bufsize = biosize;
-		if ((off_t)(lbn + 1) * biosize > np->n_size) {
-			bufsize = np->n_size - (off_t)lbn * biosize;
-			bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
+
+		/*
+		 * Issue a READ if B_CACHE is not set.  In special-append
+		 * mode, B_CACHE is based on the buffer prior to the write
+		 * op and is typically set, avoiding the read.  If a read
+		 * is required in special append mode, the server will
+		 * probably send us a short-read since we extended the file
+		 * on our end, resulting in b_resid == 0 and, thusly, 
+		 * B_CACHE getting set.
+		 *
+		 * We can also avoid issuing the read if the write covers
+		 * the entire buffer.  We have to make sure the buffer state
+		 * is reasonable in this case since we will not be initiating
+		 * I/O.  See the comments in kern/vfs_bio.c's getblk() for
+		 * more information.
+		 *
+		 * B_CACHE may also be set due to the buffer being cached
+		 * normally.
+		 */
+
+		if (on == 0 && n == bcount) {
+			bp->b_flags |= B_CACHE;
+			bp->b_flags &= ~(B_ERROR | B_INVAL);
+		}
+
+		if ((bp->b_flags & B_CACHE) == 0) {
+			bp->b_flags |= B_READ;
+			vfs_busy_pages(bp, 0);
+			error = nfs_doio(bp, cred, p);
+			if (error) {
+				brelse(bp);
+				return (error);
+			}
 		}
-		bp = nfs_getcacheblk(vp, lbn, bufsize, p);
 		if (!bp)
 			return (EINTR);
 		if (bp->b_wcred == NOCRED) {
@ -820,6 +879,17 @@ nfs_write(ap)
 		 * If the new write will leave a contiguous dirty
 		 * area, just update the b_dirtyoff and b_dirtyend,
 		 * otherwise force a write rpc of the old dirty area.
+		 *
+		 * While it is possible to merge discontiguous writes due to 
+		 * our having a B_CACHE buffer ( and thus valid read data
+		 * for the hole), we don't because it could lead to 
+		 * significant cache coherency problems with multiple clients,
+		 * especially if locking is implemented later on.
+		 *
+		 * as an optimization we could theoretically maintain
+		 * a linked list of discontinuous areas, but we would still
+		 * have to commit them separately so there isn't much
+		 * advantage to it except perhaps a bit of asynchronization.
 		 */

 		if (bp->b_dirtyend > 0 &&
@ -862,11 +932,6 @@ nfs_write(ap)
 			return (error);
 		}

-		/*
-		 * This will keep the buffer and mmaped regions more coherent.
-		 */
-		nfs_prot_buf(bp, on, n);
-
 		/*
 		 * Only update dirtyoff/dirtyend if not a degenerate 
 		 * condition.
@ -879,21 +944,7 @@ nfs_write(ap)
 				bp->b_dirtyoff = on;
 				bp->b_dirtyend = on + n;
 			}
-		}
-
-		/*
-		 * To avoid code complexity, we may have to throw away
-		 * previously valid ranges when merging the new dirty range
-		 * into the valid range.  As long as we do not *ADD* an
-		 * invalid valid range, we are ok.
-		 */
-		if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff ||
-		    bp->b_validoff > bp->b_dirtyend) {
-			bp->b_validoff = bp->b_dirtyoff;
-			bp->b_validend = bp->b_dirtyend;
-		} else {
-			bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
-			bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
+			vfs_bio_set_validclean(bp, on, n);
 		}

 		/*
@ -904,11 +955,14 @@ nfs_write(ap)

 		/*
 		 * If the lease is non-cachable or IO_SYNC do bwrite().
+		 *
+		 * IO_INVAL appears to be unused.  The idea appears to be
+		 * to turn off caching in this case.  Very odd.  XXX
 		 */
 		if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) {
 			bp->b_proc = p;
 			if (ioflag & IO_INVAL)
-				bp->b_flags |= B_INVAL;
+				bp->b_flags |= B_NOCACHE;
 			error = VOP_BWRITE(bp);
 			if (error)
 				return (error);
@ -922,8 +976,9 @@ nfs_write(ap)
 			bp->b_proc = (struct proc *)0;
 			bp->b_flags |= B_ASYNC;
 			(void)nfs_writebp(bp, 0);
-		} else
+		} else {
 			bdwrite(bp);
+		}
 	} while (uio->uio_resid > 0 && n > 0);
 	return (0);
 }
@ -956,15 +1011,16 @@ nfs_getcacheblk(vp, bn, size, p)
 				return ((struct buf *)0);
 			bp = getblk(vp, bn, size, 0, 2 * hz);
 		}
-	} else
+	} else {
 		bp = getblk(vp, bn, size, 0, 0);
+	}

 	if (vp->v_type == VREG) {
 		int biosize;
+
 		biosize = mp->mnt_stat.f_iosize;
 		bp->b_blkno = bn * (biosize / DEV_BSIZE);
 	}
-
 	return (bp);
 }

@ -1036,6 +1092,9 @@ nfs_vinvalbuf(vp, flags, cred, p, intrflg)
 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
 * This is mainly to avoid queueing async I/O requests when the nfsiods
 * are all hung on a dead server.
+ *
+ * Note: nfs_asyncio() does not clear (B_ERROR|B_INVAL) but when the bp
+ * is eventually dequeued by the async daemon, nfs_doio() *will*.
 */
 int
 nfs_asyncio(bp, cred)
@ -1164,7 +1223,7 @@ nfs_doio(bp, cr, p)
 	struct vnode *vp;
 	struct nfsnode *np;
 	struct nfsmount *nmp;
-	int error = 0, diff, len, iomode, must_commit = 0;
+	int error = 0, iomode, must_commit = 0;
 	struct uio uio;
 	struct iovec io;

@ -1177,6 +1236,13 @@ nfs_doio(bp, cr, p)
 	uiop->uio_segflg = UIO_SYSSPACE;
 	uiop->uio_procp = p;

+	/*
+	 * clear B_ERROR and B_INVAL state prior to initiating the I/O.  We
+	 * do this here so we do not have to do it in all the code that
+	 * calls us.
+	 */
+	bp->b_flags &= ~(B_ERROR | B_INVAL);
+
 	KASSERT(!(bp->b_flags & B_DONE), ("nfs_doio: bp %p already marked done", bp));

 	/*
@ -1216,25 +1282,22 @@ nfs_doio(bp, cr, p)
 		nfsstats.read_bios++;
 		error = nfs_readrpc(vp, uiop, cr);
 		if (!error) {
-		    bp->b_validoff = 0;
 		    if (uiop->uio_resid) {
 			/*
-			 * If len > 0, there is a hole in the file and
-			 * no writes after the hole have been pushed to
-			 * the server yet.
-			 * Just zero fill the rest of the valid area.
+			 * If we had a short read with no error, we must have
+			 * hit a file hole.  We should zero-fill the remainder.
+			 * This can also occur if the server hits the file EOF.
+			 *
+			 * Holes used to be able to occur due to pending 
+			 * writes, but that is not possible any longer.
 			 */
-			diff = bp->b_bcount - uiop->uio_resid;
-			len = np->n_size - (((u_quad_t)bp->b_blkno) * DEV_BSIZE
-				+ diff);
-			if (len > 0) {
-			    len = min(len, uiop->uio_resid);
-			    bzero((char *)bp->b_data + diff, len);
-			    bp->b_validend = diff + len;
-			} else
-			    bp->b_validend = diff;
-		    } else
-			bp->b_validend = bp->b_bcount;
+			int nread = bp->b_bcount - uiop->uio_resid;
+			int left  = bp->b_bcount - nread;
+
+			if (left > 0)
+				bzero((char *)bp->b_data + nread, left);
+			uiop->uio_resid = 0;
+		    }
 		}
 		if (p && (vp->v_flag & VTEXT) &&
 			(((nmp->nm_flag & NFSMNT_NQNFS) &&
@ -1262,6 +1325,10 @@ nfs_doio(bp, cr, p)
 		}
 		if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
 			error = nfs_readdirrpc(vp, uiop, cr);
+		/*
+		 * end-of-directory sets B_INVAL but does not generate an
+		 * error.
+		 */
 		if (error == 0 && uiop->uio_resid == bp->b_bcount)
 			bp->b_flags |= B_INVAL;
 		break;
@ -1296,7 +1363,7 @@ nfs_doio(bp, cr, p)
 		if (!error && iomode == NFSV3WRITE_UNSTABLE) {
 		    bp->b_flags |= B_NEEDCOMMIT;
 		    if (bp->b_dirtyoff == 0
-			&& bp->b_dirtyend == bp->b_bufsize)
+			&& bp->b_dirtyend == bp->b_bcount)
 			bp->b_flags |= B_CLUSTEROK;
 		} else {
 		    bp->b_flags &= ~B_NEEDCOMMIT;
--- a/sys/nfs/nfs_nqlease.c
+++ b/sys/nfs/nfs_nqlease.c
@ -34,7 +34,7 @@
 * SUCH DAMAGE.
 *
 *	@(#)nfs_nqlease.c	8.9 (Berkeley) 5/20/95
- * $Id: nfs_nqlease.c,v 1.39 1998/10/31 15:31:25 peter Exp $
+ * $Id: nfs_nqlease.c,v 1.40 1999/02/25 00:03:51 peter Exp $
 */


@ -561,6 +561,10 @@ nqsrv_send_eviction(vp, lp, slp, nam, cred)
 				*mtod(m, u_int32_t *) = htonl(0x80000000 |
 					(m->m_pkthdr.len - NFSX_UNSIGNED));
 			}
+			/*
+			 * nfs_sndlock if PR_CONNREQUIRED XXX
+			 */
+
 			if (((lph->lph_flag & (LC_UDP | LC_CLTP)) == 0 &&
 			    (lph->lph_slp->ns_flag & SLP_VALID) == 0) ||
 			    (nfs_slplock(lph->lph_slp, 0) == 0))
--- a/sys/nfs/nfs_socket.c
+++ b/sys/nfs/nfs_socket.c
@ -34,7 +34,7 @@
 * SUCH DAMAGE.
 *
 *	@(#)nfs_socket.c	8.5 (Berkeley) 3/30/95
- * $Id: nfs_socket.c,v 1.50 1999/02/25 00:03:51 peter Exp $
+ * $Id: nfs_socket.c,v 1.51 1999/04/24 11:29:48 dt Exp $
 */

 /*
@ -54,6 +54,7 @@
 #include <sys/socketvar.h>
 #include <sys/syslog.h>
 #include <sys/tprintf.h>
+#include <sys/sysctl.h>

 #include <netinet/in.h>
 #include <netinet/tcp.h>
@ -115,6 +116,15 @@ static int proct[NFS_NPROCS] = {
 	0, 0, 0,
 };

+static int nfs_realign_test;
+static int nfs_realign_count;
+
+SYSCTL_DECL(_vfs_nfs);
+
+SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_test, CTLFLAG_RD, &nfs_realign_test, 0, "");
+SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_count, CTLFLAG_RD, &nfs_realign_count, 0, "");
+
+
 /*
 * There is a congestion window for outstanding rpcs maintained per mount
 * point. The cwnd size is adjusted in roughly the way that:
@ -138,7 +148,7 @@ struct callout_handle	nfs_timer_handle;
 static int	nfs_msg __P((struct proc *,char *,char *));
 static int	nfs_rcvlock __P((struct nfsreq *));
 static void	nfs_rcvunlock __P((struct nfsreq *));
-static void	nfs_realign __P((struct mbuf *m, int hsiz));
+static void	nfs_realign __P((struct mbuf **pm, int hsiz));
 static int	nfs_receive __P((struct nfsreq *rep, struct sockaddr **aname,
 				 struct mbuf **mp));
 static int	nfs_reconnect __P((struct nfsreq *rep));
@ -702,7 +712,7 @@ nfs_receive(rep, aname, mp)
 	 * These could cause pointer alignment problems, so copy them to
 	 * well aligned mbufs.
 	 */
-	nfs_realign(*mp, 5 * NFSX_UNSIGNED);
+	nfs_realign(mp, 5 * NFSX_UNSIGNED);
 	return (error);
 }

@ -1589,92 +1599,56 @@ nfs_rcvunlock(rep)
 }

 /*
- * Check for badly aligned mbuf data areas and
- * realign data in an mbuf list by copying the data areas up, as required.
+ *	nfs_realign:
+ *
+ *	Check for badly aligned mbuf data and realign by copying the unaligned
+ *	portion of the data into a new mbuf chain and freeing the portions
+ *	of the old chain that were replaced.
+ *
+ *	We cannot simply realign the data within the existing mbuf chain
+ *	because the underlying buffers may contain other rpc commands and
+ *	we cannot afford to overwrite them.
+ *
+ *	We would prefer to avoid this situation entirely.  The situation does
+ *	not occur with NFS/UDP and is supposed to only occassionally occur
+ *	with TCP.  Use vfs.nfs.realign_count and realign_test to check this.
 */
 static void
-nfs_realign(m, hsiz)
-	register struct mbuf *m;
+nfs_realign(pm, hsiz)
+	register struct mbuf **pm;
 	int hsiz;
 {
-	register struct mbuf *m2;
-	register int siz, mlen, olen;
-	register caddr_t tcp, fcp;
-	struct mbuf *mnew;
+	struct mbuf *m;
+	struct mbuf *n = NULL;
+	int off = 0;

-	while (m) {
-	    /*
-	     * This never happens for UDP, rarely happens for TCP
-	     * but frequently happens for iso transport.
-	     */
-	    if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) {
-		olen = m->m_len;
-		fcp = mtod(m, caddr_t);
-		if ((intptr_t)fcp & 0x3) {
-			m->m_flags &= ~M_PKTHDR;
-			if (m->m_flags & M_EXT)
-				m->m_data = m->m_ext.ext_buf +
-					((m->m_ext.ext_size - olen) & ~0x3);
-			else
-				m->m_data = m->m_dat;
+	++nfs_realign_test;
+
+	while ((m = *pm) != NULL) {
+		if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) {
+			MGET(n, M_WAIT, MT_DATA);
+			if (m->m_len >= MINCLSIZE) {
+				MCLGET(n, M_WAIT);
+			}
+			n->m_len = 0;
+			break;
 		}
-		m->m_len = 0;
-		tcp = mtod(m, caddr_t);
-		mnew = m;
-		m2 = m->m_next;
+		pm = &m->m_next;
+	}

-		/*
-		 * If possible, only put the first invariant part
-		 * of the RPC header in the first mbuf.
-		 */
-		mlen = M_TRAILINGSPACE(m);
-		if (olen <= hsiz && mlen > hsiz)
-			mlen = hsiz;
-
-		/*
-		 * Loop through the mbuf list consolidating data.
-		 */
+	/*
+	 * If n is non-NULL, loop on m copying data, then replace the
+	 * portion of the chain that had to be realigned.
+	 */
+	if (n != NULL) {
+		++nfs_realign_count;
 		while (m) {
-			while (olen > 0) {
-				if (mlen == 0) {
-					m2->m_flags &= ~M_PKTHDR;
-					if (m2->m_flags & M_EXT)
-						m2->m_data = m2->m_ext.ext_buf;
-					else
-						m2->m_data = m2->m_dat;
-					m2->m_len = 0;
-					mlen = M_TRAILINGSPACE(m2);
-					tcp = mtod(m2, caddr_t);
-					mnew = m2;
-					m2 = m2->m_next;
-				}
-				siz = min(mlen, olen);
-				if (tcp != fcp)
-					bcopy(fcp, tcp, siz);
-				mnew->m_len += siz;
-				mlen -= siz;
-				olen -= siz;
-				tcp += siz;
-				fcp += siz;
-			}
+			m_copyback(n, off, m->m_len, mtod(m, caddr_t));
+			off += m->m_len;
 			m = m->m_next;
-			if (m) {
-				olen = m->m_len;
-				fcp = mtod(m, caddr_t);
-			}
 		}
-
-		/*
-		 * Finally, set m_len == 0 for any trailing mbufs that have
-		 * been copied out of.
-		 */
-		while (m2) {
-			m2->m_len = 0;
-			m2 = m2->m_next;
-		}
-		return;
-	    }
-	    m = m->m_next;
+		m_freem(*pm);
+		*pm = n;
 	}
 }

@ -2040,7 +2014,7 @@ nfsrv_rcv(so, arg, waitflag)
 					m_freem(mp);
 					continue;
 				}
-				nfs_realign(mp, 10 * NFSX_UNSIGNED);
+				nfs_realign(&mp, 10 * NFSX_UNSIGNED);
 				rec->nr_address = nam;
 				rec->nr_packet = mp;
 				STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link);
@ -2182,7 +2156,7 @@ nfsrv_getstream(slp, waitflag)
 		if (!rec) {
 		    m_freem(slp->ns_frag);
 		} else {
-		    nfs_realign(slp->ns_frag, 10 * NFSX_UNSIGNED);
+		    nfs_realign(&slp->ns_frag, 10 * NFSX_UNSIGNED);
 		    rec->nr_address = (struct sockaddr *)0;
 		    rec->nr_packet = slp->ns_frag;
 		    STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link);
--- a/sys/nfs/nfs_vnops.c
+++ b/sys/nfs/nfs_vnops.c
@ -34,7 +34,7 @@
 * SUCH DAMAGE.
 *
 *	@(#)nfs_vnops.c	8.16 (Berkeley) 5/27/95
- * $Id: nfs_vnops.c,v 1.123 1999/02/16 10:49:54 dfr Exp $
+ * $Id: nfs_vnops.c,v 1.124 1999/03/12 02:24:58 julian Exp $
 */


@ -408,9 +408,9 @@ nfs_access(ap)
 				error = nfs_readrpc(vp, &auio, ap->a_cred);
 			else if (vp->v_type == VDIR) {
 				char* bp;
-				bp = malloc(NFS_DIRBLKSIZ, M_TEMP, M_WAITOK);
+				bp = malloc(DIRBLKSIZ, M_TEMP, M_WAITOK);
 				aiov.iov_base = bp;
-				aiov.iov_len = auio.uio_resid = NFS_DIRBLKSIZ;
+				aiov.iov_len = auio.uio_resid = DIRBLKSIZ;
 				error = nfs_readdirrpc(vp, &auio, ap->a_cred);
 				free(bp, M_TEMP);
 			} else if (vp->v_type == VLNK)
@ -962,7 +962,7 @@ nfs_read(ap)

 	if (vp->v_type != VREG)
 		return (EPERM);
-	return (nfs_bioread(vp, ap->a_uio, ap->a_ioflag, ap->a_cred, 0));
+	return (nfs_bioread(vp, ap->a_uio, ap->a_ioflag, ap->a_cred));
 }

 /*
@ -980,7 +980,7 @@ nfs_readlink(ap)

 	if (vp->v_type != VLNK)
 		return (EINVAL);
-	return (nfs_bioread(vp, ap->a_uio, 0, ap->a_cred, 0));
+	return (nfs_bioread(vp, ap->a_uio, 0, ap->a_cred));
 }

 /*
@ -1985,7 +1985,7 @@ nfs_readdir(ap)
 	 * Call nfs_bioread() to do the real work.
 	 */
 	tresid = uio->uio_resid;
-	error = nfs_bioread(vp, uio, 0, ap->a_cred, 0);
+	error = nfs_bioread(vp, uio, 0, ap->a_cred);

 	if (!error && uio->uio_resid == tresid)
 		nfsstats.direofcache_misses++;
@ -2004,7 +2004,7 @@ nfs_readdirrpc(vp, uiop, cred)

 {
 	register int len, left;
-	register struct dirent *dp;
+	register struct dirent *dp = NULL;
 	register u_int32_t *tl;
 	register caddr_t cp;
 	register int32_t t1, t2;
@ -2019,12 +2019,9 @@ nfs_readdirrpc(vp, uiop, cred)
 	int attrflag;
 	int v3 = NFS_ISV3(vp);

-#ifndef nolint
-	dp = (struct dirent *)0;
-#endif
 #ifndef DIAGNOSTIC
-	if (uiop->uio_iovcnt != 1 || (uiop->uio_offset & (NFS_DIRBLKSIZ - 1)) ||
-		(uiop->uio_resid & (NFS_DIRBLKSIZ - 1)))
+	if (uiop->uio_iovcnt != 1 || (uiop->uio_offset & (DIRBLKSIZ - 1)) ||
+		(uiop->uio_resid & (DIRBLKSIZ - 1)))
 		panic("nfs readdirrpc bad uio");
 #endif

@ -2381,7 +2378,7 @@ nfs_readdirplusrpc(vp, uiop, cred)
 		m_freem(mrep);
 	}
 	/*
-	 * Fill last record, iff any, out to a multiple of NFS_DIRBLKSIZ
+	 * Fill last record, iff any, out to a multiple of DIRBLKSIZ
 	 * by increasing d_reclen for the last record.
 	 */
 	if (blksiz > 0) {
@ -3028,13 +3025,13 @@ nfs_bwrite(ap)
 		struct vnode *a_bp;
 	} */ *ap;
 {
-
 	return (nfs_writebp(ap->a_bp, 1));
 }

 /*
 * This is a clone of vn_bwrite(), except that B_WRITEINPROG isn't set unless
- * the force flag is one and it also handles the B_NEEDCOMMIT flag.
+ * the force flag is one and it also handles the B_NEEDCOMMIT flag.  We set
+ * B_CACHE if this is a VMIO buffer.
 */
 int
 nfs_writebp(bp, force)
@ -3049,12 +3046,15 @@ nfs_writebp(bp, force)
 	if(!(bp->b_flags & B_BUSY))
 		panic("bwrite: buffer is not busy???");

-	if (bp->b_flags & B_INVAL)
-		bp->b_flags |= B_NOCACHE;
+	if (bp->b_flags & B_INVAL) {
+		brelse(bp);
+		return(0);
+	}
+
+	bp->b_flags |= B_CACHE;

 	/*
-	 * XXX we bundirty() the bp here.  Shouldn't we do it later after
-	 * the I/O has completed??
+	 * Undirty the bp.  We will redirty it later if the I/O fails.
 	 */

 	s = splbio();
--- a/sys/nfsclient/nfs.h
+++ b/sys/nfsclient/nfs.h
@ -34,7 +34,7 @@
 * SUCH DAMAGE.
 *
 *	@(#)nfs.h	8.4 (Berkeley) 5/1/95
- * $Id: nfs.h,v 1.44 1998/09/07 05:42:15 bde Exp $
+ * $Id: nfs.h,v 1.45 1999/02/25 00:03:50 peter Exp $
 */

 #ifndef _NFS_NFS_H_
@ -651,8 +651,7 @@ void	nfs_disconnect __P((struct nfsmount *));
 void	nfs_safedisconnect __P((struct nfsmount *));
 int	nfs_getattrcache __P((struct vnode *, struct vattr *));
 int	nfsm_strtmbuf __P((struct mbuf **, char **, const char *, long));
-int	nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *,
-			 int));
+int	nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *));
 int	nfsm_uiotombuf __P((struct uio *, struct mbuf **, int, caddr_t *));
 void	nfsrv_init __P((int));
 void	nfs_clearcommit __P((struct mount *));
--- a/sys/nfsclient/nfs_bio.c
+++ b/sys/nfsclient/nfs_bio.c
@ -34,7 +34,7 @@
 * SUCH DAMAGE.
 *
 *	@(#)nfs_bio.c	8.9 (Berkeley) 3/30/95
- * $Id: nfs_bio.c,v 1.68 1999/04/05 19:38:28 julian Exp $
+ * $Id: nfs_bio.c,v 1.69 1999/04/06 03:07:54 peter Exp $
 */


@ -65,7 +65,6 @@

 static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size,
 					struct proc *p));
-static void nfs_prot_buf __P((struct buf *bp, int off, int n));

 extern int nfs_numasync;
 extern int nfs_pbuf_freecnt;
@ -84,7 +83,7 @@ nfs_getpages(ap)
 		vm_ooffset_t a_offset;
 	} */ *ap;
 {
-	int i, error, nextoff, size, toff, npages, count;
+	int i, error, nextoff, size, toff, count, npages;
 	struct uio uio;
 	struct iovec iov;
 	vm_offset_t kva;
@ -110,13 +109,35 @@ nfs_getpages(ap)
 	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
 	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
 		(void)nfs_fsinfo(nmp, vp, cred, p);
+
+	npages = btoc(count);
+
+	/*
+	 * If the requested page is partially valid, just return it and
+	 * allow the pager to zero-out the blanks.  Partially valid pages
+	 * can only occur at the file EOF.
+	 */
+
+	{
+		vm_page_t m = pages[ap->a_reqpage];
+
+		if (m->valid != 0) {
+			/* handled by vm_fault now	  */
+			/* vm_page_zero_invalid(m, TRUE); */
+			for (i = 0; i < npages; ++i) {
+				if (i != ap->a_reqpage)
+					vnode_pager_freepage(pages[i]);
+			}
+			return(0);
+		}
+	}
+
 	/*
 	 * We use only the kva address for the buffer, but this is extremely
 	 * convienient and fast.
 	 */
 	bp = getpbuf(&nfs_pbuf_freecnt);

-	npages = btoc(count);
 	kva = (vm_offset_t) bp->b_data;
 	pmap_qenter(kva, pages, npages);

@ -167,12 +188,12 @@ nfs_getpages(ap)
 			m->dirty = 0;
 		} else if (size > toff) {
 			/*
-			 * Read operation filled a partial page, set valid
-			 * bits properly.  validclean will zero out
-			 * any cruft in the buffer when setting a valid bit,
-			 * if the size is not DEV_BSIZE aligned.
+			 * Read operation filled a partial page.
 			 */
+			m->valid = 0;
 			vm_page_set_validclean(m, 0, size - toff);
+			/* handled by vm_fault now	  */
+			/* vm_page_zero_invalid(m, TRUE); */
 		}
 		
 		if (i != ap->a_reqpage) {
@ -197,13 +218,6 @@ nfs_getpages(ap)
 			} else {
 				vnode_pager_freepage(m);
 			}
-		} else {
-			/*
-			 * This page is being mapped, clear out any other
-			 * cruft in the invalid areas of the page.
-			 */
-			if (m->valid && m->valid != VM_PAGE_BITS_ALL)
-				vm_page_zero_invalid(m, FALSE);
 		}
 	}
 	return 0;
@ -228,14 +242,17 @@ nfs_putpages(ap)
 	vm_offset_t kva;
 	struct buf *bp;
 	int iomode, must_commit, i, error, npages, count;
+	off_t offset;
 	int *rtvals;
 	struct vnode *vp;
 	struct proc *p;
 	struct ucred *cred;
 	struct nfsmount *nmp;
+	struct nfsnode *np;
 	vm_page_t *pages;

 	vp = ap->a_vp;
+	np = VTONFS(vp);
 	p = curproc;				/* XXX */
 	cred = curproc->p_ucred;		/* XXX */
 	nmp = VFSTONFS(vp->v_mount);
@ -243,6 +260,7 @@ nfs_putpages(ap)
 	count = ap->a_count;
 	rtvals = ap->a_rtvals;
 	npages = btoc(count);
+	offset = IDX_TO_OFF(pages[0]->pindex);

 	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
 	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
@ -252,6 +270,16 @@ nfs_putpages(ap)
 		rtvals[i] = VM_PAGER_AGAIN;
 	}

+	/*
+	 * When putting pages, do not extend file past EOF.
+	 */
+
+	if (offset + count > np->n_size) {
+		count = np->n_size - offset;
+		if (count < 0)
+			count = 0;
+	}
+
 	/*
 	 * We use only the kva address for the buffer, but this is extremely
 	 * convienient and fast.
@ -265,7 +293,7 @@ nfs_putpages(ap)
 	iov.iov_len = count;
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
-	uio.uio_offset = IDX_TO_OFF(pages[0]->pindex);
+	uio.uio_offset = offset;
 	uio.uio_resid = count;
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_rw = UIO_WRITE;
@ -297,23 +325,21 @@ nfs_putpages(ap)
 * Vnode op for read using bio
 */
 int
-nfs_bioread(vp, uio, ioflag, cred, getpages)
+nfs_bioread(vp, uio, ioflag, cred)
 	register struct vnode *vp;
 	register struct uio *uio;
 	int ioflag;
 	struct ucred *cred;
-	int getpages;
 {
 	register struct nfsnode *np = VTONFS(vp);
 	register int biosize, i;
-	off_t diff;
 	struct buf *bp = 0, *rabp;
 	struct vattr vattr;
 	struct proc *p;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	daddr_t lbn, rabn;
-	int bufsize;
-	int nra, error = 0, n = 0, on = 0, not_readin;
+	int bcount;
+	int nra, error = 0, n = 0, on = 0;

 #ifdef DIAGNOSTIC
 	if (uio->uio_rw != UIO_READ)
@ -424,7 +450,6 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
 		nfsstats.biocache_reads++;
 		lbn = uio->uio_offset / biosize;
 		on = uio->uio_offset & (biosize - 1);
-		not_readin = 1;

 		/*
 		 * Start the read ahead(s), as required.
@ -439,7 +464,6 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
 				return (EINTR);
 			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
 				rabp->b_flags |= (B_READ | B_ASYNC);
-				rabp->b_flags &= ~B_DONE;
 				vfs_busy_pages(rabp, 0);
 				if (nfs_asyncio(rabp, cred)) {
 				    rabp->b_flags |= B_INVAL|B_ERROR;
@ -453,47 +477,31 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
 		}

 		/*
-		 * If the block is in the cache and has the required data
-		 * in a valid region, just copy it out.
-		 * Otherwise, get the block and write back/read in,
-		 * as required.
+		 * Obtain the buffer cache block.  Figure out the buffer size
+		 * when we are at EOF.  nfs_getcacheblk() will also force
+		 * uncached delayed-writes to be flushed to the server.
+		 *
+		 * Note that bcount is *not* DEV_BSIZE aligned.
 		 */
-again:
-		bufsize = biosize;
-		if ((off_t)(lbn + 1) * biosize > np->n_size && 
-		    (off_t)(lbn + 1) * biosize - np->n_size < biosize) {
-			bufsize = np->n_size - (off_t)lbn * biosize;
-			bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
+
+		bcount = biosize;
+		if ((off_t)lbn * biosize >= np->n_size) {
+			bcount = 0;
+		} else if ((off_t)(lbn + 1) * biosize > np->n_size) {
+			bcount = np->n_size - (off_t)lbn * biosize;
 		}
-		bp = nfs_getcacheblk(vp, lbn, bufsize, p);
+
+		bp = nfs_getcacheblk(vp, lbn, bcount, p);
 		if (!bp)
 			return (EINTR);

 		/*
-		 * If we are being called from nfs_getpages, we must
-		 * make sure the buffer is a vmio buffer.  The vp will
-		 * already be setup for vmio but there may be some old
-		 * non-vmio buffers attached to it.
+		 * If B_CACHE is not set, we must issue the read.  If this
+		 * fails, we return an error.
 		 */
-		if (getpages && !(bp->b_flags & B_VMIO)) {
-#ifdef DIAGNOSTIC
-			printf("nfs_bioread: non vmio buf found, discarding\n");
-#endif
-			bp->b_flags |= B_NOCACHE;
-			bp->b_flags |= B_INVAFTERWRITE;
-			if (bp->b_dirtyend > 0) {
-				if ((bp->b_flags & B_DELWRI) == 0)
-					panic("nfsbioread");
-				if (VOP_BWRITE(bp) == EINTR)
-					return (EINTR);
-			} else
-				brelse(bp);
-			goto again;
-		}
+
 		if ((bp->b_flags & B_CACHE) == 0) {
 		    bp->b_flags |= B_READ;
-		    bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
-		    not_readin = 0;
 		    vfs_busy_pages(bp, 0);
 		    error = nfs_doio(bp, cred, p);
 		    if (error) {
@ -501,32 +509,20 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
 			return (error);
 		    }
 		}
-		if (bufsize > on) {
-			n = min((unsigned)(bufsize - on), uio->uio_resid);
-		} else {
-			n = 0;
-		}
-		diff = np->n_size - uio->uio_offset;
-		if (diff < n)
-			n = diff;
-		if (not_readin && n > 0) {
-			if (on < bp->b_validoff || (on + n) > bp->b_validend) {
-				bp->b_flags |= B_NOCACHE;
-				bp->b_flags |= B_INVAFTERWRITE;
-				if (bp->b_dirtyend > 0) {
-				    if ((bp->b_flags & B_DELWRI) == 0)
-					panic("nfsbioread");
-				    if (VOP_BWRITE(bp) == EINTR)
-					return (EINTR);
-				} else
-				    brelse(bp);
-				goto again;
-			}
-		}
+
+		/*
+		 * on is the offset into the current bp.  Figure out how many
+		 * bytes we can copy out of the bp.  Note that bcount is
+		 * NOT DEV_BSIZE aligned.
+		 *
+		 * Then figure out how many bytes we can copy into the uio.
+		 */
+
+		n = 0;
+		if (on < bcount)
+			n = min((unsigned)(bcount - on), uio->uio_resid);
+
 		vp->v_lastr = lbn;
-		diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on);
-		if (diff < n)
-			n = diff;
 		break;
 	    case VLNK:
 		nfsstats.biocache_readlinks++;
@ -535,7 +531,6 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
 			return (EINTR);
 		if ((bp->b_flags & B_CACHE) == 0) {
 		    bp->b_flags |= B_READ;
-		    bp->b_flags &= ~B_DONE;
 		    vfs_busy_pages(bp, 0);
 		    error = nfs_doio(bp, cred, p);
 		    if (error) {
@ -560,13 +555,13 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
 		    return (EINTR);
 		if ((bp->b_flags & B_CACHE) == 0) {
 		    bp->b_flags |= B_READ;
-		    bp->b_flags &= ~B_DONE;
 		    vfs_busy_pages(bp, 0);
 		    error = nfs_doio(bp, cred, p);
 		    if (error) {
 			    brelse(bp);
 		    }
 		    while (error == NFSERR_BAD_COOKIE) {
+			printf("got bad cookie vp %p bp %p\n", vp, bp);
 			nfs_invaldir(vp);
 			error = nfs_vinvalbuf(vp, 0, cred, p, 1);
 			/*
@ -574,6 +569,10 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
 			 * server. The only way to get the block is by
 			 * reading from the beginning to get all the
 			 * offset cookies.
+			 *
+			 * Leave the last bp intact unless there is an error.
+			 * Loop back up to the while if the error is another
+			 * NFSERR_BAD_COOKIE (double yuch!).
 			 */
 			for (i = 0; i <= lbn && !error; i++) {
 			    if (np->n_direofoffset
@ -582,21 +581,32 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
 			    bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p);
 			    if (!bp)
 				return (EINTR);
-			    if ((bp->b_flags & B_DONE) == 0) {
-				bp->b_flags |= B_READ;
-				bp->b_flags &= ~B_DONE;
-				vfs_busy_pages(bp, 0);
-				error = nfs_doio(bp, cred, p);
-				if (error == 0 && (bp->b_flags & B_INVAL))
-					break;
-				if (error) {
-				    brelse(bp);
-				} else if (i < lbn) {
-				    brelse(bp);
-				}
+			    if ((bp->b_flags & B_CACHE) == 0) {
+				    bp->b_flags |= B_READ;
+				    vfs_busy_pages(bp, 0);
+				    error = nfs_doio(bp, cred, p);
+				    /*
+				     * no error + B_INVAL == directory EOF,
+				     * use the block.
+				     */
+				    if (error == 0 && (bp->b_flags & B_INVAL))
+					    break;
 			    }
+			    /*
+			     * An error will throw away the block and the
+			     * for loop will break out.  If no error and this
+			     * is not the block we want, we throw away the
+			     * block and go for the next one via the for loop.
+			     */
+			    if (error || i < lbn)
+				    brelse(bp);
 			}
 		    }
+		    /*
+		     * The above while is repeated if we hit another cookie
+		     * error.  If we hit an error and it wasn't a cookie error,
+		     * we give up.
+		     */
 		    if (error)
 			    return (error);
 		}
@ -616,7 +626,6 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
 			if (rabp) {
 			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
 				rabp->b_flags |= (B_READ | B_ASYNC);
-				rabp->b_flags &= ~B_DONE;
 				vfs_busy_pages(rabp, 0);
 				if (nfs_asyncio(rabp, cred)) {
 				    rabp->b_flags |= B_INVAL|B_ERROR;
@ -629,10 +638,20 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
 			}
 		}
 		/*
-		 * Make sure we use a signed variant of min() since
-		 * the second term may be negative.
+		 * Unlike VREG files, whos buffer size ( bp->b_bcount ) is
+		 * chopped for the EOF condition, we cannot tell how large
+		 * NFS directories are going to be until we hit EOF.  So
+		 * an NFS directory buffer is *not* chopped to its EOF.  Now,
+		 * it just so happens that b_resid will effectively chop it
+		 * to EOF.  *BUT* this information is lost if the buffer goes
+		 * away and is reconstituted into a B_CACHE state ( due to
+		 * being VMIO ) later.  So we keep track of the directory eof
+		 * in np->n_direofoffset and chop it off as an extra step 
+		 * right here.
 		 */
 		n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
+		if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset)
+			n = np->n_direofoffset - uio->uio_offset;
 		break;
 	    default:
 		printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
@ -649,6 +668,10 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
 		n = 0;
 		break;
 	    case VDIR:
+		/*
+		 * Invalidate buffer if caching is disabled, forcing a
+		 * re-read from the remote later.
+		 */
 		if (np->n_flag & NQNFSNONCACHE)
 			bp->b_flags |= B_INVAL;
 		break;
@ -660,24 +683,6 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
 	return (error);
 }

-static void
-nfs_prot_buf(bp, off, n)
-	struct buf *bp;
-	int off;
-	int n;
-{
-	int pindex, boff, end;
-
-	if ((bp->b_flags & B_VMIO) == 0)
-		return;
-
-	end = round_page(off + n);
-	for (boff = trunc_page(off); boff < end; boff += PAGE_SIZE) {
-		pindex = boff >> PAGE_SHIFT;
-		vm_page_protect(bp->b_pages[pindex], VM_PROT_NONE);
-	}
-}
-
 /*
 * Vnode op for write using bio
 */
@ -690,18 +695,18 @@ nfs_write(ap)
 		struct ucred *a_cred;
 	} */ *ap;
 {
-	register int biosize;
-	register struct uio *uio = ap->a_uio;
+	int biosize;
+	struct uio *uio = ap->a_uio;
 	struct proc *p = uio->uio_procp;
-	register struct vnode *vp = ap->a_vp;
+	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
-	register struct ucred *cred = ap->a_cred;
+	struct ucred *cred = ap->a_cred;
 	int ioflag = ap->a_ioflag;
 	struct buf *bp;
 	struct vattr vattr;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	daddr_t lbn;
-	int bufsize;
+	int bcount;
 	int n, on, error = 0, iomode, must_commit;

 #ifdef DIAGNOSTIC
@ -749,12 +754,9 @@ nfs_write(ap)
 		psignal(p, SIGXFSZ);
 		return (EFBIG);
 	}
-	/*
-	 * I use nm_rsize, not nm_wsize so that all buffer cache blocks
-	 * will be the same size within a filesystem. nfs_writerpc will
-	 * still use nm_wsize when sizing the rpc's.
-	 */
+
 	biosize = vp->v_mount->mnt_stat.f_iosize;
+
 	do {
 		/*
 		 * Check for a valid write lease.
@ -786,17 +788,74 @@ nfs_write(ap)
 		on = uio->uio_offset & (biosize-1);
 		n = min((unsigned)(biosize - on), uio->uio_resid);
 again:
-		if (uio->uio_offset + n > np->n_size) {
+		/*
+		 * Handle direct append and file extension cases, calculate
+		 * unaligned buffer size.
+		 */
+
+		if (uio->uio_offset == np->n_size && n) {
+			/*
+			 * special append case.  Obtain buffer prior to
+			 * resizing it to maintain B_CACHE.
+			 */
+			long save;
+
+			bcount = on;
+			bp = nfs_getcacheblk(vp, lbn, bcount, p);
+			save = bp->b_flags & B_CACHE;
+
 			np->n_size = uio->uio_offset + n;
 			np->n_flag |= NMODIFIED;
 			vnode_pager_setsize(vp, np->n_size);
+
+			bcount += n;
+			allocbuf(bp, bcount);
+			bp->b_flags |= save;
+		} else {
+			if (uio->uio_offset + n > np->n_size) {
+				np->n_size = uio->uio_offset + n;
+				np->n_flag |= NMODIFIED;
+				vnode_pager_setsize(vp, np->n_size);
+			}
+			bcount = biosize;
+			if ((off_t)(lbn + 1) * biosize > np->n_size) 
+				bcount = np->n_size - (off_t)lbn * biosize;
+			bp = nfs_getcacheblk(vp, lbn, bcount, p);
 		}
-		bufsize = biosize;
-		if ((off_t)(lbn + 1) * biosize > np->n_size) {
-			bufsize = np->n_size - (off_t)lbn * biosize;
-			bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
+
+		/*
+		 * Issue a READ if B_CACHE is not set.  In special-append
+		 * mode, B_CACHE is based on the buffer prior to the write
+		 * op and is typically set, avoiding the read.  If a read
+		 * is required in special append mode, the server will
+		 * probably send us a short-read since we extended the file
+		 * on our end, resulting in b_resid == 0 and, thusly, 
+		 * B_CACHE getting set.
+		 *
+		 * We can also avoid issuing the read if the write covers
+		 * the entire buffer.  We have to make sure the buffer state
+		 * is reasonable in this case since we will not be initiating
+		 * I/O.  See the comments in kern/vfs_bio.c's getblk() for
+		 * more information.
+		 *
+		 * B_CACHE may also be set due to the buffer being cached
+		 * normally.
+		 */
+
+		if (on == 0 && n == bcount) {
+			bp->b_flags |= B_CACHE;
+			bp->b_flags &= ~(B_ERROR | B_INVAL);
+		}
+
+		if ((bp->b_flags & B_CACHE) == 0) {
+			bp->b_flags |= B_READ;
+			vfs_busy_pages(bp, 0);
+			error = nfs_doio(bp, cred, p);
+			if (error) {
+				brelse(bp);
+				return (error);
+			}
 		}
-		bp = nfs_getcacheblk(vp, lbn, bufsize, p);
 		if (!bp)
 			return (EINTR);
 		if (bp->b_wcred == NOCRED) {
@ -820,6 +879,17 @@ nfs_write(ap)
 		 * If the new write will leave a contiguous dirty
 		 * area, just update the b_dirtyoff and b_dirtyend,
 		 * otherwise force a write rpc of the old dirty area.
+		 *
+		 * While it is possible to merge discontiguous writes due to 
+		 * our having a B_CACHE buffer ( and thus valid read data
+		 * for the hole), we don't because it could lead to 
+		 * significant cache coherency problems with multiple clients,
+		 * especially if locking is implemented later on.
+		 *
+		 * as an optimization we could theoretically maintain
+		 * a linked list of discontinuous areas, but we would still
+		 * have to commit them separately so there isn't much
+		 * advantage to it except perhaps a bit of asynchronization.
 		 */

 		if (bp->b_dirtyend > 0 &&
@ -862,11 +932,6 @@ nfs_write(ap)
 			return (error);
 		}

-		/*
-		 * This will keep the buffer and mmaped regions more coherent.
-		 */
-		nfs_prot_buf(bp, on, n);
-
 		/*
 		 * Only update dirtyoff/dirtyend if not a degenerate 
 		 * condition.
@ -879,21 +944,7 @@ nfs_write(ap)
 				bp->b_dirtyoff = on;
 				bp->b_dirtyend = on + n;
 			}
-		}
-
-		/*
-		 * To avoid code complexity, we may have to throw away
-		 * previously valid ranges when merging the new dirty range
-		 * into the valid range.  As long as we do not *ADD* an
-		 * invalid valid range, we are ok.
-		 */
-		if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff ||
-		    bp->b_validoff > bp->b_dirtyend) {
-			bp->b_validoff = bp->b_dirtyoff;
-			bp->b_validend = bp->b_dirtyend;
-		} else {
-			bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
-			bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
+			vfs_bio_set_validclean(bp, on, n);
 		}

 		/*
@ -904,11 +955,14 @@ nfs_write(ap)

 		/*
 		 * If the lease is non-cachable or IO_SYNC do bwrite().
+		 *
+		 * IO_INVAL appears to be unused.  The idea appears to be
+		 * to turn off caching in this case.  Very odd.  XXX
 		 */
 		if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) {
 			bp->b_proc = p;
 			if (ioflag & IO_INVAL)
-				bp->b_flags |= B_INVAL;
+				bp->b_flags |= B_NOCACHE;
 			error = VOP_BWRITE(bp);
 			if (error)
 				return (error);
@ -922,8 +976,9 @@ nfs_write(ap)
 			bp->b_proc = (struct proc *)0;
 			bp->b_flags |= B_ASYNC;
 			(void)nfs_writebp(bp, 0);
-		} else
+		} else {
 			bdwrite(bp);
+		}
 	} while (uio->uio_resid > 0 && n > 0);
 	return (0);
 }
@ -956,15 +1011,16 @@ nfs_getcacheblk(vp, bn, size, p)
 				return ((struct buf *)0);
 			bp = getblk(vp, bn, size, 0, 2 * hz);
 		}
-	} else
+	} else {
 		bp = getblk(vp, bn, size, 0, 0);
+	}

 	if (vp->v_type == VREG) {
 		int biosize;
+
 		biosize = mp->mnt_stat.f_iosize;
 		bp->b_blkno = bn * (biosize / DEV_BSIZE);
 	}
-
 	return (bp);
 }

@ -1036,6 +1092,9 @@ nfs_vinvalbuf(vp, flags, cred, p, intrflg)
 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
 * This is mainly to avoid queueing async I/O requests when the nfsiods
 * are all hung on a dead server.
+ *
+ * Note: nfs_asyncio() does not clear (B_ERROR|B_INVAL) but when the bp
+ * is eventually dequeued by the async daemon, nfs_doio() *will*.
 */
 int
 nfs_asyncio(bp, cred)
@ -1164,7 +1223,7 @@ nfs_doio(bp, cr, p)
 	struct vnode *vp;
 	struct nfsnode *np;
 	struct nfsmount *nmp;
-	int error = 0, diff, len, iomode, must_commit = 0;
+	int error = 0, iomode, must_commit = 0;
 	struct uio uio;
 	struct iovec io;

@ -1177,6 +1236,13 @@ nfs_doio(bp, cr, p)
 	uiop->uio_segflg = UIO_SYSSPACE;
 	uiop->uio_procp = p;

+	/*
+	 * clear B_ERROR and B_INVAL state prior to initiating the I/O.  We
+	 * do this here so we do not have to do it in all the code that
+	 * calls us.
+	 */
+	bp->b_flags &= ~(B_ERROR | B_INVAL);
+
 	KASSERT(!(bp->b_flags & B_DONE), ("nfs_doio: bp %p already marked done", bp));

 	/*
@ -1216,25 +1282,22 @@ nfs_doio(bp, cr, p)
 		nfsstats.read_bios++;
 		error = nfs_readrpc(vp, uiop, cr);
 		if (!error) {
-		    bp->b_validoff = 0;
 		    if (uiop->uio_resid) {
 			/*
-			 * If len > 0, there is a hole in the file and
-			 * no writes after the hole have been pushed to
-			 * the server yet.
-			 * Just zero fill the rest of the valid area.
+			 * If we had a short read with no error, we must have
+			 * hit a file hole.  We should zero-fill the remainder.
+			 * This can also occur if the server hits the file EOF.
+			 *
+			 * Holes used to be able to occur due to pending 
+			 * writes, but that is not possible any longer.
 			 */
-			diff = bp->b_bcount - uiop->uio_resid;
-			len = np->n_size - (((u_quad_t)bp->b_blkno) * DEV_BSIZE
-				+ diff);
-			if (len > 0) {
-			    len = min(len, uiop->uio_resid);
-			    bzero((char *)bp->b_data + diff, len);
-			    bp->b_validend = diff + len;
-			} else
-			    bp->b_validend = diff;
-		    } else
-			bp->b_validend = bp->b_bcount;
+			int nread = bp->b_bcount - uiop->uio_resid;
+			int left  = bp->b_bcount - nread;
+
+			if (left > 0)
+				bzero((char *)bp->b_data + nread, left);
+			uiop->uio_resid = 0;
+		    }
 		}
 		if (p && (vp->v_flag & VTEXT) &&
 			(((nmp->nm_flag & NFSMNT_NQNFS) &&
@ -1262,6 +1325,10 @@ nfs_doio(bp, cr, p)
 		}
 		if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
 			error = nfs_readdirrpc(vp, uiop, cr);
+		/*
+		 * end-of-directory sets B_INVAL but does not generate an
+		 * error.
+		 */
 		if (error == 0 && uiop->uio_resid == bp->b_bcount)
 			bp->b_flags |= B_INVAL;
 		break;
@ -1296,7 +1363,7 @@ nfs_doio(bp, cr, p)
 		if (!error && iomode == NFSV3WRITE_UNSTABLE) {
 		    bp->b_flags |= B_NEEDCOMMIT;
 		    if (bp->b_dirtyoff == 0
-			&& bp->b_dirtyend == bp->b_bufsize)
+			&& bp->b_dirtyend == bp->b_bcount)
 			bp->b_flags |= B_CLUSTEROK;
 		} else {
 		    bp->b_flags &= ~B_NEEDCOMMIT;
--- a/sys/nfsclient/nfs_socket.c
+++ b/sys/nfsclient/nfs_socket.c
@ -34,7 +34,7 @@
 * SUCH DAMAGE.
 *
 *	@(#)nfs_socket.c	8.5 (Berkeley) 3/30/95
- * $Id: nfs_socket.c,v 1.50 1999/02/25 00:03:51 peter Exp $
+ * $Id: nfs_socket.c,v 1.51 1999/04/24 11:29:48 dt Exp $
 */

 /*
@ -54,6 +54,7 @@
 #include <sys/socketvar.h>
 #include <sys/syslog.h>
 #include <sys/tprintf.h>
+#include <sys/sysctl.h>

 #include <netinet/in.h>
 #include <netinet/tcp.h>
@ -115,6 +116,15 @@ static int proct[NFS_NPROCS] = {
 	0, 0, 0,
 };

+static int nfs_realign_test;
+static int nfs_realign_count;
+
+SYSCTL_DECL(_vfs_nfs);
+
+SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_test, CTLFLAG_RD, &nfs_realign_test, 0, "");
+SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_count, CTLFLAG_RD, &nfs_realign_count, 0, "");
+
+
 /*
 * There is a congestion window for outstanding rpcs maintained per mount
 * point. The cwnd size is adjusted in roughly the way that:
@ -138,7 +148,7 @@ struct callout_handle	nfs_timer_handle;
 static int	nfs_msg __P((struct proc *,char *,char *));
 static int	nfs_rcvlock __P((struct nfsreq *));
 static void	nfs_rcvunlock __P((struct nfsreq *));
-static void	nfs_realign __P((struct mbuf *m, int hsiz));
+static void	nfs_realign __P((struct mbuf **pm, int hsiz));
 static int	nfs_receive __P((struct nfsreq *rep, struct sockaddr **aname,
 				 struct mbuf **mp));
 static int	nfs_reconnect __P((struct nfsreq *rep));
@ -702,7 +712,7 @@ nfs_receive(rep, aname, mp)
 	 * These could cause pointer alignment problems, so copy them to
 	 * well aligned mbufs.
 	 */
-	nfs_realign(*mp, 5 * NFSX_UNSIGNED);
+	nfs_realign(mp, 5 * NFSX_UNSIGNED);
 	return (error);
 }

@ -1589,92 +1599,56 @@ nfs_rcvunlock(rep)
 }

 /*
- * Check for badly aligned mbuf data areas and
- * realign data in an mbuf list by copying the data areas up, as required.
+ *	nfs_realign:
+ *
+ *	Check for badly aligned mbuf data and realign by copying the unaligned
+ *	portion of the data into a new mbuf chain and freeing the portions
+ *	of the old chain that were replaced.
+ *
+ *	We cannot simply realign the data within the existing mbuf chain
+ *	because the underlying buffers may contain other rpc commands and
+ *	we cannot afford to overwrite them.
+ *
+ *	We would prefer to avoid this situation entirely.  The situation does
+ *	not occur with NFS/UDP and is supposed to only occassionally occur
+ *	with TCP.  Use vfs.nfs.realign_count and realign_test to check this.
 */
 static void
-nfs_realign(m, hsiz)
-	register struct mbuf *m;
+nfs_realign(pm, hsiz)
+	register struct mbuf **pm;
 	int hsiz;
 {
-	register struct mbuf *m2;
-	register int siz, mlen, olen;
-	register caddr_t tcp, fcp;
-	struct mbuf *mnew;
+	struct mbuf *m;
+	struct mbuf *n = NULL;
+	int off = 0;

-	while (m) {
-	    /*
-	     * This never happens for UDP, rarely happens for TCP
-	     * but frequently happens for iso transport.
-	     */
-	    if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) {
-		olen = m->m_len;
-		fcp = mtod(m, caddr_t);
-		if ((intptr_t)fcp & 0x3) {
-			m->m_flags &= ~M_PKTHDR;
-			if (m->m_flags & M_EXT)
-				m->m_data = m->m_ext.ext_buf +
-					((m->m_ext.ext_size - olen) & ~0x3);
-			else
-				m->m_data = m->m_dat;
+	++nfs_realign_test;
+
+	while ((m = *pm) != NULL) {
+		if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) {
+			MGET(n, M_WAIT, MT_DATA);
+			if (m->m_len >= MINCLSIZE) {
+				MCLGET(n, M_WAIT);
+			}
+			n->m_len = 0;
+			break;
 		}
-		m->m_len = 0;
-		tcp = mtod(m, caddr_t);
-		mnew = m;
-		m2 = m->m_next;
+		pm = &m->m_next;
+	}

-		/*
-		 * If possible, only put the first invariant part
-		 * of the RPC header in the first mbuf.
-		 */
-		mlen = M_TRAILINGSPACE(m);
-		if (olen <= hsiz && mlen > hsiz)
-			mlen = hsiz;
-
-		/*
-		 * Loop through the mbuf list consolidating data.
-		 */
+	/*
+	 * If n is non-NULL, loop on m copying data, then replace the
+	 * portion of the chain that had to be realigned.
+	 */
+	if (n != NULL) {
+		++nfs_realign_count;
 		while (m) {
-			while (olen > 0) {
-				if (mlen == 0) {
-					m2->m_flags &= ~M_PKTHDR;
-					if (m2->m_flags & M_EXT)
-						m2->m_data = m2->m_ext.ext_buf;
-					else
-						m2->m_data = m2->m_dat;
-					m2->m_len = 0;
-					mlen = M_TRAILINGSPACE(m2);
-					tcp = mtod(m2, caddr_t);
-					mnew = m2;
-					m2 = m2->m_next;
-				}
-				siz = min(mlen, olen);
-				if (tcp != fcp)
-					bcopy(fcp, tcp, siz);
-				mnew->m_len += siz;
-				mlen -= siz;
-				olen -= siz;
-				tcp += siz;
-				fcp += siz;
-			}
+			m_copyback(n, off, m->m_len, mtod(m, caddr_t));
+			off += m->m_len;
 			m = m->m_next;
-			if (m) {
-				olen = m->m_len;
-				fcp = mtod(m, caddr_t);
-			}
 		}
-
-		/*
-		 * Finally, set m_len == 0 for any trailing mbufs that have
-		 * been copied out of.
-		 */
-		while (m2) {
-			m2->m_len = 0;
-			m2 = m2->m_next;
-		}
-		return;
-	    }
-	    m = m->m_next;
+		m_freem(*pm);
+		*pm = n;
 	}
 }

@ -2040,7 +2014,7 @@ nfsrv_rcv(so, arg, waitflag)
 					m_freem(mp);
 					continue;
 				}
-				nfs_realign(mp, 10 * NFSX_UNSIGNED);
+				nfs_realign(&mp, 10 * NFSX_UNSIGNED);
 				rec->nr_address = nam;
 				rec->nr_packet = mp;
 				STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link);
@ -2182,7 +2156,7 @@ nfsrv_getstream(slp, waitflag)
 		if (!rec) {
 		    m_freem(slp->ns_frag);
 		} else {
-		    nfs_realign(slp->ns_frag, 10 * NFSX_UNSIGNED);
+		    nfs_realign(&slp->ns_frag, 10 * NFSX_UNSIGNED);
 		    rec->nr_address = (struct sockaddr *)0;
 		    rec->nr_packet = slp->ns_frag;
 		    STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link);
--- a/sys/nfsclient/nfs_vnops.c
+++ b/sys/nfsclient/nfs_vnops.c
@ -34,7 +34,7 @@
 * SUCH DAMAGE.
 *
 *	@(#)nfs_vnops.c	8.16 (Berkeley) 5/27/95
- * $Id: nfs_vnops.c,v 1.123 1999/02/16 10:49:54 dfr Exp $
+ * $Id: nfs_vnops.c,v 1.124 1999/03/12 02:24:58 julian Exp $
 */


@ -408,9 +408,9 @@ nfs_access(ap)
 				error = nfs_readrpc(vp, &auio, ap->a_cred);
 			else if (vp->v_type == VDIR) {
 				char* bp;
-				bp = malloc(NFS_DIRBLKSIZ, M_TEMP, M_WAITOK);
+				bp = malloc(DIRBLKSIZ, M_TEMP, M_WAITOK);
 				aiov.iov_base = bp;
-				aiov.iov_len = auio.uio_resid = NFS_DIRBLKSIZ;
+				aiov.iov_len = auio.uio_resid = DIRBLKSIZ;
 				error = nfs_readdirrpc(vp, &auio, ap->a_cred);
 				free(bp, M_TEMP);
 			} else if (vp->v_type == VLNK)
@ -962,7 +962,7 @@ nfs_read(ap)

 	if (vp->v_type != VREG)
 		return (EPERM);
-	return (nfs_bioread(vp, ap->a_uio, ap->a_ioflag, ap->a_cred, 0));
+	return (nfs_bioread(vp, ap->a_uio, ap->a_ioflag, ap->a_cred));
 }

 /*
@ -980,7 +980,7 @@ nfs_readlink(ap)

 	if (vp->v_type != VLNK)
 		return (EINVAL);
-	return (nfs_bioread(vp, ap->a_uio, 0, ap->a_cred, 0));
+	return (nfs_bioread(vp, ap->a_uio, 0, ap->a_cred));
 }

 /*
@ -1985,7 +1985,7 @@ nfs_readdir(ap)
 	 * Call nfs_bioread() to do the real work.
 	 */
 	tresid = uio->uio_resid;
-	error = nfs_bioread(vp, uio, 0, ap->a_cred, 0);
+	error = nfs_bioread(vp, uio, 0, ap->a_cred);

 	if (!error && uio->uio_resid == tresid)
 		nfsstats.direofcache_misses++;
@ -2004,7 +2004,7 @@ nfs_readdirrpc(vp, uiop, cred)

 {
 	register int len, left;
-	register struct dirent *dp;
+	register struct dirent *dp = NULL;
 	register u_int32_t *tl;
 	register caddr_t cp;
 	register int32_t t1, t2;
@ -2019,12 +2019,9 @@ nfs_readdirrpc(vp, uiop, cred)
 	int attrflag;
 	int v3 = NFS_ISV3(vp);

-#ifndef nolint
-	dp = (struct dirent *)0;
-#endif
 #ifndef DIAGNOSTIC
-	if (uiop->uio_iovcnt != 1 || (uiop->uio_offset & (NFS_DIRBLKSIZ - 1)) ||
-		(uiop->uio_resid & (NFS_DIRBLKSIZ - 1)))
+	if (uiop->uio_iovcnt != 1 || (uiop->uio_offset & (DIRBLKSIZ - 1)) ||
+		(uiop->uio_resid & (DIRBLKSIZ - 1)))
 		panic("nfs readdirrpc bad uio");
 #endif

@ -2381,7 +2378,7 @@ nfs_readdirplusrpc(vp, uiop, cred)
 		m_freem(mrep);
 	}
 	/*
-	 * Fill last record, iff any, out to a multiple of NFS_DIRBLKSIZ
+	 * Fill last record, iff any, out to a multiple of DIRBLKSIZ
 	 * by increasing d_reclen for the last record.
 	 */
 	if (blksiz > 0) {
@ -3028,13 +3025,13 @@ nfs_bwrite(ap)
 		struct vnode *a_bp;
 	} */ *ap;
 {
-
 	return (nfs_writebp(ap->a_bp, 1));
 }

 /*
 * This is a clone of vn_bwrite(), except that B_WRITEINPROG isn't set unless
- * the force flag is one and it also handles the B_NEEDCOMMIT flag.
+ * the force flag is one and it also handles the B_NEEDCOMMIT flag.  We set
+ * B_CACHE if this is a VMIO buffer.
 */
 int
 nfs_writebp(bp, force)
@ -3049,12 +3046,15 @@ nfs_writebp(bp, force)
 	if(!(bp->b_flags & B_BUSY))
 		panic("bwrite: buffer is not busy???");

-	if (bp->b_flags & B_INVAL)
-		bp->b_flags |= B_NOCACHE;
+	if (bp->b_flags & B_INVAL) {
+		brelse(bp);
+		return(0);
+	}
+
+	bp->b_flags |= B_CACHE;

 	/*
-	 * XXX we bundirty() the bp here.  Shouldn't we do it later after
-	 * the I/O has completed??
+	 * Undirty the bp.  We will redirty it later if the I/O fails.
 	 */

 	s = splbio();
--- a/sys/nfsclient/nfsargs.h
+++ b/sys/nfsclient/nfsargs.h
@ -34,7 +34,7 @@
 * SUCH DAMAGE.
 *
 *	@(#)nfs.h	8.4 (Berkeley) 5/1/95
- * $Id: nfs.h,v 1.44 1998/09/07 05:42:15 bde Exp $
+ * $Id: nfs.h,v 1.45 1999/02/25 00:03:50 peter Exp $
 */

 #ifndef _NFS_NFS_H_
@ -651,8 +651,7 @@ void	nfs_disconnect __P((struct nfsmount *));
 void	nfs_safedisconnect __P((struct nfsmount *));
 int	nfs_getattrcache __P((struct vnode *, struct vattr *));
 int	nfsm_strtmbuf __P((struct mbuf **, char **, const char *, long));
-int	nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *,
-			 int));
+int	nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *));
 int	nfsm_uiotombuf __P((struct uio *, struct mbuf **, int, caddr_t *));
 void	nfsrv_init __P((int));
 void	nfs_clearcommit __P((struct mount *));
--- a/sys/nfsclient/nfsstats.h
+++ b/sys/nfsclient/nfsstats.h
@ -34,7 +34,7 @@
 * SUCH DAMAGE.
 *
 *	@(#)nfs.h	8.4 (Berkeley) 5/1/95
- * $Id: nfs.h,v 1.44 1998/09/07 05:42:15 bde Exp $
+ * $Id: nfs.h,v 1.45 1999/02/25 00:03:50 peter Exp $
 */

 #ifndef _NFS_NFS_H_
@ -651,8 +651,7 @@ void	nfs_disconnect __P((struct nfsmount *));
 void	nfs_safedisconnect __P((struct nfsmount *));
 int	nfs_getattrcache __P((struct vnode *, struct vattr *));
 int	nfsm_strtmbuf __P((struct mbuf **, char **, const char *, long));
-int	nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *,
-			 int));
+int	nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *));
 int	nfsm_uiotombuf __P((struct uio *, struct mbuf **, int, caddr_t *));
 void	nfsrv_init __P((int));
 void	nfs_clearcommit __P((struct mount *));
--- a/sys/nfsserver/nfs.h
+++ b/sys/nfsserver/nfs.h
@ -34,7 +34,7 @@
 * SUCH DAMAGE.
 *
 *	@(#)nfs.h	8.4 (Berkeley) 5/1/95
- * $Id: nfs.h,v 1.44 1998/09/07 05:42:15 bde Exp $
+ * $Id: nfs.h,v 1.45 1999/02/25 00:03:50 peter Exp $
 */

 #ifndef _NFS_NFS_H_
@ -651,8 +651,7 @@ void	nfs_disconnect __P((struct nfsmount *));
 void	nfs_safedisconnect __P((struct nfsmount *));
 int	nfs_getattrcache __P((struct vnode *, struct vattr *));
 int	nfsm_strtmbuf __P((struct mbuf **, char **, const char *, long));
-int	nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *,
-			 int));
+int	nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *));
 int	nfsm_uiotombuf __P((struct uio *, struct mbuf **, int, caddr_t *));
 void	nfsrv_init __P((int));
 void	nfs_clearcommit __P((struct mount *));
--- a/sys/nfsserver/nfs_srvsock.c
+++ b/sys/nfsserver/nfs_srvsock.c
@ -34,7 +34,7 @@
 * SUCH DAMAGE.
 *
 *	@(#)nfs_socket.c	8.5 (Berkeley) 3/30/95
- * $Id: nfs_socket.c,v 1.50 1999/02/25 00:03:51 peter Exp $
+ * $Id: nfs_socket.c,v 1.51 1999/04/24 11:29:48 dt Exp $
 */

 /*
@ -54,6 +54,7 @@
 #include <sys/socketvar.h>
 #include <sys/syslog.h>
 #include <sys/tprintf.h>
+#include <sys/sysctl.h>

 #include <netinet/in.h>
 #include <netinet/tcp.h>
@ -115,6 +116,15 @@ static int proct[NFS_NPROCS] = {
 	0, 0, 0,
 };

+static int nfs_realign_test;
+static int nfs_realign_count;
+
+SYSCTL_DECL(_vfs_nfs);
+
+SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_test, CTLFLAG_RD, &nfs_realign_test, 0, "");
+SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_count, CTLFLAG_RD, &nfs_realign_count, 0, "");
+
+
 /*
 * There is a congestion window for outstanding rpcs maintained per mount
 * point. The cwnd size is adjusted in roughly the way that:
@ -138,7 +148,7 @@ struct callout_handle	nfs_timer_handle;
 static int	nfs_msg __P((struct proc *,char *,char *));
 static int	nfs_rcvlock __P((struct nfsreq *));
 static void	nfs_rcvunlock __P((struct nfsreq *));
-static void	nfs_realign __P((struct mbuf *m, int hsiz));
+static void	nfs_realign __P((struct mbuf **pm, int hsiz));
 static int	nfs_receive __P((struct nfsreq *rep, struct sockaddr **aname,
 				 struct mbuf **mp));
 static int	nfs_reconnect __P((struct nfsreq *rep));
@ -702,7 +712,7 @@ nfs_receive(rep, aname, mp)
 	 * These could cause pointer alignment problems, so copy them to
 	 * well aligned mbufs.
 	 */
-	nfs_realign(*mp, 5 * NFSX_UNSIGNED);
+	nfs_realign(mp, 5 * NFSX_UNSIGNED);
 	return (error);
 }

@ -1589,92 +1599,56 @@ nfs_rcvunlock(rep)
 }

 /*
- * Check for badly aligned mbuf data areas and
- * realign data in an mbuf list by copying the data areas up, as required.
+ *	nfs_realign:
+ *
+ *	Check for badly aligned mbuf data and realign by copying the unaligned
+ *	portion of the data into a new mbuf chain and freeing the portions
+ *	of the old chain that were replaced.
+ *
+ *	We cannot simply realign the data within the existing mbuf chain
+ *	because the underlying buffers may contain other rpc commands and
+ *	we cannot afford to overwrite them.
+ *
+ *	We would prefer to avoid this situation entirely.  The situation does
+ *	not occur with NFS/UDP and is supposed to only occassionally occur
+ *	with TCP.  Use vfs.nfs.realign_count and realign_test to check this.
 */
 static void
-nfs_realign(m, hsiz)
-	register struct mbuf *m;
+nfs_realign(pm, hsiz)
+	register struct mbuf **pm;
 	int hsiz;
 {
-	register struct mbuf *m2;
-	register int siz, mlen, olen;
-	register caddr_t tcp, fcp;
-	struct mbuf *mnew;
+	struct mbuf *m;
+	struct mbuf *n = NULL;
+	int off = 0;

-	while (m) {
-	    /*
-	     * This never happens for UDP, rarely happens for TCP
-	     * but frequently happens for iso transport.
-	     */
-	    if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) {
-		olen = m->m_len;
-		fcp = mtod(m, caddr_t);
-		if ((intptr_t)fcp & 0x3) {
-			m->m_flags &= ~M_PKTHDR;
-			if (m->m_flags & M_EXT)
-				m->m_data = m->m_ext.ext_buf +
-					((m->m_ext.ext_size - olen) & ~0x3);
-			else
-				m->m_data = m->m_dat;
+	++nfs_realign_test;
+
+	while ((m = *pm) != NULL) {
+		if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) {
+			MGET(n, M_WAIT, MT_DATA);
+			if (m->m_len >= MINCLSIZE) {
+				MCLGET(n, M_WAIT);
+			}
+			n->m_len = 0;
+			break;
 		}
-		m->m_len = 0;
-		tcp = mtod(m, caddr_t);
-		mnew = m;
-		m2 = m->m_next;
+		pm = &m->m_next;
+	}

-		/*
-		 * If possible, only put the first invariant part
-		 * of the RPC header in the first mbuf.
-		 */
-		mlen = M_TRAILINGSPACE(m);
-		if (olen <= hsiz && mlen > hsiz)
-			mlen = hsiz;
-
-		/*
-		 * Loop through the mbuf list consolidating data.
-		 */
+	/*
+	 * If n is non-NULL, loop on m copying data, then replace the
+	 * portion of the chain that had to be realigned.
+	 */
+	if (n != NULL) {
+		++nfs_realign_count;
 		while (m) {
-			while (olen > 0) {
-				if (mlen == 0) {
-					m2->m_flags &= ~M_PKTHDR;
-					if (m2->m_flags & M_EXT)
-						m2->m_data = m2->m_ext.ext_buf;
-					else
-						m2->m_data = m2->m_dat;
-					m2->m_len = 0;
-					mlen = M_TRAILINGSPACE(m2);
-					tcp = mtod(m2, caddr_t);
-					mnew = m2;
-					m2 = m2->m_next;
-				}
-				siz = min(mlen, olen);
-				if (tcp != fcp)
-					bcopy(fcp, tcp, siz);
-				mnew->m_len += siz;
-				mlen -= siz;
-				olen -= siz;
-				tcp += siz;
-				fcp += siz;
-			}
+			m_copyback(n, off, m->m_len, mtod(m, caddr_t));
+			off += m->m_len;
 			m = m->m_next;
-			if (m) {
-				olen = m->m_len;
-				fcp = mtod(m, caddr_t);
-			}
 		}
-
-		/*
-		 * Finally, set m_len == 0 for any trailing mbufs that have
-		 * been copied out of.
-		 */
-		while (m2) {
-			m2->m_len = 0;
-			m2 = m2->m_next;
-		}
-		return;
-	    }
-	    m = m->m_next;
+		m_freem(*pm);
+		*pm = n;
 	}
 }

@ -2040,7 +2014,7 @@ nfsrv_rcv(so, arg, waitflag)
 					m_freem(mp);
 					continue;
 				}
-				nfs_realign(mp, 10 * NFSX_UNSIGNED);
+				nfs_realign(&mp, 10 * NFSX_UNSIGNED);
 				rec->nr_address = nam;
 				rec->nr_packet = mp;
 				STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link);
@ -2182,7 +2156,7 @@ nfsrv_getstream(slp, waitflag)
 		if (!rec) {
 		    m_freem(slp->ns_frag);
 		} else {
-		    nfs_realign(slp->ns_frag, 10 * NFSX_UNSIGNED);
+		    nfs_realign(&slp->ns_frag, 10 * NFSX_UNSIGNED);
 		    rec->nr_address = (struct sockaddr *)0;
 		    rec->nr_packet = slp->ns_frag;
 		    STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link);
--- a/sys/nfsserver/nfsrvstats.h
+++ b/sys/nfsserver/nfsrvstats.h
@ -34,7 +34,7 @@
 * SUCH DAMAGE.
 *
 *	@(#)nfs.h	8.4 (Berkeley) 5/1/95
- * $Id: nfs.h,v 1.44 1998/09/07 05:42:15 bde Exp $
+ * $Id: nfs.h,v 1.45 1999/02/25 00:03:50 peter Exp $
 */

 #ifndef _NFS_NFS_H_
@ -651,8 +651,7 @@ void	nfs_disconnect __P((struct nfsmount *));
 void	nfs_safedisconnect __P((struct nfsmount *));
 int	nfs_getattrcache __P((struct vnode *, struct vattr *));
 int	nfsm_strtmbuf __P((struct mbuf **, char **, const char *, long));
-int	nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *,
-			 int));
+int	nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *));
 int	nfsm_uiotombuf __P((struct uio *, struct mbuf **, int, caddr_t *));
 void	nfsrv_init __P((int));
 void	nfs_clearcommit __P((struct mount *));
--- a/sys/sys/bio.h
+++ b/sys/sys/bio.h
@ -36,7 +36,7 @@
 * SUCH DAMAGE.
 *
 *	@(#)buf.h	8.9 (Berkeley) 3/30/95
- * $Id: buf.h,v 1.64 1999/03/02 04:04:28 mckusick Exp $
+ * $Id: buf.h,v 1.65 1999/03/12 02:24:55 julian Exp $
 */

 #ifndef _SYS_BUF_H_
@ -78,6 +78,19 @@ struct iodone_chain {

 /*
 * The buffer header describes an I/O operation in the kernel.
+ *
+ * NOTES:
+ *	b_bufsize, b_bcount.  b_bufsize is the allocation size of the
+ *	buffer, either DEV_BSIZE or PAGE_SIZE aligned.  b_bcount is the
+ *	originally requested buffer size and can serve as a bounds check
+ *	against EOF.  For most, but not all uses, b_bcount == b_bufsize.
+ *
+ *	b_dirtyoff, b_dirtyend.  Buffers support piecemeal, unaligned
+ *	ranges of dirty data that need to be written to backing store.
+ *	The range is typically clipped at b_bcount ( not b_bufsize ).
+ *
+ *	b_resid.  Number of bytes remaining in I/O.  After an I/O operation
+ *	completes, b_resid is usually 0 indicating 100% success.
 */
 struct buf {
 	LIST_ENTRY(buf) b_hash;		/* Hash chain. */
@ -109,8 +122,10 @@ struct buf {
 	int	b_dirtyend;		/* Offset of end of dirty region. */
 	struct	ucred *b_rcred;		/* Read credentials reference. */
 	struct	ucred *b_wcred;		/* Write credentials reference. */
+#if 0
 	int	b_validoff;		/* Offset in buffer of valid region. */
 	int	b_validend;		/* Offset of end of valid region. */
+#endif
 	daddr_t	b_pblkno;               /* physical block number */
 	void	*b_saveaddr;		/* Original b_addr for physio. */
 	caddr_t	b_savekva;              /* saved kva for transfer while bouncing */
@ -151,9 +166,24 @@ struct buf {
 *			Buffer vp reassignments are illegal in this case.
 *
 *	B_CACHE		This may only be set if the buffer is entirely valid.
- *			The situation where B_DELWRI is set and B_CACHE gets
- *			cleared MUST be committed to disk so B_DELWRI can
- *			also be cleared.
+ *			The situation where B_DELWRI is set and B_CACHE is
+ *			clear MUST be committed to disk by getblk() so 
+ *			B_DELWRI can also be cleared.  See the comments for
+ *			getblk() in kern/vfs_bio.c.  If B_CACHE is clear,
+ *			the caller is expected to clear B_ERROR|B_INVAL,
+ *			set B_READ, and initiate an I/O.
+ *
+ *			The 'entire buffer' is defined to be the range from
+ *			0 through b_bcount.
+ *
+ *	B_MALLOC	Request that the buffer be allocated from the malloc
+ *			pool, DEV_BSIZE aligned instead of PAGE_SIZE aligned.
+ *
+ *	B_VMIO		Indicates that the buffer is tied into an VM object.
+ *			The buffer's data is always PAGE_SIZE aligned even
+ *			if b_bufsize and b_bcount are not.  ( b_bufsize is 
+ *			always at least DEV_BSIZE aligned, though ).
+ *	
 */

 #define	B_AGE		0x00000001	/* Move to age queue when I/O done. */
@ -356,6 +386,7 @@ void	cluster_write __P((struct buf *, u_quad_t));
 int	physio __P((void (*)(struct buf *), struct buf *, dev_t, 
 	    int, u_int (*)(struct buf *), struct uio *));
 u_int	minphys __P((struct buf *));
+void	vfs_bio_set_validclean __P((struct buf *, int base, int size));
 void	vfs_bio_clrbuf __P((struct buf *));
 void	vfs_busy_pages __P((struct buf *, int clear_modify));
 void	vfs_unbusy_pages __P((struct buf *));
@ -371,6 +402,7 @@ int	allocbuf __P((struct buf *bp, int size));
 void	reassignbuf __P((struct buf *, struct vnode *));
 void	pbreassignbuf __P((struct buf *, struct vnode *));
 struct	buf *trypbuf __P((int *));
+
 #endif /* KERNEL */

 #endif /* !_SYS_BUF_H_ */
--- a/sys/sys/buf.h
+++ b/sys/sys/buf.h
@ -36,7 +36,7 @@
 * SUCH DAMAGE.
 *
 *	@(#)buf.h	8.9 (Berkeley) 3/30/95
- * $Id: buf.h,v 1.64 1999/03/02 04:04:28 mckusick Exp $
+ * $Id: buf.h,v 1.65 1999/03/12 02:24:55 julian Exp $
 */

 #ifndef _SYS_BUF_H_
@ -78,6 +78,19 @@ struct iodone_chain {

 /*
 * The buffer header describes an I/O operation in the kernel.
+ *
+ * NOTES:
+ *	b_bufsize, b_bcount.  b_bufsize is the allocation size of the
+ *	buffer, either DEV_BSIZE or PAGE_SIZE aligned.  b_bcount is the
+ *	originally requested buffer size and can serve as a bounds check
+ *	against EOF.  For most, but not all uses, b_bcount == b_bufsize.
+ *
+ *	b_dirtyoff, b_dirtyend.  Buffers support piecemeal, unaligned
+ *	ranges of dirty data that need to be written to backing store.
+ *	The range is typically clipped at b_bcount ( not b_bufsize ).
+ *
+ *	b_resid.  Number of bytes remaining in I/O.  After an I/O operation
+ *	completes, b_resid is usually 0 indicating 100% success.
 */
 struct buf {
 	LIST_ENTRY(buf) b_hash;		/* Hash chain. */
@ -109,8 +122,10 @@ struct buf {
 	int	b_dirtyend;		/* Offset of end of dirty region. */
 	struct	ucred *b_rcred;		/* Read credentials reference. */
 	struct	ucred *b_wcred;		/* Write credentials reference. */
+#if 0
 	int	b_validoff;		/* Offset in buffer of valid region. */
 	int	b_validend;		/* Offset of end of valid region. */
+#endif
 	daddr_t	b_pblkno;               /* physical block number */
 	void	*b_saveaddr;		/* Original b_addr for physio. */
 	caddr_t	b_savekva;              /* saved kva for transfer while bouncing */
@ -151,9 +166,24 @@ struct buf {
 *			Buffer vp reassignments are illegal in this case.
 *
 *	B_CACHE		This may only be set if the buffer is entirely valid.
- *			The situation where B_DELWRI is set and B_CACHE gets
- *			cleared MUST be committed to disk so B_DELWRI can
- *			also be cleared.
+ *			The situation where B_DELWRI is set and B_CACHE is
+ *			clear MUST be committed to disk by getblk() so 
+ *			B_DELWRI can also be cleared.  See the comments for
+ *			getblk() in kern/vfs_bio.c.  If B_CACHE is clear,
+ *			the caller is expected to clear B_ERROR|B_INVAL,
+ *			set B_READ, and initiate an I/O.
+ *
+ *			The 'entire buffer' is defined to be the range from
+ *			0 through b_bcount.
+ *
+ *	B_MALLOC	Request that the buffer be allocated from the malloc
+ *			pool, DEV_BSIZE aligned instead of PAGE_SIZE aligned.
+ *
+ *	B_VMIO		Indicates that the buffer is tied into an VM object.
+ *			The buffer's data is always PAGE_SIZE aligned even
+ *			if b_bufsize and b_bcount are not.  ( b_bufsize is 
+ *			always at least DEV_BSIZE aligned, though ).
+ *	
 */

 #define	B_AGE		0x00000001	/* Move to age queue when I/O done. */
@ -356,6 +386,7 @@ void	cluster_write __P((struct buf *, u_quad_t));
 int	physio __P((void (*)(struct buf *), struct buf *, dev_t, 
 	    int, u_int (*)(struct buf *), struct uio *));
 u_int	minphys __P((struct buf *));
+void	vfs_bio_set_validclean __P((struct buf *, int base, int size));
 void	vfs_bio_clrbuf __P((struct buf *));
 void	vfs_busy_pages __P((struct buf *, int clear_modify));
 void	vfs_unbusy_pages __P((struct buf *));
@ -371,6 +402,7 @@ int	allocbuf __P((struct buf *bp, int size));
 void	reassignbuf __P((struct buf *, struct vnode *));
 void	pbreassignbuf __P((struct buf *, struct vnode *));
 struct	buf *trypbuf __P((int *));
+
 #endif /* KERNEL */

 #endif /* !_SYS_BUF_H_ */
--- a/sys/ufs/ffs/ffs_inode.c
+++ b/sys/ufs/ffs/ffs_inode.c
@ -31,7 +31,7 @@
 * SUCH DAMAGE.
 *
 *	@(#)ffs_inode.c	8.13 (Berkeley) 4/21/95
- * $Id: ffs_inode.c,v 1.52 1999/01/07 16:14:16 bde Exp $
+ * $Id: ffs_inode.c,v 1.53 1999/01/28 00:57:54 dillon Exp $
 */

 #include "opt_quota.h"
@ -452,6 +452,7 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp)
 	if ((bp->b_flags & B_CACHE) == 0) {
 		curproc->p_stats->p_ru.ru_inblock++;	/* pay for read */
 		bp->b_flags |= B_READ;
+		bp->b_flags &= ~(B_ERROR|B_INVAL);
 		if (bp->b_bcount > bp->b_bufsize)
 			panic("ffs_indirtrunc: bad buffer size");
 		bp->b_blkno = dbn;
--- a/sys/ufs/mfs/mfs_vnops.c
+++ b/sys/ufs/mfs/mfs_vnops.c
@ -31,7 +31,7 @@
 * SUCH DAMAGE.
 *
 *	@(#)mfs_vnops.c	8.11 (Berkeley) 5/22/95
- * $Id: mfs_vnops.c,v 1.42 1999/01/28 00:57:55 dillon Exp $
+ * $Id: mfs_vnops.c,v 1.43 1999/04/11 02:28:32 eivind Exp $
 */

 #include <sys/param.h>
@ -127,6 +127,9 @@ mfs_fsync(ap)
 *	We implement the B_FREEBUF strategy.  We can't just madvise()
 *	here because we have to do it in the correct order vs other bio
 *	requests, so we queue it.
+ *
+ *	Note: geteblk() sets B_INVAL.  We leave it set to guarentee buffer
+ *	throw-away on brelse()? XXX
 */

 static int
--- a/sys/ufs/ufs/ufs_bmap.c
+++ b/sys/ufs/ufs/ufs_bmap.c
@ -36,7 +36,7 @@
 * SUCH DAMAGE.
 *
 *	@(#)ufs_bmap.c	8.7 (Berkeley) 3/21/95
- * $Id: ufs_bmap.c,v 1.24 1998/10/27 11:47:08 bde Exp $
+ * $Id: ufs_bmap.c,v 1.25 1999/01/28 00:57:55 dillon Exp $
 */

 #include <sys/param.h>
@ -228,6 +228,7 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb)
 #endif
 			bp->b_blkno = blkptrtodb(ump, daddr);
 			bp->b_flags |= B_READ;
+			bp->b_flags &= ~(B_INVAL|B_ERROR);
 			vfs_busy_pages(bp, 0);
 			VOP_STRATEGY(bp->b_vp, bp);
 			curproc->p_stats->p_ru.ru_inblock++;	/* XXX */
--- a/sys/vm/vm_fault.c
+++ b/sys/vm/vm_fault.c
@ -66,7 +66,7 @@
 * any improvements or extensions that they make and grant Carnegie the
 * rights to redistribute these changes.
 *
- * $Id: vm_fault.c,v 1.100 1999/02/17 09:08:29 dillon Exp $
+ * $Id: vm_fault.c,v 1.101 1999/02/25 06:00:52 alc Exp $
 */

 /*
@ -409,6 +409,12 @@ RetryFault:;
 					firstpindex = fs.first_pindex -
 						2*(VM_FAULT_READ_BEHIND + VM_FAULT_READ_AHEAD + 1);

+				/*
+				 * note: partially valid pages cannot be 
+				 * included in the lookahead - NFS piecemeal
+				 * writes will barf on it badly.
+				 */
+
 				for(tmppindex = fs.first_pindex - 1;
 					tmppindex >= firstpindex;
 					--tmppindex) {
@ -552,12 +558,16 @@ RetryFault:;
 			}
 			fs.first_m = NULL;

+			/*
+			 * Zero the page if necessary and mark it valid.
+			 */
 			if ((fs.m->flags & PG_ZERO) == 0) {
 				vm_page_zero_fill(fs.m);
-			}
-			else
+			} else {
 				cnt.v_ozfod++;
+			}
 			cnt.v_zfod++;
+			fs.m->valid = VM_PAGE_BITS_ALL;
 			break;	/* break to PAGE HAS BEEN FOUND */
 		} else {
 			if (fs.object != fs.first_object) {
@ -788,14 +798,24 @@ RetryFault:;
 #endif

 	unlock_things(&fs);
-	fs.m->valid = VM_PAGE_BITS_ALL;
-	vm_page_flag_clear(fs.m, PG_ZERO);
+
+	/*
+	 * Sanity check: page must be completely valid or it is not fit to
+	 * map into user space.  vm_pager_get_pages() ensures this.
+	 */
+
+	if (fs.m->valid != VM_PAGE_BITS_ALL) {
+		vm_page_zero_invalid(fs.m, TRUE);
+		printf("Warning: page %p partially invalid on fault\n", fs.m);
+	}

 	pmap_enter(fs.map->pmap, vaddr, VM_PAGE_TO_PHYS(fs.m), prot, wired);
+
 	if (((fault_flags & VM_FAULT_WIRE_MASK) == 0) && (wired == 0)) {
 		pmap_prefault(fs.map->pmap, vaddr, fs.entry);
 	}

+	vm_page_flag_clear(fs.m, PG_ZERO);
 	vm_page_flag_set(fs.m, PG_MAPPED|PG_REFERENCED);
 	if (fault_flags & VM_FAULT_HOLD)
 		vm_page_hold(fs.m);
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@ -34,7 +34,7 @@
 * SUCH DAMAGE.
 *
 *	from: @(#)vm_page.c	7.4 (Berkeley) 5/7/91
- *	$Id: vm_page.c,v 1.128 1999/03/19 05:21:03 alc Exp $
+ *	$Id: vm_page.c,v 1.129 1999/04/05 19:38:29 julian Exp $
 */

 /*
@ -1460,14 +1460,16 @@ vm_page_bits(int base, int size)
 }

 /*
- * set a page valid and clean.  May not block.
+ *	vm_page_set_validclean:
 *
- * In order to maintain consistancy due to the DEV_BSIZE granularity
- * of the valid bits, we have to zero non-DEV_BSIZE aligned portions of 
- * the page at the beginning and end of the valid range when the 
- * associated valid bits are not already set.
+ *	Sets portions of a page valid and clean.  The arguments are expected
+ *	to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
+ *	of any partial chunks touched by the range.  The invalid portion of
+ *	such chunks will be zero'd.
 *
- * (base + size) must be less then or equal to PAGE_SIZE.
+ *	This routine may not block.
+ *
+ *	(base + size) must be less then or equal to PAGE_SIZE.
 */
 void
 vm_page_set_validclean(m, base, size)
@ -1529,8 +1531,35 @@ vm_page_set_validclean(m, base, size)
 		pmap_clear_modify(VM_PAGE_TO_PHYS(m));
 }

+#if 0
+
+void
+vm_page_set_dirty(m, base, size)
+	vm_page_t m;
+	int base;
+	int size;
+{
+	m->dirty |= vm_page_bits(base, size);
+}
+
+#endif
+
+void
+vm_page_clear_dirty(m, base, size)
+	vm_page_t m;
+	int base;
+	int size;
+{
+	m->dirty &= ~vm_page_bits(base, size);
+}
+
 /*
- * set a page (partially) invalid.  May not block.
+ *	vm_page_set_invalid:
+ *
+ *	Invalidates DEV_BSIZE'd chunks within a page.  Both the
+ *	valid and dirty bits for the effected areas are cleared.
+ *
+ *	May not block.
 */
 void
 vm_page_set_invalid(m, base, size)
@ -1540,9 +1569,9 @@ vm_page_set_invalid(m, base, size)
 {
 	int bits;

-	m->valid &= ~(bits = vm_page_bits(base, size));
-	if (m->valid == 0)
-		m->dirty &= ~bits;
+	bits = vm_page_bits(base, size);
+	m->valid &= ~bits;
+	m->dirty &= ~bits;
 	m->object->generation++;
 }

--- a/sys/vm/vm_page.h
+++ b/sys/vm/vm_page.h
@ -61,7 +61,7 @@
 * any improvements or extensions that they make and grant Carnegie the
 * rights to redistribute these changes.
 *
- * $Id: vm_page.h,v 1.58 1999/03/15 05:09:48 julian Exp $
+ * $Id: vm_page.h,v 1.59 1999/04/05 19:38:29 julian Exp $
 */

 /*
@ -101,6 +101,10 @@
 *	Fields in this structure are locked either by the lock on the
 *	object that the page belongs to (O) or by the lock on the page
 *	queues (P).
+ *
+ *	The 'valid' and 'dirty' fields are distinct.  A page may have dirty
+ *	bits set without having associated valid bits set.  This is used by
+ *	NFS to implement piecemeal writes.
 */

 TAILQ_HEAD(pglist, vm_page);
@ -404,6 +408,8 @@ void vm_page_wire __P((vm_page_t));
 void vm_page_unqueue __P((vm_page_t));
 void vm_page_unqueue_nowakeup __P((vm_page_t));
 void vm_page_set_validclean __P((vm_page_t, int, int));
+void vm_page_set_dirty __P((vm_page_t, int, int));
+void vm_page_clear_dirty __P((vm_page_t, int, int));
 void vm_page_set_invalid __P((vm_page_t, int, int));
 static __inline boolean_t vm_page_zero_fill __P((vm_page_t));
 int vm_page_is_valid __P((vm_page_t, int, int));
--- a/sys/vm/vm_pager.c
+++ b/sys/vm/vm_pager.c
@ -61,7 +61,7 @@
 * any improvements or extensions that they make and grant Carnegie the
 * rights to redistribute these changes.
 *
- * $Id: vm_pager.c,v 1.44 1999/03/14 09:20:00 julian Exp $
+ * $Id: vm_pager.c,v 1.45 1999/04/11 02:16:27 eivind Exp $
 */

 /*
@ -523,6 +523,9 @@ vm_pager_chain_iodone(struct buf *nbp)
 *	Obtain a physical buffer and chain it to its parent buffer.  When
 *	I/O completes, the parent buffer will be B_SIGNAL'd.  Errors are
 *	automatically propogated to the parent
+ *
+ *	Since these are brand new buffers, we do not have to clear B_INVAL
+ *	and B_ERROR because they are already clear.
 */

 struct buf *
--- a/sys/vm/vm_pager.h
+++ b/sys/vm/vm_pager.h
@ -36,7 +36,7 @@
 * SUCH DAMAGE.
 *
 *	@(#)vm_pager.h	8.4 (Berkeley) 1/12/94
- * $Id: vm_pager.h,v 1.20 1999/01/24 02:32:15 dillon Exp $
+ * $Id: vm_pager.h,v 1.21 1999/03/14 09:20:00 julian Exp $
 */

 /*
@ -110,6 +110,14 @@ void flushchainbuf(struct buf *nbp);
 void waitchainbuf(struct buf *bp, int count, int done);
 void autochaindone(struct buf *bp);

+/*
+ *	vm_page_get_pages:
+ *
+ *	Retrieve pages from the VM system in order to map them into an object
+ *	( or into VM space somewhere ).  If the pagein was successful, we
+ *	must fully validate it.
+ */
+
 static __inline int
 vm_pager_get_pages(
 	vm_object_t object,
@ -117,7 +125,13 @@ vm_pager_get_pages(
 	int count,
 	int reqpage
 ) {
-	return ((*pagertab[object->type]->pgo_getpages)(object, m, count, reqpage));
+	int r;
+
+	r = (*pagertab[object->type]->pgo_getpages)(object, m, count, reqpage);
+	if (r == VM_PAGER_OK && m[reqpage]->valid != VM_PAGE_BITS_ALL) {
+		vm_page_zero_invalid(m[reqpage], TRUE);
+	}
+	return(r);
 }

 static __inline void
--- a/sys/vm/vnode_pager.c
+++ b/sys/vm/vnode_pager.c
@ -38,7 +38,7 @@
 * SUCH DAMAGE.
 *
 *	from: @(#)vnode_pager.c	7.5 (Berkeley) 4/20/91
- *	$Id: vnode_pager.c,v 1.106 1999/04/05 19:38:29 julian Exp $
+ *	$Id: vnode_pager.c,v 1.107 1999/04/10 20:52:11 dt Exp $
 */

 /*
@ -789,7 +789,8 @@ vnode_pager_generic_getpages(vp, m, bytecount, reqpage)
 			 * read.
 			 */
 			vm_page_set_validclean(mt, 0, size - tfoff);
-			vm_page_zero_invalid(mt, FALSE);
+			/* handled by vm_fault now */
+			/* vm_page_zero_invalid(mt, FALSE); */
 		}
 		
 		vm_page_flag_clear(mt, PG_ZERO);