diff --git a/sys/nfs/nfs_bio.c b/sys/nfs/nfs_bio.c
index 63cc67e7046d..10be58d709bb 100644
--- a/sys/nfs/nfs_bio.c
+++ b/sys/nfs/nfs_bio.c
@@ -928,7 +928,15 @@ nfs_write(ap)
 		}
 
 		error = uiomove((char *)bp->b_data + on, n, uio);
-		bp->b_flags &= ~B_NEEDCOMMIT;
+
+		/*
+		 * Since this block is being modified, it must be written
+		 * again and not just committed.  Since write clustering does
+		 * not work for the stage 1 data write, only the stage 2
+		 * commit rpc, we have to clear B_CLUSTEROK as well.
+		 */
+		bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
+
 		if (error) {
 			bp->b_flags |= B_ERROR;
 			brelse(bp);
@@ -950,12 +958,6 @@ nfs_write(ap)
 			vfs_bio_set_validclean(bp, on, n);
 		}
 
-		/*
-		 * Since this block is being modified, it must be written
-		 * again and not just committed.
-		 */
-		bp->b_flags &= ~B_NEEDCOMMIT;
-
 		/*
 		 * If the lease is non-cachable or IO_SYNC do bwrite().
 		 *
@@ -986,10 +988,18 @@ nfs_write(ap)
 
 /*
  * Get an nfs cache block.
+ *
  * Allocate a new one if the block isn't currently in the cache
  * and return the block marked busy. If the calling process is
  * interrupted by a signal for an interruptible mount point, return
  * NULL.
+ *
+ * The caller must carefully deal with the possible B_INVAL state of
+ * the buffer.  nfs_doio() clears B_INVAL (and nfs_asyncio() clears it
+ * indirectly), so synchronous reads can be issued without worrying about
+ * the B_INVAL state.  We have to be a little more careful when dealing
+ * with writes (see comments in nfs_write()) when extending a file past
+ * its EOF.
  */
 static struct buf *
 nfs_getcacheblk(vp, bn, size, p)
@@ -1359,7 +1369,7 @@ nfs_doio(bp, cr, p)
 		    bp->b_flags &= ~B_WRITEINPROG;
 		    if (retv == 0) {
 			    bp->b_dirtyoff = bp->b_dirtyend = 0;
-			    bp->b_flags &= ~B_NEEDCOMMIT;
+			    bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 			    bp->b_resid = 0;
 			    biodone(bp);
 			    return (0);
@@ -1397,7 +1407,13 @@ nfs_doio(bp, cr, p)
 		 * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try
 		 * to cluster the buffers needing commit.  This will allow
 		 * the system to submit a single commit rpc for the whole
-		 * cluster.
+		 * cluster.  We can do this even if the buffer is not 100% 
+		 * dirty (relative to the NFS blocksize), so we optimize the
+		 * append-to-file-case.
+		 *
+		 * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be
+		 * cleared because write clustering only works for commit
+		 * rpc's, not for the data portion of the write).
 		 */
 
 		if (!error && iomode == NFSV3WRITE_UNSTABLE) {
@@ -1406,7 +1422,7 @@ nfs_doio(bp, cr, p)
 			&& bp->b_dirtyend == bp->b_bcount)
 			bp->b_flags |= B_CLUSTEROK;
 		} else {
-		    bp->b_flags &= ~B_NEEDCOMMIT;
+		    bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 		}
 		bp->b_flags &= ~B_WRITEINPROG;
 
diff --git a/sys/nfs/nfs_common.c b/sys/nfs/nfs_common.c
index b62b21d31326..0da996b4c788 100644
--- a/sys/nfs/nfs_common.c
+++ b/sys/nfs/nfs_common.c
@@ -1357,11 +1357,13 @@ nfs_loadattrcache(vpp, mdp, dposp, vaper)
 					vap->va_size = np->n_size;
 				else
 					np->n_size = vap->va_size;
-			} else
+			} else {
 				np->n_size = vap->va_size;
+			}
 			vnode_pager_setsize(vp, np->n_size);
-		} else
+		} else {
 			np->n_size = vap->va_size;
+		}
 	}
 	np->n_attrstamp = time_second;
 	if (vaper != NULL) {
@@ -1444,11 +1446,13 @@ nfs_getattrcache(vp, vaper)
 					vap->va_size = np->n_size;
 				else
 					np->n_size = vap->va_size;
-			} else
+			} else {
 				np->n_size = vap->va_size;
+			}
 			vnode_pager_setsize(vp, np->n_size);
-		} else
+		} else {
 			np->n_size = vap->va_size;
+		}
 	}
 	bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(struct vattr));
 	if (np->n_flag & NCHG) {
@@ -2150,7 +2154,11 @@ nfs_invaldir(vp)
  * The write verifier has changed (probably due to a server reboot), so all
  * B_NEEDCOMMIT blocks will have to be written again. Since they are on the
  * dirty block list as B_DELWRI, all this takes is clearing the B_NEEDCOMMIT
- * flag. Once done the new write verifier can be set for the mount point.
+ * and B_CLUSTEROK flags.  Once done the new write verifier can be set for the
+ * mount point.
+ *
+ * B_CLUSTEROK must be cleared along with B_NEEDCOMMIT because stage 1 data 
+ * writes are not clusterable.
  */
 void
 nfs_clearcommit(mp)
@@ -2171,7 +2179,7 @@ nfs_clearcommit(mp)
 			if (BUF_REFCNT(bp) == 0 &&
 			    (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT))
 				== (B_DELWRI | B_NEEDCOMMIT))
-				bp->b_flags &= ~B_NEEDCOMMIT;
+				bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 		}
 	}
 	splx(s);
diff --git a/sys/nfs/nfs_subs.c b/sys/nfs/nfs_subs.c
index b62b21d31326..0da996b4c788 100644
--- a/sys/nfs/nfs_subs.c
+++ b/sys/nfs/nfs_subs.c
@@ -1357,11 +1357,13 @@ nfs_loadattrcache(vpp, mdp, dposp, vaper)
 					vap->va_size = np->n_size;
 				else
 					np->n_size = vap->va_size;
-			} else
+			} else {
 				np->n_size = vap->va_size;
+			}
 			vnode_pager_setsize(vp, np->n_size);
-		} else
+		} else {
 			np->n_size = vap->va_size;
+		}
 	}
 	np->n_attrstamp = time_second;
 	if (vaper != NULL) {
@@ -1444,11 +1446,13 @@ nfs_getattrcache(vp, vaper)
 					vap->va_size = np->n_size;
 				else
 					np->n_size = vap->va_size;
-			} else
+			} else {
 				np->n_size = vap->va_size;
+			}
 			vnode_pager_setsize(vp, np->n_size);
-		} else
+		} else {
 			np->n_size = vap->va_size;
+		}
 	}
 	bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(struct vattr));
 	if (np->n_flag & NCHG) {
@@ -2150,7 +2154,11 @@ nfs_invaldir(vp)
  * The write verifier has changed (probably due to a server reboot), so all
  * B_NEEDCOMMIT blocks will have to be written again. Since they are on the
  * dirty block list as B_DELWRI, all this takes is clearing the B_NEEDCOMMIT
- * flag. Once done the new write verifier can be set for the mount point.
+ * and B_CLUSTEROK flags.  Once done the new write verifier can be set for the
+ * mount point.
+ *
+ * B_CLUSTEROK must be cleared along with B_NEEDCOMMIT because stage 1 data 
+ * writes are not clusterable.
  */
 void
 nfs_clearcommit(mp)
@@ -2171,7 +2179,7 @@ nfs_clearcommit(mp)
 			if (BUF_REFCNT(bp) == 0 &&
 			    (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT))
 				== (B_DELWRI | B_NEEDCOMMIT))
-				bp->b_flags &= ~B_NEEDCOMMIT;
+				bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 		}
 	}
 	splx(s);
diff --git a/sys/nfs/nfs_vnops.c b/sys/nfs/nfs_vnops.c
index e53b295d8ca5..7774b8bd9202 100644
--- a/sys/nfs/nfs_vnops.c
+++ b/sys/nfs/nfs_vnops.c
@@ -2924,7 +2924,7 @@ nfs_flush(vp, cred, waitfor, p, commit)
 		 */
 		for (i = 0; i < bvecpos; i++) {
 			bp = bvec[i];
-			bp->b_flags &= ~(B_NEEDCOMMIT | B_WRITEINPROG);
+			bp->b_flags &= ~(B_NEEDCOMMIT | B_WRITEINPROG | B_CLUSTEROK);
 			if (retv) {
 				/*
 				 * Error, leave B_DELWRI intact
@@ -2988,7 +2988,7 @@ nfs_flush(vp, cred, waitfor, p, commit)
 		if (passone || !commit)
 		    bp->b_flags |= B_ASYNC;
 		else
-		    bp->b_flags |= (B_ASYNC | B_WRITEINPROG | B_NEEDCOMMIT);
+		    bp->b_flags |= B_ASYNC | B_WRITEINPROG;
 		splx(s);
 		VOP_BWRITE(bp->b_vp, bp);
 		goto loop;
@@ -3127,30 +3127,6 @@ nfs_writebp(bp, force, procp)
 	splx(s);
 
 	vfs_busy_pages(bp, 1);
-#if 0
-	/*
-	 * XXX removed, moved to nfs_doio XXX
-	 */
-	/*
-	 * If B_NEEDCOMMIT is set, a commit rpc may do the trick. If not
-	 * an actual write will have to be scheduled via. VOP_STRATEGY().
-	 * If B_WRITEINPROG is already set, then push it with a write anyhow.
-	 */
-	if ((oldflags & (B_NEEDCOMMIT | B_WRITEINPROG)) == B_NEEDCOMMIT) {
-		off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff;
-		bp->b_flags |= B_WRITEINPROG;
-		retv = nfs_commit(bp->b_vp, off, bp->b_dirtyend-bp->b_dirtyoff,
-			bp->b_wcred, procp);
-		bp->b_flags &= ~B_WRITEINPROG;
-		if (!retv) {
-			bp->b_dirtyoff = bp->b_dirtyend = 0;
-			bp->b_flags &= ~B_NEEDCOMMIT;
-			biodone(bp);
-		} else if (retv == NFSERR_STALEWRITEVERF) {
-			nfs_clearcommit(bp->b_vp->v_mount);
-		}
-	}
-#endif
 	if (force)
 		bp->b_flags |= B_WRITEINPROG;
 	BUF_KERNPROC(bp);
diff --git a/sys/nfsclient/nfs_bio.c b/sys/nfsclient/nfs_bio.c
index 63cc67e7046d..10be58d709bb 100644
--- a/sys/nfsclient/nfs_bio.c
+++ b/sys/nfsclient/nfs_bio.c
@@ -928,7 +928,15 @@ nfs_write(ap)
 		}
 
 		error = uiomove((char *)bp->b_data + on, n, uio);
-		bp->b_flags &= ~B_NEEDCOMMIT;
+
+		/*
+		 * Since this block is being modified, it must be written
+		 * again and not just committed.  Since write clustering does
+		 * not work for the stage 1 data write, only the stage 2
+		 * commit rpc, we have to clear B_CLUSTEROK as well.
+		 */
+		bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
+
 		if (error) {
 			bp->b_flags |= B_ERROR;
 			brelse(bp);
@@ -950,12 +958,6 @@ nfs_write(ap)
 			vfs_bio_set_validclean(bp, on, n);
 		}
 
-		/*
-		 * Since this block is being modified, it must be written
-		 * again and not just committed.
-		 */
-		bp->b_flags &= ~B_NEEDCOMMIT;
-
 		/*
 		 * If the lease is non-cachable or IO_SYNC do bwrite().
 		 *
@@ -986,10 +988,18 @@ nfs_write(ap)
 
 /*
  * Get an nfs cache block.
+ *
  * Allocate a new one if the block isn't currently in the cache
  * and return the block marked busy. If the calling process is
  * interrupted by a signal for an interruptible mount point, return
  * NULL.
+ *
+ * The caller must carefully deal with the possible B_INVAL state of
+ * the buffer.  nfs_doio() clears B_INVAL (and nfs_asyncio() clears it
+ * indirectly), so synchronous reads can be issued without worrying about
+ * the B_INVAL state.  We have to be a little more careful when dealing
+ * with writes (see comments in nfs_write()) when extending a file past
+ * its EOF.
  */
 static struct buf *
 nfs_getcacheblk(vp, bn, size, p)
@@ -1359,7 +1369,7 @@ nfs_doio(bp, cr, p)
 		    bp->b_flags &= ~B_WRITEINPROG;
 		    if (retv == 0) {
 			    bp->b_dirtyoff = bp->b_dirtyend = 0;
-			    bp->b_flags &= ~B_NEEDCOMMIT;
+			    bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 			    bp->b_resid = 0;
 			    biodone(bp);
 			    return (0);
@@ -1397,7 +1407,13 @@ nfs_doio(bp, cr, p)
 		 * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try
 		 * to cluster the buffers needing commit.  This will allow
 		 * the system to submit a single commit rpc for the whole
-		 * cluster.
+		 * cluster.  We can do this even if the buffer is not 100% 
+		 * dirty (relative to the NFS blocksize), so we optimize the
+		 * append-to-file-case.
+		 *
+		 * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be
+		 * cleared because write clustering only works for commit
+		 * rpc's, not for the data portion of the write).
 		 */
 
 		if (!error && iomode == NFSV3WRITE_UNSTABLE) {
@@ -1406,7 +1422,7 @@ nfs_doio(bp, cr, p)
 			&& bp->b_dirtyend == bp->b_bcount)
 			bp->b_flags |= B_CLUSTEROK;
 		} else {
-		    bp->b_flags &= ~B_NEEDCOMMIT;
+		    bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 		}
 		bp->b_flags &= ~B_WRITEINPROG;
 
diff --git a/sys/nfsclient/nfs_subs.c b/sys/nfsclient/nfs_subs.c
index b62b21d31326..0da996b4c788 100644
--- a/sys/nfsclient/nfs_subs.c
+++ b/sys/nfsclient/nfs_subs.c
@@ -1357,11 +1357,13 @@ nfs_loadattrcache(vpp, mdp, dposp, vaper)
 					vap->va_size = np->n_size;
 				else
 					np->n_size = vap->va_size;
-			} else
+			} else {
 				np->n_size = vap->va_size;
+			}
 			vnode_pager_setsize(vp, np->n_size);
-		} else
+		} else {
 			np->n_size = vap->va_size;
+		}
 	}
 	np->n_attrstamp = time_second;
 	if (vaper != NULL) {
@@ -1444,11 +1446,13 @@ nfs_getattrcache(vp, vaper)
 					vap->va_size = np->n_size;
 				else
 					np->n_size = vap->va_size;
-			} else
+			} else {
 				np->n_size = vap->va_size;
+			}
 			vnode_pager_setsize(vp, np->n_size);
-		} else
+		} else {
 			np->n_size = vap->va_size;
+		}
 	}
 	bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(struct vattr));
 	if (np->n_flag & NCHG) {
@@ -2150,7 +2154,11 @@ nfs_invaldir(vp)
  * The write verifier has changed (probably due to a server reboot), so all
  * B_NEEDCOMMIT blocks will have to be written again. Since they are on the
  * dirty block list as B_DELWRI, all this takes is clearing the B_NEEDCOMMIT
- * flag. Once done the new write verifier can be set for the mount point.
+ * and B_CLUSTEROK flags.  Once done the new write verifier can be set for the
+ * mount point.
+ *
+ * B_CLUSTEROK must be cleared along with B_NEEDCOMMIT because stage 1 data 
+ * writes are not clusterable.
  */
 void
 nfs_clearcommit(mp)
@@ -2171,7 +2179,7 @@ nfs_clearcommit(mp)
 			if (BUF_REFCNT(bp) == 0 &&
 			    (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT))
 				== (B_DELWRI | B_NEEDCOMMIT))
-				bp->b_flags &= ~B_NEEDCOMMIT;
+				bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 		}
 	}
 	splx(s);
diff --git a/sys/nfsclient/nfs_vnops.c b/sys/nfsclient/nfs_vnops.c
index e53b295d8ca5..7774b8bd9202 100644
--- a/sys/nfsclient/nfs_vnops.c
+++ b/sys/nfsclient/nfs_vnops.c
@@ -2924,7 +2924,7 @@ nfs_flush(vp, cred, waitfor, p, commit)
 		 */
 		for (i = 0; i < bvecpos; i++) {
 			bp = bvec[i];
-			bp->b_flags &= ~(B_NEEDCOMMIT | B_WRITEINPROG);
+			bp->b_flags &= ~(B_NEEDCOMMIT | B_WRITEINPROG | B_CLUSTEROK);
 			if (retv) {
 				/*
 				 * Error, leave B_DELWRI intact
@@ -2988,7 +2988,7 @@ nfs_flush(vp, cred, waitfor, p, commit)
 		if (passone || !commit)
 		    bp->b_flags |= B_ASYNC;
 		else
-		    bp->b_flags |= (B_ASYNC | B_WRITEINPROG | B_NEEDCOMMIT);
+		    bp->b_flags |= B_ASYNC | B_WRITEINPROG;
 		splx(s);
 		VOP_BWRITE(bp->b_vp, bp);
 		goto loop;
@@ -3127,30 +3127,6 @@ nfs_writebp(bp, force, procp)
 	splx(s);
 
 	vfs_busy_pages(bp, 1);
-#if 0
-	/*
-	 * XXX removed, moved to nfs_doio XXX
-	 */
-	/*
-	 * If B_NEEDCOMMIT is set, a commit rpc may do the trick. If not
-	 * an actual write will have to be scheduled via. VOP_STRATEGY().
-	 * If B_WRITEINPROG is already set, then push it with a write anyhow.
-	 */
-	if ((oldflags & (B_NEEDCOMMIT | B_WRITEINPROG)) == B_NEEDCOMMIT) {
-		off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff;
-		bp->b_flags |= B_WRITEINPROG;
-		retv = nfs_commit(bp->b_vp, off, bp->b_dirtyend-bp->b_dirtyoff,
-			bp->b_wcred, procp);
-		bp->b_flags &= ~B_WRITEINPROG;
-		if (!retv) {
-			bp->b_dirtyoff = bp->b_dirtyend = 0;
-			bp->b_flags &= ~B_NEEDCOMMIT;
-			biodone(bp);
-		} else if (retv == NFSERR_STALEWRITEVERF) {
-			nfs_clearcommit(bp->b_vp->v_mount);
-		}
-	}
-#endif
 	if (force)
 		bp->b_flags |= B_WRITEINPROG;
 	BUF_KERNPROC(bp);
diff --git a/sys/nfsserver/nfs_srvsubs.c b/sys/nfsserver/nfs_srvsubs.c
index b62b21d31326..0da996b4c788 100644
--- a/sys/nfsserver/nfs_srvsubs.c
+++ b/sys/nfsserver/nfs_srvsubs.c
@@ -1357,11 +1357,13 @@ nfs_loadattrcache(vpp, mdp, dposp, vaper)
 					vap->va_size = np->n_size;
 				else
 					np->n_size = vap->va_size;
-			} else
+			} else {
 				np->n_size = vap->va_size;
+			}
 			vnode_pager_setsize(vp, np->n_size);
-		} else
+		} else {
 			np->n_size = vap->va_size;
+		}
 	}
 	np->n_attrstamp = time_second;
 	if (vaper != NULL) {
@@ -1444,11 +1446,13 @@ nfs_getattrcache(vp, vaper)
 					vap->va_size = np->n_size;
 				else
 					np->n_size = vap->va_size;
-			} else
+			} else {
 				np->n_size = vap->va_size;
+			}
 			vnode_pager_setsize(vp, np->n_size);
-		} else
+		} else {
 			np->n_size = vap->va_size;
+		}
 	}
 	bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(struct vattr));
 	if (np->n_flag & NCHG) {
@@ -2150,7 +2154,11 @@ nfs_invaldir(vp)
  * The write verifier has changed (probably due to a server reboot), so all
  * B_NEEDCOMMIT blocks will have to be written again. Since they are on the
  * dirty block list as B_DELWRI, all this takes is clearing the B_NEEDCOMMIT
- * flag. Once done the new write verifier can be set for the mount point.
+ * and B_CLUSTEROK flags.  Once done the new write verifier can be set for the
+ * mount point.
+ *
+ * B_CLUSTEROK must be cleared along with B_NEEDCOMMIT because stage 1 data 
+ * writes are not clusterable.
  */
 void
 nfs_clearcommit(mp)
@@ -2171,7 +2179,7 @@ nfs_clearcommit(mp)
 			if (BUF_REFCNT(bp) == 0 &&
 			    (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT))
 				== (B_DELWRI | B_NEEDCOMMIT))
-				bp->b_flags &= ~B_NEEDCOMMIT;
+				bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 		}
 	}
 	splx(s);
diff --git a/sys/sys/bio.h b/sys/sys/bio.h
index f9a09bb657bc..f5d1879fe164 100644
--- a/sys/sys/bio.h
+++ b/sys/sys/bio.h
@@ -176,6 +176,13 @@ struct buf {
  *	B_MALLOC	Request that the buffer be allocated from the malloc
  *			pool, DEV_BSIZE aligned instead of PAGE_SIZE aligned.
  *
+ *	B_CLUSTEROK	This flag is typically set for B_DELWRI buffers
+ *			by filesystems that allow clustering when the buffer
+ *			is fully dirty and indicates that it may be clustered
+ *			with other adjacent dirty buffers.  Note the clustering
+ *			may not be used with the stage 1 data write under NFS
+ *			but may be used for the commit rpc portion.
+ *
  *	B_VMIO		Indicates that the buffer is tied into an VM object.
  *			The buffer's data is always PAGE_SIZE aligned even
  *			if b_bufsize and b_bcount are not.  ( b_bufsize is 
diff --git a/sys/sys/buf.h b/sys/sys/buf.h
index f9a09bb657bc..f5d1879fe164 100644
--- a/sys/sys/buf.h
+++ b/sys/sys/buf.h
@@ -176,6 +176,13 @@ struct buf {
  *	B_MALLOC	Request that the buffer be allocated from the malloc
  *			pool, DEV_BSIZE aligned instead of PAGE_SIZE aligned.
  *
+ *	B_CLUSTEROK	This flag is typically set for B_DELWRI buffers
+ *			by filesystems that allow clustering when the buffer
+ *			is fully dirty and indicates that it may be clustered
+ *			with other adjacent dirty buffers.  Note the clustering
+ *			may not be used with the stage 1 data write under NFS
+ *			but may be used for the commit rpc portion.
+ *
  *	B_VMIO		Indicates that the buffer is tied into an VM object.
  *			The buffer's data is always PAGE_SIZE aligned even
  *			if b_bufsize and b_bcount are not.  ( b_bufsize is