Synopsis of problem being fixed: Dan Nelson originally reported that

blocks of zeros could wind up in a file written to over NFS by a client. The problem only occurs a few times per several gigabytes of data. This problem turned out to be bug #3 below. bug #1: B_CLUSTEROK must be cleared when an NFS buffer is reverted from stage 2 (ready for commit rpc) to stage 1 (ready for write). Reversions can occur when a dirty NFS buffer is redirtied with new data. Otherwise the VFS/BIO system may end up thinking that a stage 1 NFS buffer is clusterable. Stage 1 NFS buffers are not clusterable. bug #2: B_CLUSTEROK was inappropriately set for a 'short' NFS buffer (short buffers only occur near the EOF of the file). Change to only set when the buffer is a full biosize (usually 8K). This bug has no effect but should be fixed in -current anyway. It need not be backported. bug #3: B_NEEDCOMMIT was inappropriately set in nfs_flush() (which is typically only called by the update daemon). nfs_flush() does a multi-pass loop but due to the lack of vnode locking it is possible for new buffers to be added to the dirtyblkhd list while a flush operation is going on. This may result in nfs_flush() setting B_NEEDCOMMIT on a buffer which has *NOT* yet gone through its stage 1 write, causing only the commit rpc to be made and thus causing the contents of the buffer to be thrown away (never sent to the server). The patch also contains some cleanup, which only applies to the commit into -current. Reviewed by: dg, julian Originally Reported by: Dan Nelson <dnelson@emsphone.com>
svn path=/head/; revision=54480
1999-12-12 06:09:57 +00:00 · 1999-12-12 06:09:57 +00:00 · ea94c7b968 · 2020-12-20 02:59:44 +00:00
commit ea94c7b968
parent 6f6da2f326
10 changed files with 126 additions and 96 deletions
--- a/sys/nfs/nfs_bio.c
+++ b/sys/nfs/nfs_bio.c
@ -928,7 +928,15 @@ nfs_write(ap)
 		}

 		error = uiomove((char *)bp->b_data + on, n, uio);
-		bp->b_flags &= ~B_NEEDCOMMIT;
+
+		/*
+		 * Since this block is being modified, it must be written
+		 * again and not just committed.  Since write clustering does
+		 * not work for the stage 1 data write, only the stage 2
+		 * commit rpc, we have to clear B_CLUSTEROK as well.
+		 */
+		bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
+
 		if (error) {
 			bp->b_flags |= B_ERROR;
 			brelse(bp);
@ -950,12 +958,6 @@ nfs_write(ap)
 			vfs_bio_set_validclean(bp, on, n);
 		}

-		/*
-		 * Since this block is being modified, it must be written
-		 * again and not just committed.
-		 */
-		bp->b_flags &= ~B_NEEDCOMMIT;
-
 		/*
 		 * If the lease is non-cachable or IO_SYNC do bwrite().
 		 *
@ -986,10 +988,18 @@ nfs_write(ap)

 /*
 * Get an nfs cache block.
+ *
 * Allocate a new one if the block isn't currently in the cache
 * and return the block marked busy. If the calling process is
 * interrupted by a signal for an interruptible mount point, return
 * NULL.
+ *
+ * The caller must carefully deal with the possible B_INVAL state of
+ * the buffer.  nfs_doio() clears B_INVAL (and nfs_asyncio() clears it
+ * indirectly), so synchronous reads can be issued without worrying about
+ * the B_INVAL state.  We have to be a little more careful when dealing
+ * with writes (see comments in nfs_write()) when extending a file past
+ * its EOF.
 */
 static struct buf *
 nfs_getcacheblk(vp, bn, size, p)
@ -1359,7 +1369,7 @@ nfs_doio(bp, cr, p)
 		    bp->b_flags &= ~B_WRITEINPROG;
 		    if (retv == 0) {
 			    bp->b_dirtyoff = bp->b_dirtyend = 0;
-			    bp->b_flags &= ~B_NEEDCOMMIT;
+			    bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 			    bp->b_resid = 0;
 			    biodone(bp);
 			    return (0);
@ -1397,7 +1407,13 @@ nfs_doio(bp, cr, p)
 		 * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try
 		 * to cluster the buffers needing commit.  This will allow
 		 * the system to submit a single commit rpc for the whole
-		 * cluster.
+		 * cluster.  We can do this even if the buffer is not 100% 
+		 * dirty (relative to the NFS blocksize), so we optimize the
+		 * append-to-file-case.
+		 *
+		 * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be
+		 * cleared because write clustering only works for commit
+		 * rpc's, not for the data portion of the write).
 		 */

 		if (!error && iomode == NFSV3WRITE_UNSTABLE) {
@ -1406,7 +1422,7 @@ nfs_doio(bp, cr, p)
 			&& bp->b_dirtyend == bp->b_bcount)
 			bp->b_flags |= B_CLUSTEROK;
 		} else {
-		    bp->b_flags &= ~B_NEEDCOMMIT;
+		    bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 		}
 		bp->b_flags &= ~B_WRITEINPROG;

--- a/sys/nfs/nfs_common.c
+++ b/sys/nfs/nfs_common.c
@ -1357,11 +1357,13 @@ nfs_loadattrcache(vpp, mdp, dposp, vaper)
 					vap->va_size = np->n_size;
 				else
 					np->n_size = vap->va_size;
-			} else
+			} else {
 				np->n_size = vap->va_size;
+			}
 			vnode_pager_setsize(vp, np->n_size);
-		} else
+		} else {
 			np->n_size = vap->va_size;
+		}
 	}
 	np->n_attrstamp = time_second;
 	if (vaper != NULL) {
@ -1444,11 +1446,13 @@ nfs_getattrcache(vp, vaper)
 					vap->va_size = np->n_size;
 				else
 					np->n_size = vap->va_size;
-			} else
+			} else {
 				np->n_size = vap->va_size;
+			}
 			vnode_pager_setsize(vp, np->n_size);
-		} else
+		} else {
 			np->n_size = vap->va_size;
+		}
 	}
 	bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(struct vattr));
 	if (np->n_flag & NCHG) {
@ -2150,7 +2154,11 @@ nfs_invaldir(vp)
 * The write verifier has changed (probably due to a server reboot), so all
 * B_NEEDCOMMIT blocks will have to be written again. Since they are on the
 * dirty block list as B_DELWRI, all this takes is clearing the B_NEEDCOMMIT
- * flag. Once done the new write verifier can be set for the mount point.
+ * and B_CLUSTEROK flags.  Once done the new write verifier can be set for the
+ * mount point.
+ *
+ * B_CLUSTEROK must be cleared along with B_NEEDCOMMIT because stage 1 data 
+ * writes are not clusterable.
 */
 void
 nfs_clearcommit(mp)
@ -2171,7 +2179,7 @@ nfs_clearcommit(mp)
 			if (BUF_REFCNT(bp) == 0 &&
 			    (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT))
 				== (B_DELWRI | B_NEEDCOMMIT))
-				bp->b_flags &= ~B_NEEDCOMMIT;
+				bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 		}
 	}
 	splx(s);
--- a/sys/nfs/nfs_subs.c
+++ b/sys/nfs/nfs_subs.c
@ -1357,11 +1357,13 @@ nfs_loadattrcache(vpp, mdp, dposp, vaper)
 					vap->va_size = np->n_size;
 				else
 					np->n_size = vap->va_size;
-			} else
+			} else {
 				np->n_size = vap->va_size;
+			}
 			vnode_pager_setsize(vp, np->n_size);
-		} else
+		} else {
 			np->n_size = vap->va_size;
+		}
 	}
 	np->n_attrstamp = time_second;
 	if (vaper != NULL) {
@ -1444,11 +1446,13 @@ nfs_getattrcache(vp, vaper)
 					vap->va_size = np->n_size;
 				else
 					np->n_size = vap->va_size;
-			} else
+			} else {
 				np->n_size = vap->va_size;
+			}
 			vnode_pager_setsize(vp, np->n_size);
-		} else
+		} else {
 			np->n_size = vap->va_size;
+		}
 	}
 	bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(struct vattr));
 	if (np->n_flag & NCHG) {
@ -2150,7 +2154,11 @@ nfs_invaldir(vp)
 * The write verifier has changed (probably due to a server reboot), so all
 * B_NEEDCOMMIT blocks will have to be written again. Since they are on the
 * dirty block list as B_DELWRI, all this takes is clearing the B_NEEDCOMMIT
- * flag. Once done the new write verifier can be set for the mount point.
+ * and B_CLUSTEROK flags.  Once done the new write verifier can be set for the
+ * mount point.
+ *
+ * B_CLUSTEROK must be cleared along with B_NEEDCOMMIT because stage 1 data 
+ * writes are not clusterable.
 */
 void
 nfs_clearcommit(mp)
@ -2171,7 +2179,7 @@ nfs_clearcommit(mp)
 			if (BUF_REFCNT(bp) == 0 &&
 			    (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT))
 				== (B_DELWRI | B_NEEDCOMMIT))
-				bp->b_flags &= ~B_NEEDCOMMIT;
+				bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 		}
 	}
 	splx(s);
--- a/sys/nfs/nfs_vnops.c
+++ b/sys/nfs/nfs_vnops.c
@ -2924,7 +2924,7 @@ nfs_flush(vp, cred, waitfor, p, commit)
 		 */
 		for (i = 0; i < bvecpos; i++) {
 			bp = bvec[i];
-			bp->b_flags &= ~(B_NEEDCOMMIT | B_WRITEINPROG);
+			bp->b_flags &= ~(B_NEEDCOMMIT | B_WRITEINPROG | B_CLUSTEROK);
 			if (retv) {
 				/*
 				 * Error, leave B_DELWRI intact
@ -2988,7 +2988,7 @@ nfs_flush(vp, cred, waitfor, p, commit)
 		if (passone || !commit)
 		    bp->b_flags |= B_ASYNC;
 		else
-		    bp->b_flags |= (B_ASYNC | B_WRITEINPROG | B_NEEDCOMMIT);
+		    bp->b_flags |= B_ASYNC | B_WRITEINPROG;
 		splx(s);
 		VOP_BWRITE(bp->b_vp, bp);
 		goto loop;
@ -3127,30 +3127,6 @@ nfs_writebp(bp, force, procp)
 	splx(s);

 	vfs_busy_pages(bp, 1);
-#if 0
-	/*
-	 * XXX removed, moved to nfs_doio XXX
-	 */
-	/*
-	 * If B_NEEDCOMMIT is set, a commit rpc may do the trick. If not
-	 * an actual write will have to be scheduled via. VOP_STRATEGY().
-	 * If B_WRITEINPROG is already set, then push it with a write anyhow.
-	 */
-	if ((oldflags & (B_NEEDCOMMIT | B_WRITEINPROG)) == B_NEEDCOMMIT) {
-		off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff;
-		bp->b_flags |= B_WRITEINPROG;
-		retv = nfs_commit(bp->b_vp, off, bp->b_dirtyend-bp->b_dirtyoff,
-			bp->b_wcred, procp);
-		bp->b_flags &= ~B_WRITEINPROG;
-		if (!retv) {
-			bp->b_dirtyoff = bp->b_dirtyend = 0;
-			bp->b_flags &= ~B_NEEDCOMMIT;
-			biodone(bp);
-		} else if (retv == NFSERR_STALEWRITEVERF) {
-			nfs_clearcommit(bp->b_vp->v_mount);
-		}
-	}
-#endif
 	if (force)
 		bp->b_flags |= B_WRITEINPROG;
 	BUF_KERNPROC(bp);
--- a/sys/nfsclient/nfs_bio.c
+++ b/sys/nfsclient/nfs_bio.c
@ -928,7 +928,15 @@ nfs_write(ap)
 		}

 		error = uiomove((char *)bp->b_data + on, n, uio);
-		bp->b_flags &= ~B_NEEDCOMMIT;
+
+		/*
+		 * Since this block is being modified, it must be written
+		 * again and not just committed.  Since write clustering does
+		 * not work for the stage 1 data write, only the stage 2
+		 * commit rpc, we have to clear B_CLUSTEROK as well.
+		 */
+		bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
+
 		if (error) {
 			bp->b_flags |= B_ERROR;
 			brelse(bp);
@ -950,12 +958,6 @@ nfs_write(ap)
 			vfs_bio_set_validclean(bp, on, n);
 		}

-		/*
-		 * Since this block is being modified, it must be written
-		 * again and not just committed.
-		 */
-		bp->b_flags &= ~B_NEEDCOMMIT;
-
 		/*
 		 * If the lease is non-cachable or IO_SYNC do bwrite().
 		 *
@ -986,10 +988,18 @@ nfs_write(ap)

 /*
 * Get an nfs cache block.
+ *
 * Allocate a new one if the block isn't currently in the cache
 * and return the block marked busy. If the calling process is
 * interrupted by a signal for an interruptible mount point, return
 * NULL.
+ *
+ * The caller must carefully deal with the possible B_INVAL state of
+ * the buffer.  nfs_doio() clears B_INVAL (and nfs_asyncio() clears it
+ * indirectly), so synchronous reads can be issued without worrying about
+ * the B_INVAL state.  We have to be a little more careful when dealing
+ * with writes (see comments in nfs_write()) when extending a file past
+ * its EOF.
 */
 static struct buf *
 nfs_getcacheblk(vp, bn, size, p)
@ -1359,7 +1369,7 @@ nfs_doio(bp, cr, p)
 		    bp->b_flags &= ~B_WRITEINPROG;
 		    if (retv == 0) {
 			    bp->b_dirtyoff = bp->b_dirtyend = 0;
-			    bp->b_flags &= ~B_NEEDCOMMIT;
+			    bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 			    bp->b_resid = 0;
 			    biodone(bp);
 			    return (0);
@ -1397,7 +1407,13 @@ nfs_doio(bp, cr, p)
 		 * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try
 		 * to cluster the buffers needing commit.  This will allow
 		 * the system to submit a single commit rpc for the whole
-		 * cluster.
+		 * cluster.  We can do this even if the buffer is not 100% 
+		 * dirty (relative to the NFS blocksize), so we optimize the
+		 * append-to-file-case.
+		 *
+		 * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be
+		 * cleared because write clustering only works for commit
+		 * rpc's, not for the data portion of the write).
 		 */

 		if (!error && iomode == NFSV3WRITE_UNSTABLE) {
@ -1406,7 +1422,7 @@ nfs_doio(bp, cr, p)
 			&& bp->b_dirtyend == bp->b_bcount)
 			bp->b_flags |= B_CLUSTEROK;
 		} else {
-		    bp->b_flags &= ~B_NEEDCOMMIT;
+		    bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 		}
 		bp->b_flags &= ~B_WRITEINPROG;

--- a/sys/nfsclient/nfs_subs.c
+++ b/sys/nfsclient/nfs_subs.c
@ -1357,11 +1357,13 @@ nfs_loadattrcache(vpp, mdp, dposp, vaper)
 					vap->va_size = np->n_size;
 				else
 					np->n_size = vap->va_size;
-			} else
+			} else {
 				np->n_size = vap->va_size;
+			}
 			vnode_pager_setsize(vp, np->n_size);
-		} else
+		} else {
 			np->n_size = vap->va_size;
+		}
 	}
 	np->n_attrstamp = time_second;
 	if (vaper != NULL) {
@ -1444,11 +1446,13 @@ nfs_getattrcache(vp, vaper)
 					vap->va_size = np->n_size;
 				else
 					np->n_size = vap->va_size;
-			} else
+			} else {
 				np->n_size = vap->va_size;
+			}
 			vnode_pager_setsize(vp, np->n_size);
-		} else
+		} else {
 			np->n_size = vap->va_size;
+		}
 	}
 	bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(struct vattr));
 	if (np->n_flag & NCHG) {
@ -2150,7 +2154,11 @@ nfs_invaldir(vp)
 * The write verifier has changed (probably due to a server reboot), so all
 * B_NEEDCOMMIT blocks will have to be written again. Since they are on the
 * dirty block list as B_DELWRI, all this takes is clearing the B_NEEDCOMMIT
- * flag. Once done the new write verifier can be set for the mount point.
+ * and B_CLUSTEROK flags.  Once done the new write verifier can be set for the
+ * mount point.
+ *
+ * B_CLUSTEROK must be cleared along with B_NEEDCOMMIT because stage 1 data 
+ * writes are not clusterable.
 */
 void
 nfs_clearcommit(mp)
@ -2171,7 +2179,7 @@ nfs_clearcommit(mp)
 			if (BUF_REFCNT(bp) == 0 &&
 			    (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT))
 				== (B_DELWRI | B_NEEDCOMMIT))
-				bp->b_flags &= ~B_NEEDCOMMIT;
+				bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 		}
 	}
 	splx(s);
--- a/sys/nfsclient/nfs_vnops.c
+++ b/sys/nfsclient/nfs_vnops.c
@ -2924,7 +2924,7 @@ nfs_flush(vp, cred, waitfor, p, commit)
 		 */
 		for (i = 0; i < bvecpos; i++) {
 			bp = bvec[i];
-			bp->b_flags &= ~(B_NEEDCOMMIT | B_WRITEINPROG);
+			bp->b_flags &= ~(B_NEEDCOMMIT | B_WRITEINPROG | B_CLUSTEROK);
 			if (retv) {
 				/*
 				 * Error, leave B_DELWRI intact
@ -2988,7 +2988,7 @@ nfs_flush(vp, cred, waitfor, p, commit)
 		if (passone || !commit)
 		    bp->b_flags |= B_ASYNC;
 		else
-		    bp->b_flags |= (B_ASYNC | B_WRITEINPROG | B_NEEDCOMMIT);
+		    bp->b_flags |= B_ASYNC | B_WRITEINPROG;
 		splx(s);
 		VOP_BWRITE(bp->b_vp, bp);
 		goto loop;
@ -3127,30 +3127,6 @@ nfs_writebp(bp, force, procp)
 	splx(s);

 	vfs_busy_pages(bp, 1);
-#if 0
-	/*
-	 * XXX removed, moved to nfs_doio XXX
-	 */
-	/*
-	 * If B_NEEDCOMMIT is set, a commit rpc may do the trick. If not
-	 * an actual write will have to be scheduled via. VOP_STRATEGY().
-	 * If B_WRITEINPROG is already set, then push it with a write anyhow.
-	 */
-	if ((oldflags & (B_NEEDCOMMIT | B_WRITEINPROG)) == B_NEEDCOMMIT) {
-		off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff;
-		bp->b_flags |= B_WRITEINPROG;
-		retv = nfs_commit(bp->b_vp, off, bp->b_dirtyend-bp->b_dirtyoff,
-			bp->b_wcred, procp);
-		bp->b_flags &= ~B_WRITEINPROG;
-		if (!retv) {
-			bp->b_dirtyoff = bp->b_dirtyend = 0;
-			bp->b_flags &= ~B_NEEDCOMMIT;
-			biodone(bp);
-		} else if (retv == NFSERR_STALEWRITEVERF) {
-			nfs_clearcommit(bp->b_vp->v_mount);
-		}
-	}
-#endif
 	if (force)
 		bp->b_flags |= B_WRITEINPROG;
 	BUF_KERNPROC(bp);
--- a/sys/nfsserver/nfs_srvsubs.c
+++ b/sys/nfsserver/nfs_srvsubs.c
@ -1357,11 +1357,13 @@ nfs_loadattrcache(vpp, mdp, dposp, vaper)
 					vap->va_size = np->n_size;
 				else
 					np->n_size = vap->va_size;
-			} else
+			} else {
 				np->n_size = vap->va_size;
+			}
 			vnode_pager_setsize(vp, np->n_size);
-		} else
+		} else {
 			np->n_size = vap->va_size;
+		}
 	}
 	np->n_attrstamp = time_second;
 	if (vaper != NULL) {
@ -1444,11 +1446,13 @@ nfs_getattrcache(vp, vaper)
 					vap->va_size = np->n_size;
 				else
 					np->n_size = vap->va_size;
-			} else
+			} else {
 				np->n_size = vap->va_size;
+			}
 			vnode_pager_setsize(vp, np->n_size);
-		} else
+		} else {
 			np->n_size = vap->va_size;
+		}
 	}
 	bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(struct vattr));
 	if (np->n_flag & NCHG) {
@ -2150,7 +2154,11 @@ nfs_invaldir(vp)
 * The write verifier has changed (probably due to a server reboot), so all
 * B_NEEDCOMMIT blocks will have to be written again. Since they are on the
 * dirty block list as B_DELWRI, all this takes is clearing the B_NEEDCOMMIT
- * flag. Once done the new write verifier can be set for the mount point.
+ * and B_CLUSTEROK flags.  Once done the new write verifier can be set for the
+ * mount point.
+ *
+ * B_CLUSTEROK must be cleared along with B_NEEDCOMMIT because stage 1 data 
+ * writes are not clusterable.
 */
 void
 nfs_clearcommit(mp)
@ -2171,7 +2179,7 @@ nfs_clearcommit(mp)
 			if (BUF_REFCNT(bp) == 0 &&
 			    (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT))
 				== (B_DELWRI | B_NEEDCOMMIT))
-				bp->b_flags &= ~B_NEEDCOMMIT;
+				bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 		}
 	}
 	splx(s);
--- a/sys/sys/bio.h
+++ b/sys/sys/bio.h
@ -176,6 +176,13 @@ struct buf {
 *	B_MALLOC	Request that the buffer be allocated from the malloc
 *			pool, DEV_BSIZE aligned instead of PAGE_SIZE aligned.
 *
+ *	B_CLUSTEROK	This flag is typically set for B_DELWRI buffers
+ *			by filesystems that allow clustering when the buffer
+ *			is fully dirty and indicates that it may be clustered
+ *			with other adjacent dirty buffers.  Note the clustering
+ *			may not be used with the stage 1 data write under NFS
+ *			but may be used for the commit rpc portion.
+ *
 *	B_VMIO		Indicates that the buffer is tied into an VM object.
 *			The buffer's data is always PAGE_SIZE aligned even
 *			if b_bufsize and b_bcount are not.  ( b_bufsize is 
--- a/sys/sys/buf.h
+++ b/sys/sys/buf.h
@ -176,6 +176,13 @@ struct buf {
 *	B_MALLOC	Request that the buffer be allocated from the malloc
 *			pool, DEV_BSIZE aligned instead of PAGE_SIZE aligned.
 *
+ *	B_CLUSTEROK	This flag is typically set for B_DELWRI buffers
+ *			by filesystems that allow clustering when the buffer
+ *			is fully dirty and indicates that it may be clustered
+ *			with other adjacent dirty buffers.  Note the clustering
+ *			may not be used with the stage 1 data write under NFS
+ *			but may be used for the commit rpc portion.
+ *
 *	B_VMIO		Indicates that the buffer is tied into an VM object.
 *			The buffer's data is always PAGE_SIZE aligned even
 *			if b_bufsize and b_bcount are not.  ( b_bufsize is