Synopsis of problem being fixed: Dan Nelson originally reported that

blocks of zeros could wind up in a file written to over NFS by a client.
    The problem only occurs a few times per several gigabytes of data.   This
    problem turned out to be bug #3 below.

    bug #1:

        B_CLUSTEROK must be cleared when an NFS buffer is reverted from
        stage 2 (ready for commit rpc) to stage 1 (ready for write).
        Reversions can occur when a dirty NFS buffer is redirtied with new
        data.

        Otherwise the VFS/BIO system may end up thinking that a stage 1
        NFS buffer is clusterable.  Stage 1 NFS buffers are not clusterable.

    bug #2:

        B_CLUSTEROK was inappropriately set for a 'short' NFS buffer (short
        buffers only occur near the EOF of the file).  Change to only set
        when the buffer is a full biosize (usually 8K).  This bug has no
        effect but should be fixed in -current anyway.  It need not be
        backported.

    bug #3:

        B_NEEDCOMMIT was inappropriately set in nfs_flush() (which is
	typically only called by the update daemon).  nfs_flush()
        does a multi-pass loop but due to the lack of vnode locking it
        is possible for new buffers to be added to the dirtyblkhd list
        while a flush operation is going on.  This may result in nfs_flush()
        setting B_NEEDCOMMIT on a buffer which has *NOT* yet gone through its
        stage 1 write, causing only the commit rpc to be made and thus
        causing the contents of the buffer to be thrown away (never sent to
        the server).

    The patch also contains some cleanup, which only applies to the commit
    into -current.

Reviewed by:	dg, julian
Originally Reported by: Dan Nelson <dnelson@emsphone.com>
This commit is contained in:
Matthew Dillon 1999-12-12 06:09:57 +00:00
parent 6f6da2f326
commit ea94c7b968
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=54480
10 changed files with 126 additions and 96 deletions

View File

@ -928,7 +928,15 @@ nfs_write(ap)
}
error = uiomove((char *)bp->b_data + on, n, uio);
bp->b_flags &= ~B_NEEDCOMMIT;
/*
* Since this block is being modified, it must be written
* again and not just committed. Since write clustering does
* not work for the stage 1 data write, only the stage 2
* commit rpc, we have to clear B_CLUSTEROK as well.
*/
bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
if (error) {
bp->b_flags |= B_ERROR;
brelse(bp);
@ -950,12 +958,6 @@ nfs_write(ap)
vfs_bio_set_validclean(bp, on, n);
}
/*
* Since this block is being modified, it must be written
* again and not just committed.
*/
bp->b_flags &= ~B_NEEDCOMMIT;
/*
* If the lease is non-cachable or IO_SYNC do bwrite().
*
@ -986,10 +988,18 @@ nfs_write(ap)
/*
* Get an nfs cache block.
*
* Allocate a new one if the block isn't currently in the cache
* and return the block marked busy. If the calling process is
* interrupted by a signal for an interruptible mount point, return
* NULL.
*
* The caller must carefully deal with the possible B_INVAL state of
* the buffer. nfs_doio() clears B_INVAL (and nfs_asyncio() clears it
* indirectly), so synchronous reads can be issued without worrying about
* the B_INVAL state. We have to be a little more careful when dealing
* with writes (see comments in nfs_write()) when extending a file past
* its EOF.
*/
static struct buf *
nfs_getcacheblk(vp, bn, size, p)
@ -1359,7 +1369,7 @@ nfs_doio(bp, cr, p)
bp->b_flags &= ~B_WRITEINPROG;
if (retv == 0) {
bp->b_dirtyoff = bp->b_dirtyend = 0;
bp->b_flags &= ~B_NEEDCOMMIT;
bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
bp->b_resid = 0;
biodone(bp);
return (0);
@ -1397,7 +1407,13 @@ nfs_doio(bp, cr, p)
* When setting B_NEEDCOMMIT also set B_CLUSTEROK to try
* to cluster the buffers needing commit. This will allow
* the system to submit a single commit rpc for the whole
* cluster.
* cluster. We can do this even if the buffer is not 100%
* dirty (relative to the NFS blocksize), so we optimize the
* append-to-file-case.
*
* (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be
* cleared because write clustering only works for commit
* rpc's, not for the data portion of the write).
*/
if (!error && iomode == NFSV3WRITE_UNSTABLE) {
@ -1406,7 +1422,7 @@ nfs_doio(bp, cr, p)
&& bp->b_dirtyend == bp->b_bcount)
bp->b_flags |= B_CLUSTEROK;
} else {
bp->b_flags &= ~B_NEEDCOMMIT;
bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
}
bp->b_flags &= ~B_WRITEINPROG;

View File

@ -1357,11 +1357,13 @@ nfs_loadattrcache(vpp, mdp, dposp, vaper)
vap->va_size = np->n_size;
else
np->n_size = vap->va_size;
} else
} else {
np->n_size = vap->va_size;
}
vnode_pager_setsize(vp, np->n_size);
} else
} else {
np->n_size = vap->va_size;
}
}
np->n_attrstamp = time_second;
if (vaper != NULL) {
@ -1444,11 +1446,13 @@ nfs_getattrcache(vp, vaper)
vap->va_size = np->n_size;
else
np->n_size = vap->va_size;
} else
} else {
np->n_size = vap->va_size;
}
vnode_pager_setsize(vp, np->n_size);
} else
} else {
np->n_size = vap->va_size;
}
}
bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(struct vattr));
if (np->n_flag & NCHG) {
@ -2150,7 +2154,11 @@ nfs_invaldir(vp)
* The write verifier has changed (probably due to a server reboot), so all
* B_NEEDCOMMIT blocks will have to be written again. Since they are on the
* dirty block list as B_DELWRI, all this takes is clearing the B_NEEDCOMMIT
* flag. Once done the new write verifier can be set for the mount point.
* and B_CLUSTEROK flags. Once done the new write verifier can be set for the
* mount point.
*
* B_CLUSTEROK must be cleared along with B_NEEDCOMMIT because stage 1 data
* writes are not clusterable.
*/
void
nfs_clearcommit(mp)
@ -2171,7 +2179,7 @@ nfs_clearcommit(mp)
if (BUF_REFCNT(bp) == 0 &&
(bp->b_flags & (B_DELWRI | B_NEEDCOMMIT))
== (B_DELWRI | B_NEEDCOMMIT))
bp->b_flags &= ~B_NEEDCOMMIT;
bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
}
}
splx(s);

View File

@ -1357,11 +1357,13 @@ nfs_loadattrcache(vpp, mdp, dposp, vaper)
vap->va_size = np->n_size;
else
np->n_size = vap->va_size;
} else
} else {
np->n_size = vap->va_size;
}
vnode_pager_setsize(vp, np->n_size);
} else
} else {
np->n_size = vap->va_size;
}
}
np->n_attrstamp = time_second;
if (vaper != NULL) {
@ -1444,11 +1446,13 @@ nfs_getattrcache(vp, vaper)
vap->va_size = np->n_size;
else
np->n_size = vap->va_size;
} else
} else {
np->n_size = vap->va_size;
}
vnode_pager_setsize(vp, np->n_size);
} else
} else {
np->n_size = vap->va_size;
}
}
bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(struct vattr));
if (np->n_flag & NCHG) {
@ -2150,7 +2154,11 @@ nfs_invaldir(vp)
* The write verifier has changed (probably due to a server reboot), so all
* B_NEEDCOMMIT blocks will have to be written again. Since they are on the
* dirty block list as B_DELWRI, all this takes is clearing the B_NEEDCOMMIT
* flag. Once done the new write verifier can be set for the mount point.
* and B_CLUSTEROK flags. Once done the new write verifier can be set for the
* mount point.
*
* B_CLUSTEROK must be cleared along with B_NEEDCOMMIT because stage 1 data
* writes are not clusterable.
*/
void
nfs_clearcommit(mp)
@ -2171,7 +2179,7 @@ nfs_clearcommit(mp)
if (BUF_REFCNT(bp) == 0 &&
(bp->b_flags & (B_DELWRI | B_NEEDCOMMIT))
== (B_DELWRI | B_NEEDCOMMIT))
bp->b_flags &= ~B_NEEDCOMMIT;
bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
}
}
splx(s);

View File

@ -2924,7 +2924,7 @@ nfs_flush(vp, cred, waitfor, p, commit)
*/
for (i = 0; i < bvecpos; i++) {
bp = bvec[i];
bp->b_flags &= ~(B_NEEDCOMMIT | B_WRITEINPROG);
bp->b_flags &= ~(B_NEEDCOMMIT | B_WRITEINPROG | B_CLUSTEROK);
if (retv) {
/*
* Error, leave B_DELWRI intact
@ -2988,7 +2988,7 @@ nfs_flush(vp, cred, waitfor, p, commit)
if (passone || !commit)
bp->b_flags |= B_ASYNC;
else
bp->b_flags |= (B_ASYNC | B_WRITEINPROG | B_NEEDCOMMIT);
bp->b_flags |= B_ASYNC | B_WRITEINPROG;
splx(s);
VOP_BWRITE(bp->b_vp, bp);
goto loop;
@ -3127,30 +3127,6 @@ nfs_writebp(bp, force, procp)
splx(s);
vfs_busy_pages(bp, 1);
#if 0
/*
* XXX removed, moved to nfs_doio XXX
*/
/*
* If B_NEEDCOMMIT is set, a commit rpc may do the trick. If not
* an actual write will have to be scheduled via. VOP_STRATEGY().
* If B_WRITEINPROG is already set, then push it with a write anyhow.
*/
if ((oldflags & (B_NEEDCOMMIT | B_WRITEINPROG)) == B_NEEDCOMMIT) {
off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff;
bp->b_flags |= B_WRITEINPROG;
retv = nfs_commit(bp->b_vp, off, bp->b_dirtyend-bp->b_dirtyoff,
bp->b_wcred, procp);
bp->b_flags &= ~B_WRITEINPROG;
if (!retv) {
bp->b_dirtyoff = bp->b_dirtyend = 0;
bp->b_flags &= ~B_NEEDCOMMIT;
biodone(bp);
} else if (retv == NFSERR_STALEWRITEVERF) {
nfs_clearcommit(bp->b_vp->v_mount);
}
}
#endif
if (force)
bp->b_flags |= B_WRITEINPROG;
BUF_KERNPROC(bp);

View File

@ -928,7 +928,15 @@ nfs_write(ap)
}
error = uiomove((char *)bp->b_data + on, n, uio);
bp->b_flags &= ~B_NEEDCOMMIT;
/*
* Since this block is being modified, it must be written
* again and not just committed. Since write clustering does
* not work for the stage 1 data write, only the stage 2
* commit rpc, we have to clear B_CLUSTEROK as well.
*/
bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
if (error) {
bp->b_flags |= B_ERROR;
brelse(bp);
@ -950,12 +958,6 @@ nfs_write(ap)
vfs_bio_set_validclean(bp, on, n);
}
/*
* Since this block is being modified, it must be written
* again and not just committed.
*/
bp->b_flags &= ~B_NEEDCOMMIT;
/*
* If the lease is non-cachable or IO_SYNC do bwrite().
*
@ -986,10 +988,18 @@ nfs_write(ap)
/*
* Get an nfs cache block.
*
* Allocate a new one if the block isn't currently in the cache
* and return the block marked busy. If the calling process is
* interrupted by a signal for an interruptible mount point, return
* NULL.
*
* The caller must carefully deal with the possible B_INVAL state of
* the buffer. nfs_doio() clears B_INVAL (and nfs_asyncio() clears it
* indirectly), so synchronous reads can be issued without worrying about
* the B_INVAL state. We have to be a little more careful when dealing
* with writes (see comments in nfs_write()) when extending a file past
* its EOF.
*/
static struct buf *
nfs_getcacheblk(vp, bn, size, p)
@ -1359,7 +1369,7 @@ nfs_doio(bp, cr, p)
bp->b_flags &= ~B_WRITEINPROG;
if (retv == 0) {
bp->b_dirtyoff = bp->b_dirtyend = 0;
bp->b_flags &= ~B_NEEDCOMMIT;
bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
bp->b_resid = 0;
biodone(bp);
return (0);
@ -1397,7 +1407,13 @@ nfs_doio(bp, cr, p)
* When setting B_NEEDCOMMIT also set B_CLUSTEROK to try
* to cluster the buffers needing commit. This will allow
* the system to submit a single commit rpc for the whole
* cluster.
* cluster. We can do this even if the buffer is not 100%
* dirty (relative to the NFS blocksize), so we optimize the
* append-to-file-case.
*
* (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be
* cleared because write clustering only works for commit
* rpc's, not for the data portion of the write).
*/
if (!error && iomode == NFSV3WRITE_UNSTABLE) {
@ -1406,7 +1422,7 @@ nfs_doio(bp, cr, p)
&& bp->b_dirtyend == bp->b_bcount)
bp->b_flags |= B_CLUSTEROK;
} else {
bp->b_flags &= ~B_NEEDCOMMIT;
bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
}
bp->b_flags &= ~B_WRITEINPROG;

View File

@ -1357,11 +1357,13 @@ nfs_loadattrcache(vpp, mdp, dposp, vaper)
vap->va_size = np->n_size;
else
np->n_size = vap->va_size;
} else
} else {
np->n_size = vap->va_size;
}
vnode_pager_setsize(vp, np->n_size);
} else
} else {
np->n_size = vap->va_size;
}
}
np->n_attrstamp = time_second;
if (vaper != NULL) {
@ -1444,11 +1446,13 @@ nfs_getattrcache(vp, vaper)
vap->va_size = np->n_size;
else
np->n_size = vap->va_size;
} else
} else {
np->n_size = vap->va_size;
}
vnode_pager_setsize(vp, np->n_size);
} else
} else {
np->n_size = vap->va_size;
}
}
bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(struct vattr));
if (np->n_flag & NCHG) {
@ -2150,7 +2154,11 @@ nfs_invaldir(vp)
* The write verifier has changed (probably due to a server reboot), so all
* B_NEEDCOMMIT blocks will have to be written again. Since they are on the
* dirty block list as B_DELWRI, all this takes is clearing the B_NEEDCOMMIT
* flag. Once done the new write verifier can be set for the mount point.
* and B_CLUSTEROK flags. Once done the new write verifier can be set for the
* mount point.
*
* B_CLUSTEROK must be cleared along with B_NEEDCOMMIT because stage 1 data
* writes are not clusterable.
*/
void
nfs_clearcommit(mp)
@ -2171,7 +2179,7 @@ nfs_clearcommit(mp)
if (BUF_REFCNT(bp) == 0 &&
(bp->b_flags & (B_DELWRI | B_NEEDCOMMIT))
== (B_DELWRI | B_NEEDCOMMIT))
bp->b_flags &= ~B_NEEDCOMMIT;
bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
}
}
splx(s);

View File

@ -2924,7 +2924,7 @@ nfs_flush(vp, cred, waitfor, p, commit)
*/
for (i = 0; i < bvecpos; i++) {
bp = bvec[i];
bp->b_flags &= ~(B_NEEDCOMMIT | B_WRITEINPROG);
bp->b_flags &= ~(B_NEEDCOMMIT | B_WRITEINPROG | B_CLUSTEROK);
if (retv) {
/*
* Error, leave B_DELWRI intact
@ -2988,7 +2988,7 @@ nfs_flush(vp, cred, waitfor, p, commit)
if (passone || !commit)
bp->b_flags |= B_ASYNC;
else
bp->b_flags |= (B_ASYNC | B_WRITEINPROG | B_NEEDCOMMIT);
bp->b_flags |= B_ASYNC | B_WRITEINPROG;
splx(s);
VOP_BWRITE(bp->b_vp, bp);
goto loop;
@ -3127,30 +3127,6 @@ nfs_writebp(bp, force, procp)
splx(s);
vfs_busy_pages(bp, 1);
#if 0
/*
* XXX removed, moved to nfs_doio XXX
*/
/*
* If B_NEEDCOMMIT is set, a commit rpc may do the trick. If not
* an actual write will have to be scheduled via. VOP_STRATEGY().
* If B_WRITEINPROG is already set, then push it with a write anyhow.
*/
if ((oldflags & (B_NEEDCOMMIT | B_WRITEINPROG)) == B_NEEDCOMMIT) {
off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff;
bp->b_flags |= B_WRITEINPROG;
retv = nfs_commit(bp->b_vp, off, bp->b_dirtyend-bp->b_dirtyoff,
bp->b_wcred, procp);
bp->b_flags &= ~B_WRITEINPROG;
if (!retv) {
bp->b_dirtyoff = bp->b_dirtyend = 0;
bp->b_flags &= ~B_NEEDCOMMIT;
biodone(bp);
} else if (retv == NFSERR_STALEWRITEVERF) {
nfs_clearcommit(bp->b_vp->v_mount);
}
}
#endif
if (force)
bp->b_flags |= B_WRITEINPROG;
BUF_KERNPROC(bp);

View File

@ -1357,11 +1357,13 @@ nfs_loadattrcache(vpp, mdp, dposp, vaper)
vap->va_size = np->n_size;
else
np->n_size = vap->va_size;
} else
} else {
np->n_size = vap->va_size;
}
vnode_pager_setsize(vp, np->n_size);
} else
} else {
np->n_size = vap->va_size;
}
}
np->n_attrstamp = time_second;
if (vaper != NULL) {
@ -1444,11 +1446,13 @@ nfs_getattrcache(vp, vaper)
vap->va_size = np->n_size;
else
np->n_size = vap->va_size;
} else
} else {
np->n_size = vap->va_size;
}
vnode_pager_setsize(vp, np->n_size);
} else
} else {
np->n_size = vap->va_size;
}
}
bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(struct vattr));
if (np->n_flag & NCHG) {
@ -2150,7 +2154,11 @@ nfs_invaldir(vp)
* The write verifier has changed (probably due to a server reboot), so all
* B_NEEDCOMMIT blocks will have to be written again. Since they are on the
* dirty block list as B_DELWRI, all this takes is clearing the B_NEEDCOMMIT
* flag. Once done the new write verifier can be set for the mount point.
* and B_CLUSTEROK flags. Once done the new write verifier can be set for the
* mount point.
*
* B_CLUSTEROK must be cleared along with B_NEEDCOMMIT because stage 1 data
* writes are not clusterable.
*/
void
nfs_clearcommit(mp)
@ -2171,7 +2179,7 @@ nfs_clearcommit(mp)
if (BUF_REFCNT(bp) == 0 &&
(bp->b_flags & (B_DELWRI | B_NEEDCOMMIT))
== (B_DELWRI | B_NEEDCOMMIT))
bp->b_flags &= ~B_NEEDCOMMIT;
bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
}
}
splx(s);

View File

@ -176,6 +176,13 @@ struct buf {
* B_MALLOC Request that the buffer be allocated from the malloc
* pool, DEV_BSIZE aligned instead of PAGE_SIZE aligned.
*
* B_CLUSTEROK This flag is typically set for B_DELWRI buffers
* by filesystems that allow clustering when the buffer
* is fully dirty and indicates that it may be clustered
* with other adjacent dirty buffers. Note the clustering
* may not be used with the stage 1 data write under NFS
* but may be used for the commit rpc portion.
*
* B_VMIO Indicates that the buffer is tied into an VM object.
* The buffer's data is always PAGE_SIZE aligned even
* if b_bufsize and b_bcount are not. ( b_bufsize is

View File

@ -176,6 +176,13 @@ struct buf {
* B_MALLOC Request that the buffer be allocated from the malloc
* pool, DEV_BSIZE aligned instead of PAGE_SIZE aligned.
*
* B_CLUSTEROK This flag is typically set for B_DELWRI buffers
* by filesystems that allow clustering when the buffer
* is fully dirty and indicates that it may be clustered
* with other adjacent dirty buffers. Note the clustering
* may not be used with the stage 1 data write under NFS
* but may be used for the commit rpc portion.
*
* B_VMIO Indicates that the buffer is tied into an VM object.
* The buffer's data is always PAGE_SIZE aligned even
* if b_bufsize and b_bcount are not. ( b_bufsize is