diff --git a/sys/nfs/nfs_bio.c b/sys/nfs/nfs_bio.c index 63cc67e7046d..10be58d709bb 100644 --- a/sys/nfs/nfs_bio.c +++ b/sys/nfs/nfs_bio.c @@ -928,7 +928,15 @@ nfs_write(ap) } error = uiomove((char *)bp->b_data + on, n, uio); - bp->b_flags &= ~B_NEEDCOMMIT; + + /* + * Since this block is being modified, it must be written + * again and not just committed. Since write clustering does + * not work for the stage 1 data write, only the stage 2 + * commit rpc, we have to clear B_CLUSTEROK as well. + */ + bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); + if (error) { bp->b_flags |= B_ERROR; brelse(bp); @@ -950,12 +958,6 @@ nfs_write(ap) vfs_bio_set_validclean(bp, on, n); } - /* - * Since this block is being modified, it must be written - * again and not just committed. - */ - bp->b_flags &= ~B_NEEDCOMMIT; - /* * If the lease is non-cachable or IO_SYNC do bwrite(). * @@ -986,10 +988,18 @@ nfs_write(ap) /* * Get an nfs cache block. + * * Allocate a new one if the block isn't currently in the cache * and return the block marked busy. If the calling process is * interrupted by a signal for an interruptible mount point, return * NULL. + * + * The caller must carefully deal with the possible B_INVAL state of + * the buffer. nfs_doio() clears B_INVAL (and nfs_asyncio() clears it + * indirectly), so synchronous reads can be issued without worrying about + * the B_INVAL state. We have to be a little more careful when dealing + * with writes (see comments in nfs_write()) when extending a file past + * its EOF. */ static struct buf * nfs_getcacheblk(vp, bn, size, p) @@ -1359,7 +1369,7 @@ nfs_doio(bp, cr, p) bp->b_flags &= ~B_WRITEINPROG; if (retv == 0) { bp->b_dirtyoff = bp->b_dirtyend = 0; - bp->b_flags &= ~B_NEEDCOMMIT; + bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); bp->b_resid = 0; biodone(bp); return (0); @@ -1397,7 +1407,13 @@ nfs_doio(bp, cr, p) * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try * to cluster the buffers needing commit. This will allow * the system to submit a single commit rpc for the whole - * cluster. + * cluster. We can do this even if the buffer is not 100% + * dirty (relative to the NFS blocksize), so we optimize the + * append-to-file-case. + * + * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be + * cleared because write clustering only works for commit + * rpc's, not for the data portion of the write). */ if (!error && iomode == NFSV3WRITE_UNSTABLE) { @@ -1406,7 +1422,7 @@ nfs_doio(bp, cr, p) && bp->b_dirtyend == bp->b_bcount) bp->b_flags |= B_CLUSTEROK; } else { - bp->b_flags &= ~B_NEEDCOMMIT; + bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); } bp->b_flags &= ~B_WRITEINPROG; diff --git a/sys/nfs/nfs_common.c b/sys/nfs/nfs_common.c index b62b21d31326..0da996b4c788 100644 --- a/sys/nfs/nfs_common.c +++ b/sys/nfs/nfs_common.c @@ -1357,11 +1357,13 @@ nfs_loadattrcache(vpp, mdp, dposp, vaper) vap->va_size = np->n_size; else np->n_size = vap->va_size; - } else + } else { np->n_size = vap->va_size; + } vnode_pager_setsize(vp, np->n_size); - } else + } else { np->n_size = vap->va_size; + } } np->n_attrstamp = time_second; if (vaper != NULL) { @@ -1444,11 +1446,13 @@ nfs_getattrcache(vp, vaper) vap->va_size = np->n_size; else np->n_size = vap->va_size; - } else + } else { np->n_size = vap->va_size; + } vnode_pager_setsize(vp, np->n_size); - } else + } else { np->n_size = vap->va_size; + } } bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(struct vattr)); if (np->n_flag & NCHG) { @@ -2150,7 +2154,11 @@ nfs_invaldir(vp) * The write verifier has changed (probably due to a server reboot), so all * B_NEEDCOMMIT blocks will have to be written again. Since they are on the * dirty block list as B_DELWRI, all this takes is clearing the B_NEEDCOMMIT - * flag. Once done the new write verifier can be set for the mount point. + * and B_CLUSTEROK flags. Once done the new write verifier can be set for the + * mount point. + * + * B_CLUSTEROK must be cleared along with B_NEEDCOMMIT because stage 1 data + * writes are not clusterable. */ void nfs_clearcommit(mp) @@ -2171,7 +2179,7 @@ nfs_clearcommit(mp) if (BUF_REFCNT(bp) == 0 && (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT)) == (B_DELWRI | B_NEEDCOMMIT)) - bp->b_flags &= ~B_NEEDCOMMIT; + bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); } } splx(s); diff --git a/sys/nfs/nfs_subs.c b/sys/nfs/nfs_subs.c index b62b21d31326..0da996b4c788 100644 --- a/sys/nfs/nfs_subs.c +++ b/sys/nfs/nfs_subs.c @@ -1357,11 +1357,13 @@ nfs_loadattrcache(vpp, mdp, dposp, vaper) vap->va_size = np->n_size; else np->n_size = vap->va_size; - } else + } else { np->n_size = vap->va_size; + } vnode_pager_setsize(vp, np->n_size); - } else + } else { np->n_size = vap->va_size; + } } np->n_attrstamp = time_second; if (vaper != NULL) { @@ -1444,11 +1446,13 @@ nfs_getattrcache(vp, vaper) vap->va_size = np->n_size; else np->n_size = vap->va_size; - } else + } else { np->n_size = vap->va_size; + } vnode_pager_setsize(vp, np->n_size); - } else + } else { np->n_size = vap->va_size; + } } bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(struct vattr)); if (np->n_flag & NCHG) { @@ -2150,7 +2154,11 @@ nfs_invaldir(vp) * The write verifier has changed (probably due to a server reboot), so all * B_NEEDCOMMIT blocks will have to be written again. Since they are on the * dirty block list as B_DELWRI, all this takes is clearing the B_NEEDCOMMIT - * flag. Once done the new write verifier can be set for the mount point. + * and B_CLUSTEROK flags. Once done the new write verifier can be set for the + * mount point. + * + * B_CLUSTEROK must be cleared along with B_NEEDCOMMIT because stage 1 data + * writes are not clusterable. */ void nfs_clearcommit(mp) @@ -2171,7 +2179,7 @@ nfs_clearcommit(mp) if (BUF_REFCNT(bp) == 0 && (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT)) == (B_DELWRI | B_NEEDCOMMIT)) - bp->b_flags &= ~B_NEEDCOMMIT; + bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); } } splx(s); diff --git a/sys/nfs/nfs_vnops.c b/sys/nfs/nfs_vnops.c index e53b295d8ca5..7774b8bd9202 100644 --- a/sys/nfs/nfs_vnops.c +++ b/sys/nfs/nfs_vnops.c @@ -2924,7 +2924,7 @@ nfs_flush(vp, cred, waitfor, p, commit) */ for (i = 0; i < bvecpos; i++) { bp = bvec[i]; - bp->b_flags &= ~(B_NEEDCOMMIT | B_WRITEINPROG); + bp->b_flags &= ~(B_NEEDCOMMIT | B_WRITEINPROG | B_CLUSTEROK); if (retv) { /* * Error, leave B_DELWRI intact @@ -2988,7 +2988,7 @@ nfs_flush(vp, cred, waitfor, p, commit) if (passone || !commit) bp->b_flags |= B_ASYNC; else - bp->b_flags |= (B_ASYNC | B_WRITEINPROG | B_NEEDCOMMIT); + bp->b_flags |= B_ASYNC | B_WRITEINPROG; splx(s); VOP_BWRITE(bp->b_vp, bp); goto loop; @@ -3127,30 +3127,6 @@ nfs_writebp(bp, force, procp) splx(s); vfs_busy_pages(bp, 1); -#if 0 - /* - * XXX removed, moved to nfs_doio XXX - */ - /* - * If B_NEEDCOMMIT is set, a commit rpc may do the trick. If not - * an actual write will have to be scheduled via. VOP_STRATEGY(). - * If B_WRITEINPROG is already set, then push it with a write anyhow. - */ - if ((oldflags & (B_NEEDCOMMIT | B_WRITEINPROG)) == B_NEEDCOMMIT) { - off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff; - bp->b_flags |= B_WRITEINPROG; - retv = nfs_commit(bp->b_vp, off, bp->b_dirtyend-bp->b_dirtyoff, - bp->b_wcred, procp); - bp->b_flags &= ~B_WRITEINPROG; - if (!retv) { - bp->b_dirtyoff = bp->b_dirtyend = 0; - bp->b_flags &= ~B_NEEDCOMMIT; - biodone(bp); - } else if (retv == NFSERR_STALEWRITEVERF) { - nfs_clearcommit(bp->b_vp->v_mount); - } - } -#endif if (force) bp->b_flags |= B_WRITEINPROG; BUF_KERNPROC(bp); diff --git a/sys/nfsclient/nfs_bio.c b/sys/nfsclient/nfs_bio.c index 63cc67e7046d..10be58d709bb 100644 --- a/sys/nfsclient/nfs_bio.c +++ b/sys/nfsclient/nfs_bio.c @@ -928,7 +928,15 @@ nfs_write(ap) } error = uiomove((char *)bp->b_data + on, n, uio); - bp->b_flags &= ~B_NEEDCOMMIT; + + /* + * Since this block is being modified, it must be written + * again and not just committed. Since write clustering does + * not work for the stage 1 data write, only the stage 2 + * commit rpc, we have to clear B_CLUSTEROK as well. + */ + bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); + if (error) { bp->b_flags |= B_ERROR; brelse(bp); @@ -950,12 +958,6 @@ nfs_write(ap) vfs_bio_set_validclean(bp, on, n); } - /* - * Since this block is being modified, it must be written - * again and not just committed. - */ - bp->b_flags &= ~B_NEEDCOMMIT; - /* * If the lease is non-cachable or IO_SYNC do bwrite(). * @@ -986,10 +988,18 @@ nfs_write(ap) /* * Get an nfs cache block. + * * Allocate a new one if the block isn't currently in the cache * and return the block marked busy. If the calling process is * interrupted by a signal for an interruptible mount point, return * NULL. + * + * The caller must carefully deal with the possible B_INVAL state of + * the buffer. nfs_doio() clears B_INVAL (and nfs_asyncio() clears it + * indirectly), so synchronous reads can be issued without worrying about + * the B_INVAL state. We have to be a little more careful when dealing + * with writes (see comments in nfs_write()) when extending a file past + * its EOF. */ static struct buf * nfs_getcacheblk(vp, bn, size, p) @@ -1359,7 +1369,7 @@ nfs_doio(bp, cr, p) bp->b_flags &= ~B_WRITEINPROG; if (retv == 0) { bp->b_dirtyoff = bp->b_dirtyend = 0; - bp->b_flags &= ~B_NEEDCOMMIT; + bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); bp->b_resid = 0; biodone(bp); return (0); @@ -1397,7 +1407,13 @@ nfs_doio(bp, cr, p) * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try * to cluster the buffers needing commit. This will allow * the system to submit a single commit rpc for the whole - * cluster. + * cluster. We can do this even if the buffer is not 100% + * dirty (relative to the NFS blocksize), so we optimize the + * append-to-file-case. + * + * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be + * cleared because write clustering only works for commit + * rpc's, not for the data portion of the write). */ if (!error && iomode == NFSV3WRITE_UNSTABLE) { @@ -1406,7 +1422,7 @@ nfs_doio(bp, cr, p) && bp->b_dirtyend == bp->b_bcount) bp->b_flags |= B_CLUSTEROK; } else { - bp->b_flags &= ~B_NEEDCOMMIT; + bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); } bp->b_flags &= ~B_WRITEINPROG; diff --git a/sys/nfsclient/nfs_subs.c b/sys/nfsclient/nfs_subs.c index b62b21d31326..0da996b4c788 100644 --- a/sys/nfsclient/nfs_subs.c +++ b/sys/nfsclient/nfs_subs.c @@ -1357,11 +1357,13 @@ nfs_loadattrcache(vpp, mdp, dposp, vaper) vap->va_size = np->n_size; else np->n_size = vap->va_size; - } else + } else { np->n_size = vap->va_size; + } vnode_pager_setsize(vp, np->n_size); - } else + } else { np->n_size = vap->va_size; + } } np->n_attrstamp = time_second; if (vaper != NULL) { @@ -1444,11 +1446,13 @@ nfs_getattrcache(vp, vaper) vap->va_size = np->n_size; else np->n_size = vap->va_size; - } else + } else { np->n_size = vap->va_size; + } vnode_pager_setsize(vp, np->n_size); - } else + } else { np->n_size = vap->va_size; + } } bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(struct vattr)); if (np->n_flag & NCHG) { @@ -2150,7 +2154,11 @@ nfs_invaldir(vp) * The write verifier has changed (probably due to a server reboot), so all * B_NEEDCOMMIT blocks will have to be written again. Since they are on the * dirty block list as B_DELWRI, all this takes is clearing the B_NEEDCOMMIT - * flag. Once done the new write verifier can be set for the mount point. + * and B_CLUSTEROK flags. Once done the new write verifier can be set for the + * mount point. + * + * B_CLUSTEROK must be cleared along with B_NEEDCOMMIT because stage 1 data + * writes are not clusterable. */ void nfs_clearcommit(mp) @@ -2171,7 +2179,7 @@ nfs_clearcommit(mp) if (BUF_REFCNT(bp) == 0 && (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT)) == (B_DELWRI | B_NEEDCOMMIT)) - bp->b_flags &= ~B_NEEDCOMMIT; + bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); } } splx(s); diff --git a/sys/nfsclient/nfs_vnops.c b/sys/nfsclient/nfs_vnops.c index e53b295d8ca5..7774b8bd9202 100644 --- a/sys/nfsclient/nfs_vnops.c +++ b/sys/nfsclient/nfs_vnops.c @@ -2924,7 +2924,7 @@ nfs_flush(vp, cred, waitfor, p, commit) */ for (i = 0; i < bvecpos; i++) { bp = bvec[i]; - bp->b_flags &= ~(B_NEEDCOMMIT | B_WRITEINPROG); + bp->b_flags &= ~(B_NEEDCOMMIT | B_WRITEINPROG | B_CLUSTEROK); if (retv) { /* * Error, leave B_DELWRI intact @@ -2988,7 +2988,7 @@ nfs_flush(vp, cred, waitfor, p, commit) if (passone || !commit) bp->b_flags |= B_ASYNC; else - bp->b_flags |= (B_ASYNC | B_WRITEINPROG | B_NEEDCOMMIT); + bp->b_flags |= B_ASYNC | B_WRITEINPROG; splx(s); VOP_BWRITE(bp->b_vp, bp); goto loop; @@ -3127,30 +3127,6 @@ nfs_writebp(bp, force, procp) splx(s); vfs_busy_pages(bp, 1); -#if 0 - /* - * XXX removed, moved to nfs_doio XXX - */ - /* - * If B_NEEDCOMMIT is set, a commit rpc may do the trick. If not - * an actual write will have to be scheduled via. VOP_STRATEGY(). - * If B_WRITEINPROG is already set, then push it with a write anyhow. - */ - if ((oldflags & (B_NEEDCOMMIT | B_WRITEINPROG)) == B_NEEDCOMMIT) { - off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff; - bp->b_flags |= B_WRITEINPROG; - retv = nfs_commit(bp->b_vp, off, bp->b_dirtyend-bp->b_dirtyoff, - bp->b_wcred, procp); - bp->b_flags &= ~B_WRITEINPROG; - if (!retv) { - bp->b_dirtyoff = bp->b_dirtyend = 0; - bp->b_flags &= ~B_NEEDCOMMIT; - biodone(bp); - } else if (retv == NFSERR_STALEWRITEVERF) { - nfs_clearcommit(bp->b_vp->v_mount); - } - } -#endif if (force) bp->b_flags |= B_WRITEINPROG; BUF_KERNPROC(bp); diff --git a/sys/nfsserver/nfs_srvsubs.c b/sys/nfsserver/nfs_srvsubs.c index b62b21d31326..0da996b4c788 100644 --- a/sys/nfsserver/nfs_srvsubs.c +++ b/sys/nfsserver/nfs_srvsubs.c @@ -1357,11 +1357,13 @@ nfs_loadattrcache(vpp, mdp, dposp, vaper) vap->va_size = np->n_size; else np->n_size = vap->va_size; - } else + } else { np->n_size = vap->va_size; + } vnode_pager_setsize(vp, np->n_size); - } else + } else { np->n_size = vap->va_size; + } } np->n_attrstamp = time_second; if (vaper != NULL) { @@ -1444,11 +1446,13 @@ nfs_getattrcache(vp, vaper) vap->va_size = np->n_size; else np->n_size = vap->va_size; - } else + } else { np->n_size = vap->va_size; + } vnode_pager_setsize(vp, np->n_size); - } else + } else { np->n_size = vap->va_size; + } } bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(struct vattr)); if (np->n_flag & NCHG) { @@ -2150,7 +2154,11 @@ nfs_invaldir(vp) * The write verifier has changed (probably due to a server reboot), so all * B_NEEDCOMMIT blocks will have to be written again. Since they are on the * dirty block list as B_DELWRI, all this takes is clearing the B_NEEDCOMMIT - * flag. Once done the new write verifier can be set for the mount point. + * and B_CLUSTEROK flags. Once done the new write verifier can be set for the + * mount point. + * + * B_CLUSTEROK must be cleared along with B_NEEDCOMMIT because stage 1 data + * writes are not clusterable. */ void nfs_clearcommit(mp) @@ -2171,7 +2179,7 @@ nfs_clearcommit(mp) if (BUF_REFCNT(bp) == 0 && (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT)) == (B_DELWRI | B_NEEDCOMMIT)) - bp->b_flags &= ~B_NEEDCOMMIT; + bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); } } splx(s); diff --git a/sys/sys/bio.h b/sys/sys/bio.h index f9a09bb657bc..f5d1879fe164 100644 --- a/sys/sys/bio.h +++ b/sys/sys/bio.h @@ -176,6 +176,13 @@ struct buf { * B_MALLOC Request that the buffer be allocated from the malloc * pool, DEV_BSIZE aligned instead of PAGE_SIZE aligned. * + * B_CLUSTEROK This flag is typically set for B_DELWRI buffers + * by filesystems that allow clustering when the buffer + * is fully dirty and indicates that it may be clustered + * with other adjacent dirty buffers. Note the clustering + * may not be used with the stage 1 data write under NFS + * but may be used for the commit rpc portion. + * * B_VMIO Indicates that the buffer is tied into an VM object. * The buffer's data is always PAGE_SIZE aligned even * if b_bufsize and b_bcount are not. ( b_bufsize is diff --git a/sys/sys/buf.h b/sys/sys/buf.h index f9a09bb657bc..f5d1879fe164 100644 --- a/sys/sys/buf.h +++ b/sys/sys/buf.h @@ -176,6 +176,13 @@ struct buf { * B_MALLOC Request that the buffer be allocated from the malloc * pool, DEV_BSIZE aligned instead of PAGE_SIZE aligned. * + * B_CLUSTEROK This flag is typically set for B_DELWRI buffers + * by filesystems that allow clustering when the buffer + * is fully dirty and indicates that it may be clustered + * with other adjacent dirty buffers. Note the clustering + * may not be used with the stage 1 data write under NFS + * but may be used for the commit rpc portion. + * * B_VMIO Indicates that the buffer is tied into an VM object. * The buffer's data is always PAGE_SIZE aligned even * if b_bufsize and b_bcount are not. ( b_bufsize is