This fixes a large number of bugs in our NFS client side code. A recent

commit by Kirk also fixed a softupdates bug that could easily be triggered
by server side NFS.

	* An edge case with shared R+W mmap()'s and truncate whereby
	  the system would inappropriately clear the dirty bits on
	  still-dirty data.  (applicable to all filesystems)

	  THIS FIX TEMPORARILY DISABLED PENDING FURTHER TESTING.
	  see vm/vm_page.c line 1641

	* The straddle case for VM pages and buffer cache buffers when
	  truncating.  (applicable to NFS client side)

	* Possible SMP database corruption due to vm_pager_unmap_page()
	  not clearing the TLB for the other cpu's.  (applicable to NFS
	  client side but could effect all filesystems).  Note: not
	  considered serious since the corruption occurs beyond the file
	  EOF.

	* When flusing a dirty buffer due to B_CACHE getting cleared,
	  we were accidently setting B_CACHE again (that is, bwrite() sets
	  B_CACHE), when we really want it to stay clear after the write
	  is complete.  This resulted in a corrupt buffer.  (applicable
	  to all filesystems but probably only triggered by NFS)

	* We have to call vtruncbuf() when ftruncate()ing to remove
	  any buffer cache buffers.  This is still tentitive, I may
	  be able to remove it due to the second bug fix.  (applicable
	  to NFS client side)

	* vnode_pager_setsize() race against nfs_vinvalbuf()... we have
	  to set n_size before calling nfs_vinvalbuf or the NFS code
	  may recursively vnode_pager_setsize() to the original value
	  before the truncate.  This is what was causing the user mmap
	  bus faults in the nfs tester program.  (applicable to NFS
	  client side)

	* Fix to softupdates (see ufs/ffs/ffs_inode.c 1.73, commit made
	  by Kirk).

Testing program written by: Avadis Tevanian, Jr.
Testing program supplied by: jkh / Apple (see Dec2001 posting to freebsd-hackers with Subject 'NFS: How to make FreeBS fall on its face in one easy step')
MFC after:	1 week
This commit is contained in:
Matthew Dillon 2001-12-14 01:16:57 +00:00
parent 247e80274d
commit 3ebeaf5984
6 changed files with 128 additions and 11 deletions

View File

@ -2271,9 +2271,21 @@ loop:
* to softupdates re-dirtying the buffer. In the latter
* case, B_CACHE is set after the first write completes,
* preventing further loops.
* NOTE! b*write() sets B_CACHE. If we cleared B_CACHE
* above while extending the buffer, we cannot allow the
* buffer to remain with B_CACHE set after the write
* completes or it will represent a corrupt state. To
* deal with this we set B_NOCACHE to scrap the buffer
* after the write.
*
* We might be able to do something fancy, like setting
* B_CACHE in bwrite() except if B_DELWRI is already set,
* so the below call doesn't set B_CACHE, but that gets real
* confusing. This is much easier.
*/
if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
bp->b_flags |= B_NOCACHE;
BUF_WRITE(bp);
goto loop;
}

View File

@ -288,6 +288,8 @@ void nfs_clearcommit(struct mount *);
int nfs_writebp(struct buf *, int, struct thread *);
int nfs_fsinfo(struct nfsmount *, struct vnode *, struct ucred *,
struct thread *);
int nfs_meta_setsize (struct vnode *, struct ucred *,
struct thread *, u_quad_t);
#endif /* _KERNEL */

View File

@ -206,8 +206,14 @@ nfs_getpages(struct vop_getpages_args *ap)
vm_page_set_validclean(m, 0, size - toff);
/* handled by vm_fault now */
/* vm_page_zero_invalid(m, TRUE); */
} else {
/*
* Read operation was short. If no error occured
* we may have hit a zero-fill section. We simply
* leave valid set to 0.
*/
;
}
if (i != ap->a_reqpage) {
/*
* Whether or not to leave the page activated is up in
@ -831,9 +837,7 @@ again:
else
bcount = np->n_size - (off_t)lbn * biosize;
}
bp = nfs_getcacheblk(vp, lbn, bcount, td);
if (uio->uio_offset + n > np->n_size) {
np->n_size = uio->uio_offset + n;
np->n_flag |= NMODIFIED;
@ -1299,11 +1303,13 @@ nfs_doio(struct buf *bp, struct ucred *cr, struct thread *td)
io.iov_len = uiop->uio_resid = bp->b_bcount;
io.iov_base = bp->b_data;
uiop->uio_rw = UIO_READ;
switch (vp->v_type) {
case VREG:
uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
nfsstats.read_bios++;
error = nfs_readrpc(vp, uiop, cr);
if (!error) {
if (uiop->uio_resid) {
/*
@ -1315,7 +1321,7 @@ nfs_doio(struct buf *bp, struct ucred *cr, struct thread *td)
* writes, but that is not possible any longer.
*/
int nread = bp->b_bcount - uiop->uio_resid;
int left = bp->b_bcount - nread;
int left = uiop->uio_resid;
if (left > 0)
bzero((char *)bp->b_data + nread, left);
@ -1485,3 +1491,48 @@ nfs_doio(struct buf *bp, struct ucred *cr, struct thread *td)
bufdone(bp);
return (error);
}
/*
* Used to aid in handling ftruncate() operations on the NFS client side.
* Truncation creates a number of special problems for NFS. We have to
* throw away VM pages and buffer cache buffers that are beyond EOF, and
* we have to properly handle VM pages or (potentially dirty) buffers
* that straddle the truncation point.
*/
int
nfs_meta_setsize(struct vnode *vp, struct ucred *cred, struct thread *td, u_quad_t nsize)
{
struct nfsnode *np = VTONFS(vp);
u_quad_t tsize = np->n_size;
int biosize = vp->v_mount->mnt_stat.f_iosize;
int error = 0;
np->n_size = nsize;
if (np->n_size < tsize) {
struct buf *bp;
daddr_t lbn;
int bufsize;
/*
* vtruncbuf() doesn't get the buffer overlapping the
* truncation point. We may have a B_DELWRI and/or B_CACHE
* buffer that now needs to be truncated.
*/
error = vtruncbuf(vp, cred, td, nsize, biosize);
lbn = nsize / biosize;
bufsize = nsize & (biosize - 1);
bp = nfs_getcacheblk(vp, lbn, bufsize, td);
if (bp->b_dirtyoff > bp->b_bcount)
bp->b_dirtyoff = bp->b_bcount;
if (bp->b_dirtyend > bp->b_bcount)
bp->b_dirtyend = bp->b_bcount;
bp->b_flags |= B_RELBUF; /* don't leave garbage around */
brelse(bp);
} else {
vnode_pager_setsize(vp, nsize);
}
return(error);
}

View File

@ -643,7 +643,18 @@ nfs_setattr(struct vop_setattr_args *ap)
*/
if (vp->v_mount->mnt_flag & MNT_RDONLY)
return (EROFS);
vnode_pager_setsize(vp, vap->va_size);
/*
* We run vnode_pager_setsize() early (why?),
* we must set np->n_size now to avoid vinvalbuf
* V_SAVE races that might setsize a lower
* value.
*/
tsize = np->n_size;
error = nfs_meta_setsize(vp, ap->a_cred,
ap->a_td, vap->va_size);
if (np->n_flag & NMODIFIED) {
if (vap->va_size == 0)
error = nfs_vinvalbuf(vp, 0,
@ -656,8 +667,7 @@ nfs_setattr(struct vop_setattr_args *ap)
return (error);
}
}
tsize = np->n_size;
np->n_size = np->n_vattr.va_size = vap->va_size;
np->n_vattr.va_size = vap->va_size;
};
} else if ((vap->va_mtime.tv_sec != VNOVAL ||
vap->va_atime.tv_sec != VNOVAL) && (np->n_flag & NMODIFIED) &&
@ -1049,10 +1059,12 @@ nfs_readrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred)
m_freem(mrep);
tsiz -= retlen;
if (v3) {
if (eof || retlen == 0)
if (eof || retlen == 0) {
tsiz = 0;
} else if (retlen < len)
}
} else if (retlen < len) {
tsiz = 0;
}
}
nfsmout:
return (error);
@ -3114,3 +3126,4 @@ nfsfifo_close(struct vop_close_args *ap)
}
return (VOCALL(fifo_vnodeop_p, VOFFSET(vop_close), ap));
}

View File

@ -1630,10 +1630,24 @@ vm_page_set_validclean(vm_page_t m, int base, int size)
* use this opportunity to clear the PG_NOSYNC flag. If a process
* takes a write fault on a MAP_NOSYNC memory area the flag will
* be set again.
*
* We set valid bits inclusive of any overlap, but we can only
* clear dirty bits for DEV_BSIZE chunks that are fully within
* the range.
*/
pagebits = vm_page_bits(base, size);
m->valid |= pagebits;
#if 0 /* NOT YET */
if ((frag = base & (DEV_BSIZE - 1)) != 0) {
frag = DEV_BSIZE - frag;
base += frag;
size -= frag;
if (size < 0)
size = 0;
}
pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1));
#endif
m->dirty &= ~pagebits;
if (base == 0 && size == PAGE_SIZE) {
pmap_clear_modify(m);

View File

@ -298,14 +298,18 @@ vnode_pager_setsize(vp, nsize)
}
/*
* this gets rid of garbage at the end of a page that is now
* only partially backed by the vnode...
* only partially backed by the vnode.
*
* XXX for some reason (I don't know yet), if we take a
* completely invalid page and mark it partially valid
* it can screw up NFS reads, so we don't allow the case.
*/
if (nsize & PAGE_MASK) {
vm_offset_t kva;
vm_page_t m;
m = vm_page_lookup(object, OFF_TO_IDX(nsize));
if (m) {
if (m && m->valid) {
int base = (int)nsize & PAGE_MASK;
int size = PAGE_SIZE - base;
@ -317,6 +321,20 @@ vnode_pager_setsize(vp, nsize)
bzero((caddr_t)kva + base, size);
vm_pager_unmap_page(kva);
/*
* XXX work around SMP data integrity race
* by unmapping the page from user processes.
* The garbage we just cleared may be mapped
* to a user process running on another cpu
* and this code is not running through normal
* I/O channels which handle SMP issues for
* us, so unmap page to synchronize all cpus.
*
* XXX should vm_pager_unmap_page() have
* dealt with this?
*/
vm_page_protect(m, VM_PROT_NONE);
/*
* Clear out partial-page dirty bits. This
* has the side effect of setting the valid
@ -325,6 +343,10 @@ vnode_pager_setsize(vp, nsize)
* m->dirty == VM_PAGE_BITS_ALL. The file EOF
* case is one of them. If the page is still
* partially dirty, make it fully dirty.
*
* note that we do not clear out the valid
* bits. This would prevent bogus_page
* replacement from working properly.
*/
vm_page_set_validclean(m, base, size);
if (m->dirty != 0)
@ -965,6 +987,9 @@ vnode_pager_generic_putpages(vp, m, bytecount, flags, rtvals)
* may not properly clear the dirty bits for the entire page (which
* could be VM_PAGE_BITS_ALL due to the page having been mmap()d).
* With the page locked we are free to fix-up the dirty bits here.
*
* We do not under any circumstances truncate the valid bits, as
* this will screw up bogus page replacement.
*/
if (maxsize + poffset > object->un_pager.vnp.vnp_size) {
if (object->un_pager.vnp.vnp_size > poffset) {