This fixes a large number of bugs in our NFS client side code. A recent

commit by Kirk also fixed a softupdates bug that could easily be triggered by server side NFS. * An edge case with shared R+W mmap()'s and truncate whereby the system would inappropriately clear the dirty bits on still-dirty data. (applicable to all filesystems) THIS FIX TEMPORARILY DISABLED PENDING FURTHER TESTING. see vm/vm_page.c line 1641 * The straddle case for VM pages and buffer cache buffers when truncating. (applicable to NFS client side) * Possible SMP database corruption due to vm_pager_unmap_page() not clearing the TLB for the other cpu's. (applicable to NFS client side but could effect all filesystems). Note: not considered serious since the corruption occurs beyond the file EOF. * When flusing a dirty buffer due to B_CACHE getting cleared, we were accidently setting B_CACHE again (that is, bwrite() sets B_CACHE), when we really want it to stay clear after the write is complete. This resulted in a corrupt buffer. (applicable to all filesystems but probably only triggered by NFS) * We have to call vtruncbuf() when ftruncate()ing to remove any buffer cache buffers. This is still tentitive, I may be able to remove it due to the second bug fix. (applicable to NFS client side) * vnode_pager_setsize() race against nfs_vinvalbuf()... we have to set n_size before calling nfs_vinvalbuf or the NFS code may recursively vnode_pager_setsize() to the original value before the truncate. This is what was causing the user mmap bus faults in the nfs tester program. (applicable to NFS client side) * Fix to softupdates (see ufs/ffs/ffs_inode.c 1.73, commit made by Kirk). Testing program written by: Avadis Tevanian, Jr. Testing program supplied by: jkh / Apple (see Dec2001 posting to freebsd-hackers with Subject 'NFS: How to make FreeBS fall on its face in one easy step') MFC after: 1 week
2001-12-14 01:16:57 +00:00 · 2001-12-14 01:16:57 +00:00 · 3ebeaf5984
commit 3ebeaf5984
parent 247e80274d
6 changed files with 128 additions and 11 deletions
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@ -2271,9 +2271,21 @@ loop:
 		 * to softupdates re-dirtying the buffer.  In the latter
 		 * case, B_CACHE is set after the first write completes,
 		 * preventing further loops.
+		 * NOTE!  b*write() sets B_CACHE.  If we cleared B_CACHE
+		 * above while extending the buffer, we cannot allow the
+		 * buffer to remain with B_CACHE set after the write
+		 * completes or it will represent a corrupt state.  To
+		 * deal with this we set B_NOCACHE to scrap the buffer
+		 * after the write.
+		 *
+		 * We might be able to do something fancy, like setting
+		 * B_CACHE in bwrite() except if B_DELWRI is already set,
+		 * so the below call doesn't set B_CACHE, but that gets real
+		 * confusing.  This is much easier.
 		 */

 		if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
+			bp->b_flags |= B_NOCACHE;
 			BUF_WRITE(bp);
 			goto loop;
 		}
--- a/sys/nfsclient/nfs.h
+++ b/sys/nfsclient/nfs.h
@ -288,6 +288,8 @@ void	nfs_clearcommit(struct mount *);
 int	nfs_writebp(struct buf *, int, struct thread *);
 int	nfs_fsinfo(struct nfsmount *, struct vnode *, struct ucred *,
 	    struct thread *);
+int	nfs_meta_setsize (struct vnode *, struct ucred *,
+	    struct thread *, u_quad_t);

 #endif	/* _KERNEL */

--- a/sys/nfsclient/nfs_bio.c
+++ b/sys/nfsclient/nfs_bio.c
@ -206,8 +206,14 @@ nfs_getpages(struct vop_getpages_args *ap)
 			vm_page_set_validclean(m, 0, size - toff);
 			/* handled by vm_fault now	  */
 			/* vm_page_zero_invalid(m, TRUE); */
+		} else {
+			/*
+			 * Read operation was short.  If no error occured
+			 * we may have hit a zero-fill section.   We simply
+			 * leave valid set to 0.
+			 */
+			;
 		}
-
 		if (i != ap->a_reqpage) {
 			/*
 			 * Whether or not to leave the page activated is up in
@ -831,9 +837,7 @@ again:
 				else
 					bcount = np->n_size - (off_t)lbn * biosize;
 			}
-
 			bp = nfs_getcacheblk(vp, lbn, bcount, td);
-
 			if (uio->uio_offset + n > np->n_size) {
 				np->n_size = uio->uio_offset + n;
 				np->n_flag |= NMODIFIED;
@ -1299,11 +1303,13 @@ nfs_doio(struct buf *bp, struct ucred *cr, struct thread *td)
 	    io.iov_len = uiop->uio_resid = bp->b_bcount;
 	    io.iov_base = bp->b_data;
 	    uiop->uio_rw = UIO_READ;
+
 	    switch (vp->v_type) {
 	    case VREG:
 		uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
 		nfsstats.read_bios++;
 		error = nfs_readrpc(vp, uiop, cr);
+
 		if (!error) {
 		    if (uiop->uio_resid) {
 			/*
@ -1315,7 +1321,7 @@ nfs_doio(struct buf *bp, struct ucred *cr, struct thread *td)
 			 * writes, but that is not possible any longer.
 			 */
 			int nread = bp->b_bcount - uiop->uio_resid;
-			int left  = bp->b_bcount - nread;
+			int left  = uiop->uio_resid;

 			if (left > 0)
 				bzero((char *)bp->b_data + nread, left);
@ -1485,3 +1491,48 @@ nfs_doio(struct buf *bp, struct ucred *cr, struct thread *td)
 	bufdone(bp);
 	return (error);
 }
+
+/*
+ * Used to aid in handling ftruncate() operations on the NFS client side.
+ * Truncation creates a number of special problems for NFS.  We have to
+ * throw away VM pages and buffer cache buffers that are beyond EOF, and
+ * we have to properly handle VM pages or (potentially dirty) buffers
+ * that straddle the truncation point.
+ */
+
+int
+nfs_meta_setsize(struct vnode *vp, struct ucred *cred, struct thread *td, u_quad_t nsize)
+{
+	struct nfsnode *np = VTONFS(vp);
+	u_quad_t tsize = np->n_size;
+	int biosize = vp->v_mount->mnt_stat.f_iosize;
+	int error = 0;
+
+	np->n_size = nsize;
+
+	if (np->n_size < tsize) {
+		struct buf *bp;
+		daddr_t lbn;
+		int bufsize;
+
+		/*
+		 * vtruncbuf() doesn't get the buffer overlapping the 
+		 * truncation point.  We may have a B_DELWRI and/or B_CACHE
+		 * buffer that now needs to be truncated.
+		 */
+		error = vtruncbuf(vp, cred, td, nsize, biosize);
+		lbn = nsize / biosize;
+		bufsize = nsize & (biosize - 1);
+		bp = nfs_getcacheblk(vp, lbn, bufsize, td);
+		if (bp->b_dirtyoff > bp->b_bcount)
+			bp->b_dirtyoff = bp->b_bcount;
+		if (bp->b_dirtyend > bp->b_bcount)
+			bp->b_dirtyend = bp->b_bcount;
+		bp->b_flags |= B_RELBUF;  /* don't leave garbage around */
+		brelse(bp);
+	} else {
+		vnode_pager_setsize(vp, nsize);
+	}
+	return(error);
+}
+
--- a/sys/nfsclient/nfs_vnops.c
+++ b/sys/nfsclient/nfs_vnops.c
@ -643,7 +643,18 @@ nfs_setattr(struct vop_setattr_args *ap)
 			 */
 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
 				return (EROFS);
-			vnode_pager_setsize(vp, vap->va_size);
+
+			/*
+			 *  We run vnode_pager_setsize() early (why?),
+			 * we must set np->n_size now to avoid vinvalbuf
+			 * V_SAVE races that might setsize a lower
+			 * value.
+			 */
+
+			tsize = np->n_size;
+			error = nfs_meta_setsize(vp, ap->a_cred, 
+						ap->a_td, vap->va_size);
+
 			if (np->n_flag & NMODIFIED) {
 			    if (vap->va_size == 0)
 				error = nfs_vinvalbuf(vp, 0,
@ -656,8 +667,7 @@ nfs_setattr(struct vop_setattr_args *ap)
 				return (error);
 			    }
 			}
- 			tsize = np->n_size;
- 			np->n_size = np->n_vattr.va_size = vap->va_size;
+ 			np->n_vattr.va_size = vap->va_size;
  		};
  	} else if ((vap->va_mtime.tv_sec != VNOVAL ||
 		vap->va_atime.tv_sec != VNOVAL) && (np->n_flag & NMODIFIED) &&
@ -1049,10 +1059,12 @@ nfs_readrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred)
 		m_freem(mrep);
 		tsiz -= retlen;
 		if (v3) {
-			if (eof || retlen == 0)
+			if (eof || retlen == 0) {
 				tsiz = 0;
-		} else if (retlen < len)
+			}
+		} else if (retlen < len) {
 			tsiz = 0;
+		}
 	}
 nfsmout:
 	return (error);
@ -3114,3 +3126,4 @@ nfsfifo_close(struct vop_close_args *ap)
 	}
 	return (VOCALL(fifo_vnodeop_p, VOFFSET(vop_close), ap));
 }
+
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@ -1630,10 +1630,24 @@ vm_page_set_validclean(vm_page_t m, int base, int size)
 	 * use this opportunity to clear the PG_NOSYNC flag.  If a process
 	 * takes a write fault on a MAP_NOSYNC memory area the flag will
 	 * be set again.
+	 *
+	 * We set valid bits inclusive of any overlap, but we can only
+	 * clear dirty bits for DEV_BSIZE chunks that are fully within
+	 * the range.
 	 */

 	pagebits = vm_page_bits(base, size);
 	m->valid |= pagebits;
+#if 0	/* NOT YET */
+	if ((frag = base & (DEV_BSIZE - 1)) != 0) {
+		frag = DEV_BSIZE - frag;
+		base += frag;
+		size -= frag;
+		if (size < 0)
+			size = 0;
+	}
+	pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1));
+#endif
 	m->dirty &= ~pagebits;
 	if (base == 0 && size == PAGE_SIZE) {
 		pmap_clear_modify(m);
--- a/sys/vm/vnode_pager.c
+++ b/sys/vm/vnode_pager.c
@ -298,14 +298,18 @@ vnode_pager_setsize(vp, nsize)
 		}
 		/*
 		 * this gets rid of garbage at the end of a page that is now
-		 * only partially backed by the vnode...
+		 * only partially backed by the vnode.
+		 *
+		 * XXX for some reason (I don't know yet), if we take a
+		 * completely invalid page and mark it partially valid
+		 * it can screw up NFS reads, so we don't allow the case.
 		 */
 		if (nsize & PAGE_MASK) {
 			vm_offset_t kva;
 			vm_page_t m;

 			m = vm_page_lookup(object, OFF_TO_IDX(nsize));
-			if (m) {
+			if (m && m->valid) {
 				int base = (int)nsize & PAGE_MASK;
 				int size = PAGE_SIZE - base;

@ -317,6 +321,20 @@ vnode_pager_setsize(vp, nsize)
 				bzero((caddr_t)kva + base, size);
 				vm_pager_unmap_page(kva);

+				/*
+				 * XXX work around SMP data integrity race
+				 * by unmapping the page from user processes.
+				 * The garbage we just cleared may be mapped
+				 * to a user process running on another cpu
+				 * and this code is not running through normal
+				 * I/O channels which handle SMP issues for
+				 * us, so unmap page to synchronize all cpus.
+				 *
+				 * XXX should vm_pager_unmap_page() have
+				 * dealt with this?
+				 */
+				vm_page_protect(m, VM_PROT_NONE);
+
 				/*
 				 * Clear out partial-page dirty bits.  This
 				 * has the side effect of setting the valid
@ -325,6 +343,10 @@ vnode_pager_setsize(vp, nsize)
 				 * m->dirty == VM_PAGE_BITS_ALL.  The file EOF
 				 * case is one of them.  If the page is still
 				 * partially dirty, make it fully dirty.
+				 *
+				 * note that we do not clear out the valid
+				 * bits.  This would prevent bogus_page
+				 * replacement from working properly.
 				 */
 				vm_page_set_validclean(m, base, size);
 				if (m->dirty != 0)
@ -965,6 +987,9 @@ vnode_pager_generic_putpages(vp, m, bytecount, flags, rtvals)
 	 * may not properly clear the dirty bits for the entire page (which
 	 * could be VM_PAGE_BITS_ALL due to the page having been mmap()d).
 	 * With the page locked we are free to fix-up the dirty bits here.
+	 *
+	 * We do not under any circumstances truncate the valid bits, as
+	 * this will screw up bogus page replacement.
 	 */
 	if (maxsize + poffset > object->un_pager.vnp.vnp_size) {
 		if (object->un_pager.vnp.vnp_size > poffset) {