This fixes a large number of bugs in our NFS client side code. A recent
commit by Kirk also fixed a softupdates bug that could easily be triggered by server side NFS. * An edge case with shared R+W mmap()'s and truncate whereby the system would inappropriately clear the dirty bits on still-dirty data. (applicable to all filesystems) THIS FIX TEMPORARILY DISABLED PENDING FURTHER TESTING. see vm/vm_page.c line 1641 * The straddle case for VM pages and buffer cache buffers when truncating. (applicable to NFS client side) * Possible SMP database corruption due to vm_pager_unmap_page() not clearing the TLB for the other cpu's. (applicable to NFS client side but could effect all filesystems). Note: not considered serious since the corruption occurs beyond the file EOF. * When flusing a dirty buffer due to B_CACHE getting cleared, we were accidently setting B_CACHE again (that is, bwrite() sets B_CACHE), when we really want it to stay clear after the write is complete. This resulted in a corrupt buffer. (applicable to all filesystems but probably only triggered by NFS) * We have to call vtruncbuf() when ftruncate()ing to remove any buffer cache buffers. This is still tentitive, I may be able to remove it due to the second bug fix. (applicable to NFS client side) * vnode_pager_setsize() race against nfs_vinvalbuf()... we have to set n_size before calling nfs_vinvalbuf or the NFS code may recursively vnode_pager_setsize() to the original value before the truncate. This is what was causing the user mmap bus faults in the nfs tester program. (applicable to NFS client side) * Fix to softupdates (see ufs/ffs/ffs_inode.c 1.73, commit made by Kirk). Testing program written by: Avadis Tevanian, Jr. Testing program supplied by: jkh / Apple (see Dec2001 posting to freebsd-hackers with Subject 'NFS: How to make FreeBS fall on its face in one easy step') MFC after: 1 week
This commit is contained in:
parent
247e80274d
commit
3ebeaf5984
@ -2271,9 +2271,21 @@ loop:
|
||||
* to softupdates re-dirtying the buffer. In the latter
|
||||
* case, B_CACHE is set after the first write completes,
|
||||
* preventing further loops.
|
||||
* NOTE! b*write() sets B_CACHE. If we cleared B_CACHE
|
||||
* above while extending the buffer, we cannot allow the
|
||||
* buffer to remain with B_CACHE set after the write
|
||||
* completes or it will represent a corrupt state. To
|
||||
* deal with this we set B_NOCACHE to scrap the buffer
|
||||
* after the write.
|
||||
*
|
||||
* We might be able to do something fancy, like setting
|
||||
* B_CACHE in bwrite() except if B_DELWRI is already set,
|
||||
* so the below call doesn't set B_CACHE, but that gets real
|
||||
* confusing. This is much easier.
|
||||
*/
|
||||
|
||||
if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
|
||||
bp->b_flags |= B_NOCACHE;
|
||||
BUF_WRITE(bp);
|
||||
goto loop;
|
||||
}
|
||||
|
@ -288,6 +288,8 @@ void nfs_clearcommit(struct mount *);
|
||||
int nfs_writebp(struct buf *, int, struct thread *);
|
||||
int nfs_fsinfo(struct nfsmount *, struct vnode *, struct ucred *,
|
||||
struct thread *);
|
||||
int nfs_meta_setsize (struct vnode *, struct ucred *,
|
||||
struct thread *, u_quad_t);
|
||||
|
||||
#endif /* _KERNEL */
|
||||
|
||||
|
@ -206,8 +206,14 @@ nfs_getpages(struct vop_getpages_args *ap)
|
||||
vm_page_set_validclean(m, 0, size - toff);
|
||||
/* handled by vm_fault now */
|
||||
/* vm_page_zero_invalid(m, TRUE); */
|
||||
} else {
|
||||
/*
|
||||
* Read operation was short. If no error occured
|
||||
* we may have hit a zero-fill section. We simply
|
||||
* leave valid set to 0.
|
||||
*/
|
||||
;
|
||||
}
|
||||
|
||||
if (i != ap->a_reqpage) {
|
||||
/*
|
||||
* Whether or not to leave the page activated is up in
|
||||
@ -831,9 +837,7 @@ again:
|
||||
else
|
||||
bcount = np->n_size - (off_t)lbn * biosize;
|
||||
}
|
||||
|
||||
bp = nfs_getcacheblk(vp, lbn, bcount, td);
|
||||
|
||||
if (uio->uio_offset + n > np->n_size) {
|
||||
np->n_size = uio->uio_offset + n;
|
||||
np->n_flag |= NMODIFIED;
|
||||
@ -1299,11 +1303,13 @@ nfs_doio(struct buf *bp, struct ucred *cr, struct thread *td)
|
||||
io.iov_len = uiop->uio_resid = bp->b_bcount;
|
||||
io.iov_base = bp->b_data;
|
||||
uiop->uio_rw = UIO_READ;
|
||||
|
||||
switch (vp->v_type) {
|
||||
case VREG:
|
||||
uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
|
||||
nfsstats.read_bios++;
|
||||
error = nfs_readrpc(vp, uiop, cr);
|
||||
|
||||
if (!error) {
|
||||
if (uiop->uio_resid) {
|
||||
/*
|
||||
@ -1315,7 +1321,7 @@ nfs_doio(struct buf *bp, struct ucred *cr, struct thread *td)
|
||||
* writes, but that is not possible any longer.
|
||||
*/
|
||||
int nread = bp->b_bcount - uiop->uio_resid;
|
||||
int left = bp->b_bcount - nread;
|
||||
int left = uiop->uio_resid;
|
||||
|
||||
if (left > 0)
|
||||
bzero((char *)bp->b_data + nread, left);
|
||||
@ -1485,3 +1491,48 @@ nfs_doio(struct buf *bp, struct ucred *cr, struct thread *td)
|
||||
bufdone(bp);
|
||||
return (error);
|
||||
}
|
||||
|
||||
/*
|
||||
* Used to aid in handling ftruncate() operations on the NFS client side.
|
||||
* Truncation creates a number of special problems for NFS. We have to
|
||||
* throw away VM pages and buffer cache buffers that are beyond EOF, and
|
||||
* we have to properly handle VM pages or (potentially dirty) buffers
|
||||
* that straddle the truncation point.
|
||||
*/
|
||||
|
||||
int
|
||||
nfs_meta_setsize(struct vnode *vp, struct ucred *cred, struct thread *td, u_quad_t nsize)
|
||||
{
|
||||
struct nfsnode *np = VTONFS(vp);
|
||||
u_quad_t tsize = np->n_size;
|
||||
int biosize = vp->v_mount->mnt_stat.f_iosize;
|
||||
int error = 0;
|
||||
|
||||
np->n_size = nsize;
|
||||
|
||||
if (np->n_size < tsize) {
|
||||
struct buf *bp;
|
||||
daddr_t lbn;
|
||||
int bufsize;
|
||||
|
||||
/*
|
||||
* vtruncbuf() doesn't get the buffer overlapping the
|
||||
* truncation point. We may have a B_DELWRI and/or B_CACHE
|
||||
* buffer that now needs to be truncated.
|
||||
*/
|
||||
error = vtruncbuf(vp, cred, td, nsize, biosize);
|
||||
lbn = nsize / biosize;
|
||||
bufsize = nsize & (biosize - 1);
|
||||
bp = nfs_getcacheblk(vp, lbn, bufsize, td);
|
||||
if (bp->b_dirtyoff > bp->b_bcount)
|
||||
bp->b_dirtyoff = bp->b_bcount;
|
||||
if (bp->b_dirtyend > bp->b_bcount)
|
||||
bp->b_dirtyend = bp->b_bcount;
|
||||
bp->b_flags |= B_RELBUF; /* don't leave garbage around */
|
||||
brelse(bp);
|
||||
} else {
|
||||
vnode_pager_setsize(vp, nsize);
|
||||
}
|
||||
return(error);
|
||||
}
|
||||
|
||||
|
@ -643,7 +643,18 @@ nfs_setattr(struct vop_setattr_args *ap)
|
||||
*/
|
||||
if (vp->v_mount->mnt_flag & MNT_RDONLY)
|
||||
return (EROFS);
|
||||
vnode_pager_setsize(vp, vap->va_size);
|
||||
|
||||
/*
|
||||
* We run vnode_pager_setsize() early (why?),
|
||||
* we must set np->n_size now to avoid vinvalbuf
|
||||
* V_SAVE races that might setsize a lower
|
||||
* value.
|
||||
*/
|
||||
|
||||
tsize = np->n_size;
|
||||
error = nfs_meta_setsize(vp, ap->a_cred,
|
||||
ap->a_td, vap->va_size);
|
||||
|
||||
if (np->n_flag & NMODIFIED) {
|
||||
if (vap->va_size == 0)
|
||||
error = nfs_vinvalbuf(vp, 0,
|
||||
@ -656,8 +667,7 @@ nfs_setattr(struct vop_setattr_args *ap)
|
||||
return (error);
|
||||
}
|
||||
}
|
||||
tsize = np->n_size;
|
||||
np->n_size = np->n_vattr.va_size = vap->va_size;
|
||||
np->n_vattr.va_size = vap->va_size;
|
||||
};
|
||||
} else if ((vap->va_mtime.tv_sec != VNOVAL ||
|
||||
vap->va_atime.tv_sec != VNOVAL) && (np->n_flag & NMODIFIED) &&
|
||||
@ -1049,10 +1059,12 @@ nfs_readrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred)
|
||||
m_freem(mrep);
|
||||
tsiz -= retlen;
|
||||
if (v3) {
|
||||
if (eof || retlen == 0)
|
||||
if (eof || retlen == 0) {
|
||||
tsiz = 0;
|
||||
} else if (retlen < len)
|
||||
}
|
||||
} else if (retlen < len) {
|
||||
tsiz = 0;
|
||||
}
|
||||
}
|
||||
nfsmout:
|
||||
return (error);
|
||||
@ -3114,3 +3126,4 @@ nfsfifo_close(struct vop_close_args *ap)
|
||||
}
|
||||
return (VOCALL(fifo_vnodeop_p, VOFFSET(vop_close), ap));
|
||||
}
|
||||
|
||||
|
@ -1630,10 +1630,24 @@ vm_page_set_validclean(vm_page_t m, int base, int size)
|
||||
* use this opportunity to clear the PG_NOSYNC flag. If a process
|
||||
* takes a write fault on a MAP_NOSYNC memory area the flag will
|
||||
* be set again.
|
||||
*
|
||||
* We set valid bits inclusive of any overlap, but we can only
|
||||
* clear dirty bits for DEV_BSIZE chunks that are fully within
|
||||
* the range.
|
||||
*/
|
||||
|
||||
pagebits = vm_page_bits(base, size);
|
||||
m->valid |= pagebits;
|
||||
#if 0 /* NOT YET */
|
||||
if ((frag = base & (DEV_BSIZE - 1)) != 0) {
|
||||
frag = DEV_BSIZE - frag;
|
||||
base += frag;
|
||||
size -= frag;
|
||||
if (size < 0)
|
||||
size = 0;
|
||||
}
|
||||
pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1));
|
||||
#endif
|
||||
m->dirty &= ~pagebits;
|
||||
if (base == 0 && size == PAGE_SIZE) {
|
||||
pmap_clear_modify(m);
|
||||
|
@ -298,14 +298,18 @@ vnode_pager_setsize(vp, nsize)
|
||||
}
|
||||
/*
|
||||
* this gets rid of garbage at the end of a page that is now
|
||||
* only partially backed by the vnode...
|
||||
* only partially backed by the vnode.
|
||||
*
|
||||
* XXX for some reason (I don't know yet), if we take a
|
||||
* completely invalid page and mark it partially valid
|
||||
* it can screw up NFS reads, so we don't allow the case.
|
||||
*/
|
||||
if (nsize & PAGE_MASK) {
|
||||
vm_offset_t kva;
|
||||
vm_page_t m;
|
||||
|
||||
m = vm_page_lookup(object, OFF_TO_IDX(nsize));
|
||||
if (m) {
|
||||
if (m && m->valid) {
|
||||
int base = (int)nsize & PAGE_MASK;
|
||||
int size = PAGE_SIZE - base;
|
||||
|
||||
@ -317,6 +321,20 @@ vnode_pager_setsize(vp, nsize)
|
||||
bzero((caddr_t)kva + base, size);
|
||||
vm_pager_unmap_page(kva);
|
||||
|
||||
/*
|
||||
* XXX work around SMP data integrity race
|
||||
* by unmapping the page from user processes.
|
||||
* The garbage we just cleared may be mapped
|
||||
* to a user process running on another cpu
|
||||
* and this code is not running through normal
|
||||
* I/O channels which handle SMP issues for
|
||||
* us, so unmap page to synchronize all cpus.
|
||||
*
|
||||
* XXX should vm_pager_unmap_page() have
|
||||
* dealt with this?
|
||||
*/
|
||||
vm_page_protect(m, VM_PROT_NONE);
|
||||
|
||||
/*
|
||||
* Clear out partial-page dirty bits. This
|
||||
* has the side effect of setting the valid
|
||||
@ -325,6 +343,10 @@ vnode_pager_setsize(vp, nsize)
|
||||
* m->dirty == VM_PAGE_BITS_ALL. The file EOF
|
||||
* case is one of them. If the page is still
|
||||
* partially dirty, make it fully dirty.
|
||||
*
|
||||
* note that we do not clear out the valid
|
||||
* bits. This would prevent bogus_page
|
||||
* replacement from working properly.
|
||||
*/
|
||||
vm_page_set_validclean(m, base, size);
|
||||
if (m->dirty != 0)
|
||||
@ -965,6 +987,9 @@ vnode_pager_generic_putpages(vp, m, bytecount, flags, rtvals)
|
||||
* may not properly clear the dirty bits for the entire page (which
|
||||
* could be VM_PAGE_BITS_ALL due to the page having been mmap()d).
|
||||
* With the page locked we are free to fix-up the dirty bits here.
|
||||
*
|
||||
* We do not under any circumstances truncate the valid bits, as
|
||||
* this will screw up bogus page replacement.
|
||||
*/
|
||||
if (maxsize + poffset > object->un_pager.vnp.vnp_size) {
|
||||
if (object->un_pager.vnp.vnp_size > poffset) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user