fusefs: rewrite vop_getpages and vop_putpages

Use the standard facilities for getpages and putpages instead of bespoke
implementations that don't work well with the writeback cache.  This has
several corollaries:

* Change the way we handle short reads _again_.  vfs_bio_getpages doesn't
  provide any way to handle unexpected short reads.  Plus, I found some more
  lock-order problems.  So now when the short read is detected we'll just
  clear the vnode's attribute cache, forcing the file size to be requeried
  the next time it's needed.  VOP_GETPAGES doesn't have any way to indicate
  a short read to the "caller", so we just bzero the rest of the page
  whenever a short read happens.

* Change the way we decide when to set the FUSE_WRITE_CACHE bit.  We now set
  it for clustered writes even when the writeback cache is not in use.

Sponsored by:   The FreeBSD Foundation
This commit is contained in:
Alan Somers 2019-06-25 17:24:43 +00:00
parent 48417ae0ba
commit b9e2019755
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/projects/fuse2/; revision=349378
5 changed files with 181 additions and 242 deletions

View File

@ -100,6 +100,12 @@ __FBSDID("$FreeBSD$");
#include "fuse_ipc.h"
#include "fuse_io.h"
/*
* Set in a struct buf to indicate that the write came from the buffer cache
* and the originating cred and pid are no longer known.
*/
#define B_FUSEFS_WRITE_CACHE B_FS_FLAG1
SDT_PROVIDER_DECLARE(fusefs);
/*
* Fuse trace probe:
@ -164,10 +170,10 @@ fuse_io_clear_suid_on_write(struct vnode *vp, struct ucred *cred,
SDT_PROBE_DEFINE5(fusefs, , io, io_dispatch, "struct vnode*", "struct uio*",
"int", "struct ucred*", "struct fuse_filehandle*");
SDT_PROBE_DEFINE5(fusefs, , io, io_dispatch_filehandles_closed, "struct vnode*",
"struct uio*", "int", "bool", "struct ucred*");
SDT_PROBE_DEFINE4(fusefs, , io, io_dispatch_filehandles_closed, "struct vnode*",
"struct uio*", "int", "struct ucred*");
int
fuse_io_dispatch(struct vnode *vp, struct uio *uio, int ioflag, bool pages,
fuse_io_dispatch(struct vnode *vp, struct uio *uio, int ioflag,
struct ucred *cred, pid_t pid)
{
struct fuse_filehandle *fufh;
@ -188,8 +194,8 @@ fuse_io_dispatch(struct vnode *vp, struct uio *uio, int ioflag, bool pages,
closefufh = true;
}
else if (err) {
SDT_PROBE5(fusefs, , io, io_dispatch_filehandles_closed,
vp, uio, ioflag, pages, cred);
SDT_PROBE4(fusefs, , io, io_dispatch_filehandles_closed,
vp, uio, ioflag, cred);
printf("FUSE: io dispatch: filehandles are closed\n");
return err;
}
@ -236,15 +242,13 @@ fuse_io_dispatch(struct vnode *vp, struct uio *uio, int ioflag, bool pages,
start = uio->uio_offset;
end = start + uio->uio_resid;
/*
* Invalidate the write cache unless we're coming from
* VOP_PUTPAGES, in which case we're writing _from_ the
* write cache
*/
if (!pages )
v_inval_buf_range(vp, start, end, iosize);
KASSERT((ioflag & (IO_VMIO | IO_DIRECT)) !=
(IO_VMIO | IO_DIRECT),
("IO_DIRECT used for a cache flush?"));
/* Invalidate the write cache when writing directly */
v_inval_buf_range(vp, start, end, iosize);
err = fuse_write_directbackend(vp, uio, cred, fufh,
filesize, ioflag, pages);
filesize, ioflag, false);
} else {
SDT_PROBE2(fusefs, , io, trace, 1,
"buffered write of vnode");
@ -352,8 +356,8 @@ fuse_read_biobackend(struct vnode *vp, struct uio *uio, int ioflag,
*/
n = 0;
if (on < bcount - (intptr_t)bp->b_fsprivate1)
n = MIN((unsigned)(bcount - (intptr_t)bp->b_fsprivate1 - on),
if (on < bcount - bp->b_resid)
n = MIN((unsigned)(bcount - bp->b_resid - on),
uio->uio_resid);
if (n > 0) {
SDT_PROBE2(fusefs, , io, read_bio_backend_feed, n, bp);
@ -362,10 +366,8 @@ fuse_read_biobackend(struct vnode *vp, struct uio *uio, int ioflag,
vfs_bio_brelse(bp, ioflag);
SDT_PROBE4(fusefs, , io, read_bio_backend_end, err,
uio->uio_resid, n, bp);
if ((intptr_t)bp->b_fsprivate1 > 0) {
if (bp->b_resid > 0) {
/* Short read indicates EOF */
(void)fuse_vnode_setsize(vp, uio->uio_offset);
bp->b_fsprivate1 = (void*)0;
break;
}
}
@ -725,6 +727,21 @@ fuse_write_biobackend(struct vnode *vp, struct uio *uio,
brelse(bp);
break;
}
if (bp->b_resid > 0) {
/*
* Short read indicates EOF. Update file size
* from the server and try again.
*/
SDT_PROBE2(fusefs, , io, trace, 1,
"Short read during a RMW");
brelse(bp);
err = fuse_vnode_size(vp, &filesize, cred,
curthread);
if (err)
break;
else
goto again;
}
}
if (bp->b_wcred == NOCRED)
bp->b_wcred = crhold(cred);
@ -805,8 +822,11 @@ fuse_write_biobackend(struct vnode *vp, struct uio *uio,
vfs_bio_set_flags(bp, ioflag);
bp->b_flags |= B_FUSEFS_WRITE_CACHE;
if (ioflag & IO_SYNC) {
SDT_PROBE2(fusefs, , io, write_biobackend_issue, 2, bp);
if (!(ioflag & IO_VMIO))
bp->b_flags &= ~B_FUSEFS_WRITE_CACHE;
err = bwrite(bp);
} else if (vm_page_count_severe() ||
buf_dirty_count_severe() ||
@ -864,7 +884,6 @@ fuse_io_strategy(struct vnode *vp, struct buf *bp)
fflag = bp->b_iocmd == BIO_READ ? FREAD : FWRITE;
cred = bp->b_iocmd == BIO_READ ? bp->b_rcred : bp->b_wcred;
error = fuse_filehandle_getrw(vp, fflag, &fufh, cred, pid);
bp->b_fsprivate1 = (void*)(intptr_t)0;
if (bp->b_iocmd == BIO_READ && error == EBADF) {
/*
* This may be a read-modify-write operation on a cached file
@ -905,39 +924,40 @@ fuse_io_strategy(struct vnode *vp, struct buf *bp)
uiop->uio_offset = ((off_t)bp->b_lblkno) * biosize;
error = fuse_read_directbackend(vp, uiop, cred, fufh);
left = uiop->uio_resid;
/*
* Store the amount we failed to read in the buffer's private
* field, so callers can truncate the file if necessary'
*/
bp->b_fsprivate1 = (void*)(intptr_t)left;
if (!error && uiop->uio_resid) {
int nread = bp->b_bcount - uiop->uio_resid;
left = uiop->uio_resid;
bzero((char *)bp->b_data + nread, left);
if (fuse_data_cache_mode != FUSE_CACHE_WB ||
(fvdat->flag & FN_SIZECHANGE) == 0) {
if ((fvdat->flag & FN_SIZECHANGE) == 0) {
/*
* A short read with no error, when not using
* direct io, and when no writes are cached,
* indicates EOF. Update the file size
* accordingly. We must still bzero the
* remaining buffer so uninitialized data
* doesn't get exposed by a future truncate
* that extends the file.
* indicates EOF caused by a server-side
* truncation. Clear the attr cache so we'll
* pick up the new file size and timestamps.
*
* We must still bzero the remaining buffer so
* uninitialized data doesn't get exposed by a
* future truncate that extends the file.
*
* To prevent lock order problems, we must
* truncate the file upstack
* truncate the file upstack, not here.
*/
SDT_PROBE2(fusefs, , io, trace, 1,
"Short read of a clean file");
uiop->uio_resid = 0;
fuse_vnode_clear_attr_cache(vp);
} else {
/*
* If dirty writes _are_ cached beyond EOF,
* that indicates a newly created hole that the
* server doesn't know about.
* server doesn't know about. Those don't pose
* any problem.
* XXX: we don't currently track whether dirty
* writes are cached beyond EOF, before EOF, or
* both.
@ -976,8 +996,9 @@ fuse_io_strategy(struct vnode *vp, struct buf *bp)
io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
uiop->uio_rw = UIO_WRITE;
bool pages = bp->b_flags & B_FUSEFS_WRITE_CACHE;
error = fuse_write_directbackend(vp, uiop, cred, fufh,
filesize, 0, false);
filesize, 0, pages);
if (error == EINTR || error == ETIMEDOUT) {
bp->b_flags &= ~(B_INVAL | B_NOCACHE);

View File

@ -60,7 +60,7 @@
#ifndef _FUSE_IO_H_
#define _FUSE_IO_H_
int fuse_io_dispatch(struct vnode *vp, struct uio *uio, int ioflag, bool pages,
int fuse_io_dispatch(struct vnode *vp, struct uio *uio, int ioflag,
struct ucred *cred, pid_t pid);
int fuse_io_strategy(struct vnode *vp, struct buf *bp);
int fuse_io_flushbuf(struct vnode *vp, int waitfor, struct thread *td);

View File

@ -150,7 +150,6 @@ static vop_strategy_t fuse_vnop_strategy;
static vop_symlink_t fuse_vnop_symlink;
static vop_write_t fuse_vnop_write;
static vop_getpages_t fuse_vnop_getpages;
static vop_putpages_t fuse_vnop_putpages;
static vop_print_t fuse_vnop_print;
static vop_vptofh_t fuse_vnop_vptofh;
@ -215,7 +214,6 @@ struct vop_vector fuse_vnops = {
.vop_symlink = fuse_vnop_symlink,
.vop_write = fuse_vnop_write,
.vop_getpages = fuse_vnop_getpages,
.vop_putpages = fuse_vnop_putpages,
.vop_print = fuse_vnop_print,
.vop_vptofh = fuse_vnop_vptofh,
};
@ -1381,7 +1379,7 @@ fuse_vnop_read(struct vop_read_args *ap)
ioflag |= IO_DIRECT;
}
return fuse_io_dispatch(vp, uio, ioflag, false, cred, pid);
return fuse_io_dispatch(vp, uio, ioflag, cred, pid);
}
/*
@ -1960,10 +1958,39 @@ fuse_vnop_write(struct vop_write_args *ap)
ioflag |= IO_DIRECT;
}
return fuse_io_dispatch(vp, uio, ioflag, false, cred, pid);
return fuse_io_dispatch(vp, uio, ioflag, cred, pid);
}
static daddr_t
fuse_gbp_getblkno(struct vnode *vp, vm_ooffset_t off)
{
const int biosize = fuse_iosize(vp);
return (off / biosize);
}
static int
fuse_gbp_getblksz(struct vnode *vp, daddr_t lbn)
{
off_t filesize;
int blksz, err;
const int biosize = fuse_iosize(vp);
err = fuse_vnode_size(vp, &filesize, NULL, NULL);
KASSERT(err == 0, ("vfs_bio_getpages can't handle errors here"));
if (err)
return biosize;
if ((off_t)lbn * biosize >= filesize) {
blksz = 0;
} else if ((off_t)(lbn + 1) * biosize > filesize) {
blksz = filesize - (off_t)lbn *biosize;
} else {
blksz = biosize;
}
return (blksz);
}
SDT_PROBE_DEFINE1(fusefs, , vnops, vnop_getpages_error, "int");
/*
struct vnop_getpages_args {
struct vnode *a_vp;
@ -1975,23 +2002,7 @@ SDT_PROBE_DEFINE1(fusefs, , vnops, vnop_getpages_error, "int");
static int
fuse_vnop_getpages(struct vop_getpages_args *ap)
{
int i, error, nextoff, size, toff, count, npages;
struct uio uio;
struct iovec iov;
vm_offset_t kva;
struct buf *bp;
struct vnode *vp;
struct thread *td;
struct ucred *cred;
vm_page_t *pages;
pid_t pid = curthread->td_proc->p_pid;
vp = ap->a_vp;
KASSERT(vp->v_object, ("objectless vp passed to getpages"));
td = curthread; /* XXX */
cred = curthread->td_ucred; /* XXX */
pages = ap->a_m;
npages = ap->a_count;
struct vnode *vp = ap->a_vp;
if (!fsess_opt_mmap(vnode_mount(vp))) {
SDT_PROBE2(fusefs, , vnops, trace, 1,
@ -1999,190 +2010,8 @@ fuse_vnop_getpages(struct vop_getpages_args *ap)
return (VM_PAGER_ERROR);
}
/*
* If the last page is partially valid, just return it and allow
* the pager to zero-out the blanks. Partially valid pages can
* only occur at the file EOF.
*
* XXXGL: is that true for FUSE, which is a local filesystem,
* but still somewhat disconnected from the kernel?
*/
VM_OBJECT_WLOCK(vp->v_object);
if (pages[npages - 1]->valid != 0 && --npages == 0)
goto out;
VM_OBJECT_WUNLOCK(vp->v_object);
/*
* We use only the kva address for the buffer, but this is extremely
* convenient and fast.
*/
bp = uma_zalloc(fuse_pbuf_zone, M_WAITOK);
kva = (vm_offset_t)bp->b_data;
pmap_qenter(kva, pages, npages);
VM_CNT_INC(v_vnodein);
VM_CNT_ADD(v_vnodepgsin, npages);
count = npages << PAGE_SHIFT;
iov.iov_base = (caddr_t)kva;
iov.iov_len = count;
uio.uio_iov = &iov;
uio.uio_iovcnt = 1;
uio.uio_offset = IDX_TO_OFF(pages[0]->pindex);
uio.uio_resid = count;
uio.uio_segflg = UIO_SYSSPACE;
uio.uio_rw = UIO_READ;
uio.uio_td = td;
error = fuse_io_dispatch(vp, &uio, IO_DIRECT, true, cred, pid);
pmap_qremove(kva, npages);
uma_zfree(fuse_pbuf_zone, bp);
if (error && (uio.uio_resid == count)) {
SDT_PROBE1(fusefs, , vnops, vnop_getpages_error, error);
return VM_PAGER_ERROR;
}
/*
* Calculate the number of bytes read and validate only that number
* of bytes. Note that due to pending writes, size may be 0. This
* does not mean that the remaining data is invalid!
*/
size = count - uio.uio_resid;
VM_OBJECT_WLOCK(vp->v_object);
fuse_vm_page_lock_queues();
for (i = 0, toff = 0; i < npages; i++, toff = nextoff) {
vm_page_t m;
nextoff = toff + PAGE_SIZE;
m = pages[i];
if (nextoff <= size) {
/*
* Read operation filled an entire page
*/
m->valid = VM_PAGE_BITS_ALL;
KASSERT(m->dirty == 0,
("fuse_getpages: page %p is dirty", m));
} else if (size > toff) {
/*
* Read operation filled a partial page.
*/
m->valid = 0;
vm_page_set_valid_range(m, 0, size - toff);
KASSERT(m->dirty == 0,
("fuse_getpages: page %p is dirty", m));
} else {
/*
* Read operation was short. If no error occurred
* we may have hit a zero-fill section. We simply
* leave valid set to 0.
*/
;
}
}
fuse_vm_page_unlock_queues();
out:
VM_OBJECT_WUNLOCK(vp->v_object);
if (ap->a_rbehind)
*ap->a_rbehind = 0;
if (ap->a_rahead)
*ap->a_rahead = 0;
return (VM_PAGER_OK);
}
/*
struct vnop_putpages_args {
struct vnode *a_vp;
vm_page_t *a_m;
int a_count;
int a_sync;
int *a_rtvals;
vm_ooffset_t a_offset;
};
*/
static int
fuse_vnop_putpages(struct vop_putpages_args *ap)
{
struct uio uio;
struct iovec iov;
vm_offset_t kva;
struct buf *bp;
int i, error, npages, count;
off_t offset;
int *rtvals;
struct vnode *vp;
struct thread *td;
struct ucred *cred;
vm_page_t *pages;
vm_ooffset_t fsize;
pid_t pid = curthread->td_proc->p_pid;
vp = ap->a_vp;
KASSERT(vp->v_object, ("objectless vp passed to putpages"));
fsize = vp->v_object->un_pager.vnp.vnp_size;
td = curthread; /* XXX */
cred = curthread->td_ucred; /* XXX */
pages = ap->a_m;
count = ap->a_count;
rtvals = ap->a_rtvals;
npages = btoc(count);
offset = IDX_TO_OFF(pages[0]->pindex);
if (!fsess_opt_mmap(vnode_mount(vp))) {
SDT_PROBE2(fusefs, , vnops, trace, 1,
"called on non-cacheable vnode??\n");
}
for (i = 0; i < npages; i++)
rtvals[i] = VM_PAGER_AGAIN;
/*
* When putting pages, do not extend file past EOF.
*/
if (offset + count > fsize) {
count = fsize - offset;
if (count < 0)
count = 0;
}
/*
* We use only the kva address for the buffer, but this is extremely
* convenient and fast.
*/
bp = uma_zalloc(fuse_pbuf_zone, M_WAITOK);
kva = (vm_offset_t)bp->b_data;
pmap_qenter(kva, pages, npages);
VM_CNT_INC(v_vnodeout);
VM_CNT_ADD(v_vnodepgsout, count);
iov.iov_base = (caddr_t)kva;
iov.iov_len = count;
uio.uio_iov = &iov;
uio.uio_iovcnt = 1;
uio.uio_offset = offset;
uio.uio_resid = count;
uio.uio_segflg = UIO_SYSSPACE;
uio.uio_rw = UIO_WRITE;
uio.uio_td = td;
error = fuse_io_dispatch(vp, &uio, IO_DIRECT, true, cred, pid);
pmap_qremove(kva, npages);
uma_zfree(fuse_pbuf_zone, bp);
if (!error) {
int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE;
for (i = 0; i < nwritten; i++) {
rtvals[i] = VM_PAGER_OK;
VM_OBJECT_WLOCK(pages[i]->object);
vm_page_undirty(pages[i]);
VM_OBJECT_WUNLOCK(pages[i]->object);
}
}
return rtvals[0];
return (vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind,
ap->a_rahead, fuse_gbp_getblkno, fuse_gbp_getblksz));
}
static const char extattr_namespace_separator = '.';

View File

@ -413,7 +413,8 @@ TEST_F(Read, eio)
/*
* If the server returns a short read when direct io is not in use, that
* indicates EOF and we should update the file size.
* indicates EOF, because of a server-side truncation. We should invalidate
* all cached attributes. We may update the file size,
*/
TEST_F(ReadCacheable, eof)
{
@ -425,18 +426,21 @@ TEST_F(ReadCacheable, eof)
uint64_t offset = 100;
ssize_t bufsize = strlen(CONTENTS);
ssize_t partbufsize = 3 * bufsize / 4;
ssize_t r;
char buf[bufsize];
struct stat sb;
expect_lookup(RELPATH, ino, offset + bufsize);
expect_open(ino, 0, 1);
expect_read(ino, 0, offset + bufsize, offset + partbufsize, CONTENTS);
expect_getattr(ino, offset + partbufsize);
fd = open(FULLPATH, O_RDONLY);
ASSERT_LE(0, fd) << strerror(errno);
ASSERT_EQ(partbufsize, pread(fd, buf, bufsize, offset))
<< strerror(errno);
r = pread(fd, buf, bufsize, offset);
ASSERT_LE(0, r) << strerror(errno);
EXPECT_EQ(partbufsize, r) << strerror(errno);
ASSERT_EQ(0, fstat(fd, &sb));
EXPECT_EQ((off_t)(offset + partbufsize), sb.st_size);
/* Deliberately leak fd. close(2) will be tested in release.cc */
@ -459,6 +463,7 @@ TEST_F(ReadCacheable, eof_of_whole_buffer)
expect_open(ino, 0, 1);
expect_read(ino, 2 * m_maxbcachebuf, bufsize, bufsize, CONTENTS);
expect_read(ino, m_maxbcachebuf, m_maxbcachebuf, 0, CONTENTS);
expect_getattr(ino, m_maxbcachebuf);
fd = open(FULLPATH, O_RDONLY);
ASSERT_LE(0, fd) << strerror(errno);
@ -586,6 +591,57 @@ TEST_F(ReadCacheable, mmap)
/* Deliberately leak fd. close(2) will be tested in release.cc */
}
/*
* A read via mmap comes up short, indicating that the file was truncated
* server-side.
*/
TEST_F(ReadCacheable, mmap_eof)
{
const char FULLPATH[] = "mountpoint/some_file.txt";
const char RELPATH[] = "some_file.txt";
const char *CONTENTS = "abcdefgh";
uint64_t ino = 42;
int fd;
ssize_t len;
size_t bufsize = strlen(CONTENTS);
struct stat sb;
void *p;
len = getpagesize();
expect_lookup(RELPATH, ino, 100000);
expect_open(ino, 0, 1);
/* mmap may legitimately try to read more data than is available */
EXPECT_CALL(*m_mock, process(
ResultOf([=](auto in) {
return (in.header.opcode == FUSE_READ &&
in.header.nodeid == ino &&
in.body.read.fh == Read::FH &&
in.body.read.offset == 0 &&
in.body.read.size >= bufsize);
}, Eq(true)),
_)
).WillOnce(Invoke(ReturnImmediate([=](auto in __unused, auto& out) {
out.header.len = sizeof(struct fuse_out_header) + bufsize;
memmove(out.body.bytes, CONTENTS, bufsize);
})));
expect_getattr(ino, bufsize);
fd = open(FULLPATH, O_RDONLY);
ASSERT_LE(0, fd) << strerror(errno);
p = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
ASSERT_NE(MAP_FAILED, p) << strerror(errno);
/* The file size should be automatically truncated */
ASSERT_EQ(0, memcmp(p, CONTENTS, bufsize));
ASSERT_EQ(0, fstat(fd, &sb)) << strerror(errno);
EXPECT_EQ((off_t)bufsize, sb.st_size);
ASSERT_EQ(0, munmap(p, len)) << strerror(errno);
/* Deliberately leak fd. close(2) will be tested in release.cc */
}
/*
* Just as when FOPEN_DIRECT_IO is used, reads with O_DIRECT should bypass
* cache and to straight to the daemon

View File

@ -528,6 +528,39 @@ TEST_F(Write, rlimit_fsize)
/* Deliberately leak fd. close(2) will be tested in release.cc */
}
/*
* A short read indicates EOF. Test that nothing bad happens if we get EOF
* during the R of a RMW operation.
*/
TEST_F(WriteCacheable, eof_during_rmw)
{
const char FULLPATH[] = "mountpoint/some_file.txt";
const char RELPATH[] = "some_file.txt";
const char *CONTENTS = "abcdefgh";
const char *INITIAL = "XXXXXXXXXX";
uint64_t ino = 42;
uint64_t offset = 1;
ssize_t bufsize = strlen(CONTENTS);
off_t orig_fsize = 10;
off_t truncated_fsize = 5;
off_t final_fsize = bufsize;
int fd;
FuseTest::expect_lookup(RELPATH, ino, S_IFREG | 0644, orig_fsize, 1);
expect_open(ino, 0, 1);
expect_read(ino, 0, orig_fsize, truncated_fsize, INITIAL, O_RDWR);
expect_getattr(ino, truncated_fsize);
expect_read(ino, 0, final_fsize, final_fsize, INITIAL, O_RDWR);
maybe_expect_write(ino, offset, bufsize, CONTENTS);
fd = open(FULLPATH, O_RDWR);
EXPECT_LE(0, fd) << strerror(errno);
ASSERT_EQ(bufsize, pwrite(fd, CONTENTS, bufsize, offset))
<< strerror(errno);
/* Deliberately leak fd. close(2) will be tested in release.cc */
}
/*
* If the kernel cannot be sure which uid, gid, or pid was responsible for a
* write, then it must set the FUSE_WRITE_CACHE bit