fusefs: correctly handle short writes

If a FUSE daemon returns FOPEN_DIRECT_IO when a file is opened, then it's
allowed to write less data than was requested during a FUSE_WRITE operation
on that file handle.  fusefs should simply return a short write to userland.

The old code attempted to resend the unsent data.  Not only was that
incorrect behavior, but it did it in an ineffective way, by attempting to
"rewind" the uio and uiomove the unsent data again.

This commit correctly handles short writes by returning directly to
userland if FOPEN_DIRECT_IO was set.  If it wasn't set (making the short
write technically a protocol violation), then we resend the unsent data.
But instead of rewinding the uio, just resend the data that's already in the
kernel.

That necessitated a few changes to fuse_ipc.c to reduce the amount of bzero
activity.  fusefs may be marginally faster as a result.

PR:		236381
Sponsored by:	The FreeBSD Foundation
This commit is contained in:
Alan Somers 2019-04-04 16:51:34 +00:00
parent 35cf0e7e56
commit 12292a99ac
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/projects/fuse2/; revision=345876
7 changed files with 182 additions and 52 deletions

View File

@ -282,7 +282,7 @@ fuse_internal_fsync(struct vnode *vp,
int waitfor,
bool datasync)
{
struct fuse_fsync_in *ffsi;
struct fuse_fsync_in *ffsi = NULL;
struct fuse_dispatcher fdi;
struct fuse_filehandle *fufh;
struct fuse_vnode_data *fvdat = VTOFUD(vp);
@ -295,15 +295,20 @@ fuse_internal_fsync(struct vnode *vp,
}
if (vnode_isdir(vp))
op = FUSE_FSYNCDIR;
fdisp_init(&fdi, sizeof(*ffsi));
/*
* fsync every open file handle for this file, because we can't be sure
* which file handle the caller is really referring to.
*/
LIST_FOREACH(fufh, &fvdat->handles, next) {
fdisp_init(&fdi, sizeof(*ffsi));
fdisp_make_vp(&fdi, op, vp, td, NULL);
if (ffsi == NULL)
fdisp_make_vp(&fdi, op, vp, td, NULL);
else
fdisp_refresh_vp(&fdi, op, vp, td, NULL);
ffsi = fdi.indata;
ffsi->fh = fufh->fh_id;
ffsi->fsync_flags = 0;
if (datasync)
ffsi->fsync_flags = 1;
@ -315,8 +320,8 @@ fuse_internal_fsync(struct vnode *vp,
fuse_internal_fsync_callback);
fuse_insert_message(fdi.tick);
}
fdisp_destroy(&fdi);
}
fdisp_destroy(&fdi);
return err;
}
@ -331,7 +336,7 @@ fuse_internal_readdir(struct vnode *vp,
{
int err = 0;
struct fuse_dispatcher fdi;
struct fuse_read_in *fri;
struct fuse_read_in *fri = NULL;
if (uio_resid(uio) == 0) {
return 0;
@ -344,9 +349,11 @@ fuse_internal_readdir(struct vnode *vp,
*/
while (uio_resid(uio) > 0) {
fdi.iosize = sizeof(*fri);
fdisp_make_vp(&fdi, FUSE_READDIR, vp, NULL, NULL);
if (fri == NULL)
fdisp_make_vp(&fdi, FUSE_READDIR, vp, NULL, NULL);
else
fdisp_refresh_vp(&fdi, FUSE_READDIR, vp, NULL, NULL);
fri = fdi.indata;
fri->fh = fufh->fh_id;
@ -419,8 +426,8 @@ fuse_internal_readdir_processdata(struct uio *uio,
err = -1;
break;
}
fiov_refresh(cookediov);
fiov_adjust(cookediov, bytesavail);
bzero(cookediov->base, bytesavail);
de = (struct dirent *)cookediov->base;
de->d_fileno = fudge->ino;

View File

@ -341,10 +341,14 @@ fuse_write_directbackend(struct vnode *vp, struct uio *uio,
{
struct fuse_vnode_data *fvdat = VTOFUD(vp);
struct fuse_write_in *fwi;
struct fuse_write_out *fwo;
struct fuse_dispatcher fdi;
size_t chunksize;
void *fwi_data;
off_t as_written_offset;
int diff;
int err = 0;
bool direct_io = fufh->fuse_open_flags & FOPEN_DIRECT_IO;
if (uio->uio_resid == 0)
return (0);
@ -364,36 +368,61 @@ fuse_write_directbackend(struct vnode *vp, struct uio *uio,
fwi->fh = fufh->fh_id;
fwi->offset = uio->uio_offset;
fwi->size = chunksize;
fwi_data = (char *)fdi.indata + sizeof(*fwi);
if ((err = uiomove((char *)fdi.indata + sizeof(*fwi),
chunksize, uio)))
if ((err = uiomove(fwi_data, chunksize, uio)))
break;
retry:
if ((err = fdisp_wait_answ(&fdi)))
break;
fwo = ((struct fuse_write_out *)fdi.answ);
/* Adjust the uio in the case of short writes */
diff = chunksize - ((struct fuse_write_out *)fdi.answ)->size;
diff = fwi->size - fwo->size;
as_written_offset = uio->uio_offset - diff;
if (as_written_offset - diff > fvdat->filesize &&
fuse_data_cache_mode != FUSE_CACHE_UC) {
fuse_vnode_setsize(vp, cred, as_written_offset);
fvdat->flag &= ~FN_SIZECHANGE;
}
if (diff < 0) {
printf("WARNING: misbehaving FUSE filesystem "
"wrote more data than we provided it\n");
err = EINVAL;
break;
} else if (diff > 0 && !(ioflag & IO_DIRECT)) {
/*
* XXX We really should be directly checking whether
* the file was opened with FOPEN_DIRECT_IO, not
* IO_DIRECT. IO_DIRECT can be set in multiple ways.
*/
SDT_PROBE2(fuse, , io, trace, 1,
"misbehaving filesystem: short writes are only "
"allowed with direct_io");
}
uio->uio_resid += diff;
uio->uio_offset -= diff;
if (uio->uio_offset > fvdat->filesize &&
fuse_data_cache_mode != FUSE_CACHE_UC) {
fuse_vnode_setsize(vp, cred, uio->uio_offset);
fvdat->flag &= ~FN_SIZECHANGE;
} else if (diff > 0) {
/* Short write */
if (!direct_io) {
printf("WARNING: misbehaving FUSE filesystem: "
"short writes are only allowed with "
"direct_io\n");
}
if (ioflag & IO_DIRECT) {
/* Return early */
uio->uio_resid += diff;
uio->uio_offset -= diff;
break;
} else {
/* Resend the unwritten portion of data */
fdi.iosize = sizeof(*fwi) + diff;
/* Refresh fdi without clearing data buffer */
fdisp_refresh_vp(&fdi, FUSE_WRITE, vp,
uio->uio_td, cred);
fwi = fdi.indata;
MPASS2(fwi == fdi.indata, "FUSE dispatcher "
"reallocated despite no increase in "
"size?");
void *src = (char*)fwi_data + fwo->size;
memmove(fwi_data, src, diff);
fwi->fh = fufh->fh_id;
fwi->offset = as_written_offset;
fwi->size = diff;
goto retry;
}
}
}

View File

@ -92,6 +92,7 @@ SDT_PROVIDER_DECLARE(fuse);
*/
SDT_PROBE_DEFINE2(fuse, , ipc, trace, "int", "char*");
static void fiov_clear(struct fuse_iov *fiov);
static struct fuse_ticket *fticket_alloc(struct fuse_data *data);
static void fticket_refresh(struct fuse_ticket *ftick);
static void fticket_destroy(struct fuse_ticket *ftick);
@ -185,11 +186,19 @@ fiov_adjust(struct fuse_iov *fiov, size_t size)
fiov->len = size;
}
/* Clear the fiov's data buffer */
static void
fiov_clear(struct fuse_iov *fiov)
{
bzero(fiov->base, fiov->len);
}
/* Resize the fiov if needed, and clear it's buffer */
void
fiov_refresh(struct fuse_iov *fiov)
{
bzero(fiov->base, fiov->len);
fiov_adjust(fiov, 0);
bzero(fiov->base, fiov->len);
}
static int
@ -267,7 +276,7 @@ fticket_destroy(struct fuse_ticket *ftick)
return uma_zfree(ticket_zone, ftick);
}
static inline
static inline
void
fticket_refresh(struct fuse_ticket *ftick)
{
@ -290,6 +299,27 @@ fticket_refresh(struct fuse_ticket *ftick)
ftick->tk_flag = 0;
}
/* Prepar the ticket to be reused, but don't clear its data buffers */
static inline void
fticket_reset(struct fuse_ticket *ftick)
{
FUSE_ASSERT_MS_DONE(ftick);
FUSE_ASSERT_AW_DONE(ftick);
ftick->tk_ms_bufdata = NULL;
ftick->tk_ms_bufsize = 0;
ftick->tk_ms_type = FT_M_FIOV;
bzero(&ftick->tk_aw_ohead, sizeof(struct fuse_out_header));
ftick->tk_aw_errno = 0;
ftick->tk_aw_bufdata = NULL;
ftick->tk_aw_bufsize = 0;
ftick->tk_aw_type = FT_A_FIOV;
ftick->tk_flag = 0;
}
static int
fticket_wait_answer(struct fuse_ticket *ftick)
{
@ -703,7 +733,26 @@ fuse_standard_handler(struct fuse_ticket *ftick, struct uio *uio)
return err;
}
void
/*
* Reinitialize a dispatcher from a pid and node id, without resizing or
* clearing its data buffers
*/
static void
fdisp_refresh_pid(struct fuse_dispatcher *fdip, enum fuse_opcode op,
struct mount *mp, uint64_t nid, pid_t pid, struct ucred *cred)
{
MPASS(fdip->tick);
fticket_reset(fdip->tick);
FUSE_DIMALLOC(&fdip->tick->tk_ms_fiov, fdip->finh,
fdip->indata, fdip->iosize);
fuse_setup_ihead(fdip->finh, fdip->tick, nid, op, fdip->iosize, pid,
cred);
}
/* Initialize a dispatcher from a pid and node id */
static void
fdisp_make_pid(struct fuse_dispatcher *fdip, enum fuse_opcode op,
struct mount *mp, uint64_t nid, pid_t pid, struct ucred *cred)
{
@ -739,6 +788,22 @@ fdisp_make_vp(struct fuse_dispatcher *fdip, enum fuse_opcode op,
td->td_proc->p_pid, cred);
}
/* Refresh a fuse_dispatcher so it can be reused, but don't zero its data */
void
fdisp_refresh_vp(struct fuse_dispatcher *fdip, enum fuse_opcode op,
struct vnode *vp, struct thread *td, struct ucred *cred)
{
RECTIFY_TDCR(td, cred);
return fdisp_refresh_pid(fdip, op, vnode_mount(vp), VTOI(vp),
td->td_proc->p_pid, cred);
}
void
fdisp_refresh(struct fuse_dispatcher *fdip)
{
fticket_refresh(fdip->tick);
}
SDT_PROBE_DEFINE2(fuse, , ipc, fdisp_wait_answ_error, "char*", "int");
int

View File

@ -374,15 +374,17 @@ fdisp_destroy(struct fuse_dispatcher *fdisp)
#endif
}
void fdisp_refresh(struct fuse_dispatcher *fdip);
void fdisp_make(struct fuse_dispatcher *fdip, enum fuse_opcode op,
struct mount *mp, uint64_t nid, struct thread *td, struct ucred *cred);
void fdisp_make_pid(struct fuse_dispatcher *fdip, enum fuse_opcode op,
struct mount *mp, uint64_t nid, pid_t pid, struct ucred *cred);
void fdisp_make_vp(struct fuse_dispatcher *fdip, enum fuse_opcode op,
struct vnode *vp, struct thread *td, struct ucred *cred);
void fdisp_refresh_vp(struct fuse_dispatcher *fdip, enum fuse_opcode op,
struct vnode *vp, struct thread *td, struct ucred *cred);
int fdisp_wait_answ(struct fuse_dispatcher *fdip);
static inline int

View File

@ -2313,9 +2313,10 @@ fuse_vnop_listextattr(struct vop_listextattr_args *ap)
/*
* Retrieve Linux / FUSE compatible list values.
*/
fdisp_make_vp(&fdi, FUSE_LISTXATTR, vp, td, cred);
fdisp_refresh_vp(&fdi, FUSE_LISTXATTR, vp, td, cred);
list_xattr_in = fdi.indata;
list_xattr_in->size = linux_list_len + sizeof(*list_xattr_out);
list_xattr_in->flags = 0;
attr_str = (char *)fdi.indata + sizeof(*list_xattr_in);
snprintf(attr_str, len, "%s%c", prefix, extattr_namespace_separator);

View File

@ -246,7 +246,8 @@ void debug_fuseop(const mockfs_buf_in *in)
printf(" %s=%s", name, value);
break;
case FUSE_WRITE:
printf(" offset=%lu size=%u flags=%u",
printf(" fh=%#lx offset=%lu size=%u flags=%u",
in->body.write.fh,
in->body.write.offset, in->body.write.size,
in->body.write.write_flags);
break;

View File

@ -270,12 +270,13 @@ TEST_F(Write, DISABLED_direct_io_evicts_cache)
/* Deliberately leak fd. close(2) will be tested in release.cc */
}
/*
* When the direct_io option is used, filesystems are allowed to write less
* data than requested
/*
* If the server doesn't return FOPEN_DIRECT_IO during FUSE_OPEN, then it's not
* allowed to return a short write for that file handle. However, if it does
* then we should still do our darndest to handle it by resending the unwritten
* portion.
*/
/* https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=236381 */
TEST_F(Write, DISABLED_direct_io_short_write)
TEST_F(Write, indirect_io_short_write)
{
const char FULLPATH[] = "mountpoint/some_file.txt";
const char RELPATH[] = "some_file.txt";
@ -283,15 +284,16 @@ TEST_F(Write, DISABLED_direct_io_short_write)
uint64_t ino = 42;
int fd;
ssize_t bufsize = strlen(CONTENTS);
ssize_t halfbufsize = bufsize / 2;
const char *halfcontents = CONTENTS + halfbufsize;
ssize_t bufsize0 = 11;
ssize_t bufsize1 = strlen(CONTENTS) - bufsize0;
const char *contents1 = CONTENTS + bufsize0;
expect_lookup(RELPATH, ino, 0);
expect_open(ino, FOPEN_DIRECT_IO, 1);
expect_open(ino, 0, 1);
expect_getattr(ino, 0);
expect_write(ino, 0, bufsize, halfbufsize, 0, CONTENTS);
expect_write(ino, halfbufsize, halfbufsize, halfbufsize, 0,
halfcontents);
expect_write(ino, 0, bufsize, bufsize0, 0, CONTENTS);
expect_write(ino, bufsize0, bufsize1, bufsize1, 0,
contents1);
fd = open(FULLPATH, O_WRONLY);
EXPECT_LE(0, fd) << strerror(errno);
@ -300,20 +302,44 @@ TEST_F(Write, DISABLED_direct_io_short_write)
/* Deliberately leak fd. close(2) will be tested in release.cc */
}
/*
* When the direct_io option is used, filesystems are allowed to write less
* data than requested. We should return the short write to userland.
*/
TEST_F(Write, direct_io_short_write)
{
const char FULLPATH[] = "mountpoint/some_file.txt";
const char RELPATH[] = "some_file.txt";
const char *CONTENTS = "abcdefghijklmnop";
uint64_t ino = 42;
int fd;
ssize_t bufsize = strlen(CONTENTS);
ssize_t halfbufsize = bufsize / 2;
expect_lookup(RELPATH, ino, 0);
expect_open(ino, FOPEN_DIRECT_IO, 1);
expect_getattr(ino, 0);
expect_write(ino, 0, bufsize, halfbufsize, 0, CONTENTS);
fd = open(FULLPATH, O_WRONLY);
EXPECT_LE(0, fd) << strerror(errno);
ASSERT_EQ(halfbufsize, write(fd, CONTENTS, bufsize)) << strerror(errno);
/* Deliberately leak fd. close(2) will be tested in release.cc */
}
/*
* An insidious edge case: the filesystem returns a short write, and the
* difference between what we requested and what it actually wrote crosses an
* iov element boundary
*/
/* https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=236381 */
TEST_F(Write, DISABLED_direct_io_short_write_iov)
TEST_F(Write, direct_io_short_write_iov)
{
const char FULLPATH[] = "mountpoint/some_file.txt";
const char RELPATH[] = "some_file.txt";
const char *CONTENTS0 = "abcdefgh";
const char *CONTENTS1 = "ijklmnop";
const char *EXPECTED0 = "abcdefghijklmnop";
const char *EXPECTED1 = "hijklmnop";
uint64_t ino = 42;
int fd;
ssize_t size0 = strlen(CONTENTS0) - 1;
@ -325,7 +351,6 @@ TEST_F(Write, DISABLED_direct_io_short_write_iov)
expect_open(ino, FOPEN_DIRECT_IO, 1);
expect_getattr(ino, 0);
expect_write(ino, 0, totalsize, size0, 0, EXPECTED0);
expect_write(ino, size0, size1, size1, 0, EXPECTED1);
fd = open(FULLPATH, O_WRONLY);
EXPECT_LE(0, fd) << strerror(errno);
@ -334,7 +359,7 @@ TEST_F(Write, DISABLED_direct_io_short_write_iov)
iov[0].iov_len = strlen(CONTENTS0);
iov[1].iov_base = (void*)CONTENTS1;
iov[1].iov_len = strlen(CONTENTS1);
ASSERT_EQ(totalsize, writev(fd, iov, 2)) << strerror(errno);
ASSERT_EQ(size0, writev(fd, iov, 2)) << strerror(errno);
/* Deliberately leak fd. close(2) will be tested in release.cc */
}