fusefs: fix panic when writing with O_DIRECT and using writeback cache

When a fusefs file system is mounted using the writeback cache, the cache
may still be bypassed by opening a file with O_DIRECT.  When writing with
O_DIRECT, the cache must be invalidated for the affected portion of the
file.  Fix some panics caused by inadvertently invalidating too much.

Sponsored by:	The FreeBSD Foundation
This commit is contained in:
asomers 2019-07-28 15:17:32 +00:00
parent 90daad7031
commit 10eed53afb
2 changed files with 184 additions and 3 deletions

View File

@ -119,9 +119,11 @@ SDT_PROVIDER_DECLARE(fusefs);
*/
SDT_PROBE_DEFINE2(fusefs, , io, trace, "int", "char*");
static int
fuse_inval_buf_range(struct vnode *vp, off_t filesize, off_t start, off_t end);
static void
fuse_io_clear_suid_on_write(struct vnode *vp, struct ucred *cred,
struct thread *td);
struct thread *td);
static int
fuse_read_directbackend(struct vnode *vp, struct uio *uio,
struct ucred *cred, struct fuse_filehandle *fufh);
@ -136,6 +138,58 @@ static int
fuse_write_biobackend(struct vnode *vp, struct uio *uio,
struct ucred *cred, struct fuse_filehandle *fufh, int ioflag, pid_t pid);
/* Invalidate a range of cached data, whether dirty of not */
static int
fuse_inval_buf_range(struct vnode *vp, off_t filesize, off_t start, off_t end)
{
struct buf *bp;
daddr_t left_lbn, end_lbn, right_lbn;
off_t new_filesize;
int iosize, left_on, right_on, right_blksize;
iosize = fuse_iosize(vp);
left_lbn = start / iosize;
end_lbn = howmany(end, iosize);
left_on = start & (iosize - 1);
if (left_on != 0) {
bp = getblk(vp, left_lbn, iosize, PCATCH, 0, 0);
if ((bp->b_flags & B_CACHE) != 0 && bp->b_dirtyend >= left_on) {
/*
* Flush the dirty buffer, because we don't have a
* byte-granular way to record which parts of the
* buffer are valid.
*/
bwrite(bp);
if (bp->b_error)
return (bp->b_error);
} else {
brelse(bp);
}
}
right_on = end & (iosize - 1);
if (right_on != 0) {
right_lbn = end / iosize;
new_filesize = MAX(filesize, end);
right_blksize = MIN(iosize, new_filesize - iosize * right_lbn);
bp = getblk(vp, right_lbn, right_blksize, PCATCH, 0, 0);
if ((bp->b_flags & B_CACHE) != 0 && bp->b_dirtyoff < right_on) {
/*
* Flush the dirty buffer, because we don't have a
* byte-granular way to record which parts of the
* buffer are valid.
*/
bwrite(bp);
if (bp->b_error)
return (bp->b_error);
} else {
brelse(bp);
}
}
v_inval_buf_range(vp, left_lbn, end_lbn, iosize);
return (0);
}
/*
* FreeBSD clears the SUID and SGID bits on any write by a non-root user.
*/
@ -236,7 +290,6 @@ fuse_io_dispatch(struct vnode *vp, struct uio *uio, int ioflag,
case UIO_WRITE:
fuse_vnode_update(vp, FN_MTIMECHANGE | FN_CTIMECHANGE);
if (directio) {
const int iosize = fuse_iosize(vp);
off_t start, end, filesize;
SDT_PROBE2(fusefs, , io, trace, 1,
@ -252,7 +305,9 @@ fuse_io_dispatch(struct vnode *vp, struct uio *uio, int ioflag,
(IO_VMIO | IO_DIRECT),
("IO_DIRECT used for a cache flush?"));
/* Invalidate the write cache when writing directly */
v_inval_buf_range(vp, start, end, iosize);
err = fuse_inval_buf_range(vp, filesize, start, end);
if (err)
return (err);
err = fuse_write_directbackend(vp, uio, cred, fufh,
filesize, ioflag, false);
} else {

View File

@ -950,6 +950,132 @@ TEST_F(WriteBackAsync, delay)
/* Don't close the file because that would flush the cache */
}
/*
* A direct write should not evict dirty cached data from outside of its own
* byte range.
*/
TEST_F(WriteBackAsync, direct_io_ignores_unrelated_cached)
{
const char FULLPATH[] = "mountpoint/some_file.txt";
const char RELPATH[] = "some_file.txt";
const char CONTENTS0[] = "abcdefgh";
const char CONTENTS1[] = "ijklmnop";
uint64_t ino = 42;
int fd;
ssize_t bufsize = strlen(CONTENTS0) + 1;
ssize_t fsize = 2 * m_maxbcachebuf;
char readbuf[bufsize];
void *zeros;
zeros = calloc(1, m_maxbcachebuf);
ASSERT_NE(nullptr, zeros);
expect_lookup(RELPATH, ino, fsize);
expect_open(ino, 0, 1);
expect_read(ino, 0, m_maxbcachebuf, m_maxbcachebuf, zeros);
FuseTest::expect_write(ino, m_maxbcachebuf, bufsize, bufsize, 0, 0,
CONTENTS1);
fd = open(FULLPATH, O_RDWR);
EXPECT_LE(0, fd) << strerror(errno);
// Cache first block with dirty data. This will entail first reading
// the existing data.
ASSERT_EQ(bufsize, pwrite(fd, CONTENTS0, bufsize, 0))
<< strerror(errno);
// Write directly to second block
ASSERT_EQ(0, fcntl(fd, F_SETFL, O_DIRECT)) << strerror(errno);
ASSERT_EQ(bufsize, pwrite(fd, CONTENTS1, bufsize, m_maxbcachebuf))
<< strerror(errno);
// Read from the first block again. Should be serviced by cache.
ASSERT_EQ(0, fcntl(fd, F_SETFL, 0)) << strerror(errno);
ASSERT_EQ(bufsize, pread(fd, readbuf, bufsize, 0)) << strerror(errno);
ASSERT_STREQ(readbuf, CONTENTS0);
leak(fd);
free(zeros);
}
/*
* If a direct io write partially overlaps one or two blocks of dirty cached
* data, No dirty data should be lost. Admittedly this is a weird test,
* because it would be unusual to use O_DIRECT and the writeback cache.
*/
TEST_F(WriteBackAsync, direct_io_partially_overlaps_cached_block)
{
const char FULLPATH[] = "mountpoint/some_file.txt";
const char RELPATH[] = "some_file.txt";
uint64_t ino = 42;
int fd;
off_t bs = m_maxbcachebuf;
ssize_t fsize = 3 * bs;
void *readbuf, *zeros, *ones, *zeroones, *onezeros;
readbuf = malloc(bs);
ASSERT_NE(nullptr, readbuf) << strerror(errno);
zeros = calloc(1, 3 * bs);
ASSERT_NE(nullptr, zeros);
ones = calloc(1, 2 * bs);
ASSERT_NE(nullptr, ones);
memset(ones, 1, 2 * bs);
zeroones = calloc(1, bs);
ASSERT_NE(nullptr, zeroones);
memset((uint8_t*)zeroones + bs / 2, 1, bs / 2);
onezeros = calloc(1, bs);
ASSERT_NE(nullptr, onezeros);
memset(onezeros, 1, bs / 2);
expect_lookup(RELPATH, ino, fsize);
expect_open(ino, 0, 1);
fd = open(FULLPATH, O_RDWR);
EXPECT_LE(0, fd) << strerror(errno);
/* Cache first and third blocks with dirty data. */
ASSERT_EQ(3 * bs, pwrite(fd, zeros, 3 * bs, 0)) << strerror(errno);
/*
* Write directly to all three blocks. The partially written blocks
* will be flushed because they're dirty.
*/
FuseTest::expect_write(ino, 0, bs, bs, 0, 0, zeros);
FuseTest::expect_write(ino, 2 * bs, bs, bs, 0, 0, zeros);
/* The direct write is split in two because of the m_maxwrite value */
FuseTest::expect_write(ino, bs / 2, bs, bs, 0, 0, ones);
FuseTest::expect_write(ino, 3 * bs / 2, bs, bs, 0, 0, ones);
ASSERT_EQ(0, fcntl(fd, F_SETFL, O_DIRECT)) << strerror(errno);
ASSERT_EQ(2 * bs, pwrite(fd, ones, 2 * bs, bs / 2)) << strerror(errno);
/*
* Read from both the valid and invalid portions of the first and third
* blocks again. This will entail FUSE_READ operations because these
* blocks were invalidated by the direct write.
*/
expect_read(ino, 0, bs, bs, zeroones);
expect_read(ino, 2 * bs, bs, bs, onezeros);
ASSERT_EQ(0, fcntl(fd, F_SETFL, 0)) << strerror(errno);
ASSERT_EQ(bs / 2, pread(fd, readbuf, bs / 2, 0)) << strerror(errno);
EXPECT_EQ(0, memcmp(zeros, readbuf, bs / 2));
ASSERT_EQ(bs / 2, pread(fd, readbuf, bs / 2, 5 * bs / 2))
<< strerror(errno);
EXPECT_EQ(0, memcmp(zeros, readbuf, bs / 2));
ASSERT_EQ(bs / 2, pread(fd, readbuf, bs / 2, bs / 2))
<< strerror(errno);
EXPECT_EQ(0, memcmp(ones, readbuf, bs / 2));
ASSERT_EQ(bs / 2, pread(fd, readbuf, bs / 2, 2 * bs))
<< strerror(errno);
EXPECT_EQ(0, memcmp(ones, readbuf, bs / 2));
leak(fd);
free(zeroones);
free(onezeros);
free(ones);
free(zeros);
free(readbuf);
}
/*
* In WriteBack mode, writes may be cached beyond what the server thinks is the
* EOF. In this case, a short read at EOF should _not_ cause fusefs to update