fusefs: WIP fixing writeback cacheing

The current "writeback" cache mode, selected by the
vfs.fusefs.data_cache_mode sysctl, doesn't do writeback cacheing at all.  It
merely goes through the motions of using buf(9), but then writes every
buffer synchronously.  This commit:

* Enables delayed writes when the sysctl is set to writeback cacheing
* Fixes a cache-coherency problem when extending a file whose last page has
  just been written.
* Removes the "sync" mount option, which had been set unconditionally.
* Adjusts some SDT probes
* Adds several new tests that mimic what fsx does but with more control and
  without a real file system.  As I discover failures with fsx, I add
  regression tests to this file.
* Adds a test that ensures we can append to a file without reading any data
  from it.

This change is still incomplete.  Clustered writing is not yet supported,
and there are frequent "panic: vm_fault_hold: fault on nofault entry" panics
that I need to fix.

Sponsored by:	The FreeBSD Foundation
This commit is contained in:
asomers 2019-06-11 16:32:33 +00:00
parent 908ce3e65c
commit d6a303386f
5 changed files with 374 additions and 17 deletions

View File

@ -84,6 +84,7 @@ __FBSDID("$FreeBSD$");
#include <sys/bio.h>
#include <sys/buf.h>
#include <sys/sysctl.h>
#include <sys/vmmeter.h>
#include <vm/vm.h>
#include <vm/vm_extern.h>
@ -265,9 +266,10 @@ out:
return (err);
}
SDT_PROBE_DEFINE3(fusefs, , io, read_bio_backend_start, "int", "int", "int");
SDT_PROBE_DEFINE4(fusefs, , io, read_bio_backend_start, "int", "int", "int", "int");
SDT_PROBE_DEFINE2(fusefs, , io, read_bio_backend_feed, "int", "int");
SDT_PROBE_DEFINE3(fusefs, , io, read_bio_backend_end, "int", "ssize_t", "int");
SDT_PROBE_DEFINE4(fusefs, , io, read_bio_backend_end, "int", "ssize_t", "int",
"struct buf*");
static int
fuse_read_biobackend(struct vnode *vp, struct uio *uio, int ioflag,
struct ucred *cred, struct fuse_filehandle *fufh, pid_t pid)
@ -297,9 +299,6 @@ fuse_read_biobackend(struct vnode *vp, struct uio *uio, int ioflag,
lbn = uio->uio_offset / biosize;
on = uio->uio_offset & (biosize - 1);
SDT_PROBE3(fusefs, , io, read_bio_backend_start,
biosize, (int)lbn, on);
if ((off_t)lbn * biosize >= filesize) {
bcount = 0;
} else if ((off_t)(lbn + 1) * biosize > filesize) {
@ -308,6 +307,9 @@ fuse_read_biobackend(struct vnode *vp, struct uio *uio, int ioflag,
bcount = biosize;
}
SDT_PROBE4(fusefs, , io, read_bio_backend_start,
biosize, (int)lbn, on, bcount);
/* TODO: readahead. See ext2_read for an example */
err = bread(vp, lbn, bcount, NOCRED, &bp);
if (err) {
@ -333,8 +335,8 @@ fuse_read_biobackend(struct vnode *vp, struct uio *uio, int ioflag,
err = uiomove(bp->b_data + on, n, uio);
}
vfs_bio_brelse(bp, ioflag);
SDT_PROBE3(fusefs, , io, read_bio_backend_end, err,
uio->uio_resid, n);
SDT_PROBE4(fusefs, , io, read_bio_backend_end, err,
uio->uio_resid, n, bp);
}
return (err);
@ -564,6 +566,7 @@ fuse_write_biobackend(struct vnode *vp, struct uio *uio,
off_t filesize;
int bcount;
int n, on, err = 0;
bool last_page;
const int biosize = fuse_iosize(vp);
@ -612,6 +615,11 @@ again:
extending = true;
bcount = on + n;
}
if (howmany(((off_t)lbn * biosize + on + n - 1), PAGE_SIZE) >=
howmany(filesize, PAGE_SIZE))
last_page = true;
else
last_page = false;
if (direct_append) {
/*
* Take care to preserve the buffer's B_CACHE state so
@ -642,6 +650,8 @@ again:
break;
}
}
if (biosize > bcount)
vfs_bio_clrbuf(bp);
SDT_PROBE6(fusefs, , io, write_biobackend_start,
lbn, on, n, uio, bcount, direct_append);
@ -689,7 +699,6 @@ again:
* If the chopping creates a reverse-indexed or degenerate
* situation with dirtyoff/end, we 0 both of them.
*/
if (bp->b_dirtyend > bcount) {
SDT_PROBE2(fusefs, , io, write_biobackend_append_race,
(long)bp->b_blkno * biosize,
@ -738,6 +747,7 @@ again:
bp->b_error = err;
brelse(bp);
break;
/* TODO: vfs_bio_clrbuf like ffs_write does? */
}
/*
* Only update dirtyoff/dirtyend if not a degenerate
@ -753,7 +763,42 @@ again:
}
vfs_bio_set_valid(bp, on, n);
}
err = bwrite(bp);
vfs_bio_set_flags(bp, ioflag);
if (last_page) {
/*
* When writing the last page of a file we must write
* synchronously. If we didn't, then a subsequent
* operation could extend the file, making the last
* page of this buffer invalid because it would only be
* partially cached.
*
* As an optimization, it would be allowable to only
* write the last page synchronously. Or, it should be
* possible to synchronously flush the last
* already-written page whenever extending a file with
* ftruncate or another write.
*/
err = bwrite(bp);
} else if (ioflag & IO_SYNC) {
err = bwrite(bp);
} else if (vm_page_count_severe() ||
buf_dirty_count_severe() ||
(ioflag & IO_ASYNC)) {
/* TODO: enable write clustering later */
bawrite(bp);
} else if (on == 0 && n == bcount) {
if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0)
bdwrite(bp);
else
bawrite(bp);
} else if (ioflag & IO_DIRECT) {
bawrite(bp);
} else {
bp->b_flags &= ~B_CLUSTEROK;
bdwrite(bp);
}
if (err)
break;
} while (uio->uio_resid > 0 && n > 0);
@ -819,7 +864,7 @@ fuse_io_strategy(struct vnode *vp, struct buf *bp)
io.iov_base = bp->b_data;
uiop->uio_rw = UIO_READ;
uiop->uio_offset = ((off_t)bp->b_blkno) * biosize;
uiop->uio_offset = ((off_t)bp->b_lblkno) * biosize;
error = fuse_read_directbackend(vp, uiop, cred, fufh);
if (!error && uiop->uio_resid) {
@ -854,14 +899,14 @@ fuse_io_strategy(struct vnode *vp, struct buf *bp)
return (error);
}
if ((off_t)bp->b_blkno * biosize + bp->b_dirtyend > filesize)
if ((off_t)bp->b_lblkno * biosize + bp->b_dirtyend > filesize)
bp->b_dirtyend = filesize -
(off_t)bp->b_blkno * biosize;
(off_t)bp->b_lblkno * biosize;
if (bp->b_dirtyend > bp->b_dirtyoff) {
io.iov_len = uiop->uio_resid = bp->b_dirtyend
- bp->b_dirtyoff;
uiop->uio_offset = (off_t)bp->b_blkno * biosize
uiop->uio_offset = (off_t)bp->b_lblkno * biosize
+ bp->b_dirtyoff;
io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
uiop->uio_rw = UIO_WRITE;

View File

@ -314,9 +314,6 @@ fuse_vfsop_mount(struct mount *mp)
__mntopts = 0;
td = curthread;
MNT_ILOCK(mp);
mp->mnt_flag |= MNT_SYNCHRONOUS;
MNT_IUNLOCK(mp);
/* Get the new options passed to mount */
opts = mp->mnt_optnew;

View File

@ -21,6 +21,7 @@ GTESTS+= fsync
GTESTS+= fsyncdir
GTESTS+= getattr
GTESTS+= interrupt
GTESTS+= io
GTESTS+= link
GTESTS+= locks
GTESTS+= lookup

276
tests/sys/fs/fusefs/io.cc Normal file
View File

@ -0,0 +1,276 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2019 The FreeBSD Foundation
*
* This software was developed by BFF Storage Systems, LLC under sponsorship
* from the FreeBSD Foundation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
extern "C" {
#include <fcntl.h>
#include <stdlib.h>
#include <unistd.h>
}
#include "mockfs.hh"
#include "utils.hh"
/*
* For testing I/O like fsx does, but deterministically and without a real
* underlying file system
*
* TODO: after fusefs gains the options to select cache mode for each mount
* point, run each of these tests for all cache modes.
*/
using namespace testing;
const char FULLPATH[] = "mountpoint/some_file.txt";
const char RELPATH[] = "some_file.txt";
const uint64_t ino = 42;
class Io: public FuseTest {
public:
int m_backing_fd, m_control_fd, m_test_fd;
Io(): m_backing_fd(-1), m_control_fd(-1) {};
void SetUp()
{
m_backing_fd = open("backing_file", O_RDWR | O_CREAT | O_TRUNC);
if (m_backing_fd < 0)
FAIL() << strerror(errno);
m_control_fd = open("control", O_RDWR | O_CREAT | O_TRUNC);
if (m_control_fd < 0)
FAIL() << strerror(errno);
srandom(22'9'1982); // Seed with my birthday
FuseTest::SetUp();
if (IsSkipped())
return;
expect_lookup(RELPATH, ino, S_IFREG | 0644, 0, 1);
expect_open(ino, 0, 1);
EXPECT_CALL(*m_mock, process(
ResultOf([=](auto in) {
return (in.header.opcode == FUSE_WRITE &&
in.header.nodeid == ino);
}, Eq(true)),
_)
).WillRepeatedly(Invoke(ReturnImmediate([=](auto in, auto& out) {
const char *buf = (const char*)in.body.bytes +
sizeof(struct fuse_write_in);
ssize_t isize = in.body.write.size;
off_t iofs = in.body.write.offset;
ASSERT_EQ(isize, pwrite(m_backing_fd, buf, isize, iofs))
<< strerror(errno);
SET_OUT_HEADER_LEN(out, write);
out.body.write.size = isize;
})));
EXPECT_CALL(*m_mock, process(
ResultOf([=](auto in) {
return (in.header.opcode == FUSE_READ &&
in.header.nodeid == ino);
}, Eq(true)),
_)
).WillRepeatedly(Invoke(ReturnImmediate([=](auto in, auto& out) {
ssize_t isize = in.body.write.size;
off_t iofs = in.body.write.offset;
void *buf = out.body.bytes;
ASSERT_LE(0, pread(m_backing_fd, buf, isize, iofs))
<< strerror(errno);
out.header.len = sizeof(struct fuse_out_header) + isize;
})));
EXPECT_CALL(*m_mock, process(
ResultOf([=](auto in) {
uint32_t valid = FATTR_SIZE | FATTR_FH;
return (in.header.opcode == FUSE_SETATTR &&
in.header.nodeid == ino &&
in.body.setattr.valid == valid);
}, Eq(true)),
_)
).WillRepeatedly(Invoke(ReturnImmediate([=](auto in, auto& out) {
ASSERT_EQ(0, ftruncate(m_backing_fd, in.body.setattr.size))
<< strerror(errno);
SET_OUT_HEADER_LEN(out, attr);
out.body.attr.attr.ino = ino;
out.body.attr.attr.mode = S_IFREG | 0755;
out.body.attr.attr.size = in.body.setattr.size;
out.body.attr.attr_valid = UINT64_MAX;
})));
/* Any test that close()s will send FUSE_FLUSH and FUSE_RELEASE */
EXPECT_CALL(*m_mock, process(
ResultOf([=](auto in) {
return (in.header.opcode == FUSE_FLUSH &&
in.header.nodeid == ino);
}, Eq(true)),
_)
).WillRepeatedly(Invoke(ReturnErrno(0)));
EXPECT_CALL(*m_mock, process(
ResultOf([=](auto in) {
return (in.header.opcode == FUSE_RELEASE &&
in.header.nodeid == ino);
}, Eq(true)),
_)
).WillRepeatedly(Invoke(ReturnErrno(0)));
m_test_fd = open(FULLPATH, O_RDWR );
EXPECT_LE(0, m_test_fd) << strerror(errno);
}
void TearDown()
{
if (m_backing_fd >= 0)
close(m_backing_fd);
if (m_control_fd >= 0)
close(m_control_fd);
FuseTest::TearDown();
/* Deliberately leak test_fd */
}
void do_ftruncate(off_t offs)
{
ASSERT_EQ(0, ftruncate(m_test_fd, offs)) << strerror(errno);
ASSERT_EQ(0, ftruncate(m_control_fd, offs)) << strerror(errno);
}
void do_read(ssize_t size, off_t offs)
{
void *test_buf, *control_buf;
test_buf = malloc(size);
ASSERT_NE(NULL, test_buf) << strerror(errno);
control_buf = malloc(size);
ASSERT_NE(NULL, control_buf) << strerror(errno);
ASSERT_EQ(size, pread(m_test_fd, test_buf, size, offs))
<< strerror(errno);
ASSERT_EQ(size, pread(m_control_fd, control_buf, size, offs))
<< strerror(errno);
ASSERT_EQ(0, memcmp(test_buf, control_buf, size));
free(control_buf);
free(test_buf);
}
void do_write(ssize_t size, off_t offs)
{
char *buf;
long i;
buf = (char*)malloc(size);
ASSERT_NE(NULL, buf) << strerror(errno);
for (i=0; i < size; i++)
buf[i] = random();
ASSERT_EQ(size, pwrite(m_test_fd, buf, size, offs ))
<< strerror(errno);
ASSERT_EQ(size, pwrite(m_control_fd, buf, size, offs))
<< strerror(errno);
}
};
/*
* Extend a file with dirty data in the last page of the last block.
*
* fsx -WR -P /tmp -S8 -N3 fsx.bin
*/
TEST_F(Io, extend_from_dirty_page)
{
off_t wofs = 0x21a0;
ssize_t wsize = 0xf0a8;
off_t rofs = 0xb284;
ssize_t rsize = 0x9b22;
off_t truncsize = 0x28702;
do_write(wsize, wofs);
do_ftruncate(truncsize);
do_read(rsize, rofs);
}
/*
* When writing the last page of a file, it must be written synchronously.
* Otherwise the cached page can become invalid by a subsequent extend
* operation.
*
* fsx -WR -P /tmp -S642 -N3 fsx.bin
*/
TEST_F(Io, last_page)
{
off_t wofs0 = 0x1134f;
ssize_t wsize0 = 0xcc77;
off_t wofs1 = 0x2096a;
ssize_t wsize1 = 0xdfa7;
off_t rofs = 0x1a3aa;
ssize_t rsize = 0xb5b7;
do_write(wsize0, wofs0);
do_write(wsize1, wofs1);
do_read(rsize, rofs);
}
/*
* Read a hole from a block that contains some cached data.
*
* fsx -WR -P /tmp -S55 fsx.bin
*/
TEST_F(Io, read_hole_from_cached_block)
{
off_t wofs = 0x160c5;
ssize_t wsize = 0xa996;
off_t rofs = 0x472e;
ssize_t rsize = 0xd8d5;
do_write(wsize, wofs);
do_read(rsize, rofs);
}
/*
* Reliable panic; I don't yet know why.
* Disabled because it panics.
*
* fsx -WR -P /tmp -S839 -d -N6 fsx.bin
*/
TEST_F(Io, DISABLED_fault_on_nofault_entry)
{
off_t wofs0 = 0x3bad7;
ssize_t wsize0 = 0x4529;
off_t wofs1 = 0xc30d;
ssize_t wsize1 = 0x5f77;
off_t truncsize0 = 0x10916;
off_t rofs = 0xdf17;
ssize_t rsize = 0x29ff;
off_t truncsize1 = 0x152b4;
do_write(wsize0, wofs0);
do_write(wsize1, wofs1);
do_ftruncate(truncsize0);
do_read(rsize, rofs);
do_ftruncate(truncsize1);
close(m_test_fd);
}

View File

@ -249,6 +249,44 @@ TEST_F(Write, append)
/* Deliberately leak fd. close(2) will be tested in release.cc */
}
/* If a file is cached, then appending to the end should not cause a read */
TEST_F(Write, append_to_cached)
{
const ssize_t BUFSIZE = 9;
const char FULLPATH[] = "mountpoint/some_file.txt";
const char RELPATH[] = "some_file.txt";
char *oldcontents, *oldbuf;
const char CONTENTS[BUFSIZE] = "abcdefgh";
uint64_t ino = 42;
/*
* Set offset in between maxbcachebuf boundary to test buffer handling
*/
uint64_t oldsize = m_maxbcachebuf / 2;
int fd;
oldcontents = (char*)calloc(1, oldsize);
ASSERT_NE(NULL, oldcontents) << strerror(errno);
oldbuf = (char*)malloc(oldsize);
ASSERT_NE(NULL, oldbuf) << strerror(errno);
expect_lookup(RELPATH, ino, oldsize);
expect_open(ino, 0, 1);
expect_read(ino, 0, oldsize, oldsize, oldcontents);
expect_write(ino, oldsize, BUFSIZE, BUFSIZE, CONTENTS);
/* Must open O_RDWR or fuse(4) implicitly sets direct_io */
fd = open(FULLPATH, O_RDWR | O_APPEND);
EXPECT_LE(0, fd) << strerror(errno);
/* Read the old data into the cache */
ASSERT_EQ((ssize_t)oldsize, read(fd, oldbuf, oldsize))
<< strerror(errno);
/* Write the new data. There should be no more read operations */
ASSERT_EQ(BUFSIZE, write(fd, CONTENTS, BUFSIZE)) << strerror(errno);
/* Deliberately leak fd. close(2) will be tested in release.cc */
}
TEST_F(Write, append_direct_io)
{
const ssize_t BUFSIZE = 9;
@ -789,7 +827,7 @@ TEST_F(WriteThrough, DISABLED_writethrough)
EXPECT_LE(0, fd) << strerror(errno);
ASSERT_EQ(bufsize, write(fd, CONTENTS, bufsize)) << strerror(errno);
/*
/*
* A subsequent read should be serviced by cache, without querying the
* filesystem daemon
*/