From a1c9f4ad0d65957158d62d04619f150159d18f4d Mon Sep 17 00:00:00 2001 From: Alan Somers Date: Thu, 20 Jun 2019 17:08:21 +0000 Subject: [PATCH] fusefs: implement VOP_BMAP If the fuse daemon supports FUSE_BMAP, then use that for the block mapping. Otherwise, use the same technique used by vop_stdbmap. Report large values for runp and runb in order to maximize read clustering and minimize upcalls, even if we don't know the true layout. The major result of this change is that sequential reads to FUSE files will now usually happen 128KB at a time instead of 64KB. Sponsored by: The FreeBSD Foundation --- sys/fs/fuse/fuse_internal.c | 3 +- sys/fs/fuse/fuse_io.c | 4 +- sys/fs/fuse/fuse_ipc.c | 4 + sys/fs/fuse/fuse_ipc.h | 2 +- sys/fs/fuse/fuse_vnops.c | 88 +++++++++++++++++++ tests/sys/fs/fusefs/Makefile | 1 + tests/sys/fs/fusefs/bmap.cc | 159 ++++++++++++++++++++++++++++++++++ tests/sys/fs/fusefs/mockfs.cc | 4 + tests/sys/fs/fusefs/mockfs.hh | 2 + tests/sys/fs/fusefs/read.cc | 58 +++---------- tests/sys/fs/fusefs/utils.cc | 14 +++ 11 files changed, 288 insertions(+), 51 deletions(-) create mode 100644 tests/sys/fs/fusefs/bmap.cc diff --git a/sys/fs/fuse/fuse_internal.c b/sys/fs/fuse/fuse_internal.c index cd24acd40355..6f727711b7c8 100644 --- a/sys/fs/fuse/fuse_internal.c +++ b/sys/fs/fuse/fuse_internal.c @@ -908,7 +908,8 @@ fuse_internal_init_callback(struct fuse_ticket *tick, struct uio *uio) if (fuse_libabi_geq(data, 7, 5)) { if (fticket_resp(tick)->len == sizeof(struct fuse_init_out)) { - data->max_readahead = fiio->max_readahead; + data->max_readahead_blocks = fiio->max_readahead / + maxbcachebuf; data->max_write = fiio->max_write; if (fiio->flags & FUSE_ASYNC_READ) data->dataflags |= FSESS_ASYNC_READ; diff --git a/sys/fs/fuse/fuse_io.c b/sys/fs/fuse/fuse_io.c index 60871175ada3..ab5e18508d05 100644 --- a/sys/fs/fuse/fuse_io.c +++ b/sys/fs/fuse/fuse_io.c @@ -321,10 +321,10 @@ fuse_read_biobackend(struct vnode *vp, struct uio *uio, int ioflag, /* Try clustered read */ long totread = uio->uio_resid + on; seqcount = MIN(seqcount, - data->max_readahead / biosize + 1); + data->max_readahead_blocks + 1); err = cluster_read(vp, filesize, lbn, bcount, NOCRED, totread, seqcount, 0, &bp); - } else if (seqcount > 1 && data->max_readahead >= nextsize) { + } else if (seqcount > 1 && data->max_readahead_blocks >= 1) { /* Try non-clustered readahead */ err = breadn(vp, lbn, bcount, &nextlbn, &nextsize, 1, NOCRED, &bp); diff --git a/sys/fs/fuse/fuse_ipc.c b/sys/fs/fuse/fuse_ipc.c index 06dd83b7e08b..b577b5e5579b 100644 --- a/sys/fs/fuse/fuse_ipc.c +++ b/sys/fs/fuse/fuse_ipc.c @@ -711,6 +711,10 @@ fuse_body_audit(struct fuse_ticket *ftick, size_t blen) opcode = fticket_opcode(ftick); switch (opcode) { + case FUSE_BMAP: + err = (blen == sizeof(struct fuse_bmap_out)) ? 0 : EINVAL; + break; + case FUSE_LINK: case FUSE_LOOKUP: case FUSE_MKDIR: diff --git a/sys/fs/fuse/fuse_ipc.h b/sys/fs/fuse/fuse_ipc.h index 440a4f671490..0f9b1180f362 100644 --- a/sys/fs/fuse/fuse_ipc.h +++ b/sys/fs/fuse/fuse_ipc.h @@ -197,7 +197,7 @@ struct fuse_data { uint32_t fuse_libabi_major; uint32_t fuse_libabi_minor; - uint32_t max_readahead; + uint32_t max_readahead_blocks; uint32_t max_write; uint32_t max_read; uint32_t subtype; diff --git a/sys/fs/fuse/fuse_vnops.c b/sys/fs/fuse/fuse_vnops.c index b1d8b5837930..4fcc53290f92 100644 --- a/sys/fs/fuse/fuse_vnops.c +++ b/sys/fs/fuse/fuse_vnops.c @@ -120,6 +120,7 @@ SDT_PROBE_DEFINE2(fusefs, , vnops, trace, "int", "char*"); /* vnode ops */ static vop_access_t fuse_vnop_access; static vop_advlock_t fuse_vnop_advlock; +static vop_bmap_t fuse_vnop_bmap; static vop_close_t fuse_fifo_close; static vop_close_t fuse_vnop_close; static vop_create_t fuse_vnop_create; @@ -174,6 +175,7 @@ struct vop_vector fuse_vnops = { .vop_default = &default_vnodeops, .vop_access = fuse_vnop_access, .vop_advlock = fuse_vnop_advlock, + .vop_bmap = fuse_vnop_bmap, .vop_close = fuse_vnop_close, .vop_create = fuse_vnop_create, .vop_deleteextattr = fuse_vnop_deleteextattr, @@ -466,6 +468,92 @@ fuse_vnop_advlock(struct vop_advlock_args *ap) return err; } +/* { + struct vnode *a_vp; + daddr_t a_bn; + struct bufobj **a_bop; + daddr_t *a_bnp; + int *a_runp; + int *a_runb; +} */ +static int +fuse_vnop_bmap(struct vop_bmap_args *ap) +{ + struct vnode *vp = ap->a_vp; + struct bufobj **bo = ap->a_bop; + struct thread *td = curthread; + struct mount *mp; + struct fuse_dispatcher fdi; + struct fuse_bmap_in *fbi; + struct fuse_bmap_out *fbo; + struct fuse_data *data; + uint64_t biosize; + off_t filesize; + daddr_t lbn = ap->a_bn; + daddr_t *pbn = ap->a_bnp; + int *runp = ap->a_runp; + int *runb = ap->a_runb; + int error = 0; + int maxrun; + + if (fuse_isdeadfs(vp)) { + return ENXIO; + } + + mp = vnode_mount(vp); + data = fuse_get_mpdata(mp); + biosize = fuse_iosize(vp); + maxrun = MIN(vp->v_mount->mnt_iosize_max / biosize - 1, + data->max_readahead_blocks); + + if (bo != NULL) + *bo = &vp->v_bufobj; + + /* + * The FUSE_BMAP operation does not include the runp and runb + * variables, so we must guess. Report nonzero contiguous runs so + * cluster_read will combine adjacent reads. It's worthwhile to reduce + * upcalls even if we don't know the true physical layout of the file. + * + * FUSE file systems may opt out of read clustering in two ways: + * * mounting with -onoclusterr + * * Setting max_readahead <= maxbcachebuf during FUSE_INIT + */ + if (runb != NULL) + *runb = MIN(lbn, maxrun); + if (runp != NULL) { + error = fuse_vnode_size(vp, &filesize, td->td_ucred, td); + if (error == 0) + *runp = MIN(MAX(0, filesize / biosize - lbn - 1), + maxrun); + else + *runp = 0; + } + + if (fsess_isimpl(mp, FUSE_BMAP)) { + fdisp_init(&fdi, sizeof(*fbi)); + fdisp_make_vp(&fdi, FUSE_BMAP, vp, td, td->td_ucred); + fbi = fdi.indata; + fbi->block = lbn; + fbi->blocksize = biosize; + error = fdisp_wait_answ(&fdi); + if (error == ENOSYS) { + fsess_set_notimpl(mp, FUSE_BMAP); + error = 0; + } else { + fbo = fdi.answ; + if (error == 0 && pbn != NULL) + *pbn = fbo->block; + return error; + } + } + + /* If the daemon doesn't support BMAP, make up a sensible default */ + if (pbn != NULL) + *pbn = lbn * btodb(biosize); + return (error); +} + /* struct vop_close_args { struct vnode *a_vp; diff --git a/tests/sys/fs/fusefs/Makefile b/tests/sys/fs/fusefs/Makefile index 8aa07933f602..1a4045e8f2f0 100644 --- a/tests/sys/fs/fusefs/Makefile +++ b/tests/sys/fs/fusefs/Makefile @@ -9,6 +9,7 @@ TESTSDIR= ${TESTSBASE}/sys/fs/fusefs # out, so we get more granular reporting. GTESTS+= access GTESTS+= allow_other +GTESTS+= bmap GTESTS+= create GTESTS+= default_permissions GTESTS+= default_permissions_privileged diff --git a/tests/sys/fs/fusefs/bmap.cc b/tests/sys/fs/fusefs/bmap.cc new file mode 100644 index 000000000000..3ab345a45377 --- /dev/null +++ b/tests/sys/fs/fusefs/bmap.cc @@ -0,0 +1,159 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2019 The FreeBSD Foundation + * + * This software was developed by BFF Storage Systems, LLC under sponsorship + * from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +extern "C" { +#include +#include +#include + +#include +} + +#include "mockfs.hh" +#include "utils.hh" + +using namespace testing; + +const static char FULLPATH[] = "mountpoint/foo"; +const static char RELPATH[] = "foo"; + +class Bmap: public FuseTest { +public: +virtual void SetUp() { + m_maxreadahead = UINT32_MAX; + FuseTest::SetUp(); +} +void expect_bmap(uint64_t ino, uint64_t lbn, uint32_t blocksize, uint64_t pbn) +{ + EXPECT_CALL(*m_mock, process( + ResultOf([=](auto in) { + return (in.header.opcode == FUSE_BMAP && + in.header.nodeid == ino && + in.body.bmap.block == lbn && + in.body.bmap.blocksize == blocksize); + }, Eq(true)), + _) + ).WillOnce(Invoke(ReturnImmediate([=](auto i __unused, auto& out) { + SET_OUT_HEADER_LEN(out, bmap); + out.body.bmap.block = pbn; + }))); +} + +void expect_lookup(const char *relpath, uint64_t ino, off_t size) +{ + FuseTest::expect_lookup(relpath, ino, S_IFREG | 0644, size, 1, + UINT64_MAX); +} +}; + +/* + * Test FUSE_BMAP + * XXX The FUSE protocol does not include the runp and runb variables, so those + * must be guessed in-kernel. + */ +TEST_F(Bmap, bmap) +{ + struct fiobmap2_arg arg; + const off_t filesize = 1 << 20; + const ino_t ino = 42; + int64_t lbn = 10; + int64_t pbn = 12345; + int fd; + + expect_lookup(RELPATH, 42, filesize); + expect_open(ino, 0, 1); + expect_bmap(ino, lbn, m_maxbcachebuf, pbn); + + fd = open(FULLPATH, O_RDWR); + ASSERT_LE(0, fd) << strerror(errno); + + arg.bn = lbn; + arg.runp = -1; + arg.runb = -1; + ASSERT_EQ(0, ioctl(fd, FIOBMAP2, &arg)) << strerror(errno); + EXPECT_EQ(arg.bn, pbn); + EXPECT_EQ(arg.runp, MAXPHYS / m_maxbcachebuf - 1); + EXPECT_EQ(arg.runb, MAXPHYS / m_maxbcachebuf - 1); +} + +/* + * If the daemon does not implement VOP_BMAP, fusefs should return sensible + * defaults. + */ +TEST_F(Bmap, default_) +{ + struct fiobmap2_arg arg; + const off_t filesize = 1 << 20; + const ino_t ino = 42; + int64_t lbn; + int fd; + + expect_lookup(RELPATH, 42, filesize); + expect_open(ino, 0, 1); + EXPECT_CALL(*m_mock, process( + ResultOf([=](auto in) { + return (in.header.opcode == FUSE_BMAP); + }, Eq(true)), + _) + ).WillOnce(Invoke(ReturnErrno(ENOSYS))); + + fd = open(FULLPATH, O_RDWR); + ASSERT_LE(0, fd) << strerror(errno); + + /* First block */ + lbn = 0; + arg.bn = lbn; + arg.runp = -1; + arg.runb = -1; + ASSERT_EQ(0, ioctl(fd, FIOBMAP2, &arg)) << strerror(errno); + EXPECT_EQ(arg.bn, 0); + EXPECT_EQ(arg.runp, MAXPHYS / m_maxbcachebuf - 1); + EXPECT_EQ(arg.runb, 0); + + /* In the middle */ + lbn = filesize / m_maxbcachebuf / 2; + arg.bn = lbn; + arg.runp = -1; + arg.runb = -1; + ASSERT_EQ(0, ioctl(fd, FIOBMAP2, &arg)) << strerror(errno); + EXPECT_EQ(arg.bn, lbn * m_maxbcachebuf / DEV_BSIZE); + EXPECT_EQ(arg.runp, MAXPHYS / m_maxbcachebuf - 1); + EXPECT_EQ(arg.runb, MAXPHYS / m_maxbcachebuf - 1); + + /* Last block */ + lbn = filesize / m_maxbcachebuf - 1; + arg.bn = lbn; + arg.runp = -1; + arg.runb = -1; + ASSERT_EQ(0, ioctl(fd, FIOBMAP2, &arg)) << strerror(errno); + EXPECT_EQ(arg.bn, lbn * m_maxbcachebuf / DEV_BSIZE); + EXPECT_EQ(arg.runp, 0); + EXPECT_EQ(arg.runb, MAXPHYS / m_maxbcachebuf - 1); +} diff --git a/tests/sys/fs/fusefs/mockfs.cc b/tests/sys/fs/fusefs/mockfs.cc index 893ff40607f5..7a9e8b66a5c4 100644 --- a/tests/sys/fs/fusefs/mockfs.cc +++ b/tests/sys/fs/fusefs/mockfs.cc @@ -168,6 +168,10 @@ void MockFS::debug_request(const mockfs_buf_in &in) case FUSE_ACCESS: printf(" mask=%#x", in.body.access.mask); break; + case FUSE_BMAP: + printf(" block=%#lx blocksize=%#x", in.body.bmap.block, + in.body.bmap.blocksize); + break; case FUSE_CREATE: if (m_kernel_minor_version >= 12) name = (const char*)in.body.bytes + diff --git a/tests/sys/fs/fusefs/mockfs.hh b/tests/sys/fs/fusefs/mockfs.hh index e37cd750ebde..162df6281932 100644 --- a/tests/sys/fs/fusefs/mockfs.hh +++ b/tests/sys/fs/fusefs/mockfs.hh @@ -124,6 +124,7 @@ struct fuse_create_out_7_8 { union fuse_payloads_in { fuse_access_in access; + fuse_bmap_in bmap; /* value is from fuse_kern_chan.c in fusefs-libs */ uint8_t bytes[0x21000 - sizeof(struct fuse_in_header)]; fuse_create_in create; @@ -164,6 +165,7 @@ struct mockfs_buf_in { union fuse_payloads_out { fuse_attr_out attr; fuse_attr_out_7_8 attr_7_8; + fuse_bmap_out bmap; fuse_create_out create; fuse_create_out_7_8 create_7_8; /* diff --git a/tests/sys/fs/fusefs/read.cc b/tests/sys/fs/fusefs/read.cc index 6146db93f5d2..a0b531ccba74 100644 --- a/tests/sys/fs/fusefs/read.cc +++ b/tests/sys/fs/fusefs/read.cc @@ -29,7 +29,7 @@ */ extern "C" { -#include +#include #include #include #include @@ -749,16 +749,15 @@ TEST_F(ReadCacheable, DISABLED_sendfile_eio) /* Deliberately leak fd. close(2) will be tested in release.cc */ } -/* Large reads should be clustered, even across cache block boundaries */ -/* - * Disabled because clustered reads requires VOP_BMAP, which fusefs does not - * yet support +/* + * Sequential reads should use readahead. And if allowed, large reads should + * be clustered. */ -TEST_P(ReadAhead, DISABLED_cluster) { +TEST_P(ReadAhead, readahead) { const char FULLPATH[] = "mountpoint/some_file.txt"; const char RELPATH[] = "some_file.txt"; uint64_t ino = 42; - int fd, maxcontig; + int fd, maxcontig, clustersize; ssize_t bufsize = 4 * m_maxbcachebuf; ssize_t filesize = bufsize; uint64_t len; @@ -774,8 +773,9 @@ TEST_P(ReadAhead, DISABLED_cluster) { expect_open(ino, 0, 1); maxcontig = m_noclusterr ? m_maxbcachebuf : m_maxbcachebuf + (int)get<1>(GetParam()); - for (offs = 0; offs < bufsize; offs += maxcontig) { - len = std::min((size_t)maxcontig, (size_t)(filesize - offs)); + clustersize = MIN(maxcontig, MAXPHYS); + for (offs = 0; offs < bufsize; offs += clustersize) { + len = std::min((size_t)clustersize, (size_t)(filesize - offs)); expect_read(ino, offs, len, len, contents + offs); } @@ -791,47 +791,11 @@ TEST_P(ReadAhead, DISABLED_cluster) { /* Deliberately leak fd. close(2) will be tested in release.cc */ } -/* fuse(4) should honor the filesystem's requested m_readahead parameter */ -TEST_P(ReadAhead, readahead) { - const char FULLPATH[] = "mountpoint/some_file.txt"; - const char RELPATH[] = "some_file.txt"; - uint64_t ino = 42; - int fd, i; - ssize_t bufsize = m_maxbcachebuf; - ssize_t filesize = m_maxbcachebuf * 6; - char *rbuf, *contents; - - contents = (char*)malloc(filesize); - ASSERT_NE(NULL, contents); - memset(contents, 'X', filesize); - rbuf = (char*)calloc(1, bufsize); - - expect_lookup(RELPATH, ino, filesize); - expect_open(ino, 0, 1); - /* fuse(4) should only read ahead the allowed amount */ - expect_read(ino, 0, m_maxbcachebuf, m_maxbcachebuf, contents); - for (i = 0; i < (int)get<1>(GetParam()) / m_maxbcachebuf; i++) { - off_t offs = (i + 1) * m_maxbcachebuf; - expect_read(ino, offs, m_maxbcachebuf, m_maxbcachebuf, - contents + offs); - } - - fd = open(FULLPATH, O_RDONLY); - ASSERT_LE(0, fd) << strerror(errno); - - /* Set the internal readahead counter to a "large" value */ - ASSERT_EQ(0, fcntl(fd, F_READAHEAD, 1'000'000'000)) << strerror(errno); - - ASSERT_EQ(bufsize, read(fd, rbuf, bufsize)) << strerror(errno); - ASSERT_EQ(0, memcmp(rbuf, contents, bufsize)); - - /* Deliberately leak fd. close(2) will be tested in release.cc */ -} - INSTANTIATE_TEST_CASE_P(RA, ReadAhead, Values(tuple(false, 0u), tuple(false, 0x10000), tuple(false, 0x20000), tuple(false, 0x30000), tuple(true, 0u), - tuple(true, 0x10000))); + tuple(true, 0x10000), + tuple(true, 0x20000))); diff --git a/tests/sys/fs/fusefs/utils.cc b/tests/sys/fs/fusefs/utils.cc index 1b1fdb013bc6..cd8b8be40259 100644 --- a/tests/sys/fs/fusefs/utils.cc +++ b/tests/sys/fs/fusefs/utils.cc @@ -129,6 +129,20 @@ void FuseTest::SetUp() { _) ).Times(AnyNumber()) .WillRepeatedly(Invoke(ReturnErrno(ENOSYS))); + /* + * FUSE_BMAP is called for most test cases that read data. Set + * a default expectation and return ENOSYS. + * + * Individual test cases can override this expectation since + * googlemock evaluates expectations in LIFO order. + */ + EXPECT_CALL(*m_mock, process( + ResultOf([=](auto in) { + return (in.header.opcode == FUSE_BMAP); + }, Eq(true)), + _) + ).Times(AnyNumber()) + .WillRepeatedly(Invoke(ReturnErrno(ENOSYS))); } catch (std::system_error err) { FAIL() << err.what(); }