fusefs: implement VOP_BMAP

If the fuse daemon supports FUSE_BMAP, then use that for the block mapping.
Otherwise, use the same technique used by vop_stdbmap.  Report large values
for runp and runb in order to maximize read clustering and minimize upcalls,
even if we don't know the true layout.

The major result of this change is that sequential reads to FUSE files will
now usually happen 128KB at a time instead of 64KB.

Sponsored by:	The FreeBSD Foundation
This commit is contained in:
Alan Somers 2019-06-20 17:08:21 +00:00
parent e532a99901
commit a1c9f4ad0d
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/projects/fuse2/; revision=349238
11 changed files with 288 additions and 51 deletions

View File

@ -908,7 +908,8 @@ fuse_internal_init_callback(struct fuse_ticket *tick, struct uio *uio)
if (fuse_libabi_geq(data, 7, 5)) {
if (fticket_resp(tick)->len == sizeof(struct fuse_init_out)) {
data->max_readahead = fiio->max_readahead;
data->max_readahead_blocks = fiio->max_readahead /
maxbcachebuf;
data->max_write = fiio->max_write;
if (fiio->flags & FUSE_ASYNC_READ)
data->dataflags |= FSESS_ASYNC_READ;

View File

@ -321,10 +321,10 @@ fuse_read_biobackend(struct vnode *vp, struct uio *uio, int ioflag,
/* Try clustered read */
long totread = uio->uio_resid + on;
seqcount = MIN(seqcount,
data->max_readahead / biosize + 1);
data->max_readahead_blocks + 1);
err = cluster_read(vp, filesize, lbn, bcount, NOCRED,
totread, seqcount, 0, &bp);
} else if (seqcount > 1 && data->max_readahead >= nextsize) {
} else if (seqcount > 1 && data->max_readahead_blocks >= 1) {
/* Try non-clustered readahead */
err = breadn(vp, lbn, bcount, &nextlbn, &nextsize, 1,
NOCRED, &bp);

View File

@ -711,6 +711,10 @@ fuse_body_audit(struct fuse_ticket *ftick, size_t blen)
opcode = fticket_opcode(ftick);
switch (opcode) {
case FUSE_BMAP:
err = (blen == sizeof(struct fuse_bmap_out)) ? 0 : EINVAL;
break;
case FUSE_LINK:
case FUSE_LOOKUP:
case FUSE_MKDIR:

View File

@ -197,7 +197,7 @@ struct fuse_data {
uint32_t fuse_libabi_major;
uint32_t fuse_libabi_minor;
uint32_t max_readahead;
uint32_t max_readahead_blocks;
uint32_t max_write;
uint32_t max_read;
uint32_t subtype;

View File

@ -120,6 +120,7 @@ SDT_PROBE_DEFINE2(fusefs, , vnops, trace, "int", "char*");
/* vnode ops */
static vop_access_t fuse_vnop_access;
static vop_advlock_t fuse_vnop_advlock;
static vop_bmap_t fuse_vnop_bmap;
static vop_close_t fuse_fifo_close;
static vop_close_t fuse_vnop_close;
static vop_create_t fuse_vnop_create;
@ -174,6 +175,7 @@ struct vop_vector fuse_vnops = {
.vop_default = &default_vnodeops,
.vop_access = fuse_vnop_access,
.vop_advlock = fuse_vnop_advlock,
.vop_bmap = fuse_vnop_bmap,
.vop_close = fuse_vnop_close,
.vop_create = fuse_vnop_create,
.vop_deleteextattr = fuse_vnop_deleteextattr,
@ -466,6 +468,92 @@ fuse_vnop_advlock(struct vop_advlock_args *ap)
return err;
}
/* {
struct vnode *a_vp;
daddr_t a_bn;
struct bufobj **a_bop;
daddr_t *a_bnp;
int *a_runp;
int *a_runb;
} */
static int
fuse_vnop_bmap(struct vop_bmap_args *ap)
{
struct vnode *vp = ap->a_vp;
struct bufobj **bo = ap->a_bop;
struct thread *td = curthread;
struct mount *mp;
struct fuse_dispatcher fdi;
struct fuse_bmap_in *fbi;
struct fuse_bmap_out *fbo;
struct fuse_data *data;
uint64_t biosize;
off_t filesize;
daddr_t lbn = ap->a_bn;
daddr_t *pbn = ap->a_bnp;
int *runp = ap->a_runp;
int *runb = ap->a_runb;
int error = 0;
int maxrun;
if (fuse_isdeadfs(vp)) {
return ENXIO;
}
mp = vnode_mount(vp);
data = fuse_get_mpdata(mp);
biosize = fuse_iosize(vp);
maxrun = MIN(vp->v_mount->mnt_iosize_max / biosize - 1,
data->max_readahead_blocks);
if (bo != NULL)
*bo = &vp->v_bufobj;
/*
* The FUSE_BMAP operation does not include the runp and runb
* variables, so we must guess. Report nonzero contiguous runs so
* cluster_read will combine adjacent reads. It's worthwhile to reduce
* upcalls even if we don't know the true physical layout of the file.
*
* FUSE file systems may opt out of read clustering in two ways:
* * mounting with -onoclusterr
* * Setting max_readahead <= maxbcachebuf during FUSE_INIT
*/
if (runb != NULL)
*runb = MIN(lbn, maxrun);
if (runp != NULL) {
error = fuse_vnode_size(vp, &filesize, td->td_ucred, td);
if (error == 0)
*runp = MIN(MAX(0, filesize / biosize - lbn - 1),
maxrun);
else
*runp = 0;
}
if (fsess_isimpl(mp, FUSE_BMAP)) {
fdisp_init(&fdi, sizeof(*fbi));
fdisp_make_vp(&fdi, FUSE_BMAP, vp, td, td->td_ucred);
fbi = fdi.indata;
fbi->block = lbn;
fbi->blocksize = biosize;
error = fdisp_wait_answ(&fdi);
if (error == ENOSYS) {
fsess_set_notimpl(mp, FUSE_BMAP);
error = 0;
} else {
fbo = fdi.answ;
if (error == 0 && pbn != NULL)
*pbn = fbo->block;
return error;
}
}
/* If the daemon doesn't support BMAP, make up a sensible default */
if (pbn != NULL)
*pbn = lbn * btodb(biosize);
return (error);
}
/*
struct vop_close_args {
struct vnode *a_vp;

View File

@ -9,6 +9,7 @@ TESTSDIR= ${TESTSBASE}/sys/fs/fusefs
# out, so we get more granular reporting.
GTESTS+= access
GTESTS+= allow_other
GTESTS+= bmap
GTESTS+= create
GTESTS+= default_permissions
GTESTS+= default_permissions_privileged

159
tests/sys/fs/fusefs/bmap.cc Normal file
View File

@ -0,0 +1,159 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2019 The FreeBSD Foundation
*
* This software was developed by BFF Storage Systems, LLC under sponsorship
* from the FreeBSD Foundation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
extern "C" {
#include <sys/param.h>
#include <sys/ioctl.h>
#include <sys/filio.h>
#include <fcntl.h>
}
#include "mockfs.hh"
#include "utils.hh"
using namespace testing;
const static char FULLPATH[] = "mountpoint/foo";
const static char RELPATH[] = "foo";
class Bmap: public FuseTest {
public:
virtual void SetUp() {
m_maxreadahead = UINT32_MAX;
FuseTest::SetUp();
}
void expect_bmap(uint64_t ino, uint64_t lbn, uint32_t blocksize, uint64_t pbn)
{
EXPECT_CALL(*m_mock, process(
ResultOf([=](auto in) {
return (in.header.opcode == FUSE_BMAP &&
in.header.nodeid == ino &&
in.body.bmap.block == lbn &&
in.body.bmap.blocksize == blocksize);
}, Eq(true)),
_)
).WillOnce(Invoke(ReturnImmediate([=](auto i __unused, auto& out) {
SET_OUT_HEADER_LEN(out, bmap);
out.body.bmap.block = pbn;
})));
}
void expect_lookup(const char *relpath, uint64_t ino, off_t size)
{
FuseTest::expect_lookup(relpath, ino, S_IFREG | 0644, size, 1,
UINT64_MAX);
}
};
/*
* Test FUSE_BMAP
* XXX The FUSE protocol does not include the runp and runb variables, so those
* must be guessed in-kernel.
*/
TEST_F(Bmap, bmap)
{
struct fiobmap2_arg arg;
const off_t filesize = 1 << 20;
const ino_t ino = 42;
int64_t lbn = 10;
int64_t pbn = 12345;
int fd;
expect_lookup(RELPATH, 42, filesize);
expect_open(ino, 0, 1);
expect_bmap(ino, lbn, m_maxbcachebuf, pbn);
fd = open(FULLPATH, O_RDWR);
ASSERT_LE(0, fd) << strerror(errno);
arg.bn = lbn;
arg.runp = -1;
arg.runb = -1;
ASSERT_EQ(0, ioctl(fd, FIOBMAP2, &arg)) << strerror(errno);
EXPECT_EQ(arg.bn, pbn);
EXPECT_EQ(arg.runp, MAXPHYS / m_maxbcachebuf - 1);
EXPECT_EQ(arg.runb, MAXPHYS / m_maxbcachebuf - 1);
}
/*
* If the daemon does not implement VOP_BMAP, fusefs should return sensible
* defaults.
*/
TEST_F(Bmap, default_)
{
struct fiobmap2_arg arg;
const off_t filesize = 1 << 20;
const ino_t ino = 42;
int64_t lbn;
int fd;
expect_lookup(RELPATH, 42, filesize);
expect_open(ino, 0, 1);
EXPECT_CALL(*m_mock, process(
ResultOf([=](auto in) {
return (in.header.opcode == FUSE_BMAP);
}, Eq(true)),
_)
).WillOnce(Invoke(ReturnErrno(ENOSYS)));
fd = open(FULLPATH, O_RDWR);
ASSERT_LE(0, fd) << strerror(errno);
/* First block */
lbn = 0;
arg.bn = lbn;
arg.runp = -1;
arg.runb = -1;
ASSERT_EQ(0, ioctl(fd, FIOBMAP2, &arg)) << strerror(errno);
EXPECT_EQ(arg.bn, 0);
EXPECT_EQ(arg.runp, MAXPHYS / m_maxbcachebuf - 1);
EXPECT_EQ(arg.runb, 0);
/* In the middle */
lbn = filesize / m_maxbcachebuf / 2;
arg.bn = lbn;
arg.runp = -1;
arg.runb = -1;
ASSERT_EQ(0, ioctl(fd, FIOBMAP2, &arg)) << strerror(errno);
EXPECT_EQ(arg.bn, lbn * m_maxbcachebuf / DEV_BSIZE);
EXPECT_EQ(arg.runp, MAXPHYS / m_maxbcachebuf - 1);
EXPECT_EQ(arg.runb, MAXPHYS / m_maxbcachebuf - 1);
/* Last block */
lbn = filesize / m_maxbcachebuf - 1;
arg.bn = lbn;
arg.runp = -1;
arg.runb = -1;
ASSERT_EQ(0, ioctl(fd, FIOBMAP2, &arg)) << strerror(errno);
EXPECT_EQ(arg.bn, lbn * m_maxbcachebuf / DEV_BSIZE);
EXPECT_EQ(arg.runp, 0);
EXPECT_EQ(arg.runb, MAXPHYS / m_maxbcachebuf - 1);
}

View File

@ -168,6 +168,10 @@ void MockFS::debug_request(const mockfs_buf_in &in)
case FUSE_ACCESS:
printf(" mask=%#x", in.body.access.mask);
break;
case FUSE_BMAP:
printf(" block=%#lx blocksize=%#x", in.body.bmap.block,
in.body.bmap.blocksize);
break;
case FUSE_CREATE:
if (m_kernel_minor_version >= 12)
name = (const char*)in.body.bytes +

View File

@ -124,6 +124,7 @@ struct fuse_create_out_7_8 {
union fuse_payloads_in {
fuse_access_in access;
fuse_bmap_in bmap;
/* value is from fuse_kern_chan.c in fusefs-libs */
uint8_t bytes[0x21000 - sizeof(struct fuse_in_header)];
fuse_create_in create;
@ -164,6 +165,7 @@ struct mockfs_buf_in {
union fuse_payloads_out {
fuse_attr_out attr;
fuse_attr_out_7_8 attr_7_8;
fuse_bmap_out bmap;
fuse_create_out create;
fuse_create_out_7_8 create_7_8;
/*

View File

@ -29,7 +29,7 @@
*/
extern "C" {
#include <sys/types.h>
#include <sys/param.h>
#include <sys/mman.h>
#include <sys/socket.h>
#include <sys/sysctl.h>
@ -749,16 +749,15 @@ TEST_F(ReadCacheable, DISABLED_sendfile_eio)
/* Deliberately leak fd. close(2) will be tested in release.cc */
}
/* Large reads should be clustered, even across cache block boundaries */
/*
* Disabled because clustered reads requires VOP_BMAP, which fusefs does not
* yet support
/*
* Sequential reads should use readahead. And if allowed, large reads should
* be clustered.
*/
TEST_P(ReadAhead, DISABLED_cluster) {
TEST_P(ReadAhead, readahead) {
const char FULLPATH[] = "mountpoint/some_file.txt";
const char RELPATH[] = "some_file.txt";
uint64_t ino = 42;
int fd, maxcontig;
int fd, maxcontig, clustersize;
ssize_t bufsize = 4 * m_maxbcachebuf;
ssize_t filesize = bufsize;
uint64_t len;
@ -774,8 +773,9 @@ TEST_P(ReadAhead, DISABLED_cluster) {
expect_open(ino, 0, 1);
maxcontig = m_noclusterr ? m_maxbcachebuf :
m_maxbcachebuf + (int)get<1>(GetParam());
for (offs = 0; offs < bufsize; offs += maxcontig) {
len = std::min((size_t)maxcontig, (size_t)(filesize - offs));
clustersize = MIN(maxcontig, MAXPHYS);
for (offs = 0; offs < bufsize; offs += clustersize) {
len = std::min((size_t)clustersize, (size_t)(filesize - offs));
expect_read(ino, offs, len, len, contents + offs);
}
@ -791,47 +791,11 @@ TEST_P(ReadAhead, DISABLED_cluster) {
/* Deliberately leak fd. close(2) will be tested in release.cc */
}
/* fuse(4) should honor the filesystem's requested m_readahead parameter */
TEST_P(ReadAhead, readahead) {
const char FULLPATH[] = "mountpoint/some_file.txt";
const char RELPATH[] = "some_file.txt";
uint64_t ino = 42;
int fd, i;
ssize_t bufsize = m_maxbcachebuf;
ssize_t filesize = m_maxbcachebuf * 6;
char *rbuf, *contents;
contents = (char*)malloc(filesize);
ASSERT_NE(NULL, contents);
memset(contents, 'X', filesize);
rbuf = (char*)calloc(1, bufsize);
expect_lookup(RELPATH, ino, filesize);
expect_open(ino, 0, 1);
/* fuse(4) should only read ahead the allowed amount */
expect_read(ino, 0, m_maxbcachebuf, m_maxbcachebuf, contents);
for (i = 0; i < (int)get<1>(GetParam()) / m_maxbcachebuf; i++) {
off_t offs = (i + 1) * m_maxbcachebuf;
expect_read(ino, offs, m_maxbcachebuf, m_maxbcachebuf,
contents + offs);
}
fd = open(FULLPATH, O_RDONLY);
ASSERT_LE(0, fd) << strerror(errno);
/* Set the internal readahead counter to a "large" value */
ASSERT_EQ(0, fcntl(fd, F_READAHEAD, 1'000'000'000)) << strerror(errno);
ASSERT_EQ(bufsize, read(fd, rbuf, bufsize)) << strerror(errno);
ASSERT_EQ(0, memcmp(rbuf, contents, bufsize));
/* Deliberately leak fd. close(2) will be tested in release.cc */
}
INSTANTIATE_TEST_CASE_P(RA, ReadAhead,
Values(tuple<bool, int>(false, 0u),
tuple<bool, int>(false, 0x10000),
tuple<bool, int>(false, 0x20000),
tuple<bool, int>(false, 0x30000),
tuple<bool, int>(true, 0u),
tuple<bool, int>(true, 0x10000)));
tuple<bool, int>(true, 0x10000),
tuple<bool, int>(true, 0x20000)));

View File

@ -129,6 +129,20 @@ void FuseTest::SetUp() {
_)
).Times(AnyNumber())
.WillRepeatedly(Invoke(ReturnErrno(ENOSYS)));
/*
* FUSE_BMAP is called for most test cases that read data. Set
* a default expectation and return ENOSYS.
*
* Individual test cases can override this expectation since
* googlemock evaluates expectations in LIFO order.
*/
EXPECT_CALL(*m_mock, process(
ResultOf([=](auto in) {
return (in.header.opcode == FUSE_BMAP);
}, Eq(true)),
_)
).Times(AnyNumber())
.WillRepeatedly(Invoke(ReturnErrno(ENOSYS)));
} catch (std::system_error err) {
FAIL() << err.what();
}