zfs_vnops: make zfs_get_data OS-independent
Move zfs_get_data() in to platform-independent code. The only platform-specific aspect of it is the way we release an inode (Linux) / vnode_t (FreeBSD). I am not aware of a platform that could be supported by ZFS that couldn't implement zfs_rele_async itself. It's sibling zvol_get_data already is platform-independent. Reviewed-by: Ryan Moeller <ryan@iXsystems.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Christian Schwarz <me@cschwarz.com> Closes #10979
This commit is contained in:
parent
09eb36ce3d
commit
ab8c935ea6
@ -173,7 +173,6 @@ extern void zfs_tstamp_update_setup_ext(struct znode *,
|
||||
uint_t, uint64_t [2], uint64_t [2], boolean_t have_tx);
|
||||
extern void zfs_znode_free(struct znode *);
|
||||
|
||||
extern zil_get_data_t zfs_get_data;
|
||||
extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE];
|
||||
extern int zfsfstype;
|
||||
|
||||
|
@ -163,7 +163,6 @@ extern caddr_t zfs_map_page(page_t *, enum seg_rw);
|
||||
extern void zfs_unmap_page(page_t *, caddr_t);
|
||||
#endif /* HAVE_UIO_RW */
|
||||
|
||||
extern zil_get_data_t zfs_get_data;
|
||||
extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE];
|
||||
extern int zfsfstype;
|
||||
|
||||
|
@ -39,4 +39,17 @@ extern int mappedread(znode_t *, int, uio_t *);
|
||||
extern int mappedread_sf(znode_t *, int, uio_t *);
|
||||
extern void update_pages(znode_t *, int64_t, int, objset_t *, uint64_t);
|
||||
|
||||
/*
|
||||
* Platform code that asynchronously drops zp's inode / vnode_t.
|
||||
*
|
||||
* Asynchronous dropping ensures that the caller will never drop the
|
||||
* last reference on an inode / vnode_t in the current context.
|
||||
* Doing so while holding open a tx could result in a deadlock if
|
||||
* the platform calls into filesystem again in the implementation
|
||||
* of inode / vnode_t dropping (e.g. call from iput_final()).
|
||||
*/
|
||||
extern void zfs_zrele_async(znode_t *zp);
|
||||
|
||||
extern zil_get_data_t zfs_get_data;
|
||||
|
||||
#endif
|
||||
|
@ -42,6 +42,7 @@
|
||||
#include <sys/mount.h>
|
||||
#include <sys/cmn_err.h>
|
||||
#include <sys/zfs_znode.h>
|
||||
#include <sys/zfs_vnops.h>
|
||||
#include <sys/zfs_dir.h>
|
||||
#include <sys/zil.h>
|
||||
#include <sys/fs/zfs.h>
|
||||
|
@ -29,6 +29,7 @@
|
||||
/* Portions Copyright 2007 Jeremy Teo */
|
||||
/* Portions Copyright 2010 Robert Milkowski */
|
||||
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <sys/param.h>
|
||||
#include <sys/time.h>
|
||||
@ -669,163 +670,13 @@ zfs_write_simple(znode_t *zp, const void *data, size_t len,
|
||||
return (error);
|
||||
}
|
||||
|
||||
static void
|
||||
zfs_get_done(zgd_t *zgd, int error)
|
||||
void
|
||||
zfs_zrele_async(znode_t *zp)
|
||||
{
|
||||
znode_t *zp = zgd->zgd_private;
|
||||
objset_t *os = zp->z_zfsvfs->z_os;
|
||||
vnode_t *vp = ZTOV(zp);
|
||||
objset_t *os = ITOZSB(vp)->z_os;
|
||||
|
||||
if (zgd->zgd_db)
|
||||
dmu_buf_rele(zgd->zgd_db, zgd);
|
||||
|
||||
zfs_rangelock_exit(zgd->zgd_lr);
|
||||
|
||||
/*
|
||||
* Release the vnode asynchronously as we currently have the
|
||||
* txg stopped from syncing.
|
||||
*/
|
||||
VN_RELE_ASYNC(ZTOV(zp), dsl_pool_zrele_taskq(dmu_objset_pool(os)));
|
||||
|
||||
kmem_free(zgd, sizeof (zgd_t));
|
||||
}
|
||||
|
||||
#ifdef ZFS_DEBUG
|
||||
static int zil_fault_io = 0;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Get data to generate a TX_WRITE intent log record.
|
||||
*/
|
||||
int
|
||||
zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
|
||||
{
|
||||
zfsvfs_t *zfsvfs = arg;
|
||||
objset_t *os = zfsvfs->z_os;
|
||||
znode_t *zp;
|
||||
uint64_t object = lr->lr_foid;
|
||||
uint64_t offset = lr->lr_offset;
|
||||
uint64_t size = lr->lr_length;
|
||||
dmu_buf_t *db;
|
||||
zgd_t *zgd;
|
||||
int error = 0;
|
||||
|
||||
ASSERT3P(lwb, !=, NULL);
|
||||
ASSERT3P(zio, !=, NULL);
|
||||
ASSERT3U(size, !=, 0);
|
||||
|
||||
/*
|
||||
* Nothing to do if the file has been removed
|
||||
*/
|
||||
if (zfs_zget(zfsvfs, object, &zp) != 0)
|
||||
return (SET_ERROR(ENOENT));
|
||||
if (zp->z_unlinked) {
|
||||
/*
|
||||
* Release the vnode asynchronously as we currently have the
|
||||
* txg stopped from syncing.
|
||||
*/
|
||||
VN_RELE_ASYNC(ZTOV(zp),
|
||||
dsl_pool_zrele_taskq(dmu_objset_pool(os)));
|
||||
return (SET_ERROR(ENOENT));
|
||||
}
|
||||
|
||||
zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
|
||||
zgd->zgd_lwb = lwb;
|
||||
zgd->zgd_private = zp;
|
||||
|
||||
/*
|
||||
* Write records come in two flavors: immediate and indirect.
|
||||
* For small writes it's cheaper to store the data with the
|
||||
* log record (immediate); for large writes it's cheaper to
|
||||
* sync the data and get a pointer to it (indirect) so that
|
||||
* we don't have to write the data twice.
|
||||
*/
|
||||
if (buf != NULL) { /* immediate write */
|
||||
zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, offset,
|
||||
size, RL_READER);
|
||||
/* test for truncation needs to be done while range locked */
|
||||
if (offset >= zp->z_size) {
|
||||
error = SET_ERROR(ENOENT);
|
||||
} else {
|
||||
error = dmu_read(os, object, offset, size, buf,
|
||||
DMU_READ_NO_PREFETCH);
|
||||
}
|
||||
ASSERT(error == 0 || error == ENOENT);
|
||||
} else { /* indirect write */
|
||||
/*
|
||||
* Have to lock the whole block to ensure when it's
|
||||
* written out and its checksum is being calculated
|
||||
* that no one can change the data. We need to re-check
|
||||
* blocksize after we get the lock in case it's changed!
|
||||
*/
|
||||
for (;;) {
|
||||
uint64_t blkoff;
|
||||
size = zp->z_blksz;
|
||||
blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
|
||||
offset -= blkoff;
|
||||
zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
|
||||
offset, size, RL_READER);
|
||||
if (zp->z_blksz == size)
|
||||
break;
|
||||
offset += blkoff;
|
||||
zfs_rangelock_exit(zgd->zgd_lr);
|
||||
}
|
||||
/* test for truncation needs to be done while range locked */
|
||||
if (lr->lr_offset >= zp->z_size)
|
||||
error = SET_ERROR(ENOENT);
|
||||
#ifdef ZFS_DEBUG
|
||||
if (zil_fault_io) {
|
||||
error = SET_ERROR(EIO);
|
||||
zil_fault_io = 0;
|
||||
}
|
||||
#endif
|
||||
if (error == 0)
|
||||
error = dmu_buf_hold(os, object, offset, zgd, &db,
|
||||
DMU_READ_NO_PREFETCH);
|
||||
|
||||
if (error == 0) {
|
||||
blkptr_t *bp = &lr->lr_blkptr;
|
||||
|
||||
zgd->zgd_db = db;
|
||||
zgd->zgd_bp = bp;
|
||||
|
||||
ASSERT(db->db_offset == offset);
|
||||
ASSERT(db->db_size == size);
|
||||
|
||||
error = dmu_sync(zio, lr->lr_common.lrc_txg,
|
||||
zfs_get_done, zgd);
|
||||
ASSERT(error || lr->lr_length <= size);
|
||||
|
||||
/*
|
||||
* On success, we need to wait for the write I/O
|
||||
* initiated by dmu_sync() to complete before we can
|
||||
* release this dbuf. We will finish everything up
|
||||
* in the zfs_get_done() callback.
|
||||
*/
|
||||
if (error == 0)
|
||||
return (0);
|
||||
|
||||
if (error == EALREADY) {
|
||||
lr->lr_common.lrc_txtype = TX_WRITE2;
|
||||
/*
|
||||
* TX_WRITE2 relies on the data previously
|
||||
* written by the TX_WRITE that caused
|
||||
* EALREADY. We zero out the BP because
|
||||
* it is the old, currently-on-disk BP,
|
||||
* so there's no need to zio_flush() its
|
||||
* vdevs (flushing would needlesly hurt
|
||||
* performance, and doesn't work on
|
||||
* indirect vdevs).
|
||||
*/
|
||||
zgd->zgd_bp = NULL;
|
||||
BP_ZERO(bp);
|
||||
error = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
zfs_get_done(zgd, error);
|
||||
|
||||
return (error);
|
||||
VN_RELE_ASYNC(vp, dsl_pool_zrele_taskq(dmu_objset_pool(os)));
|
||||
}
|
||||
|
||||
static int
|
||||
|
@ -381,12 +381,6 @@ zfs_write_simple(znode_t *zp, const void *data, size_t len,
|
||||
return (error);
|
||||
}
|
||||
|
||||
/*
|
||||
* Drop a reference on the passed inode asynchronously. This ensures
|
||||
* that the caller will never drop the last reference on an inode in
|
||||
* the current context. Doing so while holding open a tx could result
|
||||
* in a deadlock if iput_final() re-enters the filesystem code.
|
||||
*/
|
||||
void
|
||||
zfs_zrele_async(znode_t *zp)
|
||||
{
|
||||
@ -403,159 +397,6 @@ zfs_zrele_async(znode_t *zp)
|
||||
zrele(zp);
|
||||
}
|
||||
|
||||
/* ARGSUSED */
|
||||
static void
|
||||
zfs_get_done(zgd_t *zgd, int error)
|
||||
{
|
||||
znode_t *zp = zgd->zgd_private;
|
||||
|
||||
if (zgd->zgd_db)
|
||||
dmu_buf_rele(zgd->zgd_db, zgd);
|
||||
|
||||
zfs_rangelock_exit(zgd->zgd_lr);
|
||||
|
||||
/*
|
||||
* Release the vnode asynchronously as we currently have the
|
||||
* txg stopped from syncing.
|
||||
*/
|
||||
zfs_zrele_async(zp);
|
||||
|
||||
kmem_free(zgd, sizeof (zgd_t));
|
||||
}
|
||||
|
||||
#ifdef ZFS_DEBUG
|
||||
static int zil_fault_io = 0;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Get data to generate a TX_WRITE intent log record.
|
||||
*/
|
||||
int
|
||||
zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
|
||||
{
|
||||
zfsvfs_t *zfsvfs = arg;
|
||||
objset_t *os = zfsvfs->z_os;
|
||||
znode_t *zp;
|
||||
uint64_t object = lr->lr_foid;
|
||||
uint64_t offset = lr->lr_offset;
|
||||
uint64_t size = lr->lr_length;
|
||||
dmu_buf_t *db;
|
||||
zgd_t *zgd;
|
||||
int error = 0;
|
||||
|
||||
ASSERT3P(lwb, !=, NULL);
|
||||
ASSERT3P(zio, !=, NULL);
|
||||
ASSERT3U(size, !=, 0);
|
||||
|
||||
/*
|
||||
* Nothing to do if the file has been removed
|
||||
*/
|
||||
if (zfs_zget(zfsvfs, object, &zp) != 0)
|
||||
return (SET_ERROR(ENOENT));
|
||||
if (zp->z_unlinked) {
|
||||
/*
|
||||
* Release the vnode asynchronously as we currently have the
|
||||
* txg stopped from syncing.
|
||||
*/
|
||||
zfs_zrele_async(zp);
|
||||
return (SET_ERROR(ENOENT));
|
||||
}
|
||||
|
||||
zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
|
||||
zgd->zgd_lwb = lwb;
|
||||
zgd->zgd_private = zp;
|
||||
|
||||
/*
|
||||
* Write records come in two flavors: immediate and indirect.
|
||||
* For small writes it's cheaper to store the data with the
|
||||
* log record (immediate); for large writes it's cheaper to
|
||||
* sync the data and get a pointer to it (indirect) so that
|
||||
* we don't have to write the data twice.
|
||||
*/
|
||||
if (buf != NULL) { /* immediate write */
|
||||
zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
|
||||
offset, size, RL_READER);
|
||||
/* test for truncation needs to be done while range locked */
|
||||
if (offset >= zp->z_size) {
|
||||
error = SET_ERROR(ENOENT);
|
||||
} else {
|
||||
error = dmu_read(os, object, offset, size, buf,
|
||||
DMU_READ_NO_PREFETCH);
|
||||
}
|
||||
ASSERT(error == 0 || error == ENOENT);
|
||||
} else { /* indirect write */
|
||||
/*
|
||||
* Have to lock the whole block to ensure when it's
|
||||
* written out and its checksum is being calculated
|
||||
* that no one can change the data. We need to re-check
|
||||
* blocksize after we get the lock in case it's changed!
|
||||
*/
|
||||
for (;;) {
|
||||
uint64_t blkoff;
|
||||
size = zp->z_blksz;
|
||||
blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
|
||||
offset -= blkoff;
|
||||
zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
|
||||
offset, size, RL_READER);
|
||||
if (zp->z_blksz == size)
|
||||
break;
|
||||
offset += blkoff;
|
||||
zfs_rangelock_exit(zgd->zgd_lr);
|
||||
}
|
||||
/* test for truncation needs to be done while range locked */
|
||||
if (lr->lr_offset >= zp->z_size)
|
||||
error = SET_ERROR(ENOENT);
|
||||
#ifdef ZFS_DEBUG
|
||||
if (zil_fault_io) {
|
||||
error = SET_ERROR(EIO);
|
||||
zil_fault_io = 0;
|
||||
}
|
||||
#endif
|
||||
if (error == 0)
|
||||
error = dmu_buf_hold(os, object, offset, zgd, &db,
|
||||
DMU_READ_NO_PREFETCH);
|
||||
|
||||
if (error == 0) {
|
||||
blkptr_t *bp = &lr->lr_blkptr;
|
||||
|
||||
zgd->zgd_db = db;
|
||||
zgd->zgd_bp = bp;
|
||||
|
||||
ASSERT(db->db_offset == offset);
|
||||
ASSERT(db->db_size == size);
|
||||
|
||||
error = dmu_sync(zio, lr->lr_common.lrc_txg,
|
||||
zfs_get_done, zgd);
|
||||
ASSERT(error || lr->lr_length <= size);
|
||||
|
||||
/*
|
||||
* On success, we need to wait for the write I/O
|
||||
* initiated by dmu_sync() to complete before we can
|
||||
* release this dbuf. We will finish everything up
|
||||
* in the zfs_get_done() callback.
|
||||
*/
|
||||
if (error == 0)
|
||||
return (0);
|
||||
|
||||
if (error == EALREADY) {
|
||||
lr->lr_common.lrc_txtype = TX_WRITE2;
|
||||
/*
|
||||
* TX_WRITE2 relies on the data previously
|
||||
* written by the TX_WRITE that caused
|
||||
* EALREADY. We zero out the BP because
|
||||
* it is the old, currently-on-disk BP.
|
||||
*/
|
||||
zgd->zgd_bp = NULL;
|
||||
BP_ZERO(bp);
|
||||
error = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
zfs_get_done(zgd, error);
|
||||
|
||||
return (error);
|
||||
}
|
||||
|
||||
/*
|
||||
* Lookup an entry in a directory, or an extended attribute directory.
|
||||
|
@ -52,6 +52,8 @@
|
||||
#include <sys/policy.h>
|
||||
#include <sys/zfs_vnops.h>
|
||||
#include <sys/zfs_quota.h>
|
||||
#include <sys/zfs_vfsops.h>
|
||||
#include <sys/zfs_znode.h>
|
||||
|
||||
|
||||
static ulong_t zfs_fsync_sync_cnt = 4;
|
||||
@ -717,6 +719,163 @@ zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr)
|
||||
return (error);
|
||||
}
|
||||
|
||||
#ifdef ZFS_DEBUG
|
||||
static int zil_fault_io = 0;
|
||||
#endif
|
||||
|
||||
static void zfs_get_done(zgd_t *zgd, int error);
|
||||
|
||||
/*
|
||||
* Get data to generate a TX_WRITE intent log record.
|
||||
*/
|
||||
int
|
||||
zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
|
||||
{
|
||||
zfsvfs_t *zfsvfs = arg;
|
||||
objset_t *os = zfsvfs->z_os;
|
||||
znode_t *zp;
|
||||
uint64_t object = lr->lr_foid;
|
||||
uint64_t offset = lr->lr_offset;
|
||||
uint64_t size = lr->lr_length;
|
||||
dmu_buf_t *db;
|
||||
zgd_t *zgd;
|
||||
int error = 0;
|
||||
|
||||
ASSERT3P(lwb, !=, NULL);
|
||||
ASSERT3P(zio, !=, NULL);
|
||||
ASSERT3U(size, !=, 0);
|
||||
|
||||
/*
|
||||
* Nothing to do if the file has been removed
|
||||
*/
|
||||
if (zfs_zget(zfsvfs, object, &zp) != 0)
|
||||
return (SET_ERROR(ENOENT));
|
||||
if (zp->z_unlinked) {
|
||||
/*
|
||||
* Release the vnode asynchronously as we currently have the
|
||||
* txg stopped from syncing.
|
||||
*/
|
||||
zfs_zrele_async(zp);
|
||||
return (SET_ERROR(ENOENT));
|
||||
}
|
||||
|
||||
zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
|
||||
zgd->zgd_lwb = lwb;
|
||||
zgd->zgd_private = zp;
|
||||
|
||||
/*
|
||||
* Write records come in two flavors: immediate and indirect.
|
||||
* For small writes it's cheaper to store the data with the
|
||||
* log record (immediate); for large writes it's cheaper to
|
||||
* sync the data and get a pointer to it (indirect) so that
|
||||
* we don't have to write the data twice.
|
||||
*/
|
||||
if (buf != NULL) { /* immediate write */
|
||||
zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
|
||||
offset, size, RL_READER);
|
||||
/* test for truncation needs to be done while range locked */
|
||||
if (offset >= zp->z_size) {
|
||||
error = SET_ERROR(ENOENT);
|
||||
} else {
|
||||
error = dmu_read(os, object, offset, size, buf,
|
||||
DMU_READ_NO_PREFETCH);
|
||||
}
|
||||
ASSERT(error == 0 || error == ENOENT);
|
||||
} else { /* indirect write */
|
||||
/*
|
||||
* Have to lock the whole block to ensure when it's
|
||||
* written out and its checksum is being calculated
|
||||
* that no one can change the data. We need to re-check
|
||||
* blocksize after we get the lock in case it's changed!
|
||||
*/
|
||||
for (;;) {
|
||||
uint64_t blkoff;
|
||||
size = zp->z_blksz;
|
||||
blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
|
||||
offset -= blkoff;
|
||||
zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
|
||||
offset, size, RL_READER);
|
||||
if (zp->z_blksz == size)
|
||||
break;
|
||||
offset += blkoff;
|
||||
zfs_rangelock_exit(zgd->zgd_lr);
|
||||
}
|
||||
/* test for truncation needs to be done while range locked */
|
||||
if (lr->lr_offset >= zp->z_size)
|
||||
error = SET_ERROR(ENOENT);
|
||||
#ifdef ZFS_DEBUG
|
||||
if (zil_fault_io) {
|
||||
error = SET_ERROR(EIO);
|
||||
zil_fault_io = 0;
|
||||
}
|
||||
#endif
|
||||
if (error == 0)
|
||||
error = dmu_buf_hold(os, object, offset, zgd, &db,
|
||||
DMU_READ_NO_PREFETCH);
|
||||
|
||||
if (error == 0) {
|
||||
blkptr_t *bp = &lr->lr_blkptr;
|
||||
|
||||
zgd->zgd_db = db;
|
||||
zgd->zgd_bp = bp;
|
||||
|
||||
ASSERT(db->db_offset == offset);
|
||||
ASSERT(db->db_size == size);
|
||||
|
||||
error = dmu_sync(zio, lr->lr_common.lrc_txg,
|
||||
zfs_get_done, zgd);
|
||||
ASSERT(error || lr->lr_length <= size);
|
||||
|
||||
/*
|
||||
* On success, we need to wait for the write I/O
|
||||
* initiated by dmu_sync() to complete before we can
|
||||
* release this dbuf. We will finish everything up
|
||||
* in the zfs_get_done() callback.
|
||||
*/
|
||||
if (error == 0)
|
||||
return (0);
|
||||
|
||||
if (error == EALREADY) {
|
||||
lr->lr_common.lrc_txtype = TX_WRITE2;
|
||||
/*
|
||||
* TX_WRITE2 relies on the data previously
|
||||
* written by the TX_WRITE that caused
|
||||
* EALREADY. We zero out the BP because
|
||||
* it is the old, currently-on-disk BP.
|
||||
*/
|
||||
zgd->zgd_bp = NULL;
|
||||
BP_ZERO(bp);
|
||||
error = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
zfs_get_done(zgd, error);
|
||||
|
||||
return (error);
|
||||
}
|
||||
|
||||
|
||||
/* ARGSUSED */
|
||||
static void
|
||||
zfs_get_done(zgd_t *zgd, int error)
|
||||
{
|
||||
znode_t *zp = zgd->zgd_private;
|
||||
|
||||
if (zgd->zgd_db)
|
||||
dmu_buf_rele(zgd->zgd_db, zgd);
|
||||
|
||||
zfs_rangelock_exit(zgd->zgd_lr);
|
||||
|
||||
/*
|
||||
* Release the vnode asynchronously as we currently have the
|
||||
* txg stopped from syncing.
|
||||
*/
|
||||
zfs_zrele_async(zp);
|
||||
|
||||
kmem_free(zgd, sizeof (zgd_t));
|
||||
}
|
||||
|
||||
EXPORT_SYMBOL(zfs_access);
|
||||
EXPORT_SYMBOL(zfs_fsync);
|
||||
EXPORT_SYMBOL(zfs_holey);
|
||||
|
Loading…
Reference in New Issue
Block a user