Fix zil replay panic when TX_REMOVE followed by TX_CREATE
If TX_REMOVE is followed by TX_CREATE on the same object id, we need to make sure the object removal is completely finished before creation. The current implementation relies on dnode_hold_impl with DNODE_MUST_BE_ALLOCATED returning ENOENT. While this check seems to work fine before, in current version it does not guarantee the object removal is completed. We fix this by checking if DNODE_MUST_BE_FREE returns successful instead. Also add test and remove dead code in dnode_hold_impl. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Tom Caputi <tcaputi@datto.com> Signed-off-by: Chunwei Chen <david.chen@nutanix.com> Closes #7151 Closes #8910 Closes #9123 Closes #9145
This commit is contained in:
parent
9c9dcd6e04
commit
035e96118b
@ -46,6 +46,7 @@ extern "C" {
|
||||
*/
|
||||
#define DNODE_MUST_BE_ALLOCATED 1
|
||||
#define DNODE_MUST_BE_FREE 2
|
||||
#define DNODE_DRY_RUN 4
|
||||
|
||||
/*
|
||||
* dnode_next_offset() flags.
|
||||
@ -415,6 +416,7 @@ int dnode_hold_impl(struct objset *dd, uint64_t object, int flag, int dn_slots,
|
||||
boolean_t dnode_add_ref(dnode_t *dn, void *ref);
|
||||
void dnode_rele(dnode_t *dn, void *ref);
|
||||
void dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting);
|
||||
int dnode_try_claim(objset_t *os, uint64_t object, int slots);
|
||||
void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
|
||||
void dnode_sync(dnode_t *dn, dmu_tx_t *tx);
|
||||
void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
|
||||
@ -531,11 +533,6 @@ typedef struct dnode_stats {
|
||||
* a range of dnode slots which would overflow the dnode_phys_t.
|
||||
*/
|
||||
kstat_named_t dnode_hold_free_overflow;
|
||||
/*
|
||||
* Number of times a dnode_hold(...) was attempted on a dnode
|
||||
* which had already been unlinked in an earlier txg.
|
||||
*/
|
||||
kstat_named_t dnode_hold_free_txg;
|
||||
/*
|
||||
* Number of times dnode_free_interior_slots() needed to retry
|
||||
* acquiring a slot zrl lock due to contention.
|
||||
|
@ -55,7 +55,6 @@ dnode_stats_t dnode_stats = {
|
||||
{ "dnode_hold_free_lock_retry", KSTAT_DATA_UINT64 },
|
||||
{ "dnode_hold_free_overflow", KSTAT_DATA_UINT64 },
|
||||
{ "dnode_hold_free_refcount", KSTAT_DATA_UINT64 },
|
||||
{ "dnode_hold_free_txg", KSTAT_DATA_UINT64 },
|
||||
{ "dnode_free_interior_lock_retry", KSTAT_DATA_UINT64 },
|
||||
{ "dnode_allocate", KSTAT_DATA_UINT64 },
|
||||
{ "dnode_reallocate", KSTAT_DATA_UINT64 },
|
||||
@ -1263,6 +1262,10 @@ dnode_buf_evict_async(void *dbu)
|
||||
* as an extra dnode slot by an large dnode, in which case it returns
|
||||
* ENOENT.
|
||||
*
|
||||
* If the DNODE_DRY_RUN flag is set, we don't actually hold the dnode, just
|
||||
* return whether the hold would succeed or not. tag and dnp should set to
|
||||
* NULL in this case.
|
||||
*
|
||||
* errors:
|
||||
* EINVAL - Invalid object number or flags.
|
||||
* ENOSPC - Hole too small to fulfill "slots" request (DNODE_MUST_BE_FREE)
|
||||
@ -1291,6 +1294,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
|
||||
|
||||
ASSERT(!(flag & DNODE_MUST_BE_ALLOCATED) || (slots == 0));
|
||||
ASSERT(!(flag & DNODE_MUST_BE_FREE) || (slots > 0));
|
||||
IMPLY(flag & DNODE_DRY_RUN, (tag == NULL) && (dnp == NULL));
|
||||
|
||||
/*
|
||||
* If you are holding the spa config lock as writer, you shouldn't
|
||||
@ -1320,8 +1324,11 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
|
||||
if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)
|
||||
return (SET_ERROR(EEXIST));
|
||||
DNODE_VERIFY(dn);
|
||||
(void) zfs_refcount_add(&dn->dn_holds, tag);
|
||||
*dnp = dn;
|
||||
/* Don't actually hold if dry run, just return 0 */
|
||||
if (!(flag & DNODE_DRY_RUN)) {
|
||||
(void) zfs_refcount_add(&dn->dn_holds, tag);
|
||||
*dnp = dn;
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
@ -1462,6 +1469,14 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
|
||||
return (SET_ERROR(ENOENT));
|
||||
}
|
||||
|
||||
/* Don't actually hold if dry run, just return 0 */
|
||||
if (flag & DNODE_DRY_RUN) {
|
||||
mutex_exit(&dn->dn_mtx);
|
||||
dnode_slots_rele(dnc, idx, slots);
|
||||
dbuf_rele(db, FTAG);
|
||||
return (0);
|
||||
}
|
||||
|
||||
DNODE_STAT_BUMP(dnode_hold_alloc_hits);
|
||||
} else if (flag & DNODE_MUST_BE_FREE) {
|
||||
|
||||
@ -1519,6 +1534,14 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
|
||||
return (SET_ERROR(EEXIST));
|
||||
}
|
||||
|
||||
/* Don't actually hold if dry run, just return 0 */
|
||||
if (flag & DNODE_DRY_RUN) {
|
||||
mutex_exit(&dn->dn_mtx);
|
||||
dnode_slots_rele(dnc, idx, slots);
|
||||
dbuf_rele(db, FTAG);
|
||||
return (0);
|
||||
}
|
||||
|
||||
dnode_set_slots(dnc, idx + 1, slots - 1, DN_SLOT_INTERIOR);
|
||||
DNODE_STAT_BUMP(dnode_hold_free_hits);
|
||||
} else {
|
||||
@ -1526,15 +1549,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
|
||||
return (SET_ERROR(EINVAL));
|
||||
}
|
||||
|
||||
if (dn->dn_free_txg) {
|
||||
DNODE_STAT_BUMP(dnode_hold_free_txg);
|
||||
type = dn->dn_type;
|
||||
mutex_exit(&dn->dn_mtx);
|
||||
dnode_slots_rele(dnc, idx, slots);
|
||||
dbuf_rele(db, FTAG);
|
||||
return (SET_ERROR((flag & DNODE_MUST_BE_ALLOCATED) ?
|
||||
ENOENT : EEXIST));
|
||||
}
|
||||
ASSERT0(dn->dn_free_txg);
|
||||
|
||||
if (zfs_refcount_add(&dn->dn_holds, tag) == 1)
|
||||
dbuf_add_ref(db, dnh);
|
||||
@ -1625,6 +1640,16 @@ dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Test whether we can create a dnode at the specified location.
|
||||
*/
|
||||
int
|
||||
dnode_try_claim(objset_t *os, uint64_t object, int slots)
|
||||
{
|
||||
return (dnode_hold_impl(os, object, DNODE_MUST_BE_FREE | DNODE_DRY_RUN,
|
||||
slots, NULL, NULL));
|
||||
}
|
||||
|
||||
void
|
||||
dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
|
||||
{
|
||||
|
@ -337,8 +337,8 @@ zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap)
|
||||
xva.xva_vattr.va_nblocks = lr->lr_gen;
|
||||
xva.xva_vattr.va_fsid = dnodesize;
|
||||
|
||||
error = dmu_object_info(zfsvfs->z_os, lr->lr_foid, NULL);
|
||||
if (error != ENOENT)
|
||||
error = dnode_try_claim(zfsvfs->z_os, objid, dnodesize >> DNODE_SHIFT);
|
||||
if (error)
|
||||
goto bail;
|
||||
|
||||
if (lr->lr_common.lrc_txtype & TX_CI)
|
||||
@ -473,8 +473,8 @@ zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap)
|
||||
xva.xva_vattr.va_nblocks = lr->lr_gen;
|
||||
xva.xva_vattr.va_fsid = dnodesize;
|
||||
|
||||
error = dmu_object_info(zfsvfs->z_os, objid, NULL);
|
||||
if (error != ENOENT)
|
||||
error = dnode_try_claim(zfsvfs->z_os, objid, dnodesize >> DNODE_SHIFT);
|
||||
if (error)
|
||||
goto out;
|
||||
|
||||
if (lr->lr_common.lrc_txtype & TX_CI)
|
||||
|
@ -835,8 +835,8 @@ tags = ['functional', 'scrub_mirror']
|
||||
tests = ['slog_001_pos', 'slog_002_pos', 'slog_003_pos', 'slog_004_pos',
|
||||
'slog_005_pos', 'slog_006_pos', 'slog_007_pos', 'slog_008_neg',
|
||||
'slog_009_neg', 'slog_010_neg', 'slog_011_neg', 'slog_012_neg',
|
||||
'slog_013_pos', 'slog_014_pos', 'slog_015_neg', 'slog_replay_fs',
|
||||
'slog_replay_volume']
|
||||
'slog_013_pos', 'slog_014_pos', 'slog_015_neg', 'slog_replay_fs_001',
|
||||
'slog_replay_fs_002', 'slog_replay_volume']
|
||||
tags = ['functional', 'slog']
|
||||
|
||||
[tests/functional/snapshot]
|
||||
|
@ -17,7 +17,8 @@ dist_pkgdata_SCRIPTS = \
|
||||
slog_013_pos.ksh \
|
||||
slog_014_pos.ksh \
|
||||
slog_015_neg.ksh \
|
||||
slog_replay_fs.ksh \
|
||||
slog_replay_fs_001.ksh \
|
||||
slog_replay_fs_002.ksh \
|
||||
slog_replay_volume.ksh
|
||||
|
||||
dist_pkgdata_DATA = \
|
||||
|
137
tests/zfs-tests/tests/functional/slog/slog_replay_fs_002.ksh
Executable file
137
tests/zfs-tests/tests/functional/slog/slog_replay_fs_002.ksh
Executable file
@ -0,0 +1,137 @@
|
||||
#!/bin/ksh -p
|
||||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# The contents of this file are subject to the terms of the
|
||||
# Common Development and Distribution License (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
#
|
||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
# or http://www.opensolaris.org/os/licensing.
|
||||
# See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
# When distributing Covered Code, include this CDDL HEADER in each
|
||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
# If applicable, add the following below this CDDL HEADER, with the
|
||||
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
#
|
||||
# CDDL HEADER END
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
|
||||
# Use is subject to license terms.
|
||||
#
|
||||
|
||||
. $STF_SUITE/tests/functional/slog/slog.kshlib
|
||||
|
||||
#
|
||||
# DESCRIPTION:
|
||||
# Verify slog replay correctly when TX_REMOVEs are followed by
|
||||
# TX_CREATEs.
|
||||
#
|
||||
# STRATEGY:
|
||||
# 1. Create a file system (TESTFS) with a lot of files
|
||||
# 2. Freeze TESTFS
|
||||
# 3. Remove all files then create a lot of files
|
||||
# 4. Copy TESTFS to temporary location (TESTDIR/copy)
|
||||
# 5. Unmount filesystem
|
||||
# <at this stage TESTFS is empty again and unfrozen, and the
|
||||
# intent log contains a complete set of deltas to replay it>
|
||||
# 6. Remount TESTFS <which replays the intent log>
|
||||
# 7. Compare TESTFS against the TESTDIR/copy
|
||||
#
|
||||
|
||||
verify_runnable "global"
|
||||
|
||||
function cleanup_fs
|
||||
{
|
||||
cleanup
|
||||
}
|
||||
|
||||
log_assert "Replay of intent log succeeds."
|
||||
log_onexit cleanup_fs
|
||||
log_must setup
|
||||
|
||||
#
|
||||
# 1. Create a file system (TESTFS) with a lot of files
|
||||
#
|
||||
log_must zpool create $TESTPOOL $VDEV log mirror $LDEV
|
||||
log_must zfs set compression=on $TESTPOOL
|
||||
log_must zfs create $TESTPOOL/$TESTFS
|
||||
|
||||
# Prep for the test of TX_REMOVE followed by TX_CREATE
|
||||
dnsize=(legacy auto 1k 2k 4k 8k 16k)
|
||||
NFILES=200
|
||||
log_must mkdir /$TESTPOOL/$TESTFS/dir0
|
||||
log_must eval 'for i in $(seq $NFILES); do zfs set dnodesize=${dnsize[$RANDOM % ${#dnsize[@]}]} $TESTPOOL/$TESTFS; touch /$TESTPOOL/$TESTFS/dir0/file.$i; done'
|
||||
|
||||
#
|
||||
# Reimport to reset dnode allocation pointer.
|
||||
# This is to make sure we will have TX_REMOVE and TX_CREATE on same id
|
||||
#
|
||||
log_must zpool export $TESTPOOL
|
||||
log_must zpool import -f -d $VDIR $TESTPOOL
|
||||
|
||||
#
|
||||
# This dd command works around an issue where ZIL records aren't created
|
||||
# after freezing the pool unless a ZIL header already exists. Create a file
|
||||
# synchronously to force ZFS to write one out.
|
||||
#
|
||||
log_must dd if=/dev/zero of=/$TESTPOOL/$TESTFS/sync \
|
||||
conv=fdatasync,fsync bs=1 count=1
|
||||
|
||||
#
|
||||
# 2. Freeze TESTFS
|
||||
#
|
||||
log_must zpool freeze $TESTPOOL
|
||||
|
||||
#
|
||||
# 3. Remove all files then create a lot of files
|
||||
#
|
||||
# TX_REMOVE followed by TX_CREATE
|
||||
log_must eval 'rm -f /$TESTPOOL/$TESTFS/dir0/*'
|
||||
log_must eval 'for i in $(seq $NFILES); do zfs set dnodesize=${dnsize[$RANDOM % ${#dnsize[@]}]} $TESTPOOL/$TESTFS; touch /$TESTPOOL/$TESTFS/dir0/file.$i; done'
|
||||
|
||||
#
|
||||
# 4. Copy TESTFS to temporary location (TESTDIR/copy)
|
||||
#
|
||||
log_must mkdir -p $TESTDIR/copy
|
||||
log_must cp -a /$TESTPOOL/$TESTFS/* $TESTDIR/copy/
|
||||
|
||||
#
|
||||
# 5. Unmount filesystem and export the pool
|
||||
#
|
||||
# At this stage TESTFS is empty again and frozen, the intent log contains
|
||||
# a complete set of deltas to replay.
|
||||
#
|
||||
log_must zfs unmount /$TESTPOOL/$TESTFS
|
||||
|
||||
log_note "Verify transactions to replay:"
|
||||
log_must zdb -iv $TESTPOOL/$TESTFS
|
||||
|
||||
log_must zpool export $TESTPOOL
|
||||
|
||||
#
|
||||
# 6. Remount TESTFS <which replays the intent log>
|
||||
#
|
||||
# Import the pool to unfreeze it and claim log blocks. It has to be
|
||||
# `zpool import -f` because we can't write a frozen pool's labels!
|
||||
#
|
||||
log_must zpool import -f -d $VDIR $TESTPOOL
|
||||
|
||||
#
|
||||
# 7. Compare TESTFS against the TESTDIR/copy
|
||||
#
|
||||
log_note "Verify current block usage:"
|
||||
log_must zdb -bcv $TESTPOOL
|
||||
|
||||
log_note "Verify number of files"
|
||||
log_must test "$(ls /$TESTPOOL/$TESTFS/dir0 | wc -l)" -eq $NFILES
|
||||
|
||||
log_note "Verify working set diff:"
|
||||
log_must diff -r /$TESTPOOL/$TESTFS $TESTDIR/copy
|
||||
|
||||
log_pass "Replay of intent log succeeds."
|
Loading…
Reference in New Issue
Block a user