Multiple fixes to the zfsd test suite

* Wait for gnop devices to disappear after "gnop destroy".

  Apparently that process is asynchronous now, or maybe it's just slower
  than it used to be.  Also, after removing a gnop wait for its pool to
  be degraded.  That isn't instant.

* The zfsd tests no longer require camcontrol.

  This was a harmless oversight from
  11ed0a95bf

* Fix the zfsd_degrade_001_pos test for recent zfs versions.

  ZFS now rate limits checksum errors to about 20 per second.  But
  zfsd's threshold for degrading a disk is 50 per minute.  So we must
  alternately corrupt and scrub the pool to ensure that checksum errors
  are generated in multiple 1-second windows, so that zfsd will see
  enough of them.

* Fix the zfsd_fault_001_pos test in VMs

  And, for that matter, when using NVME or SATA disks.  As originally
  written, the test used the da driver to inject errors.  Rewrite it to
  use gnop vdevs.  gnop can also inject errors.  It works on top of any
  disk device, and it's also faster than using da.

MFC after:	2 weeks
Sponsored by:	Axcient
Differential Revision: https://reviews.freebsd.org/D39437
This commit is contained in:
Alan Somers 2023-04-03 15:43:17 -06:00
parent 92642bba4d
commit dba2e89ea7
7 changed files with 53 additions and 72 deletions

View File

@ -85,6 +85,12 @@ function destroy_gnop
# Use "-f" so we can destroy a gnop with a consumer (like ZFS)
gnop destroy -f ${disk}.nop
# Wait for it to disappear
for i in `seq 5`; do
gnop status ${disk}.nop >/dev/null 2>/dev/null || break
sleep $i
done
}
# Destroy multiple gnop devices. Attempt to destroy them all, ignoring errors

View File

@ -72,19 +72,32 @@ function corrupt_pool_vdev
typeset pool=$1
typeset vdev=$2
typeset file=$3
typeset -li start=0
typeset -li now=0
typeset -li timeout=60
# do some IO on the pool
log_must $DD if=/dev/zero of=$file bs=1024k count=64
$FSYNC $file
# scribble on the underlying file to corrupt the vdev
log_must $DD if=/dev/urandom of=$vdev bs=1024k count=64 conv=notrunc
# ZFS rate limits checksum errors to about 20 per second. So in order
# to ensure that we reach zfsd's threshold, we must alternately
# scribble and scrub.
while (( "$now" - "$start" < "$timeout" )); do
# scribble on the underlying file to corrupt the vdev
log_must $DD if=/dev/urandom of=$vdev bs=1024k count=64 conv=notrunc
# Scrub the pool to detect the corruption
log_must $ZPOOL scrub $pool
wait_until_scrubbed $pool
# Scrub the pool to detect and repair the corruption
log_must $ZPOOL scrub $pool
wait_until_scrubbed $pool
now=`date +%s`
if [ "$start" -eq 0 ]; then
start=`date +%s`
fi
check_state "$pool" "$vdev" DEGRADED && return
$SLEEP 1
done
# ZFSD can take up to 60 seconds to degrade an array in response to
# errors (though it's usually faster).
wait_for_pool_dev_state_change 60 $vdev DEGRADED
log_must $ZPOOL status "$pool"
log_fail "ERROR: Disk $vdev not marked as DEGRADED in $pool"
}

View File

@ -83,6 +83,7 @@ for keyword in "${MY_KEYWORDS[@]}" ; do
log_must $ZPOOL set autoreplace=on $TESTPOOL
log_must destroy_gnop $REMOVAL_DISK
log_must wait_for_pool_removal 20
log_must create_gnop $NEW_DISK $PHYSPATH
verify_assertion
destroy_pool "$TESTPOOL"

View File

@ -93,6 +93,7 @@ for keyword in "${MY_KEYWORDS[@]}" ; do
log_must $ZPOOL set autoreplace=on $TESTPOOL
log_must destroy_gnop $REMOVAL_DISK
log_must wait_for_pool_removal 20
log_must create_gnop $NEW_DISK $PHYSPATH
verify_assertion
destroy_pool "$TESTPOOL"

View File

@ -27,6 +27,7 @@
# $FreeBSD$
. $STF_SUITE/include/libtest.kshlib
. $STF_SUITE/include/libgnop.kshlib
################################################################################
#
@ -39,8 +40,7 @@
#
#
# STRATEGY:
# 1. Create a storage pool. Only use the da driver (FreeBSD's SCSI disk
# driver) because it has a special interface for simulating IO errors.
# 1. Create a storage pool. Use gnop vdevs so we can inject I/O errors.
# 2. Inject IO errors while doing IO to the pool.
# 3. Verify that the vdev becomes FAULTED.
# 4. ONLINE it and verify that it resilvers and joins the pool.
@ -57,65 +57,28 @@
verify_runnable "global"
function cleanup
{
# Disable error injection, if still active
sysctl kern.cam.da.$TMPDISKNUM.error_inject=0 > /dev/null
if poolexists $TESTPOOL; then
# We should not get here if the test passed. Print the output
# of zpool status to assist in debugging.
$ZPOOL status
# Clear out artificially generated errors and destroy the pool
$ZPOOL clear $TESTPOOL
destroy_pool $TESTPOOL
fi
}
log_assert "ZFS will fault a vdev that produces IO errors"
log_onexit cleanup
ensure_zfsd_running
# Make sure that at least one of the disks is using the da driver, and use
# that disk for inject errors
typeset TMPDISK=""
for d in $DISKS
do
b=`basename $d`
if test ${b%%[0-9]*} == da
then
TMPDISK=$b
TMPDISKNUM=${b##da}
break
fi
done
if test -z $TMPDISK
then
log_unsupported "This test requires at least one disk to use the da driver"
fi
DISK0_NOP=${DISK0}.nop
DISK1_NOP=${DISK1}.nop
log_must create_gnops $DISK0 $DISK1
for type in "raidz" "mirror"; do
log_note "Testing raid type $type"
# Create a pool on the supplied disks
create_pool $TESTPOOL $type $DISKS
create_pool $TESTPOOL $type "$DISK0_NOP" "$DISK1_NOP"
log_must $ZFS create $TESTPOOL/$TESTFS
# Cause some IO errors writing to the pool
while true; do
# Running zpool status after every dd operation is too slow.
# So we will run several dd's in a row before checking zpool
# status. sync between dd operations to ensure that the disk
# gets IO
for ((i=0; $i<64; i=$i+1)); do
sysctl kern.cam.da.$TMPDISKNUM.error_inject=1 > \
/dev/null
$DD if=/dev/zero bs=128k count=1 >> \
/$TESTPOOL/$TESTFS/$TESTFILE 2> /dev/null
$FSYNC /$TESTPOOL/$TESTFS/$TESTFILE
done
log_must gnop configure -e 5 -w 100 "$DISK1_NOP"
$DD if=/dev/zero bs=128k count=1 >> \
/$TESTPOOL/$TESTFS/$TESTFILE 2> /dev/null
$FSYNC /$TESTPOOL/$TESTFS/$TESTFILE
# Check to see if the pool is faulted yet
$ZPOOL status $TESTPOOL | grep -q 'state: DEGRADED'
if [ $? == 0 ]
@ -127,15 +90,9 @@ for type in "raidz" "mirror"; do
log_must check_state $TESTPOOL $TMPDISK "FAULTED"
#find the failed disk guid
typeset FAILED_VDEV=`$ZPOOL status $TESTPOOL |
awk "/^[[:space:]]*$TMPDISK[[:space:]]*FAULTED/ {print \\$1}"`
# Reattach the failed disk
$ZPOOL online $TESTPOOL $FAILED_VDEV > /dev/null
if [ $? != 0 ]; then
log_fail "Could not reattach $FAILED_VDEV"
fi
# Heal and reattach the failed disk
log_must gnop configure -w 0 "$DISK1_NOP"
log_must $ZPOOL online $TESTPOOL "$DISK1_NOP"
# Verify that the pool resilvers and goes to the ONLINE state
for (( retries=60; $retries>0; retries=$retries+1 ))

View File

@ -58,6 +58,7 @@ for type in "raidz" "mirror"; do
# Disable the first disk.
log_must destroy_gnop $REMOVAL_DISK
log_must wait_for_pool_removal 20
# Write out data to make sure we can do I/O after the disk failure
log_must $DD if=/dev/zero of=$TESTDIR/$TESTFILE bs=1m count=1

View File

@ -30,12 +30,14 @@ atf_test_case zfsd_fault_001_pos cleanup
zfsd_fault_001_pos_head()
{
atf_set "descr" "ZFS will fault a vdev that produces IO errors"
atf_set "require.progs" "ksh93 zfs zpool zfsd"
atf_set "require.progs" "ksh93 gnop zfs zpool zfsd"
atf_set "timeout" 300
}
zfsd_fault_001_pos_body()
{
. $(atf_get_srcdir)/../../include/default.cfg
. $(atf_get_srcdir)/../hotspare/hotspare.kshlib
. $(atf_get_srcdir)/../hotspare/hotspare.cfg
. $(atf_get_srcdir)/zfsd.cfg
verify_disk_count "$DISKS" 2
@ -212,7 +214,7 @@ atf_test_case zfsd_hotspare_004_pos cleanup
zfsd_hotspare_004_pos_head()
{
atf_set "descr" "Removing a disk from a pool results in the spare activating"
atf_set "require.progs" "ksh93 gnop zpool camcontrol zfsd"
atf_set "require.progs" "ksh93 gnop zpool"
atf_set "timeout" 3600
}
zfsd_hotspare_004_pos_body()
@ -303,7 +305,7 @@ atf_test_case zfsd_hotspare_007_pos cleanup
zfsd_hotspare_007_pos_head()
{
atf_set "descr" "zfsd will swap failed drives at startup"
atf_set "require.progs" "ksh93 gnop zpool camcontrol zfsd"
atf_set "require.progs" "ksh93 gnop zpool"
atf_set "timeout" 3600
}
zfsd_hotspare_007_pos_body()
@ -364,7 +366,7 @@ atf_test_case zfsd_autoreplace_001_neg cleanup
zfsd_autoreplace_001_neg_head()
{
atf_set "descr" "A pool without autoreplace set will not replace by physical path"
atf_set "require.progs" "ksh93 zpool camcontrol zfsd gnop"
atf_set "require.progs" "ksh93 zpool gnop"
atf_set "timeout" 3600
}
zfsd_autoreplace_001_neg_body()
@ -425,7 +427,7 @@ atf_test_case zfsd_autoreplace_003_pos cleanup
zfsd_autoreplace_003_pos_head()
{
atf_set "descr" "A pool with autoreplace set will replace by physical path even if a spare is active"
atf_set "require.progs" "ksh93 zpool camcontrol zfsd gnop"
atf_set "require.progs" "ksh93 zpool gnop"
atf_set "timeout" 3600
}
zfsd_autoreplace_003_pos_body()
@ -456,7 +458,7 @@ atf_test_case zfsd_replace_001_pos cleanup
zfsd_replace_001_pos_head()
{
atf_set "descr" "ZFSD will automatically replace a SAS disk that disappears and reappears in the same location, with the same devname"
atf_set "require.progs" "ksh93 zpool camcontrol zfsd zfs gnop"
atf_set "require.progs" "ksh93 zpool zfs gnop"
}
zfsd_replace_001_pos_body()
{
@ -485,7 +487,7 @@ atf_test_case zfsd_replace_002_pos cleanup
zfsd_replace_002_pos_head()
{
atf_set "descr" "zfsd will reactivate a pool after all disks are failed and reappeared"
atf_set "require.progs" "ksh93 zpool camcontrol zfsd zfs"
atf_set "require.progs" "ksh93 zpool zfs"
}
zfsd_replace_002_pos_body()
{
@ -514,7 +516,7 @@ atf_test_case zfsd_replace_003_pos cleanup
zfsd_replace_003_pos_head()
{
atf_set "descr" "ZFSD will correctly replace disks that dissapear and reappear with different devnames"
atf_set "require.progs" "ksh93 zpool camcontrol zfsd zfs gnop"
atf_set "require.progs" "ksh93 zpool zfs gnop"
}
zfsd_replace_003_pos_body()
{