Fix zpool status in case of unloaded keys

When scrubbing an encrypted filesystem with unloaded key still report an error in zpool status. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Alek Pinchuk <apinchuk@axcient.com> Signed-off-by: George Amanakis <gamanakis@gmail.com> Closes #13675 Closes #13717
2022-08-23 02:42:01 +02:00 · 2022-08-23 02:42:01 +02:00 · 0c4064d9a0
commit 0c4064d9a0
parent 17e212652d
9 changed files with 190 additions and 34 deletions
--- a/include/sys/dsl_crypt.h
+++ b/include/sys/dsl_crypt.h
@ -222,5 +222,6 @@ int spa_do_crypt_abd(boolean_t encrypt, spa_t *spa, const zbookmark_phys_t *zb,
    dmu_object_type_t ot, boolean_t dedup, boolean_t bswap, uint8_t *salt,
    uint8_t *iv, uint8_t *mac, uint_t datalen, abd_t *pabd, abd_t *cabd,
    boolean_t *no_crypt);
+zfs_keystatus_t dsl_dataset_get_keystatus(dsl_dir_t *dd);

 #endif
--- a/man/man7/zpool-features.7
+++ b/man/man7/zpool-features.7
@ -545,6 +545,9 @@ This feature enables the upgraded version of errlog, which required an on-disk
 error log format change.
 Now the error log of each head dataset is stored separately in the zap object
 and keyed by the head id.
+In case of encrypted filesystems with unloaded keys or unmounted encrypted
+filesystems we are unable to check their snapshots or clones for errors and
+these will not be reported.
 With this feature enabled, every dataset affected by an error block is listed
 in the output of
 .Nm zpool Cm status .
--- a/module/zfs/dsl_crypt.c
+++ b/module/zfs/dsl_crypt.c
@ -1137,7 +1137,7 @@ dmu_objset_check_wkey_loaded(dsl_dir_t *dd)
 	return (0);
 }

-static zfs_keystatus_t
+zfs_keystatus_t
 dsl_dataset_get_keystatus(dsl_dir_t *dd)
 {
 	/* check if this dd has a has a dsl key */
--- a/module/zfs/spa_errlog.c
+++ b/module/zfs/spa_errlog.c
@ -21,8 +21,8 @@
 /*
 * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
 * Copyright (c) 2013, 2014, Delphix. All rights reserved.
- * Copyright (c) 2021, George Amanakis. All rights reserved.
 * Copyright (c) 2019 Datto Inc.
+ * Copyright (c) 2021, 2022, George Amanakis. All rights reserved.
 */

 /*
@ -68,6 +68,7 @@
 #include <sys/dsl_dir.h>
 #include <sys/dmu_objset.h>
 #include <sys/dbuf.h>
+#include <sys/zfs_znode.h>

 #define	NAME_MAX_LEN 64

@ -175,6 +176,23 @@ get_head_and_birth_txg(spa_t *spa, zbookmark_err_phys_t *zep, uint64_t ds_obj,
 		return (error);
 	}

+	/*
+	 * If the key is not loaded dbuf_dnode_findbp() will error out with
+	 * EACCES. However in that case dnode_hold() will eventually call
+	 * dbuf_read()->zio_wait() which may call spa_log_error(). This will
+	 * lead to a deadlock due to us holding the mutex spa_errlist_lock.
+	 * Avoid this by checking here if the keys are loaded, if not return.
+	 * If the keys are not loaded the head_errlog feature is meaningless
+	 * as we cannot figure out the birth txg of the block pointer.
+	 */
+	if (dsl_dataset_get_keystatus(ds->ds_dir) ==
+	    ZFS_KEYSTATUS_UNAVAILABLE) {
+		zep->zb_birth = 0;
+		dsl_dataset_rele(ds, FTAG);
+		dsl_pool_config_exit(dp, FTAG);
+		return (0);
+	}
+
 	dnode_t *dn;
 	blkptr_t bp;

@ -188,11 +206,22 @@ get_head_and_birth_txg(spa_t *spa, zbookmark_err_phys_t *zep, uint64_t ds_obj,
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	error = dbuf_dnode_findbp(dn, zep->zb_level, zep->zb_blkid, &bp, NULL,
 	    NULL);
-
 	if (error == 0 && BP_IS_HOLE(&bp))
 		error = SET_ERROR(ENOENT);

-	zep->zb_birth = bp.blk_birth;
+	/*
+	 * If the key is loaded but the encrypted filesystem is unmounted when
+	 * a scrub is run, then dbuf_dnode_findbp() will still error out with
+	 * EACCES (possibly due to the key mapping being removed upon
+	 * unmounting). In that case the head_errlog feature is also
+	 * meaningless as we cannot figure out the birth txg of the block
+	 * pointer.
+	 */
+	if (error == EACCES)
+		error = 0;
+	else if (!error)
+		zep->zb_birth = bp.blk_birth;
+
 	rw_exit(&dn->dn_struct_rwlock);
 	dnode_rele(dn, FTAG);
 	dsl_dataset_rele(ds, FTAG);
@ -264,7 +293,6 @@ find_birth_txg(dsl_dataset_t *ds, zbookmark_err_phys_t *zep,
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	error = dbuf_dnode_findbp(dn, zep->zb_level, zep->zb_blkid, &bp, NULL,
 	    NULL);
-
 	if (error == 0 && BP_IS_HOLE(&bp))
 		error = SET_ERROR(ENOENT);

@ -298,27 +326,40 @@ check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
 	uint64_t txg_to_consider = spa->spa_syncing_txg;
 	boolean_t check_snapshot = B_TRUE;
 	error = find_birth_txg(ds, zep, &latest_txg);
-	if (error == 0) {
-		if (zep->zb_birth == latest_txg) {
-			/* Block neither free nor rewritten. */
-			if (!only_count) {
-				zbookmark_phys_t zb;
-				zep_to_zb(head_ds, zep, &zb);
-				if (copyout(&zb, (char *)uaddr + (*count - 1)
-				    * sizeof (zbookmark_phys_t),
-				    sizeof (zbookmark_phys_t)) != 0) {
-					dsl_dataset_rele(ds, FTAG);
-					return (SET_ERROR(EFAULT));
-				}
-				(*count)--;
-			} else {
-				(*count)++;
+
+	/*
+	 * If we cannot figure out the current birth txg of the block pointer
+	 * error out. If the filesystem is encrypted and the key is not loaded
+	 * or the encrypted filesystem is not mounted the error will be EACCES.
+	 * In that case do not return an error.
+	 */
+	if (error == EACCES) {
+		dsl_dataset_rele(ds, FTAG);
+		return (0);
+	}
+	if (error) {
+		dsl_dataset_rele(ds, FTAG);
+		return (error);
+	}
+	if (zep->zb_birth == latest_txg) {
+		/* Block neither free nor rewritten. */
+		if (!only_count) {
+			zbookmark_phys_t zb;
+			zep_to_zb(head_ds, zep, &zb);
+			if (copyout(&zb, (char *)uaddr + (*count - 1)
+			    * sizeof (zbookmark_phys_t),
+			    sizeof (zbookmark_phys_t)) != 0) {
+				dsl_dataset_rele(ds, FTAG);
+				return (SET_ERROR(EFAULT));
 			}
-			check_snapshot = B_FALSE;
+			(*count)--;
 		} else {
-			ASSERT3U(zep->zb_birth, <, latest_txg);
-			txg_to_consider = latest_txg;
+			(*count)++;
 		}
+		check_snapshot = B_FALSE;
+	} else {
+		ASSERT3U(zep->zb_birth, <, latest_txg);
+		txg_to_consider = latest_txg;
 	}

 	/* How many snapshots reference this block. */
@ -439,9 +480,31 @@ process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
    uint64_t *count, void *uaddr, boolean_t only_count)
 {
 	dsl_pool_t *dp = spa->spa_dsl_pool;
-	dsl_pool_config_enter(dp, FTAG);
 	uint64_t top_affected_fs;

+	/*
+	 * If the zb_birth is 0 it means we failed to retrieve the birth txg
+	 * of the block pointer. This happens when an encrypted filesystem is
+	 * not mounted or when the key is not loaded. Do not proceed to
+	 * check_filesystem(), instead do the accounting here.
+	 */
+	if (zep->zb_birth == 0) {
+		if (!only_count) {
+			zbookmark_phys_t zb;
+			zep_to_zb(head_ds, zep, &zb);
+			if (copyout(&zb, (char *)uaddr + (*count - 1)
+			    * sizeof (zbookmark_phys_t),
+			    sizeof (zbookmark_phys_t)) != 0) {
+				return (SET_ERROR(EFAULT));
+			}
+			(*count)--;
+		} else {
+			(*count)++;
+		}
+		return (0);
+	}
+
+	dsl_pool_config_enter(dp, FTAG);
 	int error = find_top_affected_fs(spa, head_ds, zep, &top_affected_fs);
 	if (error == 0)
 		error = check_filesystem(spa, top_affected_fs, zep, count,
@ -497,6 +560,7 @@ get_errlist_size(spa_t *spa, avl_tree_t *tree)
 		zep.zb_object = se->se_bookmark.zb_object;
 		zep.zb_level = se->se_bookmark.zb_level;
 		zep.zb_blkid = se->se_bookmark.zb_blkid;
+		zep.zb_birth = 0;

 		/*
 		 * If we cannot find out the head dataset and birth txg of
@ -505,8 +569,10 @@ get_errlist_size(spa_t *spa, avl_tree_t *tree)
 		 * sync_error_list() and written to the on-disk error log.
 		 */
 		uint64_t head_ds_obj;
-		if (get_head_and_birth_txg(spa, &zep,
-		    se->se_bookmark.zb_objset, &head_ds_obj) == 0)
+		int error = get_head_and_birth_txg(spa, &zep,
+		    se->se_bookmark.zb_objset, &head_ds_obj);
+
+		if (!error)
 			(void) process_error_block(spa, head_ds_obj, &zep,
 			    &total, NULL, B_TRUE);
 	}
@ -695,6 +761,7 @@ sync_upgrade_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t *newobj,
 		zep.zb_object = zb.zb_object;
 		zep.zb_level = zb.zb_level;
 		zep.zb_blkid = zb.zb_blkid;
+		zep.zb_birth = 0;

 		/*
 		 * We cannot use get_head_and_birth_txg() because it will
@ -737,8 +804,11 @@ sync_upgrade_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t *newobj,
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 		error = dbuf_dnode_findbp(dn, zep.zb_level, zep.zb_blkid, &bp,
 		    NULL, NULL);
+		if (error == EACCES)
+			error = 0;
+		else if (!error)
+			zep.zb_birth = bp.blk_birth;

-		zep.zb_birth = bp.blk_birth;
 		rw_exit(&dn->dn_struct_rwlock);
 		dnode_rele(dn, FTAG);
 		dsl_dataset_rele(ds, FTAG);
@ -885,16 +955,16 @@ process_error_list(spa_t *spa, avl_tree_t *list, void *uaddr, uint64_t *count)
 		zep.zb_object = se->se_bookmark.zb_object;
 		zep.zb_level = se->se_bookmark.zb_level;
 		zep.zb_blkid = se->se_bookmark.zb_blkid;
+		zep.zb_birth = 0;

 		uint64_t head_ds_obj;
 		int error = get_head_and_birth_txg(spa, &zep,
 		    se->se_bookmark.zb_objset, &head_ds_obj);
-		if (error != 0)
-			return (error);

-		error = process_error_block(spa, head_ds_obj, &zep, count,
-		    uaddr, B_FALSE);
-		if (error != 0)
+		if (!error)
+			error = process_error_block(spa, head_ds_obj, &zep,
+			    count, uaddr, B_FALSE);
+		if (error)
 			return (error);
 	}
 	return (0);
@ -1013,6 +1083,7 @@ sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx)
 			zep.zb_object = se->se_bookmark.zb_object;
 			zep.zb_level = se->se_bookmark.zb_level;
 			zep.zb_blkid = se->se_bookmark.zb_blkid;
+			zep.zb_birth = 0;

 			/*
 			 * If we cannot find out the head dataset and birth txg
@ -1024,7 +1095,7 @@ sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx)
 			uint64_t head_dataset_obj;
 			int error = get_head_and_birth_txg(spa, &zep,
 			    se->se_bookmark.zb_objset, &head_dataset_obj);
-			if (error != 0)
+			if (error)
 				continue;

 			uint64_t err_obj;
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@ -494,7 +494,7 @@ tags = ['functional', 'cli_root', 'zpool_split']
 [tests/functional/cli_root/zpool_status]
 tests = ['zpool_status_001_pos', 'zpool_status_002_pos',
    'zpool_status_003_pos', 'zpool_status_004_pos',
-    'zpool_status_features_001_pos']
+    'zpool_status_005_pos', 'zpool_status_features_001_pos']
 tags = ['functional', 'cli_root', 'zpool_status']

 [tests/functional/cli_root/zpool_sync]
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@ -1161,6 +1161,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/cli_root/zpool_status/zpool_status_002_pos.ksh \
 	functional/cli_root/zpool_status/zpool_status_003_pos.ksh \
 	functional/cli_root/zpool_status/zpool_status_004_pos.ksh \
+	functional/cli_root/zpool_status/zpool_status_005_pos.ksh \
 	functional/cli_root/zpool_status/zpool_status_features_001_pos.ksh \
 	functional/cli_root/zpool_sync/cleanup.ksh \
 	functional/cli_root/zpool_sync/setup.ksh \
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_003_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_003_pos.ksh
@ -63,6 +63,7 @@ log_must zfs snapshot $TESTPOOL2@snap
 log_must zfs clone $TESTPOOL2@snap $TESTPOOL2/clone

 # Look to see that snapshot, clone and filesystem our files report errors
+log_must zpool status -v $TESTPOOL2
 log_must eval "zpool status -v | grep '$TESTPOOL2@snap:/10m_file'"
 log_must eval "zpool status -v | grep '$TESTPOOL2/clone/10m_file'"
 log_must eval "zpool status -v | grep '$TESTPOOL2/10m_file'"
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_004_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_004_pos.ksh
@ -74,6 +74,7 @@ log_must eval "zpool status -v | grep '$TESTPOOL2/10m_file'"

 # Check that enabling the feature reports the error properly.
 log_must zpool set feature@head_errlog=enabled $TESTPOOL2
+log_must zpool status -v $TESTPOOL2
 log_must eval "zpool status -v | grep '$TESTPOOL2@snap:/10m_file'"
 log_must eval "zpool status -v | grep '$TESTPOOL2/clone/10m_file'"
 log_must eval "zpool status -v | grep '$TESTPOOL2/10m_file'"
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_005_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_005_pos.ksh
@ -0,0 +1,78 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2022 George Amanakis. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+#
+# DESCRIPTION:
+# Verify correct output with 'zpool status -v' after corrupting a file
+#
+# STRATEGY:
+# 1. Create a pool, an ancrypted filesystem and a file
+# 2. zinject checksum errors
+# 3. Unmount the filesystem and unload the key
+# 4. Scrub the pool
+# 5. Verify we report errors in the pool in 'zpool status -v'
+
+verify_runnable "both"
+
+DISK=${DISKS%% *}
+
+function cleanup
+{
+	log_must zinject -c all
+	destroy_pool $TESTPOOL2
+	rm -f $TESTDIR/vdev_a
+}
+
+log_assert "Verify reporting errors with unloaded keys works"
+log_onexit cleanup
+
+typeset passphrase="password"
+typeset file="/$TESTPOOL2/$TESTFS1/$TESTFILE0"
+
+truncate -s $MINVDEVSIZE $TESTDIR/vdev_a
+log_must zpool create -f -o feature@head_errlog=enabled $TESTPOOL2 $TESTDIR/vdev_a
+
+log_must eval "echo $passphrase > /$TESTPOOL2/pwd"
+
+log_must zfs create -o encryption=aes-256-ccm -o keyformat=passphrase \
+    -o keylocation=file:///$TESTPOOL2/pwd -o primarycache=none \
+    $TESTPOOL2/$TESTFS1
+
+log_must dd if=/dev/urandom of=$file bs=1024 count=1024 oflag=sync
+log_must eval "echo 'aaaaaaaa' >> "$file
+
+corrupt_blocks_at_level $file 0
+log_must zfs unmount $TESTPOOL2/$TESTFS1
+log_must zfs unload-key $TESTPOOL2/$TESTFS1
+log_must zpool sync $TESTPOOL2
+log_must zpool scrub $TESTPOOL2
+log_must zpool wait -t scrub $TESTPOOL2
+log_must zpool status -v $TESTPOOL2
+log_must eval "zpool status -v $TESTPOOL2 | \
+    grep \"Permanent errors have been detected\""
+
+log_pass "Verify reporting errors with unloaded keys works"